diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,126231 @@ +{ + "best_metric": 0.29905760288238525, + "best_model_checkpoint": "./w2v-bert-2.0-chichewa_34_102h/checkpoint-13000", + "epoch": 14.173296573454115, + "eval_steps": 1000, + "global_step": 18000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0007877116975187081, + "grad_norm": 19.958507537841797, + "learning_rate": 3.0000000000000004e-09, + "loss": 8.4444, + "step": 1 + }, + { + "epoch": 0.0015754233950374162, + "grad_norm": 19.452104568481445, + "learning_rate": 6.000000000000001e-09, + "loss": 8.0868, + "step": 2 + }, + { + "epoch": 0.0023631350925561244, + "grad_norm": 21.658512115478516, + "learning_rate": 9e-09, + "loss": 8.7593, + "step": 3 + }, + { + "epoch": 0.0031508467900748325, + "grad_norm": 20.213897705078125, + "learning_rate": 1.2000000000000002e-08, + "loss": 8.3121, + "step": 4 + }, + { + "epoch": 0.003938558487593541, + "grad_norm": 20.489686965942383, + "learning_rate": 1.5000000000000002e-08, + "loss": 8.4089, + "step": 5 + }, + { + "epoch": 0.004726270185112249, + "grad_norm": 23.097007751464844, + "learning_rate": 1.8e-08, + "loss": 9.2043, + "step": 6 + }, + { + "epoch": 0.005513981882630957, + "grad_norm": 27.332561492919922, + "learning_rate": 2.1e-08, + "loss": 10.5783, + "step": 7 + }, + { + "epoch": 0.006301693580149665, + "grad_norm": 28.446813583374023, + "learning_rate": 2.4000000000000003e-08, + "loss": 10.9717, + "step": 8 + }, + { + "epoch": 0.0070894052776683735, + "grad_norm": 25.912334442138672, + "learning_rate": 2.7e-08, + "loss": 10.0423, + "step": 9 + }, + { + "epoch": 0.007877116975187082, + "grad_norm": 26.995702743530273, + "learning_rate": 3.0000000000000004e-08, + "loss": 10.421, + "step": 10 + }, + { + "epoch": 0.00866482867270579, + "grad_norm": 26.43002700805664, + "learning_rate": 3.3000000000000004e-08, + "loss": 10.1292, + "step": 11 + }, + { + "epoch": 0.009452540370224497, + "grad_norm": 25.203683853149414, + "learning_rate": 3.6e-08, + "loss": 9.7723, + "step": 12 + }, + { + "epoch": 0.010240252067743205, + "grad_norm": 26.768142700195312, + "learning_rate": 3.9e-08, + "loss": 10.3128, + "step": 13 + }, + { + "epoch": 0.011027963765261915, + "grad_norm": 26.809391021728516, + "learning_rate": 4.2e-08, + "loss": 10.3475, + "step": 14 + }, + { + "epoch": 0.011815675462780622, + "grad_norm": 26.187631607055664, + "learning_rate": 4.5e-08, + "loss": 10.1165, + "step": 15 + }, + { + "epoch": 0.01260338716029933, + "grad_norm": 26.220365524291992, + "learning_rate": 4.8000000000000006e-08, + "loss": 10.1158, + "step": 16 + }, + { + "epoch": 0.01339109885781804, + "grad_norm": 26.761127471923828, + "learning_rate": 5.1e-08, + "loss": 10.2245, + "step": 17 + }, + { + "epoch": 0.014178810555336747, + "grad_norm": 25.401134490966797, + "learning_rate": 5.4e-08, + "loss": 9.8784, + "step": 18 + }, + { + "epoch": 0.014966522252855455, + "grad_norm": 28.075042724609375, + "learning_rate": 5.7e-08, + "loss": 10.4969, + "step": 19 + }, + { + "epoch": 0.015754233950374164, + "grad_norm": 26.30954933166504, + "learning_rate": 6.000000000000001e-08, + "loss": 10.2208, + "step": 20 + }, + { + "epoch": 0.01654194564789287, + "grad_norm": 26.42831039428711, + "learning_rate": 6.3e-08, + "loss": 10.1168, + "step": 21 + }, + { + "epoch": 0.01732965734541158, + "grad_norm": 25.461009979248047, + "learning_rate": 6.600000000000001e-08, + "loss": 9.9133, + "step": 22 + }, + { + "epoch": 0.01811736904293029, + "grad_norm": 26.674026489257812, + "learning_rate": 6.9e-08, + "loss": 10.1306, + "step": 23 + }, + { + "epoch": 0.018905080740448995, + "grad_norm": 27.038665771484375, + "learning_rate": 7.2e-08, + "loss": 10.1647, + "step": 24 + }, + { + "epoch": 0.019692792437967704, + "grad_norm": 26.607969284057617, + "learning_rate": 7.500000000000001e-08, + "loss": 10.1849, + "step": 25 + }, + { + "epoch": 0.02048050413548641, + "grad_norm": 25.706941604614258, + "learning_rate": 7.8e-08, + "loss": 9.8337, + "step": 26 + }, + { + "epoch": 0.02126821583300512, + "grad_norm": 26.11414337158203, + "learning_rate": 8.100000000000001e-08, + "loss": 10.2067, + "step": 27 + }, + { + "epoch": 0.02205592753052383, + "grad_norm": 25.452116012573242, + "learning_rate": 8.4e-08, + "loss": 9.8234, + "step": 28 + }, + { + "epoch": 0.022843639228042535, + "grad_norm": 27.956953048706055, + "learning_rate": 8.7e-08, + "loss": 10.3007, + "step": 29 + }, + { + "epoch": 0.023631350925561245, + "grad_norm": 25.951942443847656, + "learning_rate": 9e-08, + "loss": 9.9008, + "step": 30 + }, + { + "epoch": 0.024419062623079954, + "grad_norm": 25.920316696166992, + "learning_rate": 9.3e-08, + "loss": 9.8724, + "step": 31 + }, + { + "epoch": 0.02520677432059866, + "grad_norm": 26.323375701904297, + "learning_rate": 9.600000000000001e-08, + "loss": 10.1081, + "step": 32 + }, + { + "epoch": 0.02599448601811737, + "grad_norm": 26.962894439697266, + "learning_rate": 9.9e-08, + "loss": 10.0565, + "step": 33 + }, + { + "epoch": 0.02678219771563608, + "grad_norm": 24.09636116027832, + "learning_rate": 1.02e-07, + "loss": 9.3971, + "step": 34 + }, + { + "epoch": 0.027569909413154785, + "grad_norm": 25.85160255432129, + "learning_rate": 1.05e-07, + "loss": 9.7801, + "step": 35 + }, + { + "epoch": 0.028357621110673494, + "grad_norm": 26.924654006958008, + "learning_rate": 1.08e-07, + "loss": 9.9326, + "step": 36 + }, + { + "epoch": 0.0291453328081922, + "grad_norm": 25.226865768432617, + "learning_rate": 1.11e-07, + "loss": 9.326, + "step": 37 + }, + { + "epoch": 0.02993304450571091, + "grad_norm": 27.227313995361328, + "learning_rate": 1.14e-07, + "loss": 9.8736, + "step": 38 + }, + { + "epoch": 0.03072075620322962, + "grad_norm": 26.056306838989258, + "learning_rate": 1.17e-07, + "loss": 9.5151, + "step": 39 + }, + { + "epoch": 0.03150846790074833, + "grad_norm": 27.066823959350586, + "learning_rate": 1.2000000000000002e-07, + "loss": 9.7922, + "step": 40 + }, + { + "epoch": 0.032296179598267034, + "grad_norm": 25.500141143798828, + "learning_rate": 1.23e-07, + "loss": 9.6827, + "step": 41 + }, + { + "epoch": 0.03308389129578574, + "grad_norm": 25.60626792907715, + "learning_rate": 1.26e-07, + "loss": 9.4549, + "step": 42 + }, + { + "epoch": 0.03387160299330445, + "grad_norm": 25.086023330688477, + "learning_rate": 1.29e-07, + "loss": 9.2719, + "step": 43 + }, + { + "epoch": 0.03465931469082316, + "grad_norm": 26.919702529907227, + "learning_rate": 1.3200000000000002e-07, + "loss": 9.7412, + "step": 44 + }, + { + "epoch": 0.035447026388341865, + "grad_norm": 25.042985916137695, + "learning_rate": 1.35e-07, + "loss": 9.2755, + "step": 45 + }, + { + "epoch": 0.03623473808586058, + "grad_norm": 25.056926727294922, + "learning_rate": 1.38e-07, + "loss": 9.1482, + "step": 46 + }, + { + "epoch": 0.037022449783379284, + "grad_norm": 24.650362014770508, + "learning_rate": 1.41e-07, + "loss": 9.0814, + "step": 47 + }, + { + "epoch": 0.03781016148089799, + "grad_norm": 24.338790893554688, + "learning_rate": 1.44e-07, + "loss": 8.9702, + "step": 48 + }, + { + "epoch": 0.0385978731784167, + "grad_norm": 26.17897605895996, + "learning_rate": 1.47e-07, + "loss": 9.4058, + "step": 49 + }, + { + "epoch": 0.03938558487593541, + "grad_norm": 23.722810745239258, + "learning_rate": 1.5000000000000002e-07, + "loss": 8.6713, + "step": 50 + }, + { + "epoch": 0.040173296573454115, + "grad_norm": 22.906557083129883, + "learning_rate": 1.53e-07, + "loss": 8.9662, + "step": 51 + }, + { + "epoch": 0.04096100827097282, + "grad_norm": 21.931827545166016, + "learning_rate": 1.56e-07, + "loss": 8.5539, + "step": 52 + }, + { + "epoch": 0.041748719968491534, + "grad_norm": 20.994794845581055, + "learning_rate": 1.59e-07, + "loss": 8.063, + "step": 53 + }, + { + "epoch": 0.04253643166601024, + "grad_norm": 20.733474731445312, + "learning_rate": 1.6200000000000002e-07, + "loss": 8.0299, + "step": 54 + }, + { + "epoch": 0.043324143363528946, + "grad_norm": 22.25860023498535, + "learning_rate": 1.6499999999999998e-07, + "loss": 8.4037, + "step": 55 + }, + { + "epoch": 0.04411185506104766, + "grad_norm": 23.774017333984375, + "learning_rate": 1.68e-07, + "loss": 8.9388, + "step": 56 + }, + { + "epoch": 0.044899566758566364, + "grad_norm": 25.937002182006836, + "learning_rate": 1.71e-07, + "loss": 9.4147, + "step": 57 + }, + { + "epoch": 0.04568727845608507, + "grad_norm": 28.94572639465332, + "learning_rate": 1.74e-07, + "loss": 10.1063, + "step": 58 + }, + { + "epoch": 0.04647499015360378, + "grad_norm": 27.67974853515625, + "learning_rate": 1.77e-07, + "loss": 10.0893, + "step": 59 + }, + { + "epoch": 0.04726270185112249, + "grad_norm": 29.100597381591797, + "learning_rate": 1.8e-07, + "loss": 10.1185, + "step": 60 + }, + { + "epoch": 0.048050413548641195, + "grad_norm": 28.854867935180664, + "learning_rate": 1.83e-07, + "loss": 9.8934, + "step": 61 + }, + { + "epoch": 0.04883812524615991, + "grad_norm": 28.347305297851562, + "learning_rate": 1.86e-07, + "loss": 9.8679, + "step": 62 + }, + { + "epoch": 0.049625836943678614, + "grad_norm": 29.261558532714844, + "learning_rate": 1.89e-07, + "loss": 10.1203, + "step": 63 + }, + { + "epoch": 0.05041354864119732, + "grad_norm": 29.437442779541016, + "learning_rate": 1.9200000000000003e-07, + "loss": 10.077, + "step": 64 + }, + { + "epoch": 0.05120126033871603, + "grad_norm": 29.34528350830078, + "learning_rate": 1.9499999999999999e-07, + "loss": 10.0344, + "step": 65 + }, + { + "epoch": 0.05198897203623474, + "grad_norm": 28.63390350341797, + "learning_rate": 1.98e-07, + "loss": 9.8725, + "step": 66 + }, + { + "epoch": 0.052776683733753445, + "grad_norm": 29.416051864624023, + "learning_rate": 2.01e-07, + "loss": 9.9658, + "step": 67 + }, + { + "epoch": 0.05356439543127216, + "grad_norm": 29.293710708618164, + "learning_rate": 2.04e-07, + "loss": 9.8626, + "step": 68 + }, + { + "epoch": 0.054352107128790864, + "grad_norm": 28.185775756835938, + "learning_rate": 2.0700000000000001e-07, + "loss": 9.7124, + "step": 69 + }, + { + "epoch": 0.05513981882630957, + "grad_norm": 29.568822860717773, + "learning_rate": 2.1e-07, + "loss": 9.7968, + "step": 70 + }, + { + "epoch": 0.055927530523828275, + "grad_norm": 29.27025604248047, + "learning_rate": 2.1300000000000001e-07, + "loss": 9.6618, + "step": 71 + }, + { + "epoch": 0.05671524222134699, + "grad_norm": 28.70044708251953, + "learning_rate": 2.16e-07, + "loss": 9.6289, + "step": 72 + }, + { + "epoch": 0.057502953918865694, + "grad_norm": 28.25299072265625, + "learning_rate": 2.1900000000000002e-07, + "loss": 9.5471, + "step": 73 + }, + { + "epoch": 0.0582906656163844, + "grad_norm": 29.431013107299805, + "learning_rate": 2.22e-07, + "loss": 9.5698, + "step": 74 + }, + { + "epoch": 0.05907837731390311, + "grad_norm": 30.497535705566406, + "learning_rate": 2.25e-07, + "loss": 9.7989, + "step": 75 + }, + { + "epoch": 0.05986608901142182, + "grad_norm": 29.945486068725586, + "learning_rate": 2.28e-07, + "loss": 9.6243, + "step": 76 + }, + { + "epoch": 0.060653800708940525, + "grad_norm": 30.446632385253906, + "learning_rate": 2.3100000000000002e-07, + "loss": 9.7724, + "step": 77 + }, + { + "epoch": 0.06144151240645924, + "grad_norm": 31.395265579223633, + "learning_rate": 2.34e-07, + "loss": 9.7699, + "step": 78 + }, + { + "epoch": 0.062229224103977944, + "grad_norm": 29.687305450439453, + "learning_rate": 2.3700000000000002e-07, + "loss": 9.4155, + "step": 79 + }, + { + "epoch": 0.06301693580149666, + "grad_norm": 30.59857177734375, + "learning_rate": 2.4000000000000003e-07, + "loss": 9.5445, + "step": 80 + }, + { + "epoch": 0.06380464749901536, + "grad_norm": 29.442880630493164, + "learning_rate": 2.43e-07, + "loss": 9.3082, + "step": 81 + }, + { + "epoch": 0.06459235919653407, + "grad_norm": 29.976268768310547, + "learning_rate": 2.46e-07, + "loss": 9.4012, + "step": 82 + }, + { + "epoch": 0.06538007089405277, + "grad_norm": 30.01825523376465, + "learning_rate": 2.49e-07, + "loss": 9.36, + "step": 83 + }, + { + "epoch": 0.06616778259157148, + "grad_norm": 28.501314163208008, + "learning_rate": 2.52e-07, + "loss": 8.9912, + "step": 84 + }, + { + "epoch": 0.06695549428909019, + "grad_norm": 28.62384605407715, + "learning_rate": 2.5500000000000005e-07, + "loss": 9.0183, + "step": 85 + }, + { + "epoch": 0.0677432059866089, + "grad_norm": 30.718854904174805, + "learning_rate": 2.58e-07, + "loss": 9.4174, + "step": 86 + }, + { + "epoch": 0.06853091768412761, + "grad_norm": 32.05936050415039, + "learning_rate": 2.6099999999999997e-07, + "loss": 9.6124, + "step": 87 + }, + { + "epoch": 0.06931862938164632, + "grad_norm": 30.501577377319336, + "learning_rate": 2.6400000000000003e-07, + "loss": 9.0303, + "step": 88 + }, + { + "epoch": 0.07010634107916502, + "grad_norm": 31.52448081970215, + "learning_rate": 2.67e-07, + "loss": 9.3106, + "step": 89 + }, + { + "epoch": 0.07089405277668373, + "grad_norm": 30.112722396850586, + "learning_rate": 2.7e-07, + "loss": 8.953, + "step": 90 + }, + { + "epoch": 0.07168176447420244, + "grad_norm": 31.769365310668945, + "learning_rate": 2.73e-07, + "loss": 9.2196, + "step": 91 + }, + { + "epoch": 0.07246947617172116, + "grad_norm": 30.418306350708008, + "learning_rate": 2.76e-07, + "loss": 8.7356, + "step": 92 + }, + { + "epoch": 0.07325718786923986, + "grad_norm": 31.046571731567383, + "learning_rate": 2.79e-07, + "loss": 8.8623, + "step": 93 + }, + { + "epoch": 0.07404489956675857, + "grad_norm": 33.64398193359375, + "learning_rate": 2.82e-07, + "loss": 9.2121, + "step": 94 + }, + { + "epoch": 0.07483261126427727, + "grad_norm": 31.792766571044922, + "learning_rate": 2.85e-07, + "loss": 8.8605, + "step": 95 + }, + { + "epoch": 0.07562032296179598, + "grad_norm": 30.769004821777344, + "learning_rate": 2.88e-07, + "loss": 8.7387, + "step": 96 + }, + { + "epoch": 0.07640803465931469, + "grad_norm": 28.40618324279785, + "learning_rate": 2.91e-07, + "loss": 8.2935, + "step": 97 + }, + { + "epoch": 0.0771957463568334, + "grad_norm": 30.933502197265625, + "learning_rate": 2.94e-07, + "loss": 8.5306, + "step": 98 + }, + { + "epoch": 0.07798345805435211, + "grad_norm": 31.91015625, + "learning_rate": 2.97e-07, + "loss": 8.6779, + "step": 99 + }, + { + "epoch": 0.07877116975187082, + "grad_norm": 30.529985427856445, + "learning_rate": 3.0000000000000004e-07, + "loss": 8.3288, + "step": 100 + }, + { + "epoch": 0.07955888144938952, + "grad_norm": 25.08968162536621, + "learning_rate": 3.03e-07, + "loss": 7.8755, + "step": 101 + }, + { + "epoch": 0.08034659314690823, + "grad_norm": 23.59964942932129, + "learning_rate": 3.06e-07, + "loss": 7.553, + "step": 102 + }, + { + "epoch": 0.08113430484442694, + "grad_norm": 24.057044982910156, + "learning_rate": 3.0900000000000003e-07, + "loss": 7.598, + "step": 103 + }, + { + "epoch": 0.08192201654194564, + "grad_norm": 23.894298553466797, + "learning_rate": 3.12e-07, + "loss": 7.3477, + "step": 104 + }, + { + "epoch": 0.08270972823946436, + "grad_norm": 27.038135528564453, + "learning_rate": 3.15e-07, + "loss": 7.7735, + "step": 105 + }, + { + "epoch": 0.08349743993698307, + "grad_norm": 30.176294326782227, + "learning_rate": 3.18e-07, + "loss": 8.2365, + "step": 106 + }, + { + "epoch": 0.08428515163450177, + "grad_norm": 33.44722366333008, + "learning_rate": 3.21e-07, + "loss": 8.6445, + "step": 107 + }, + { + "epoch": 0.08507286333202048, + "grad_norm": 34.5628662109375, + "learning_rate": 3.2400000000000004e-07, + "loss": 8.6668, + "step": 108 + }, + { + "epoch": 0.08586057502953919, + "grad_norm": Infinity, + "learning_rate": 3.2400000000000004e-07, + "loss": 9.0652, + "step": 109 + }, + { + "epoch": 0.08664828672705789, + "grad_norm": 36.42795181274414, + "learning_rate": 3.27e-07, + "loss": 8.937, + "step": 110 + }, + { + "epoch": 0.08743599842457661, + "grad_norm": 38.14984893798828, + "learning_rate": 3.2999999999999996e-07, + "loss": 9.135, + "step": 111 + }, + { + "epoch": 0.08822371012209532, + "grad_norm": 38.88314437866211, + "learning_rate": 3.3300000000000003e-07, + "loss": 9.0913, + "step": 112 + }, + { + "epoch": 0.08901142181961402, + "grad_norm": 40.67845153808594, + "learning_rate": 3.36e-07, + "loss": 9.1756, + "step": 113 + }, + { + "epoch": 0.08979913351713273, + "grad_norm": 38.57084655761719, + "learning_rate": 3.39e-07, + "loss": 8.9131, + "step": 114 + }, + { + "epoch": 0.09058684521465143, + "grad_norm": 37.576499938964844, + "learning_rate": 3.42e-07, + "loss": 8.7922, + "step": 115 + }, + { + "epoch": 0.09137455691217014, + "grad_norm": 41.61140823364258, + "learning_rate": 3.45e-07, + "loss": 9.004, + "step": 116 + }, + { + "epoch": 0.09216226860968886, + "grad_norm": 42.311622619628906, + "learning_rate": 3.48e-07, + "loss": 8.8639, + "step": 117 + }, + { + "epoch": 0.09294998030720757, + "grad_norm": 40.926517486572266, + "learning_rate": 3.51e-07, + "loss": 8.8315, + "step": 118 + }, + { + "epoch": 0.09373769200472627, + "grad_norm": 42.721439361572266, + "learning_rate": 3.54e-07, + "loss": 8.816, + "step": 119 + }, + { + "epoch": 0.09452540370224498, + "grad_norm": 43.165794372558594, + "learning_rate": 3.5700000000000003e-07, + "loss": 8.7576, + "step": 120 + }, + { + "epoch": 0.09531311539976368, + "grad_norm": 41.680824279785156, + "learning_rate": 3.6e-07, + "loss": 8.6062, + "step": 121 + }, + { + "epoch": 0.09610082709728239, + "grad_norm": 45.38103485107422, + "learning_rate": 3.63e-07, + "loss": 8.7043, + "step": 122 + }, + { + "epoch": 0.0968885387948011, + "grad_norm": 43.03227233886719, + "learning_rate": 3.66e-07, + "loss": 8.5265, + "step": 123 + }, + { + "epoch": 0.09767625049231982, + "grad_norm": 43.56718444824219, + "learning_rate": 3.6900000000000004e-07, + "loss": 8.4168, + "step": 124 + }, + { + "epoch": 0.09846396218983852, + "grad_norm": 43.58745193481445, + "learning_rate": 3.72e-07, + "loss": 8.3287, + "step": 125 + }, + { + "epoch": 0.09925167388735723, + "grad_norm": 44.702945709228516, + "learning_rate": 3.75e-07, + "loss": 8.3133, + "step": 126 + }, + { + "epoch": 0.10003938558487593, + "grad_norm": 44.41905975341797, + "learning_rate": 3.78e-07, + "loss": 8.2255, + "step": 127 + }, + { + "epoch": 0.10082709728239464, + "grad_norm": 45.80497360229492, + "learning_rate": 3.81e-07, + "loss": 8.2665, + "step": 128 + }, + { + "epoch": 0.10161480897991335, + "grad_norm": 41.79016876220703, + "learning_rate": 3.8400000000000005e-07, + "loss": 7.9926, + "step": 129 + }, + { + "epoch": 0.10240252067743207, + "grad_norm": 49.180091857910156, + "learning_rate": 3.87e-07, + "loss": 8.3142, + "step": 130 + }, + { + "epoch": 0.10319023237495077, + "grad_norm": 47.868595123291016, + "learning_rate": 3.8999999999999997e-07, + "loss": 8.1112, + "step": 131 + }, + { + "epoch": 0.10397794407246948, + "grad_norm": 49.065120697021484, + "learning_rate": 3.9300000000000004e-07, + "loss": 8.0694, + "step": 132 + }, + { + "epoch": 0.10476565576998818, + "grad_norm": 45.4450798034668, + "learning_rate": 3.96e-07, + "loss": 7.6934, + "step": 133 + }, + { + "epoch": 0.10555336746750689, + "grad_norm": 47.7122802734375, + "learning_rate": 3.99e-07, + "loss": 7.9072, + "step": 134 + }, + { + "epoch": 0.1063410791650256, + "grad_norm": 50.639102935791016, + "learning_rate": 4.02e-07, + "loss": 7.8934, + "step": 135 + }, + { + "epoch": 0.10712879086254432, + "grad_norm": 52.65062713623047, + "learning_rate": 4.05e-07, + "loss": 7.9366, + "step": 136 + }, + { + "epoch": 0.10791650256006302, + "grad_norm": 47.36444091796875, + "learning_rate": 4.08e-07, + "loss": 7.4184, + "step": 137 + }, + { + "epoch": 0.10870421425758173, + "grad_norm": 47.60418701171875, + "learning_rate": 4.11e-07, + "loss": 7.4393, + "step": 138 + }, + { + "epoch": 0.10949192595510043, + "grad_norm": 51.04661560058594, + "learning_rate": 4.1400000000000003e-07, + "loss": 7.6219, + "step": 139 + }, + { + "epoch": 0.11027963765261914, + "grad_norm": 50.47011184692383, + "learning_rate": 4.17e-07, + "loss": 7.4966, + "step": 140 + }, + { + "epoch": 0.11106734935013785, + "grad_norm": 52.339473724365234, + "learning_rate": 4.2e-07, + "loss": 7.49, + "step": 141 + }, + { + "epoch": 0.11185506104765655, + "grad_norm": 48.282737731933594, + "learning_rate": 4.23e-07, + "loss": 7.0961, + "step": 142 + }, + { + "epoch": 0.11264277274517527, + "grad_norm": 47.427616119384766, + "learning_rate": 4.2600000000000003e-07, + "loss": 7.0464, + "step": 143 + }, + { + "epoch": 0.11343048444269398, + "grad_norm": 49.6283073425293, + "learning_rate": 4.2900000000000004e-07, + "loss": 7.0621, + "step": 144 + }, + { + "epoch": 0.11421819614021268, + "grad_norm": 48.020755767822266, + "learning_rate": 4.32e-07, + "loss": 6.8987, + "step": 145 + }, + { + "epoch": 0.11500590783773139, + "grad_norm": 47.63594436645508, + "learning_rate": 4.35e-07, + "loss": 6.7289, + "step": 146 + }, + { + "epoch": 0.1157936195352501, + "grad_norm": 49.88113784790039, + "learning_rate": 4.3800000000000003e-07, + "loss": 6.8327, + "step": 147 + }, + { + "epoch": 0.1165813312327688, + "grad_norm": 49.09914016723633, + "learning_rate": 4.41e-07, + "loss": 6.7332, + "step": 148 + }, + { + "epoch": 0.11736904293028752, + "grad_norm": 50.70106887817383, + "learning_rate": 4.44e-07, + "loss": 6.6795, + "step": 149 + }, + { + "epoch": 0.11815675462780623, + "grad_norm": 45.103919982910156, + "learning_rate": 4.47e-07, + "loss": 6.3573, + "step": 150 + }, + { + "epoch": 0.11894446632532493, + "grad_norm": 40.395267486572266, + "learning_rate": 4.5e-07, + "loss": 6.3099, + "step": 151 + }, + { + "epoch": 0.11973217802284364, + "grad_norm": 38.48122024536133, + "learning_rate": 4.5300000000000005e-07, + "loss": 6.1513, + "step": 152 + }, + { + "epoch": 0.12051988972036234, + "grad_norm": 40.56327438354492, + "learning_rate": 4.56e-07, + "loss": 6.1272, + "step": 153 + }, + { + "epoch": 0.12130760141788105, + "grad_norm": 34.909912109375, + "learning_rate": 4.5899999999999997e-07, + "loss": 5.7603, + "step": 154 + }, + { + "epoch": 0.12209531311539977, + "grad_norm": 39.74394226074219, + "learning_rate": 4.6200000000000003e-07, + "loss": 5.9479, + "step": 155 + }, + { + "epoch": 0.12288302481291848, + "grad_norm": 47.732749938964844, + "learning_rate": 4.65e-07, + "loss": 6.1843, + "step": 156 + }, + { + "epoch": 0.12367073651043718, + "grad_norm": 50.35280227661133, + "learning_rate": 4.68e-07, + "loss": 6.2748, + "step": 157 + }, + { + "epoch": 0.12445844820795589, + "grad_norm": 53.14102554321289, + "learning_rate": 4.7099999999999997e-07, + "loss": 6.3745, + "step": 158 + }, + { + "epoch": 0.1252461599054746, + "grad_norm": 53.38798522949219, + "learning_rate": 4.7400000000000004e-07, + "loss": 6.3346, + "step": 159 + }, + { + "epoch": 0.1260338716029933, + "grad_norm": 56.268272399902344, + "learning_rate": 4.77e-07, + "loss": 6.2724, + "step": 160 + }, + { + "epoch": 0.12682158330051202, + "grad_norm": 55.922481536865234, + "learning_rate": 4.800000000000001e-07, + "loss": 6.2263, + "step": 161 + }, + { + "epoch": 0.12760929499803073, + "grad_norm": 53.57609558105469, + "learning_rate": 4.83e-07, + "loss": 6.1563, + "step": 162 + }, + { + "epoch": 0.12839700669554943, + "grad_norm": 51.562660217285156, + "learning_rate": 4.86e-07, + "loss": 6.0375, + "step": 163 + }, + { + "epoch": 0.12918471839306814, + "grad_norm": 47.105289459228516, + "learning_rate": 4.89e-07, + "loss": 5.8266, + "step": 164 + }, + { + "epoch": 0.12997243009058684, + "grad_norm": 50.56541442871094, + "learning_rate": 4.92e-07, + "loss": 5.8813, + "step": 165 + }, + { + "epoch": 0.13076014178810555, + "grad_norm": 45.97529220581055, + "learning_rate": 4.95e-07, + "loss": 5.7156, + "step": 166 + }, + { + "epoch": 0.13154785348562426, + "grad_norm": 46.09469985961914, + "learning_rate": 4.98e-07, + "loss": 5.6751, + "step": 167 + }, + { + "epoch": 0.13233556518314296, + "grad_norm": 43.56045150756836, + "learning_rate": 5.01e-07, + "loss": 5.5143, + "step": 168 + }, + { + "epoch": 0.13312327688066167, + "grad_norm": 41.57733154296875, + "learning_rate": 5.04e-07, + "loss": 5.4799, + "step": 169 + }, + { + "epoch": 0.13391098857818037, + "grad_norm": 42.79589080810547, + "learning_rate": 5.07e-07, + "loss": 5.4735, + "step": 170 + }, + { + "epoch": 0.1346987002756991, + "grad_norm": 40.96742630004883, + "learning_rate": 5.100000000000001e-07, + "loss": 5.4217, + "step": 171 + }, + { + "epoch": 0.1354864119732178, + "grad_norm": 36.37437057495117, + "learning_rate": 5.13e-07, + "loss": 5.2361, + "step": 172 + }, + { + "epoch": 0.13627412367073652, + "grad_norm": 33.45048141479492, + "learning_rate": 5.16e-07, + "loss": 5.1587, + "step": 173 + }, + { + "epoch": 0.13706183536825522, + "grad_norm": 34.097023010253906, + "learning_rate": 5.19e-07, + "loss": 5.166, + "step": 174 + }, + { + "epoch": 0.13784954706577393, + "grad_norm": 34.22423553466797, + "learning_rate": 5.219999999999999e-07, + "loss": 5.1332, + "step": 175 + }, + { + "epoch": 0.13863725876329264, + "grad_norm": 29.603321075439453, + "learning_rate": 5.250000000000001e-07, + "loss": 4.9919, + "step": 176 + }, + { + "epoch": 0.13942497046081134, + "grad_norm": 26.78028678894043, + "learning_rate": 5.280000000000001e-07, + "loss": 4.9226, + "step": 177 + }, + { + "epoch": 0.14021268215833005, + "grad_norm": 26.757226943969727, + "learning_rate": 5.31e-07, + "loss": 4.9252, + "step": 178 + }, + { + "epoch": 0.14100039385584875, + "grad_norm": 24.002351760864258, + "learning_rate": 5.34e-07, + "loss": 4.8669, + "step": 179 + }, + { + "epoch": 0.14178810555336746, + "grad_norm": 21.158519744873047, + "learning_rate": 5.37e-07, + "loss": 4.7844, + "step": 180 + }, + { + "epoch": 0.14257581725088617, + "grad_norm": 19.352439880371094, + "learning_rate": 5.4e-07, + "loss": 4.8041, + "step": 181 + }, + { + "epoch": 0.14336352894840487, + "grad_norm": 17.103342056274414, + "learning_rate": 5.43e-07, + "loss": 4.7186, + "step": 182 + }, + { + "epoch": 0.14415124064592358, + "grad_norm": 13.757027626037598, + "learning_rate": 5.46e-07, + "loss": 4.6864, + "step": 183 + }, + { + "epoch": 0.1449389523434423, + "grad_norm": 12.518247604370117, + "learning_rate": 5.490000000000001e-07, + "loss": 4.671, + "step": 184 + }, + { + "epoch": 0.14572666404096102, + "grad_norm": 11.68359661102295, + "learning_rate": 5.52e-07, + "loss": 4.6459, + "step": 185 + }, + { + "epoch": 0.14651437573847972, + "grad_norm": 8.856922149658203, + "learning_rate": 5.55e-07, + "loss": 4.5778, + "step": 186 + }, + { + "epoch": 0.14730208743599843, + "grad_norm": 8.255171775817871, + "learning_rate": 5.58e-07, + "loss": 4.5879, + "step": 187 + }, + { + "epoch": 0.14808979913351714, + "grad_norm": 8.243420600891113, + "learning_rate": 5.61e-07, + "loss": 4.5794, + "step": 188 + }, + { + "epoch": 0.14887751083103584, + "grad_norm": 8.021942138671875, + "learning_rate": 5.64e-07, + "loss": 4.6042, + "step": 189 + }, + { + "epoch": 0.14966522252855455, + "grad_norm": 8.823857307434082, + "learning_rate": 5.67e-07, + "loss": 4.5523, + "step": 190 + }, + { + "epoch": 0.15045293422607325, + "grad_norm": 10.121891021728516, + "learning_rate": 5.7e-07, + "loss": 4.4945, + "step": 191 + }, + { + "epoch": 0.15124064592359196, + "grad_norm": 9.714287757873535, + "learning_rate": 5.73e-07, + "loss": 4.5033, + "step": 192 + }, + { + "epoch": 0.15202835762111067, + "grad_norm": 10.8416109085083, + "learning_rate": 5.76e-07, + "loss": 4.5147, + "step": 193 + }, + { + "epoch": 0.15281606931862937, + "grad_norm": 10.361454963684082, + "learning_rate": 5.790000000000001e-07, + "loss": 4.4745, + "step": 194 + }, + { + "epoch": 0.15360378101614808, + "grad_norm": 10.876596450805664, + "learning_rate": 5.82e-07, + "loss": 4.493, + "step": 195 + }, + { + "epoch": 0.1543914927136668, + "grad_norm": 10.443202018737793, + "learning_rate": 5.85e-07, + "loss": 4.542, + "step": 196 + }, + { + "epoch": 0.15517920441118552, + "grad_norm": 10.679583549499512, + "learning_rate": 5.88e-07, + "loss": 4.4426, + "step": 197 + }, + { + "epoch": 0.15596691610870422, + "grad_norm": 9.800992965698242, + "learning_rate": 5.909999999999999e-07, + "loss": 4.4232, + "step": 198 + }, + { + "epoch": 0.15675462780622293, + "grad_norm": 9.405814170837402, + "learning_rate": 5.94e-07, + "loss": 4.4149, + "step": 199 + }, + { + "epoch": 0.15754233950374164, + "grad_norm": 9.38916301727295, + "learning_rate": 5.970000000000001e-07, + "loss": 4.3823, + "step": 200 + }, + { + "epoch": 0.15833005120126034, + "grad_norm": 9.38918399810791, + "learning_rate": 6.000000000000001e-07, + "loss": 4.2793, + "step": 201 + }, + { + "epoch": 0.15911776289877905, + "grad_norm": 6.689791679382324, + "learning_rate": 6.03e-07, + "loss": 4.1609, + "step": 202 + }, + { + "epoch": 0.15990547459629775, + "grad_norm": 6.517496109008789, + "learning_rate": 6.06e-07, + "loss": 4.1812, + "step": 203 + }, + { + "epoch": 0.16069318629381646, + "grad_norm": 6.752288818359375, + "learning_rate": 6.09e-07, + "loss": 4.0984, + "step": 204 + }, + { + "epoch": 0.16148089799133516, + "grad_norm": 6.817626476287842, + "learning_rate": 6.12e-07, + "loss": 4.1173, + "step": 205 + }, + { + "epoch": 0.16226860968885387, + "grad_norm": 6.2976789474487305, + "learning_rate": 6.15e-07, + "loss": 4.2063, + "step": 206 + }, + { + "epoch": 0.16305632138637258, + "grad_norm": 6.458749771118164, + "learning_rate": 6.180000000000001e-07, + "loss": 4.2664, + "step": 207 + }, + { + "epoch": 0.16384403308389128, + "grad_norm": 6.207608222961426, + "learning_rate": 6.21e-07, + "loss": 4.2995, + "step": 208 + }, + { + "epoch": 0.16463174478141002, + "grad_norm": 6.422799587249756, + "learning_rate": 6.24e-07, + "loss": 4.2493, + "step": 209 + }, + { + "epoch": 0.16541945647892872, + "grad_norm": 6.500354290008545, + "learning_rate": 6.27e-07, + "loss": 4.2731, + "step": 210 + }, + { + "epoch": 0.16620716817644743, + "grad_norm": 6.4180521965026855, + "learning_rate": 6.3e-07, + "loss": 4.3471, + "step": 211 + }, + { + "epoch": 0.16699487987396613, + "grad_norm": 6.663375377655029, + "learning_rate": 6.33e-07, + "loss": 4.2726, + "step": 212 + }, + { + "epoch": 0.16778259157148484, + "grad_norm": 6.011595726013184, + "learning_rate": 6.36e-07, + "loss": 4.2358, + "step": 213 + }, + { + "epoch": 0.16857030326900355, + "grad_norm": 5.915055751800537, + "learning_rate": 6.39e-07, + "loss": 4.2138, + "step": 214 + }, + { + "epoch": 0.16935801496652225, + "grad_norm": 5.826460361480713, + "learning_rate": 6.42e-07, + "loss": 4.2037, + "step": 215 + }, + { + "epoch": 0.17014572666404096, + "grad_norm": 5.587107181549072, + "learning_rate": 6.45e-07, + "loss": 4.1831, + "step": 216 + }, + { + "epoch": 0.17093343836155966, + "grad_norm": 5.783921241760254, + "learning_rate": 6.480000000000001e-07, + "loss": 4.1661, + "step": 217 + }, + { + "epoch": 0.17172115005907837, + "grad_norm": 5.678048133850098, + "learning_rate": 6.51e-07, + "loss": 4.1848, + "step": 218 + }, + { + "epoch": 0.17250886175659708, + "grad_norm": 5.76167631149292, + "learning_rate": 6.54e-07, + "loss": 4.1549, + "step": 219 + }, + { + "epoch": 0.17329657345411578, + "grad_norm": 5.5433502197265625, + "learning_rate": 6.57e-07, + "loss": 4.1744, + "step": 220 + }, + { + "epoch": 0.1740842851516345, + "grad_norm": 5.738646030426025, + "learning_rate": 6.599999999999999e-07, + "loss": 4.1037, + "step": 221 + }, + { + "epoch": 0.17487199684915322, + "grad_norm": 5.422011375427246, + "learning_rate": 6.63e-07, + "loss": 4.0673, + "step": 222 + }, + { + "epoch": 0.17565970854667193, + "grad_norm": 6.266214847564697, + "learning_rate": 6.660000000000001e-07, + "loss": 4.0757, + "step": 223 + }, + { + "epoch": 0.17644742024419063, + "grad_norm": 7.809333801269531, + "learning_rate": 6.690000000000001e-07, + "loss": 4.062, + "step": 224 + }, + { + "epoch": 0.17723513194170934, + "grad_norm": 5.3605523109436035, + "learning_rate": 6.72e-07, + "loss": 4.0654, + "step": 225 + }, + { + "epoch": 0.17802284363922805, + "grad_norm": 4.962398529052734, + "learning_rate": 6.75e-07, + "loss": 4.0437, + "step": 226 + }, + { + "epoch": 0.17881055533674675, + "grad_norm": 5.3401899337768555, + "learning_rate": 6.78e-07, + "loss": 4.0105, + "step": 227 + }, + { + "epoch": 0.17959826703426546, + "grad_norm": 4.891468524932861, + "learning_rate": 6.81e-07, + "loss": 3.9898, + "step": 228 + }, + { + "epoch": 0.18038597873178416, + "grad_norm": 4.404794692993164, + "learning_rate": 6.84e-07, + "loss": 3.9906, + "step": 229 + }, + { + "epoch": 0.18117369042930287, + "grad_norm": 6.199390888214111, + "learning_rate": 6.87e-07, + "loss": 4.0063, + "step": 230 + }, + { + "epoch": 0.18196140212682158, + "grad_norm": 4.984107971191406, + "learning_rate": 6.9e-07, + "loss": 3.9499, + "step": 231 + }, + { + "epoch": 0.18274911382434028, + "grad_norm": 4.681666851043701, + "learning_rate": 6.93e-07, + "loss": 4.0103, + "step": 232 + }, + { + "epoch": 0.183536825521859, + "grad_norm": 4.33386754989624, + "learning_rate": 6.96e-07, + "loss": 3.957, + "step": 233 + }, + { + "epoch": 0.18432453721937772, + "grad_norm": 4.675302028656006, + "learning_rate": 6.990000000000001e-07, + "loss": 3.9471, + "step": 234 + }, + { + "epoch": 0.18511224891689643, + "grad_norm": 4.081276893615723, + "learning_rate": 7.02e-07, + "loss": 3.949, + "step": 235 + }, + { + "epoch": 0.18589996061441513, + "grad_norm": 4.932337760925293, + "learning_rate": 7.05e-07, + "loss": 3.8802, + "step": 236 + }, + { + "epoch": 0.18668767231193384, + "grad_norm": 4.202552318572998, + "learning_rate": 7.08e-07, + "loss": 3.8895, + "step": 237 + }, + { + "epoch": 0.18747538400945254, + "grad_norm": 5.111434459686279, + "learning_rate": 7.11e-07, + "loss": 3.9131, + "step": 238 + }, + { + "epoch": 0.18826309570697125, + "grad_norm": 4.370693683624268, + "learning_rate": 7.140000000000001e-07, + "loss": 3.935, + "step": 239 + }, + { + "epoch": 0.18905080740448996, + "grad_norm": 4.223780632019043, + "learning_rate": 7.170000000000001e-07, + "loss": 3.8416, + "step": 240 + }, + { + "epoch": 0.18983851910200866, + "grad_norm": 4.129825592041016, + "learning_rate": 7.2e-07, + "loss": 3.8306, + "step": 241 + }, + { + "epoch": 0.19062623079952737, + "grad_norm": 4.608582019805908, + "learning_rate": 7.23e-07, + "loss": 3.8781, + "step": 242 + }, + { + "epoch": 0.19141394249704607, + "grad_norm": 7.216768741607666, + "learning_rate": 7.26e-07, + "loss": 3.7639, + "step": 243 + }, + { + "epoch": 0.19220165419456478, + "grad_norm": 5.199926376342773, + "learning_rate": 7.29e-07, + "loss": 3.7917, + "step": 244 + }, + { + "epoch": 0.1929893658920835, + "grad_norm": 4.570247650146484, + "learning_rate": 7.32e-07, + "loss": 3.7881, + "step": 245 + }, + { + "epoch": 0.1937770775896022, + "grad_norm": 4.029629230499268, + "learning_rate": 7.350000000000001e-07, + "loss": 3.7389, + "step": 246 + }, + { + "epoch": 0.19456478928712093, + "grad_norm": 3.9306652545928955, + "learning_rate": 7.380000000000001e-07, + "loss": 3.7328, + "step": 247 + }, + { + "epoch": 0.19535250098463963, + "grad_norm": 6.7592339515686035, + "learning_rate": 7.41e-07, + "loss": 3.739, + "step": 248 + }, + { + "epoch": 0.19614021268215834, + "grad_norm": 4.472005844116211, + "learning_rate": 7.44e-07, + "loss": 3.746, + "step": 249 + }, + { + "epoch": 0.19692792437967704, + "grad_norm": 4.5147705078125, + "learning_rate": 7.47e-07, + "loss": 3.7453, + "step": 250 + }, + { + "epoch": 0.19771563607719575, + "grad_norm": 6.1832075119018555, + "learning_rate": 7.5e-07, + "loss": 3.6381, + "step": 251 + }, + { + "epoch": 0.19850334777471446, + "grad_norm": 6.3091020584106445, + "learning_rate": 7.53e-07, + "loss": 3.5443, + "step": 252 + }, + { + "epoch": 0.19929105947223316, + "grad_norm": 4.653207302093506, + "learning_rate": 7.56e-07, + "loss": 3.5391, + "step": 253 + }, + { + "epoch": 0.20007877116975187, + "grad_norm": 4.053643226623535, + "learning_rate": 7.59e-07, + "loss": 3.516, + "step": 254 + }, + { + "epoch": 0.20086648286727057, + "grad_norm": 3.9116573333740234, + "learning_rate": 7.62e-07, + "loss": 3.4746, + "step": 255 + }, + { + "epoch": 0.20165419456478928, + "grad_norm": 5.611377716064453, + "learning_rate": 7.65e-07, + "loss": 3.5181, + "step": 256 + }, + { + "epoch": 0.20244190626230799, + "grad_norm": 6.6079912185668945, + "learning_rate": 7.680000000000001e-07, + "loss": 3.5607, + "step": 257 + }, + { + "epoch": 0.2032296179598267, + "grad_norm": 7.817990779876709, + "learning_rate": 7.71e-07, + "loss": 3.5802, + "step": 258 + }, + { + "epoch": 0.20401732965734543, + "grad_norm": 7.744387149810791, + "learning_rate": 7.74e-07, + "loss": 3.5791, + "step": 259 + }, + { + "epoch": 0.20480504135486413, + "grad_norm": 7.251423358917236, + "learning_rate": 7.77e-07, + "loss": 3.5435, + "step": 260 + }, + { + "epoch": 0.20559275305238284, + "grad_norm": 5.06251859664917, + "learning_rate": 7.799999999999999e-07, + "loss": 3.5131, + "step": 261 + }, + { + "epoch": 0.20638046474990154, + "grad_norm": 6.222503662109375, + "learning_rate": 7.830000000000001e-07, + "loss": 3.5576, + "step": 262 + }, + { + "epoch": 0.20716817644742025, + "grad_norm": 6.228431224822998, + "learning_rate": 7.860000000000001e-07, + "loss": 3.5216, + "step": 263 + }, + { + "epoch": 0.20795588814493896, + "grad_norm": 4.4483466148376465, + "learning_rate": 7.89e-07, + "loss": 3.4843, + "step": 264 + }, + { + "epoch": 0.20874359984245766, + "grad_norm": 5.789809226989746, + "learning_rate": 7.92e-07, + "loss": 3.4937, + "step": 265 + }, + { + "epoch": 0.20953131153997637, + "grad_norm": 5.729975700378418, + "learning_rate": 7.95e-07, + "loss": 3.4584, + "step": 266 + }, + { + "epoch": 0.21031902323749507, + "grad_norm": 8.085867881774902, + "learning_rate": 7.98e-07, + "loss": 3.4269, + "step": 267 + }, + { + "epoch": 0.21110673493501378, + "grad_norm": 7.715183258056641, + "learning_rate": 8.01e-07, + "loss": 3.4534, + "step": 268 + }, + { + "epoch": 0.21189444663253248, + "grad_norm": 6.287649631500244, + "learning_rate": 8.04e-07, + "loss": 3.4328, + "step": 269 + }, + { + "epoch": 0.2126821583300512, + "grad_norm": 7.075897216796875, + "learning_rate": 8.070000000000001e-07, + "loss": 3.4083, + "step": 270 + }, + { + "epoch": 0.2134698700275699, + "grad_norm": 9.932754516601562, + "learning_rate": 8.1e-07, + "loss": 3.3919, + "step": 271 + }, + { + "epoch": 0.21425758172508863, + "grad_norm": 5.832574367523193, + "learning_rate": 8.13e-07, + "loss": 3.3759, + "step": 272 + }, + { + "epoch": 0.21504529342260734, + "grad_norm": 4.111095905303955, + "learning_rate": 8.16e-07, + "loss": 3.4046, + "step": 273 + }, + { + "epoch": 0.21583300512012604, + "grad_norm": 5.41343879699707, + "learning_rate": 8.19e-07, + "loss": 3.3679, + "step": 274 + }, + { + "epoch": 0.21662071681764475, + "grad_norm": 4.866332530975342, + "learning_rate": 8.22e-07, + "loss": 3.357, + "step": 275 + }, + { + "epoch": 0.21740842851516345, + "grad_norm": 11.282758712768555, + "learning_rate": 8.25e-07, + "loss": 3.3367, + "step": 276 + }, + { + "epoch": 0.21819614021268216, + "grad_norm": 10.144689559936523, + "learning_rate": 8.280000000000001e-07, + "loss": 3.3336, + "step": 277 + }, + { + "epoch": 0.21898385191020087, + "grad_norm": 4.519490718841553, + "learning_rate": 8.31e-07, + "loss": 3.3219, + "step": 278 + }, + { + "epoch": 0.21977156360771957, + "grad_norm": 6.456394195556641, + "learning_rate": 8.34e-07, + "loss": 3.3155, + "step": 279 + }, + { + "epoch": 0.22055927530523828, + "grad_norm": 4.7481489181518555, + "learning_rate": 8.370000000000001e-07, + "loss": 3.2969, + "step": 280 + }, + { + "epoch": 0.22134698700275698, + "grad_norm": 3.2007181644439697, + "learning_rate": 8.4e-07, + "loss": 3.2752, + "step": 281 + }, + { + "epoch": 0.2221346987002757, + "grad_norm": 8.362846374511719, + "learning_rate": 8.43e-07, + "loss": 3.3067, + "step": 282 + }, + { + "epoch": 0.2229224103977944, + "grad_norm": 5.8566741943359375, + "learning_rate": 8.46e-07, + "loss": 3.2855, + "step": 283 + }, + { + "epoch": 0.2237101220953131, + "grad_norm": 5.3818888664245605, + "learning_rate": 8.489999999999999e-07, + "loss": 3.2328, + "step": 284 + }, + { + "epoch": 0.22449783379283184, + "grad_norm": 7.2556891441345215, + "learning_rate": 8.520000000000001e-07, + "loss": 3.2483, + "step": 285 + }, + { + "epoch": 0.22528554549035054, + "grad_norm": 5.602944374084473, + "learning_rate": 8.550000000000001e-07, + "loss": 3.2759, + "step": 286 + }, + { + "epoch": 0.22607325718786925, + "grad_norm": 3.7117702960968018, + "learning_rate": 8.580000000000001e-07, + "loss": 3.2119, + "step": 287 + }, + { + "epoch": 0.22686096888538795, + "grad_norm": 6.258291721343994, + "learning_rate": 8.61e-07, + "loss": 3.2794, + "step": 288 + }, + { + "epoch": 0.22764868058290666, + "grad_norm": 4.86312198638916, + "learning_rate": 8.64e-07, + "loss": 3.2232, + "step": 289 + }, + { + "epoch": 0.22843639228042537, + "grad_norm": 5.1380109786987305, + "learning_rate": 8.67e-07, + "loss": 3.2454, + "step": 290 + }, + { + "epoch": 0.22922410397794407, + "grad_norm": 9.04436206817627, + "learning_rate": 8.7e-07, + "loss": 3.2273, + "step": 291 + }, + { + "epoch": 0.23001181567546278, + "grad_norm": 4.309777736663818, + "learning_rate": 8.73e-07, + "loss": 3.2537, + "step": 292 + }, + { + "epoch": 0.23079952737298148, + "grad_norm": 5.5641350746154785, + "learning_rate": 8.760000000000001e-07, + "loss": 3.195, + "step": 293 + }, + { + "epoch": 0.2315872390705002, + "grad_norm": 4.682958602905273, + "learning_rate": 8.79e-07, + "loss": 3.2099, + "step": 294 + }, + { + "epoch": 0.2323749507680189, + "grad_norm": 4.243220806121826, + "learning_rate": 8.82e-07, + "loss": 3.1741, + "step": 295 + }, + { + "epoch": 0.2331626624655376, + "grad_norm": 6.358564376831055, + "learning_rate": 8.85e-07, + "loss": 3.2343, + "step": 296 + }, + { + "epoch": 0.23395037416305633, + "grad_norm": 3.9667258262634277, + "learning_rate": 8.88e-07, + "loss": 3.1965, + "step": 297 + }, + { + "epoch": 0.23473808586057504, + "grad_norm": 8.902915954589844, + "learning_rate": 8.91e-07, + "loss": 3.1986, + "step": 298 + }, + { + "epoch": 0.23552579755809375, + "grad_norm": 7.593542098999023, + "learning_rate": 8.94e-07, + "loss": 3.1875, + "step": 299 + }, + { + "epoch": 0.23631350925561245, + "grad_norm": 6.558197498321533, + "learning_rate": 8.97e-07, + "loss": 3.1652, + "step": 300 + }, + { + "epoch": 0.23710122095313116, + "grad_norm": 13.535314559936523, + "learning_rate": 9e-07, + "loss": 3.3151, + "step": 301 + }, + { + "epoch": 0.23788893265064986, + "grad_norm": 10.61730670928955, + "learning_rate": 9.03e-07, + "loss": 3.2791, + "step": 302 + }, + { + "epoch": 0.23867664434816857, + "grad_norm": 9.696962356567383, + "learning_rate": 9.060000000000001e-07, + "loss": 3.2319, + "step": 303 + }, + { + "epoch": 0.23946435604568728, + "grad_norm": 6.610024929046631, + "learning_rate": 9.09e-07, + "loss": 3.1809, + "step": 304 + }, + { + "epoch": 0.24025206774320598, + "grad_norm": 6.176677703857422, + "learning_rate": 9.12e-07, + "loss": 3.2174, + "step": 305 + }, + { + "epoch": 0.2410397794407247, + "grad_norm": 3.9954397678375244, + "learning_rate": 9.15e-07, + "loss": 3.1739, + "step": 306 + }, + { + "epoch": 0.2418274911382434, + "grad_norm": 8.678698539733887, + "learning_rate": 9.179999999999999e-07, + "loss": 3.2395, + "step": 307 + }, + { + "epoch": 0.2426152028357621, + "grad_norm": 9.144721031188965, + "learning_rate": 9.210000000000001e-07, + "loss": 3.1968, + "step": 308 + }, + { + "epoch": 0.2434029145332808, + "grad_norm": 9.356255531311035, + "learning_rate": 9.240000000000001e-07, + "loss": 3.2528, + "step": 309 + }, + { + "epoch": 0.24419062623079954, + "grad_norm": 10.471453666687012, + "learning_rate": 9.270000000000001e-07, + "loss": 3.2664, + "step": 310 + }, + { + "epoch": 0.24497833792831825, + "grad_norm": 7.638957500457764, + "learning_rate": 9.3e-07, + "loss": 3.1918, + "step": 311 + }, + { + "epoch": 0.24576604962583695, + "grad_norm": 7.724191188812256, + "learning_rate": 9.33e-07, + "loss": 3.1954, + "step": 312 + }, + { + "epoch": 0.24655376132335566, + "grad_norm": 11.812409400939941, + "learning_rate": 9.36e-07, + "loss": 3.1543, + "step": 313 + }, + { + "epoch": 0.24734147302087436, + "grad_norm": 6.739223480224609, + "learning_rate": 9.39e-07, + "loss": 3.1199, + "step": 314 + }, + { + "epoch": 0.24812918471839307, + "grad_norm": 6.5116658210754395, + "learning_rate": 9.419999999999999e-07, + "loss": 3.1989, + "step": 315 + }, + { + "epoch": 0.24891689641591178, + "grad_norm": 7.403290271759033, + "learning_rate": 9.450000000000001e-07, + "loss": 3.1745, + "step": 316 + }, + { + "epoch": 0.24970460811343048, + "grad_norm": 6.9632487297058105, + "learning_rate": 9.480000000000001e-07, + "loss": 3.1561, + "step": 317 + }, + { + "epoch": 0.2504923198109492, + "grad_norm": 11.451126098632812, + "learning_rate": 9.51e-07, + "loss": 3.1406, + "step": 318 + }, + { + "epoch": 0.2512800315084679, + "grad_norm": 7.065820693969727, + "learning_rate": 9.54e-07, + "loss": 3.1076, + "step": 319 + }, + { + "epoch": 0.2520677432059866, + "grad_norm": 15.716169357299805, + "learning_rate": 9.57e-07, + "loss": 3.1384, + "step": 320 + }, + { + "epoch": 0.2528554549035053, + "grad_norm": 4.571938991546631, + "learning_rate": 9.600000000000001e-07, + "loss": 3.1543, + "step": 321 + }, + { + "epoch": 0.25364316660102404, + "grad_norm": 4.824108600616455, + "learning_rate": 9.63e-07, + "loss": 3.1259, + "step": 322 + }, + { + "epoch": 0.2544308782985427, + "grad_norm": 7.471798896789551, + "learning_rate": 9.66e-07, + "loss": 3.1568, + "step": 323 + }, + { + "epoch": 0.25521858999606145, + "grad_norm": 3.9762988090515137, + "learning_rate": 9.690000000000002e-07, + "loss": 3.1083, + "step": 324 + }, + { + "epoch": 0.25600630169358013, + "grad_norm": 3.956766128540039, + "learning_rate": 9.72e-07, + "loss": 3.1052, + "step": 325 + }, + { + "epoch": 0.25679401339109886, + "grad_norm": 6.85633659362793, + "learning_rate": 9.75e-07, + "loss": 3.0521, + "step": 326 + }, + { + "epoch": 0.25758172508861754, + "grad_norm": 4.2610883712768555, + "learning_rate": 9.78e-07, + "loss": 3.1218, + "step": 327 + }, + { + "epoch": 0.2583694367861363, + "grad_norm": 8.925460815429688, + "learning_rate": 9.81e-07, + "loss": 3.09, + "step": 328 + }, + { + "epoch": 0.259157148483655, + "grad_norm": 2.957304000854492, + "learning_rate": 9.84e-07, + "loss": 3.0854, + "step": 329 + }, + { + "epoch": 0.2599448601811737, + "grad_norm": 3.802190065383911, + "learning_rate": 9.87e-07, + "loss": 3.1569, + "step": 330 + }, + { + "epoch": 0.2607325718786924, + "grad_norm": 8.973787307739258, + "learning_rate": 9.9e-07, + "loss": 3.14, + "step": 331 + }, + { + "epoch": 0.2615202835762111, + "grad_norm": 5.716228008270264, + "learning_rate": 9.929999999999999e-07, + "loss": 3.1249, + "step": 332 + }, + { + "epoch": 0.26230799527372983, + "grad_norm": 5.87473726272583, + "learning_rate": 9.96e-07, + "loss": 3.1283, + "step": 333 + }, + { + "epoch": 0.2630957069712485, + "grad_norm": 5.9342732429504395, + "learning_rate": 9.99e-07, + "loss": 3.1106, + "step": 334 + }, + { + "epoch": 0.26388341866876724, + "grad_norm": 7.754244327545166, + "learning_rate": 1.002e-06, + "loss": 3.0796, + "step": 335 + }, + { + "epoch": 0.2646711303662859, + "grad_norm": 3.5852208137512207, + "learning_rate": 1.0050000000000001e-06, + "loss": 3.0979, + "step": 336 + }, + { + "epoch": 0.26545884206380466, + "grad_norm": 5.56411600112915, + "learning_rate": 1.008e-06, + "loss": 3.1012, + "step": 337 + }, + { + "epoch": 0.26624655376132333, + "grad_norm": 22.257902145385742, + "learning_rate": 1.0110000000000001e-06, + "loss": 3.0788, + "step": 338 + }, + { + "epoch": 0.26703426545884207, + "grad_norm": 10.102089881896973, + "learning_rate": 1.014e-06, + "loss": 3.1026, + "step": 339 + }, + { + "epoch": 0.26782197715636075, + "grad_norm": 5.072144031524658, + "learning_rate": 1.017e-06, + "loss": 3.046, + "step": 340 + }, + { + "epoch": 0.2686096888538795, + "grad_norm": 5.284185886383057, + "learning_rate": 1.0200000000000002e-06, + "loss": 3.1028, + "step": 341 + }, + { + "epoch": 0.2693974005513982, + "grad_norm": 6.365428447723389, + "learning_rate": 1.023e-06, + "loss": 3.0764, + "step": 342 + }, + { + "epoch": 0.2701851122489169, + "grad_norm": 14.387186050415039, + "learning_rate": 1.026e-06, + "loss": 3.0952, + "step": 343 + }, + { + "epoch": 0.2709728239464356, + "grad_norm": 6.568045616149902, + "learning_rate": 1.029e-06, + "loss": 3.0715, + "step": 344 + }, + { + "epoch": 0.2717605356439543, + "grad_norm": NaN, + "learning_rate": 1.029e-06, + "loss": 3.1145, + "step": 345 + }, + { + "epoch": 0.27254824734147304, + "grad_norm": 8.732887268066406, + "learning_rate": 1.032e-06, + "loss": 3.0334, + "step": 346 + }, + { + "epoch": 0.2733359590389917, + "grad_norm": 10.508849143981934, + "learning_rate": 1.035e-06, + "loss": 3.1111, + "step": 347 + }, + { + "epoch": 0.27412367073651045, + "grad_norm": 16.092422485351562, + "learning_rate": 1.038e-06, + "loss": 3.1055, + "step": 348 + }, + { + "epoch": 0.27491138243402913, + "grad_norm": 9.130645751953125, + "learning_rate": 1.041e-06, + "loss": 3.0462, + "step": 349 + }, + { + "epoch": 0.27569909413154786, + "grad_norm": 24.408676147460938, + "learning_rate": 1.0439999999999999e-06, + "loss": 3.1199, + "step": 350 + }, + { + "epoch": 0.27648680582906654, + "grad_norm": 12.097480773925781, + "learning_rate": 1.047e-06, + "loss": 3.2197, + "step": 351 + }, + { + "epoch": 0.2772745175265853, + "grad_norm": 9.480462074279785, + "learning_rate": 1.0500000000000001e-06, + "loss": 3.1871, + "step": 352 + }, + { + "epoch": 0.27806222922410395, + "grad_norm": 4.990139007568359, + "learning_rate": 1.053e-06, + "loss": 3.1389, + "step": 353 + }, + { + "epoch": 0.2788499409216227, + "grad_norm": 4.295811653137207, + "learning_rate": 1.0560000000000001e-06, + "loss": 3.0887, + "step": 354 + }, + { + "epoch": 0.2796376526191414, + "grad_norm": 5.059241771697998, + "learning_rate": 1.059e-06, + "loss": 3.1147, + "step": 355 + }, + { + "epoch": 0.2804253643166601, + "grad_norm": 5.86007022857666, + "learning_rate": 1.062e-06, + "loss": 3.1256, + "step": 356 + }, + { + "epoch": 0.28121307601417883, + "grad_norm": 7.495848178863525, + "learning_rate": 1.065e-06, + "loss": 3.1292, + "step": 357 + }, + { + "epoch": 0.2820007877116975, + "grad_norm": 7.542823791503906, + "learning_rate": 1.068e-06, + "loss": 3.1186, + "step": 358 + }, + { + "epoch": 0.28278849940921624, + "grad_norm": 5.334527015686035, + "learning_rate": 1.0710000000000002e-06, + "loss": 3.0662, + "step": 359 + }, + { + "epoch": 0.2835762111067349, + "grad_norm": 6.712592124938965, + "learning_rate": 1.074e-06, + "loss": 3.1256, + "step": 360 + }, + { + "epoch": 0.28436392280425365, + "grad_norm": 3.8070201873779297, + "learning_rate": 1.077e-06, + "loss": 3.075, + "step": 361 + }, + { + "epoch": 0.28515163450177233, + "grad_norm": 4.047352313995361, + "learning_rate": 1.08e-06, + "loss": 3.0152, + "step": 362 + }, + { + "epoch": 0.28593934619929107, + "grad_norm": 8.286897659301758, + "learning_rate": 1.083e-06, + "loss": 3.1298, + "step": 363 + }, + { + "epoch": 0.28672705789680974, + "grad_norm": 9.559124946594238, + "learning_rate": 1.086e-06, + "loss": 3.0814, + "step": 364 + }, + { + "epoch": 0.2875147695943285, + "grad_norm": 8.545434951782227, + "learning_rate": 1.089e-06, + "loss": 3.0868, + "step": 365 + }, + { + "epoch": 0.28830248129184716, + "grad_norm": 9.708534240722656, + "learning_rate": 1.092e-06, + "loss": 3.0783, + "step": 366 + }, + { + "epoch": 0.2890901929893659, + "grad_norm": 8.198746681213379, + "learning_rate": 1.0949999999999999e-06, + "loss": 3.0696, + "step": 367 + }, + { + "epoch": 0.2898779046868846, + "grad_norm": 14.207758903503418, + "learning_rate": 1.0980000000000001e-06, + "loss": 3.0981, + "step": 368 + }, + { + "epoch": 0.2906656163844033, + "grad_norm": 13.575934410095215, + "learning_rate": 1.1010000000000001e-06, + "loss": 3.0839, + "step": 369 + }, + { + "epoch": 0.29145332808192204, + "grad_norm": 5.160454750061035, + "learning_rate": 1.104e-06, + "loss": 3.0093, + "step": 370 + }, + { + "epoch": 0.2922410397794407, + "grad_norm": 6.382524013519287, + "learning_rate": 1.1070000000000002e-06, + "loss": 3.0758, + "step": 371 + }, + { + "epoch": 0.29302875147695945, + "grad_norm": 10.913718223571777, + "learning_rate": 1.11e-06, + "loss": 3.0577, + "step": 372 + }, + { + "epoch": 0.2938164631744781, + "grad_norm": 26.941162109375, + "learning_rate": 1.113e-06, + "loss": 3.073, + "step": 373 + }, + { + "epoch": 0.29460417487199686, + "grad_norm": 6.947534561157227, + "learning_rate": 1.116e-06, + "loss": 3.0778, + "step": 374 + }, + { + "epoch": 0.29539188656951554, + "grad_norm": 4.768919944763184, + "learning_rate": 1.119e-06, + "loss": 3.0497, + "step": 375 + }, + { + "epoch": 0.29617959826703427, + "grad_norm": 7.091861248016357, + "learning_rate": 1.122e-06, + "loss": 3.0356, + "step": 376 + }, + { + "epoch": 0.29696730996455295, + "grad_norm": 5.327103614807129, + "learning_rate": 1.125e-06, + "loss": 3.0319, + "step": 377 + }, + { + "epoch": 0.2977550216620717, + "grad_norm": 11.311358451843262, + "learning_rate": 1.128e-06, + "loss": 3.0484, + "step": 378 + }, + { + "epoch": 0.2985427333595904, + "grad_norm": 4.320501804351807, + "learning_rate": 1.131e-06, + "loss": 3.0726, + "step": 379 + }, + { + "epoch": 0.2993304450571091, + "grad_norm": 3.3752224445343018, + "learning_rate": 1.134e-06, + "loss": 3.0624, + "step": 380 + }, + { + "epoch": 0.30011815675462783, + "grad_norm": 4.963978290557861, + "learning_rate": 1.137e-06, + "loss": 3.0524, + "step": 381 + }, + { + "epoch": 0.3009058684521465, + "grad_norm": 6.269223213195801, + "learning_rate": 1.14e-06, + "loss": 3.0607, + "step": 382 + }, + { + "epoch": 0.30169358014966524, + "grad_norm": 4.914637088775635, + "learning_rate": 1.1430000000000001e-06, + "loss": 3.054, + "step": 383 + }, + { + "epoch": 0.3024812918471839, + "grad_norm": 6.588011741638184, + "learning_rate": 1.146e-06, + "loss": 3.0316, + "step": 384 + }, + { + "epoch": 0.30326900354470265, + "grad_norm": 25.187252044677734, + "learning_rate": 1.1490000000000001e-06, + "loss": 3.0949, + "step": 385 + }, + { + "epoch": 0.30405671524222133, + "grad_norm": 11.086193084716797, + "learning_rate": 1.152e-06, + "loss": 3.0741, + "step": 386 + }, + { + "epoch": 0.30484442693974007, + "grad_norm": 6.609203815460205, + "learning_rate": 1.155e-06, + "loss": 3.0451, + "step": 387 + }, + { + "epoch": 0.30563213863725874, + "grad_norm": 5.583180904388428, + "learning_rate": 1.1580000000000002e-06, + "loss": 3.0254, + "step": 388 + }, + { + "epoch": 0.3064198503347775, + "grad_norm": 5.788333892822266, + "learning_rate": 1.161e-06, + "loss": 3.0526, + "step": 389 + }, + { + "epoch": 0.30720756203229616, + "grad_norm": 6.242170810699463, + "learning_rate": 1.164e-06, + "loss": 3.1087, + "step": 390 + }, + { + "epoch": 0.3079952737298149, + "grad_norm": 3.79465389251709, + "learning_rate": 1.167e-06, + "loss": 3.0145, + "step": 391 + }, + { + "epoch": 0.3087829854273336, + "grad_norm": 5.768966197967529, + "learning_rate": 1.17e-06, + "loss": 3.0468, + "step": 392 + }, + { + "epoch": 0.3095706971248523, + "grad_norm": 5.131977081298828, + "learning_rate": 1.173e-06, + "loss": 3.0138, + "step": 393 + }, + { + "epoch": 0.31035840882237103, + "grad_norm": 3.707314968109131, + "learning_rate": 1.176e-06, + "loss": 3.0073, + "step": 394 + }, + { + "epoch": 0.3111461205198897, + "grad_norm": 4.692445278167725, + "learning_rate": 1.179e-06, + "loss": 2.9971, + "step": 395 + }, + { + "epoch": 0.31193383221740845, + "grad_norm": 7.342323303222656, + "learning_rate": 1.1819999999999999e-06, + "loss": 3.0303, + "step": 396 + }, + { + "epoch": 0.3127215439149271, + "grad_norm": 3.6735751628875732, + "learning_rate": 1.185e-06, + "loss": 2.9811, + "step": 397 + }, + { + "epoch": 0.31350925561244586, + "grad_norm": 11.098773002624512, + "learning_rate": 1.188e-06, + "loss": 3.041, + "step": 398 + }, + { + "epoch": 0.31429696730996454, + "grad_norm": 6.451319694519043, + "learning_rate": 1.191e-06, + "loss": 3.0379, + "step": 399 + }, + { + "epoch": 0.31508467900748327, + "grad_norm": 5.416855335235596, + "learning_rate": 1.1940000000000001e-06, + "loss": 3.0305, + "step": 400 + }, + { + "epoch": 0.31587239070500195, + "grad_norm": 7.048165798187256, + "learning_rate": 1.197e-06, + "loss": 3.1535, + "step": 401 + }, + { + "epoch": 0.3166601024025207, + "grad_norm": 6.660574436187744, + "learning_rate": 1.2000000000000002e-06, + "loss": 3.0694, + "step": 402 + }, + { + "epoch": 0.31744781410003936, + "grad_norm": 4.550227165222168, + "learning_rate": 1.203e-06, + "loss": 3.0481, + "step": 403 + }, + { + "epoch": 0.3182355257975581, + "grad_norm": 3.6928353309631348, + "learning_rate": 1.206e-06, + "loss": 3.0477, + "step": 404 + }, + { + "epoch": 0.31902323749507683, + "grad_norm": 5.430210113525391, + "learning_rate": 1.2090000000000002e-06, + "loss": 3.0504, + "step": 405 + }, + { + "epoch": 0.3198109491925955, + "grad_norm": 5.120702743530273, + "learning_rate": 1.212e-06, + "loss": 3.0525, + "step": 406 + }, + { + "epoch": 0.32059866089011424, + "grad_norm": 5.877532958984375, + "learning_rate": 1.215e-06, + "loss": 3.0455, + "step": 407 + }, + { + "epoch": 0.3213863725876329, + "grad_norm": 7.078686237335205, + "learning_rate": 1.218e-06, + "loss": 3.0178, + "step": 408 + }, + { + "epoch": 0.32217408428515165, + "grad_norm": 4.936050891876221, + "learning_rate": 1.221e-06, + "loss": 3.0271, + "step": 409 + }, + { + "epoch": 0.32296179598267033, + "grad_norm": 3.3682117462158203, + "learning_rate": 1.224e-06, + "loss": 3.0002, + "step": 410 + }, + { + "epoch": 0.32374950768018906, + "grad_norm": 15.289687156677246, + "learning_rate": 1.227e-06, + "loss": 3.0122, + "step": 411 + }, + { + "epoch": 0.32453721937770774, + "grad_norm": 8.485540390014648, + "learning_rate": 1.23e-06, + "loss": 3.0174, + "step": 412 + }, + { + "epoch": 0.3253249310752265, + "grad_norm": 8.567049980163574, + "learning_rate": 1.2329999999999999e-06, + "loss": 3.0221, + "step": 413 + }, + { + "epoch": 0.32611264277274515, + "grad_norm": 7.974210262298584, + "learning_rate": 1.2360000000000001e-06, + "loss": 3.0296, + "step": 414 + }, + { + "epoch": 0.3269003544702639, + "grad_norm": 6.598056793212891, + "learning_rate": 1.2390000000000001e-06, + "loss": 3.0535, + "step": 415 + }, + { + "epoch": 0.32768806616778257, + "grad_norm": 9.896428108215332, + "learning_rate": 1.242e-06, + "loss": 3.0089, + "step": 416 + }, + { + "epoch": 0.3284757778653013, + "grad_norm": 6.9402971267700195, + "learning_rate": 1.2450000000000002e-06, + "loss": 2.9808, + "step": 417 + }, + { + "epoch": 0.32926348956282003, + "grad_norm": 8.137408256530762, + "learning_rate": 1.248e-06, + "loss": 2.9786, + "step": 418 + }, + { + "epoch": 0.3300512012603387, + "grad_norm": 7.3277082443237305, + "learning_rate": 1.251e-06, + "loss": 3.0269, + "step": 419 + }, + { + "epoch": 0.33083891295785745, + "grad_norm": 6.309098720550537, + "learning_rate": 1.254e-06, + "loss": 3.0344, + "step": 420 + }, + { + "epoch": 0.3316266246553761, + "grad_norm": 5.348298072814941, + "learning_rate": 1.257e-06, + "loss": 3.0181, + "step": 421 + }, + { + "epoch": 0.33241433635289486, + "grad_norm": 12.52122688293457, + "learning_rate": 1.26e-06, + "loss": 2.9354, + "step": 422 + }, + { + "epoch": 0.33320204805041354, + "grad_norm": 10.916754722595215, + "learning_rate": 1.263e-06, + "loss": 2.985, + "step": 423 + }, + { + "epoch": 0.33398975974793227, + "grad_norm": 6.322619915008545, + "learning_rate": 1.266e-06, + "loss": 2.9851, + "step": 424 + }, + { + "epoch": 0.33477747144545095, + "grad_norm": 5.217950344085693, + "learning_rate": 1.269e-06, + "loss": 3.0274, + "step": 425 + }, + { + "epoch": 0.3355651831429697, + "grad_norm": 5.328530788421631, + "learning_rate": 1.272e-06, + "loss": 3.0523, + "step": 426 + }, + { + "epoch": 0.33635289484048836, + "grad_norm": 10.175230026245117, + "learning_rate": 1.275e-06, + "loss": 3.0136, + "step": 427 + }, + { + "epoch": 0.3371406065380071, + "grad_norm": 7.5981903076171875, + "learning_rate": 1.278e-06, + "loss": 2.9665, + "step": 428 + }, + { + "epoch": 0.33792831823552577, + "grad_norm": 10.223776817321777, + "learning_rate": 1.281e-06, + "loss": 2.9896, + "step": 429 + }, + { + "epoch": 0.3387160299330445, + "grad_norm": 13.228652954101562, + "learning_rate": 1.284e-06, + "loss": 2.9947, + "step": 430 + }, + { + "epoch": 0.33950374163056324, + "grad_norm": 5.675848960876465, + "learning_rate": 1.2870000000000001e-06, + "loss": 2.993, + "step": 431 + }, + { + "epoch": 0.3402914533280819, + "grad_norm": 4.715620517730713, + "learning_rate": 1.29e-06, + "loss": 2.9631, + "step": 432 + }, + { + "epoch": 0.34107916502560065, + "grad_norm": 5.10097074508667, + "learning_rate": 1.293e-06, + "loss": 3.0143, + "step": 433 + }, + { + "epoch": 0.34186687672311933, + "grad_norm": 6.7617597579956055, + "learning_rate": 1.2960000000000002e-06, + "loss": 3.0083, + "step": 434 + }, + { + "epoch": 0.34265458842063806, + "grad_norm": 17.813114166259766, + "learning_rate": 1.299e-06, + "loss": 2.9979, + "step": 435 + }, + { + "epoch": 0.34344230011815674, + "grad_norm": 10.038800239562988, + "learning_rate": 1.302e-06, + "loss": 3.0087, + "step": 436 + }, + { + "epoch": 0.3442300118156755, + "grad_norm": 7.763751983642578, + "learning_rate": 1.305e-06, + "loss": 2.9638, + "step": 437 + }, + { + "epoch": 0.34501772351319415, + "grad_norm": 4.817987442016602, + "learning_rate": 1.308e-06, + "loss": 2.9396, + "step": 438 + }, + { + "epoch": 0.3458054352107129, + "grad_norm": 15.853002548217773, + "learning_rate": 1.311e-06, + "loss": 2.9735, + "step": 439 + }, + { + "epoch": 0.34659314690823156, + "grad_norm": 19.025659561157227, + "learning_rate": 1.314e-06, + "loss": 2.9477, + "step": 440 + }, + { + "epoch": 0.3473808586057503, + "grad_norm": 3.736346483230591, + "learning_rate": 1.317e-06, + "loss": 2.9453, + "step": 441 + }, + { + "epoch": 0.348168570303269, + "grad_norm": 5.593940258026123, + "learning_rate": 1.3199999999999999e-06, + "loss": 3.0248, + "step": 442 + }, + { + "epoch": 0.3489562820007877, + "grad_norm": 5.00626277923584, + "learning_rate": 1.323e-06, + "loss": 3.0009, + "step": 443 + }, + { + "epoch": 0.34974399369830644, + "grad_norm": 4.503805160522461, + "learning_rate": 1.326e-06, + "loss": 2.9515, + "step": 444 + }, + { + "epoch": 0.3505317053958251, + "grad_norm": 4.232930660247803, + "learning_rate": 1.3290000000000001e-06, + "loss": 2.96, + "step": 445 + }, + { + "epoch": 0.35131941709334386, + "grad_norm": 5.650477409362793, + "learning_rate": 1.3320000000000001e-06, + "loss": 2.9606, + "step": 446 + }, + { + "epoch": 0.35210712879086253, + "grad_norm": 18.473041534423828, + "learning_rate": 1.335e-06, + "loss": 3.0121, + "step": 447 + }, + { + "epoch": 0.35289484048838127, + "grad_norm": 6.920109748840332, + "learning_rate": 1.3380000000000001e-06, + "loss": 2.9874, + "step": 448 + }, + { + "epoch": 0.35368255218589995, + "grad_norm": 7.728922367095947, + "learning_rate": 1.341e-06, + "loss": 2.9547, + "step": 449 + }, + { + "epoch": 0.3544702638834187, + "grad_norm": 4.263467788696289, + "learning_rate": 1.344e-06, + "loss": 2.987, + "step": 450 + }, + { + "epoch": 0.35525797558093736, + "grad_norm": 4.650792121887207, + "learning_rate": 1.3470000000000002e-06, + "loss": 3.1063, + "step": 451 + }, + { + "epoch": 0.3560456872784561, + "grad_norm": 3.902890682220459, + "learning_rate": 1.35e-06, + "loss": 3.044, + "step": 452 + }, + { + "epoch": 0.35683339897597477, + "grad_norm": 5.16432523727417, + "learning_rate": 1.353e-06, + "loss": 3.0441, + "step": 453 + }, + { + "epoch": 0.3576211106734935, + "grad_norm": 2.9720873832702637, + "learning_rate": 1.356e-06, + "loss": 3.0133, + "step": 454 + }, + { + "epoch": 0.35840882237101224, + "grad_norm": 3.9596035480499268, + "learning_rate": 1.359e-06, + "loss": 2.9884, + "step": 455 + }, + { + "epoch": 0.3591965340685309, + "grad_norm": 3.3732471466064453, + "learning_rate": 1.362e-06, + "loss": 2.9691, + "step": 456 + }, + { + "epoch": 0.35998424576604965, + "grad_norm": 12.752083778381348, + "learning_rate": 1.365e-06, + "loss": 2.9719, + "step": 457 + }, + { + "epoch": 0.3607719574635683, + "grad_norm": 3.6770858764648438, + "learning_rate": 1.368e-06, + "loss": 2.9463, + "step": 458 + }, + { + "epoch": 0.36155966916108706, + "grad_norm": 3.7643487453460693, + "learning_rate": 1.3709999999999999e-06, + "loss": 2.9454, + "step": 459 + }, + { + "epoch": 0.36234738085860574, + "grad_norm": 8.50643253326416, + "learning_rate": 1.374e-06, + "loss": 2.9626, + "step": 460 + }, + { + "epoch": 0.3631350925561245, + "grad_norm": 4.350733757019043, + "learning_rate": 1.3770000000000001e-06, + "loss": 2.9378, + "step": 461 + }, + { + "epoch": 0.36392280425364315, + "grad_norm": 12.259222030639648, + "learning_rate": 1.38e-06, + "loss": 2.9556, + "step": 462 + }, + { + "epoch": 0.3647105159511619, + "grad_norm": 7.144326686859131, + "learning_rate": 1.3830000000000001e-06, + "loss": 2.9521, + "step": 463 + }, + { + "epoch": 0.36549822764868056, + "grad_norm": 4.518127918243408, + "learning_rate": 1.386e-06, + "loss": 2.9495, + "step": 464 + }, + { + "epoch": 0.3662859393461993, + "grad_norm": 11.50539779663086, + "learning_rate": 1.389e-06, + "loss": 2.9482, + "step": 465 + }, + { + "epoch": 0.367073651043718, + "grad_norm": 9.436210632324219, + "learning_rate": 1.392e-06, + "loss": 2.971, + "step": 466 + }, + { + "epoch": 0.3678613627412367, + "grad_norm": 5.534749507904053, + "learning_rate": 1.395e-06, + "loss": 2.9316, + "step": 467 + }, + { + "epoch": 0.36864907443875544, + "grad_norm": 5.906118392944336, + "learning_rate": 1.3980000000000002e-06, + "loss": 2.946, + "step": 468 + }, + { + "epoch": 0.3694367861362741, + "grad_norm": 8.404918670654297, + "learning_rate": 1.401e-06, + "loss": 2.9667, + "step": 469 + }, + { + "epoch": 0.37022449783379285, + "grad_norm": Infinity, + "learning_rate": 1.401e-06, + "loss": 2.9622, + "step": 470 + }, + { + "epoch": 0.37101220953131153, + "grad_norm": 6.801695346832275, + "learning_rate": 1.404e-06, + "loss": 2.9741, + "step": 471 + }, + { + "epoch": 0.37179992122883027, + "grad_norm": 6.5663533210754395, + "learning_rate": 1.407e-06, + "loss": 2.9605, + "step": 472 + }, + { + "epoch": 0.37258763292634894, + "grad_norm": 13.700033187866211, + "learning_rate": 1.41e-06, + "loss": 2.9808, + "step": 473 + }, + { + "epoch": 0.3733753446238677, + "grad_norm": 5.247633457183838, + "learning_rate": 1.413e-06, + "loss": 2.9489, + "step": 474 + }, + { + "epoch": 0.37416305632138636, + "grad_norm": 8.073824882507324, + "learning_rate": 1.416e-06, + "loss": 2.994, + "step": 475 + }, + { + "epoch": 0.3749507680189051, + "grad_norm": 4.412911891937256, + "learning_rate": 1.419e-06, + "loss": 2.9593, + "step": 476 + }, + { + "epoch": 0.37573847971642377, + "grad_norm": 6.819213390350342, + "learning_rate": 1.422e-06, + "loss": 2.9465, + "step": 477 + }, + { + "epoch": 0.3765261914139425, + "grad_norm": 5.330704212188721, + "learning_rate": 1.4250000000000001e-06, + "loss": 2.9792, + "step": 478 + }, + { + "epoch": 0.3773139031114612, + "grad_norm": 12.255523681640625, + "learning_rate": 1.4280000000000001e-06, + "loss": 2.9516, + "step": 479 + }, + { + "epoch": 0.3781016148089799, + "grad_norm": 8.835041046142578, + "learning_rate": 1.431e-06, + "loss": 2.9653, + "step": 480 + }, + { + "epoch": 0.37888932650649865, + "grad_norm": 6.3353590965271, + "learning_rate": 1.4340000000000002e-06, + "loss": 2.9525, + "step": 481 + }, + { + "epoch": 0.3796770382040173, + "grad_norm": 14.417763710021973, + "learning_rate": 1.437e-06, + "loss": 2.9872, + "step": 482 + }, + { + "epoch": 0.38046474990153606, + "grad_norm": 5.643706321716309, + "learning_rate": 1.44e-06, + "loss": 2.968, + "step": 483 + }, + { + "epoch": 0.38125246159905474, + "grad_norm": 6.521895408630371, + "learning_rate": 1.443e-06, + "loss": 2.9542, + "step": 484 + }, + { + "epoch": 0.38204017329657347, + "grad_norm": 15.12179946899414, + "learning_rate": 1.446e-06, + "loss": 2.9493, + "step": 485 + }, + { + "epoch": 0.38282788499409215, + "grad_norm": 4.613198757171631, + "learning_rate": 1.449e-06, + "loss": 2.9726, + "step": 486 + }, + { + "epoch": 0.3836155966916109, + "grad_norm": 4.991598129272461, + "learning_rate": 1.452e-06, + "loss": 2.9897, + "step": 487 + }, + { + "epoch": 0.38440330838912956, + "grad_norm": 5.223840713500977, + "learning_rate": 1.455e-06, + "loss": 2.9751, + "step": 488 + }, + { + "epoch": 0.3851910200866483, + "grad_norm": 18.718603134155273, + "learning_rate": 1.458e-06, + "loss": 2.9849, + "step": 489 + }, + { + "epoch": 0.385978731784167, + "grad_norm": 5.200051307678223, + "learning_rate": 1.461e-06, + "loss": 2.9345, + "step": 490 + }, + { + "epoch": 0.3867664434816857, + "grad_norm": 12.832660675048828, + "learning_rate": 1.464e-06, + "loss": 2.9189, + "step": 491 + }, + { + "epoch": 0.3875541551792044, + "grad_norm": 15.693609237670898, + "learning_rate": 1.467e-06, + "loss": 2.918, + "step": 492 + }, + { + "epoch": 0.3883418668767231, + "grad_norm": 8.23716926574707, + "learning_rate": 1.4700000000000001e-06, + "loss": 2.9339, + "step": 493 + }, + { + "epoch": 0.38912957857424185, + "grad_norm": 5.50828218460083, + "learning_rate": 1.473e-06, + "loss": 2.9423, + "step": 494 + }, + { + "epoch": 0.38991729027176053, + "grad_norm": 25.409927368164062, + "learning_rate": 1.4760000000000001e-06, + "loss": 2.9613, + "step": 495 + }, + { + "epoch": 0.39070500196927926, + "grad_norm": 6.568357467651367, + "learning_rate": 1.479e-06, + "loss": 2.9141, + "step": 496 + }, + { + "epoch": 0.39149271366679794, + "grad_norm": 7.352271556854248, + "learning_rate": 1.482e-06, + "loss": 2.9467, + "step": 497 + }, + { + "epoch": 0.3922804253643167, + "grad_norm": 3.6445388793945312, + "learning_rate": 1.4850000000000002e-06, + "loss": 2.9359, + "step": 498 + }, + { + "epoch": 0.39306813706183535, + "grad_norm": 26.993850708007812, + "learning_rate": 1.488e-06, + "loss": 2.9235, + "step": 499 + }, + { + "epoch": 0.3938558487593541, + "grad_norm": 9.328468322753906, + "learning_rate": 1.491e-06, + "loss": 2.942, + "step": 500 + }, + { + "epoch": 0.39464356045687277, + "grad_norm": 5.654018402099609, + "learning_rate": 1.494e-06, + "loss": 3.0477, + "step": 501 + }, + { + "epoch": 0.3954312721543915, + "grad_norm": 3.729282855987549, + "learning_rate": 1.497e-06, + "loss": 3.0116, + "step": 502 + }, + { + "epoch": 0.3962189838519102, + "grad_norm": 3.4503934383392334, + "learning_rate": 1.5e-06, + "loss": 2.9652, + "step": 503 + }, + { + "epoch": 0.3970066955494289, + "grad_norm": 11.940032005310059, + "learning_rate": 1.503e-06, + "loss": 2.9791, + "step": 504 + }, + { + "epoch": 0.3977944072469476, + "grad_norm": 6.245434284210205, + "learning_rate": 1.506e-06, + "loss": 2.9647, + "step": 505 + }, + { + "epoch": 0.3985821189444663, + "grad_norm": 7.571252822875977, + "learning_rate": 1.5089999999999999e-06, + "loss": 2.9729, + "step": 506 + }, + { + "epoch": 0.39936983064198506, + "grad_norm": 4.826038360595703, + "learning_rate": 1.512e-06, + "loss": 2.9781, + "step": 507 + }, + { + "epoch": 0.40015754233950374, + "grad_norm": 64.6466064453125, + "learning_rate": 1.5150000000000001e-06, + "loss": 2.9578, + "step": 508 + }, + { + "epoch": 0.40094525403702247, + "grad_norm": 6.013833999633789, + "learning_rate": 1.518e-06, + "loss": 2.9605, + "step": 509 + }, + { + "epoch": 0.40173296573454115, + "grad_norm": 4.100379943847656, + "learning_rate": 1.5210000000000001e-06, + "loss": 2.9382, + "step": 510 + }, + { + "epoch": 0.4025206774320599, + "grad_norm": 8.676809310913086, + "learning_rate": 1.524e-06, + "loss": 2.948, + "step": 511 + }, + { + "epoch": 0.40330838912957856, + "grad_norm": 12.685815811157227, + "learning_rate": 1.5270000000000002e-06, + "loss": 2.9623, + "step": 512 + }, + { + "epoch": 0.4040961008270973, + "grad_norm": 10.464503288269043, + "learning_rate": 1.53e-06, + "loss": 2.9061, + "step": 513 + }, + { + "epoch": 0.40488381252461597, + "grad_norm": 6.26030969619751, + "learning_rate": 1.533e-06, + "loss": 2.9066, + "step": 514 + }, + { + "epoch": 0.4056715242221347, + "grad_norm": 9.786190032958984, + "learning_rate": 1.5360000000000002e-06, + "loss": 2.9288, + "step": 515 + }, + { + "epoch": 0.4064592359196534, + "grad_norm": 9.137029647827148, + "learning_rate": 1.539e-06, + "loss": 2.8863, + "step": 516 + }, + { + "epoch": 0.4072469476171721, + "grad_norm": 6.453967094421387, + "learning_rate": 1.542e-06, + "loss": 2.9611, + "step": 517 + }, + { + "epoch": 0.40803465931469085, + "grad_norm": 6.627200126647949, + "learning_rate": 1.545e-06, + "loss": 2.9674, + "step": 518 + }, + { + "epoch": 0.40882237101220953, + "grad_norm": 6.247026443481445, + "learning_rate": 1.548e-06, + "loss": 2.9676, + "step": 519 + }, + { + "epoch": 0.40961008270972826, + "grad_norm": 5.725870132446289, + "learning_rate": 1.551e-06, + "loss": 2.905, + "step": 520 + }, + { + "epoch": 0.41039779440724694, + "grad_norm": 5.8602190017700195, + "learning_rate": 1.554e-06, + "loss": 2.9517, + "step": 521 + }, + { + "epoch": 0.4111855061047657, + "grad_norm": 18.588109970092773, + "learning_rate": 1.557e-06, + "loss": 2.962, + "step": 522 + }, + { + "epoch": 0.41197321780228435, + "grad_norm": 38.5172004699707, + "learning_rate": 1.5599999999999999e-06, + "loss": 2.9499, + "step": 523 + }, + { + "epoch": 0.4127609294998031, + "grad_norm": 7.698236465454102, + "learning_rate": 1.5630000000000001e-06, + "loss": 2.9274, + "step": 524 + }, + { + "epoch": 0.41354864119732176, + "grad_norm": 31.03251838684082, + "learning_rate": 1.5660000000000001e-06, + "loss": 2.934, + "step": 525 + }, + { + "epoch": 0.4143363528948405, + "grad_norm": 5.372282981872559, + "learning_rate": 1.569e-06, + "loss": 2.9664, + "step": 526 + }, + { + "epoch": 0.4151240645923592, + "grad_norm": 10.512112617492676, + "learning_rate": 1.5720000000000002e-06, + "loss": 2.8887, + "step": 527 + }, + { + "epoch": 0.4159117762898779, + "grad_norm": 4.788158893585205, + "learning_rate": 1.575e-06, + "loss": 2.9209, + "step": 528 + }, + { + "epoch": 0.4166994879873966, + "grad_norm": 6.5681376457214355, + "learning_rate": 1.578e-06, + "loss": 2.9233, + "step": 529 + }, + { + "epoch": 0.4174871996849153, + "grad_norm": 44.41343307495117, + "learning_rate": 1.581e-06, + "loss": 2.9077, + "step": 530 + }, + { + "epoch": 0.41827491138243406, + "grad_norm": 6.633177280426025, + "learning_rate": 1.584e-06, + "loss": 2.9507, + "step": 531 + }, + { + "epoch": 0.41906262307995273, + "grad_norm": 8.874907493591309, + "learning_rate": 1.5870000000000002e-06, + "loss": 2.9034, + "step": 532 + }, + { + "epoch": 0.41985033477747147, + "grad_norm": 6.798246383666992, + "learning_rate": 1.59e-06, + "loss": 2.9198, + "step": 533 + }, + { + "epoch": 0.42063804647499015, + "grad_norm": 9.883596420288086, + "learning_rate": 1.593e-06, + "loss": 2.931, + "step": 534 + }, + { + "epoch": 0.4214257581725089, + "grad_norm": 9.287618637084961, + "learning_rate": 1.596e-06, + "loss": 2.8948, + "step": 535 + }, + { + "epoch": 0.42221346987002756, + "grad_norm": 17.261999130249023, + "learning_rate": 1.599e-06, + "loss": 2.9177, + "step": 536 + }, + { + "epoch": 0.4230011815675463, + "grad_norm": 6.711757183074951, + "learning_rate": 1.602e-06, + "loss": 2.9109, + "step": 537 + }, + { + "epoch": 0.42378889326506497, + "grad_norm": 8.325769424438477, + "learning_rate": 1.605e-06, + "loss": 2.9368, + "step": 538 + }, + { + "epoch": 0.4245766049625837, + "grad_norm": 8.21493148803711, + "learning_rate": 1.608e-06, + "loss": 2.9161, + "step": 539 + }, + { + "epoch": 0.4253643166601024, + "grad_norm": 8.994702339172363, + "learning_rate": 1.611e-06, + "loss": 2.927, + "step": 540 + }, + { + "epoch": 0.4261520283576211, + "grad_norm": 3.5007236003875732, + "learning_rate": 1.6140000000000001e-06, + "loss": 2.9399, + "step": 541 + }, + { + "epoch": 0.4269397400551398, + "grad_norm": 9.40645980834961, + "learning_rate": 1.6170000000000001e-06, + "loss": 2.9062, + "step": 542 + }, + { + "epoch": 0.4277274517526585, + "grad_norm": 13.810895919799805, + "learning_rate": 1.62e-06, + "loss": 2.9009, + "step": 543 + }, + { + "epoch": 0.42851516345017726, + "grad_norm": 16.642290115356445, + "learning_rate": 1.6230000000000002e-06, + "loss": 2.9402, + "step": 544 + }, + { + "epoch": 0.42930287514769594, + "grad_norm": 6.338722229003906, + "learning_rate": 1.626e-06, + "loss": 2.92, + "step": 545 + }, + { + "epoch": 0.4300905868452147, + "grad_norm": 6.34137487411499, + "learning_rate": 1.629e-06, + "loss": 2.8953, + "step": 546 + }, + { + "epoch": 0.43087829854273335, + "grad_norm": 5.905561923980713, + "learning_rate": 1.632e-06, + "loss": 2.9037, + "step": 547 + }, + { + "epoch": 0.4316660102402521, + "grad_norm": 26.460729598999023, + "learning_rate": 1.635e-06, + "loss": 2.9186, + "step": 548 + }, + { + "epoch": 0.43245372193777076, + "grad_norm": 15.366682052612305, + "learning_rate": 1.638e-06, + "loss": 2.9443, + "step": 549 + }, + { + "epoch": 0.4332414336352895, + "grad_norm": 11.808568954467773, + "learning_rate": 1.641e-06, + "loss": 2.9328, + "step": 550 + }, + { + "epoch": 0.4340291453328082, + "grad_norm": 7.734588623046875, + "learning_rate": 1.644e-06, + "loss": 3.006, + "step": 551 + }, + { + "epoch": 0.4348168570303269, + "grad_norm": 9.05081558227539, + "learning_rate": 1.6469999999999999e-06, + "loss": 3.0397, + "step": 552 + }, + { + "epoch": 0.4356045687278456, + "grad_norm": 5.760892868041992, + "learning_rate": 1.65e-06, + "loss": 2.9384, + "step": 553 + }, + { + "epoch": 0.4363922804253643, + "grad_norm": 10.124921798706055, + "learning_rate": 1.653e-06, + "loss": 2.9691, + "step": 554 + }, + { + "epoch": 0.437179992122883, + "grad_norm": 5.164519309997559, + "learning_rate": 1.6560000000000001e-06, + "loss": 2.9375, + "step": 555 + }, + { + "epoch": 0.43796770382040173, + "grad_norm": 20.4421443939209, + "learning_rate": 1.6590000000000001e-06, + "loss": 2.8953, + "step": 556 + }, + { + "epoch": 0.43875541551792047, + "grad_norm": 4.481269836425781, + "learning_rate": 1.662e-06, + "loss": 2.9535, + "step": 557 + }, + { + "epoch": 0.43954312721543914, + "grad_norm": 5.2533860206604, + "learning_rate": 1.6650000000000002e-06, + "loss": 2.9224, + "step": 558 + }, + { + "epoch": 0.4403308389129579, + "grad_norm": 16.052967071533203, + "learning_rate": 1.668e-06, + "loss": 2.9208, + "step": 559 + }, + { + "epoch": 0.44111855061047656, + "grad_norm": 4.025826454162598, + "learning_rate": 1.671e-06, + "loss": 2.905, + "step": 560 + }, + { + "epoch": 0.4419062623079953, + "grad_norm": 5.7356953620910645, + "learning_rate": 1.6740000000000002e-06, + "loss": 2.896, + "step": 561 + }, + { + "epoch": 0.44269397400551397, + "grad_norm": 6.793007850646973, + "learning_rate": 1.677e-06, + "loss": 2.9275, + "step": 562 + }, + { + "epoch": 0.4434816857030327, + "grad_norm": 4.742384433746338, + "learning_rate": 1.68e-06, + "loss": 2.9152, + "step": 563 + }, + { + "epoch": 0.4442693974005514, + "grad_norm": 8.130260467529297, + "learning_rate": 1.683e-06, + "loss": 2.915, + "step": 564 + }, + { + "epoch": 0.4450571090980701, + "grad_norm": 5.794771194458008, + "learning_rate": 1.686e-06, + "loss": 2.926, + "step": 565 + }, + { + "epoch": 0.4458448207955888, + "grad_norm": 5.188541889190674, + "learning_rate": 1.689e-06, + "loss": 2.9324, + "step": 566 + }, + { + "epoch": 0.4466325324931075, + "grad_norm": 4.144057273864746, + "learning_rate": 1.692e-06, + "loss": 2.874, + "step": 567 + }, + { + "epoch": 0.4474202441906262, + "grad_norm": 6.729897975921631, + "learning_rate": 1.695e-06, + "loss": 2.8972, + "step": 568 + }, + { + "epoch": 0.44820795588814494, + "grad_norm": NaN, + "learning_rate": 1.695e-06, + "loss": 2.9093, + "step": 569 + }, + { + "epoch": 0.44899566758566367, + "grad_norm": 5.1365532875061035, + "learning_rate": 1.6979999999999999e-06, + "loss": 2.9494, + "step": 570 + }, + { + "epoch": 0.44978337928318235, + "grad_norm": 3.857987403869629, + "learning_rate": 1.701e-06, + "loss": 2.9046, + "step": 571 + }, + { + "epoch": 0.4505710909807011, + "grad_norm": 4.504978656768799, + "learning_rate": 1.7040000000000001e-06, + "loss": 2.9046, + "step": 572 + }, + { + "epoch": 0.45135880267821976, + "grad_norm": 14.969274520874023, + "learning_rate": 1.707e-06, + "loss": 2.8938, + "step": 573 + }, + { + "epoch": 0.4521465143757385, + "grad_norm": 7.5694804191589355, + "learning_rate": 1.7100000000000001e-06, + "loss": 2.9687, + "step": 574 + }, + { + "epoch": 0.4529342260732572, + "grad_norm": 19.504871368408203, + "learning_rate": 1.713e-06, + "loss": 2.9533, + "step": 575 + }, + { + "epoch": 0.4537219377707759, + "grad_norm": 14.33553409576416, + "learning_rate": 1.7160000000000002e-06, + "loss": 2.9262, + "step": 576 + }, + { + "epoch": 0.4545096494682946, + "grad_norm": 9.721388816833496, + "learning_rate": 1.719e-06, + "loss": 2.9492, + "step": 577 + }, + { + "epoch": 0.4552973611658133, + "grad_norm": 7.896555423736572, + "learning_rate": 1.722e-06, + "loss": 2.9286, + "step": 578 + }, + { + "epoch": 0.456085072863332, + "grad_norm": 5.8212361335754395, + "learning_rate": 1.7250000000000002e-06, + "loss": 2.9108, + "step": 579 + }, + { + "epoch": 0.45687278456085073, + "grad_norm": 6.00703763961792, + "learning_rate": 1.728e-06, + "loss": 2.9488, + "step": 580 + }, + { + "epoch": 0.4576604962583694, + "grad_norm": 5.844910621643066, + "learning_rate": 1.731e-06, + "loss": 2.9309, + "step": 581 + }, + { + "epoch": 0.45844820795588814, + "grad_norm": 5.616259574890137, + "learning_rate": 1.734e-06, + "loss": 2.9174, + "step": 582 + }, + { + "epoch": 0.4592359196534069, + "grad_norm": 9.855520248413086, + "learning_rate": 1.737e-06, + "loss": 2.8942, + "step": 583 + }, + { + "epoch": 0.46002363135092555, + "grad_norm": 8.823214530944824, + "learning_rate": 1.74e-06, + "loss": 2.9371, + "step": 584 + }, + { + "epoch": 0.4608113430484443, + "grad_norm": 5.850224494934082, + "learning_rate": 1.743e-06, + "loss": 2.9235, + "step": 585 + }, + { + "epoch": 0.46159905474596297, + "grad_norm": 7.153159141540527, + "learning_rate": 1.746e-06, + "loss": 2.8827, + "step": 586 + }, + { + "epoch": 0.4623867664434817, + "grad_norm": 24.55967903137207, + "learning_rate": 1.749e-06, + "loss": 2.8615, + "step": 587 + }, + { + "epoch": 0.4631744781410004, + "grad_norm": 7.238897800445557, + "learning_rate": 1.7520000000000001e-06, + "loss": 2.8888, + "step": 588 + }, + { + "epoch": 0.4639621898385191, + "grad_norm": 9.293492317199707, + "learning_rate": 1.7550000000000001e-06, + "loss": 2.8868, + "step": 589 + }, + { + "epoch": 0.4647499015360378, + "grad_norm": 4.436881065368652, + "learning_rate": 1.758e-06, + "loss": 2.8886, + "step": 590 + }, + { + "epoch": 0.4655376132335565, + "grad_norm": 5.460833549499512, + "learning_rate": 1.7610000000000002e-06, + "loss": 2.8643, + "step": 591 + }, + { + "epoch": 0.4663253249310752, + "grad_norm": 6.941051006317139, + "learning_rate": 1.764e-06, + "loss": 2.8833, + "step": 592 + }, + { + "epoch": 0.46711303662859394, + "grad_norm": 22.04667091369629, + "learning_rate": 1.767e-06, + "loss": 2.8941, + "step": 593 + }, + { + "epoch": 0.46790074832611267, + "grad_norm": 13.834938049316406, + "learning_rate": 1.77e-06, + "loss": 2.8985, + "step": 594 + }, + { + "epoch": 0.46868846002363135, + "grad_norm": 6.131762981414795, + "learning_rate": 1.773e-06, + "loss": 2.8624, + "step": 595 + }, + { + "epoch": 0.4694761717211501, + "grad_norm": 8.991756439208984, + "learning_rate": 1.776e-06, + "loss": 2.9043, + "step": 596 + }, + { + "epoch": 0.47026388341866876, + "grad_norm": 6.86785888671875, + "learning_rate": 1.779e-06, + "loss": 2.8875, + "step": 597 + }, + { + "epoch": 0.4710515951161875, + "grad_norm": 9.442583084106445, + "learning_rate": 1.782e-06, + "loss": 2.9379, + "step": 598 + }, + { + "epoch": 0.47183930681370617, + "grad_norm": 4.0587592124938965, + "learning_rate": 1.785e-06, + "loss": 2.8923, + "step": 599 + }, + { + "epoch": 0.4726270185112249, + "grad_norm": 7.948934555053711, + "learning_rate": 1.788e-06, + "loss": 2.9647, + "step": 600 + }, + { + "epoch": 0.4734147302087436, + "grad_norm": 19.64722442626953, + "learning_rate": 1.791e-06, + "loss": 3.0647, + "step": 601 + }, + { + "epoch": 0.4742024419062623, + "grad_norm": 9.444209098815918, + "learning_rate": 1.794e-06, + "loss": 2.9941, + "step": 602 + }, + { + "epoch": 0.474990153603781, + "grad_norm": 10.560823440551758, + "learning_rate": 1.7970000000000001e-06, + "loss": 2.9763, + "step": 603 + }, + { + "epoch": 0.47577786530129973, + "grad_norm": 11.41232681274414, + "learning_rate": 1.8e-06, + "loss": 2.9751, + "step": 604 + }, + { + "epoch": 0.4765655769988184, + "grad_norm": 5.039262771606445, + "learning_rate": 1.8030000000000001e-06, + "loss": 2.9853, + "step": 605 + }, + { + "epoch": 0.47735328869633714, + "grad_norm": 5.058529853820801, + "learning_rate": 1.806e-06, + "loss": 2.8853, + "step": 606 + }, + { + "epoch": 0.4781410003938559, + "grad_norm": 3.5510830879211426, + "learning_rate": 1.809e-06, + "loss": 2.9141, + "step": 607 + }, + { + "epoch": 0.47892871209137455, + "grad_norm": 8.935461044311523, + "learning_rate": 1.8120000000000002e-06, + "loss": 2.9146, + "step": 608 + }, + { + "epoch": 0.4797164237888933, + "grad_norm": 5.6419596672058105, + "learning_rate": 1.815e-06, + "loss": 2.8788, + "step": 609 + }, + { + "epoch": 0.48050413548641197, + "grad_norm": 5.757681846618652, + "learning_rate": 1.818e-06, + "loss": 2.9023, + "step": 610 + }, + { + "epoch": 0.4812918471839307, + "grad_norm": 4.210809707641602, + "learning_rate": 1.821e-06, + "loss": 2.8734, + "step": 611 + }, + { + "epoch": 0.4820795588814494, + "grad_norm": 5.079986095428467, + "learning_rate": 1.824e-06, + "loss": 2.8664, + "step": 612 + }, + { + "epoch": 0.4828672705789681, + "grad_norm": 6.335360527038574, + "learning_rate": 1.827e-06, + "loss": 2.8911, + "step": 613 + }, + { + "epoch": 0.4836549822764868, + "grad_norm": 5.43281888961792, + "learning_rate": 1.83e-06, + "loss": 2.8776, + "step": 614 + }, + { + "epoch": 0.4844426939740055, + "grad_norm": 5.438951015472412, + "learning_rate": 1.833e-06, + "loss": 2.9309, + "step": 615 + }, + { + "epoch": 0.4852304056715242, + "grad_norm": 12.708145141601562, + "learning_rate": 1.8359999999999999e-06, + "loss": 2.8588, + "step": 616 + }, + { + "epoch": 0.48601811736904293, + "grad_norm": 12.37439250946045, + "learning_rate": 1.839e-06, + "loss": 2.8872, + "step": 617 + }, + { + "epoch": 0.4868058290665616, + "grad_norm": 9.357802391052246, + "learning_rate": 1.8420000000000001e-06, + "loss": 2.8823, + "step": 618 + }, + { + "epoch": 0.48759354076408035, + "grad_norm": 3.517868995666504, + "learning_rate": 1.8450000000000001e-06, + "loss": 2.8898, + "step": 619 + }, + { + "epoch": 0.4883812524615991, + "grad_norm": 4.747567653656006, + "learning_rate": 1.8480000000000001e-06, + "loss": 2.8714, + "step": 620 + }, + { + "epoch": 0.48916896415911776, + "grad_norm": 11.26347827911377, + "learning_rate": 1.851e-06, + "loss": 2.8806, + "step": 621 + }, + { + "epoch": 0.4899566758566365, + "grad_norm": 3.921535015106201, + "learning_rate": 1.8540000000000002e-06, + "loss": 2.8925, + "step": 622 + }, + { + "epoch": 0.49074438755415517, + "grad_norm": 8.622675895690918, + "learning_rate": 1.857e-06, + "loss": 2.8845, + "step": 623 + }, + { + "epoch": 0.4915320992516739, + "grad_norm": 6.823661804199219, + "learning_rate": 1.86e-06, + "loss": 2.8921, + "step": 624 + }, + { + "epoch": 0.4923198109491926, + "grad_norm": 7.1662678718566895, + "learning_rate": 1.8630000000000002e-06, + "loss": 2.8729, + "step": 625 + }, + { + "epoch": 0.4931075226467113, + "grad_norm": 7.455286026000977, + "learning_rate": 1.866e-06, + "loss": 2.8917, + "step": 626 + }, + { + "epoch": 0.49389523434423, + "grad_norm": 5.859024524688721, + "learning_rate": 1.869e-06, + "loss": 2.8763, + "step": 627 + }, + { + "epoch": 0.49468294604174873, + "grad_norm": 11.08643627166748, + "learning_rate": 1.872e-06, + "loss": 2.9097, + "step": 628 + }, + { + "epoch": 0.4954706577392674, + "grad_norm": 7.339168548583984, + "learning_rate": 1.875e-06, + "loss": 2.868, + "step": 629 + }, + { + "epoch": 0.49625836943678614, + "grad_norm": 13.947766304016113, + "learning_rate": 1.878e-06, + "loss": 2.8846, + "step": 630 + }, + { + "epoch": 0.4970460811343048, + "grad_norm": 8.443973541259766, + "learning_rate": 1.8810000000000003e-06, + "loss": 2.9037, + "step": 631 + }, + { + "epoch": 0.49783379283182355, + "grad_norm": 7.051575660705566, + "learning_rate": 1.8839999999999999e-06, + "loss": 2.8785, + "step": 632 + }, + { + "epoch": 0.4986215045293423, + "grad_norm": 3.4976377487182617, + "learning_rate": 1.8869999999999999e-06, + "loss": 2.8995, + "step": 633 + }, + { + "epoch": 0.49940921622686096, + "grad_norm": 14.746891975402832, + "learning_rate": 1.8900000000000001e-06, + "loss": 2.8802, + "step": 634 + }, + { + "epoch": 0.5001969279243796, + "grad_norm": 10.462647438049316, + "learning_rate": 1.8930000000000001e-06, + "loss": 2.8882, + "step": 635 + }, + { + "epoch": 0.5009846396218984, + "grad_norm": 6.567530155181885, + "learning_rate": 1.8960000000000001e-06, + "loss": 2.942, + "step": 636 + }, + { + "epoch": 0.5017723513194171, + "grad_norm": 9.316963195800781, + "learning_rate": 1.899e-06, + "loss": 2.9077, + "step": 637 + }, + { + "epoch": 0.5025600630169358, + "grad_norm": 5.960690021514893, + "learning_rate": 1.902e-06, + "loss": 2.9047, + "step": 638 + }, + { + "epoch": 0.5033477747144545, + "grad_norm": 6.092120170593262, + "learning_rate": 1.905e-06, + "loss": 2.8749, + "step": 639 + }, + { + "epoch": 0.5041354864119733, + "grad_norm": 11.290860176086426, + "learning_rate": 1.908e-06, + "loss": 2.9413, + "step": 640 + }, + { + "epoch": 0.5049231981094919, + "grad_norm": 8.951945304870605, + "learning_rate": 1.9110000000000004e-06, + "loss": 2.8523, + "step": 641 + }, + { + "epoch": 0.5057109098070106, + "grad_norm": 5.414577007293701, + "learning_rate": 1.914e-06, + "loss": 2.9175, + "step": 642 + }, + { + "epoch": 0.5064986215045293, + "grad_norm": 8.224546432495117, + "learning_rate": 1.917e-06, + "loss": 2.8617, + "step": 643 + }, + { + "epoch": 0.5072863332020481, + "grad_norm": 4.7914862632751465, + "learning_rate": 1.9200000000000003e-06, + "loss": 2.8971, + "step": 644 + }, + { + "epoch": 0.5080740448995668, + "grad_norm": 15.83332633972168, + "learning_rate": 1.923e-06, + "loss": 2.8917, + "step": 645 + }, + { + "epoch": 0.5088617565970854, + "grad_norm": 6.276189804077148, + "learning_rate": 1.926e-06, + "loss": 2.8498, + "step": 646 + }, + { + "epoch": 0.5096494682946042, + "grad_norm": 7.166374206542969, + "learning_rate": 1.929e-06, + "loss": 2.9195, + "step": 647 + }, + { + "epoch": 0.5104371799921229, + "grad_norm": 8.013594627380371, + "learning_rate": 1.932e-06, + "loss": 2.9121, + "step": 648 + }, + { + "epoch": 0.5112248916896416, + "grad_norm": 7.00277042388916, + "learning_rate": 1.935e-06, + "loss": 2.864, + "step": 649 + }, + { + "epoch": 0.5120126033871603, + "grad_norm": 4.1169562339782715, + "learning_rate": 1.9380000000000003e-06, + "loss": 2.8816, + "step": 650 + }, + { + "epoch": 0.512800315084679, + "grad_norm": 7.136220932006836, + "learning_rate": 1.9409999999999997e-06, + "loss": 3.0224, + "step": 651 + }, + { + "epoch": 0.5135880267821977, + "grad_norm": 6.829476833343506, + "learning_rate": 1.944e-06, + "loss": 2.9536, + "step": 652 + }, + { + "epoch": 0.5143757384797164, + "grad_norm": 4.9768877029418945, + "learning_rate": 1.947e-06, + "loss": 2.9651, + "step": 653 + }, + { + "epoch": 0.5151634501772351, + "grad_norm": 3.683809757232666, + "learning_rate": 1.95e-06, + "loss": 2.9226, + "step": 654 + }, + { + "epoch": 0.5159511618747539, + "grad_norm": 10.050484657287598, + "learning_rate": 1.953e-06, + "loss": 2.9443, + "step": 655 + }, + { + "epoch": 0.5167388735722726, + "grad_norm": 14.102240562438965, + "learning_rate": 1.956e-06, + "loss": 2.9569, + "step": 656 + }, + { + "epoch": 0.5175265852697912, + "grad_norm": 8.555575370788574, + "learning_rate": 1.959e-06, + "loss": 2.9315, + "step": 657 + }, + { + "epoch": 0.51831429696731, + "grad_norm": 10.728520393371582, + "learning_rate": 1.962e-06, + "loss": 2.9065, + "step": 658 + }, + { + "epoch": 0.5191020086648287, + "grad_norm": 5.401296615600586, + "learning_rate": 1.9650000000000002e-06, + "loss": 2.8769, + "step": 659 + }, + { + "epoch": 0.5198897203623474, + "grad_norm": 6.646932601928711, + "learning_rate": 1.968e-06, + "loss": 2.8817, + "step": 660 + }, + { + "epoch": 0.520677432059866, + "grad_norm": 15.059279441833496, + "learning_rate": 1.971e-06, + "loss": 2.8936, + "step": 661 + }, + { + "epoch": 0.5214651437573848, + "grad_norm": 7.535762786865234, + "learning_rate": 1.974e-06, + "loss": 2.9099, + "step": 662 + }, + { + "epoch": 0.5222528554549035, + "grad_norm": 5.606179237365723, + "learning_rate": 1.977e-06, + "loss": 2.8702, + "step": 663 + }, + { + "epoch": 0.5230405671524222, + "grad_norm": 6.90221643447876, + "learning_rate": 1.98e-06, + "loss": 2.8721, + "step": 664 + }, + { + "epoch": 0.5238282788499409, + "grad_norm": 7.724842548370361, + "learning_rate": 1.9830000000000003e-06, + "loss": 2.8602, + "step": 665 + }, + { + "epoch": 0.5246159905474597, + "grad_norm": 7.838924884796143, + "learning_rate": 1.9859999999999997e-06, + "loss": 2.8646, + "step": 666 + }, + { + "epoch": 0.5254037022449783, + "grad_norm": 5.319076061248779, + "learning_rate": 1.989e-06, + "loss": 2.8891, + "step": 667 + }, + { + "epoch": 0.526191413942497, + "grad_norm": 6.2576212882995605, + "learning_rate": 1.992e-06, + "loss": 2.8698, + "step": 668 + }, + { + "epoch": 0.5269791256400157, + "grad_norm": 7.276239395141602, + "learning_rate": 1.995e-06, + "loss": 2.8774, + "step": 669 + }, + { + "epoch": 0.5277668373375345, + "grad_norm": 5.378846168518066, + "learning_rate": 1.998e-06, + "loss": 2.8837, + "step": 670 + }, + { + "epoch": 0.5285545490350532, + "grad_norm": 9.823685646057129, + "learning_rate": 2.001e-06, + "loss": 2.8862, + "step": 671 + }, + { + "epoch": 0.5293422607325718, + "grad_norm": 21.40079116821289, + "learning_rate": 2.004e-06, + "loss": 2.8516, + "step": 672 + }, + { + "epoch": 0.5301299724300906, + "grad_norm": 14.23735237121582, + "learning_rate": 2.007e-06, + "loss": 2.848, + "step": 673 + }, + { + "epoch": 0.5309176841276093, + "grad_norm": 3.8338639736175537, + "learning_rate": 2.0100000000000002e-06, + "loss": 2.8587, + "step": 674 + }, + { + "epoch": 0.531705395825128, + "grad_norm": 4.297427177429199, + "learning_rate": 2.0130000000000005e-06, + "loss": 2.8758, + "step": 675 + }, + { + "epoch": 0.5324931075226467, + "grad_norm": 5.852993488311768, + "learning_rate": 2.016e-06, + "loss": 2.8797, + "step": 676 + }, + { + "epoch": 0.5332808192201655, + "grad_norm": 99.50304412841797, + "learning_rate": 2.019e-06, + "loss": 2.8881, + "step": 677 + }, + { + "epoch": 0.5340685309176841, + "grad_norm": 4.5439605712890625, + "learning_rate": 2.0220000000000003e-06, + "loss": 2.8581, + "step": 678 + }, + { + "epoch": 0.5348562426152028, + "grad_norm": 6.661430358886719, + "learning_rate": 2.025e-06, + "loss": 2.8515, + "step": 679 + }, + { + "epoch": 0.5356439543127215, + "grad_norm": NaN, + "learning_rate": 2.025e-06, + "loss": 2.8735, + "step": 680 + }, + { + "epoch": 0.5364316660102403, + "grad_norm": 8.937329292297363, + "learning_rate": 2.028e-06, + "loss": 2.854, + "step": 681 + }, + { + "epoch": 0.537219377707759, + "grad_norm": 10.572710037231445, + "learning_rate": 2.031e-06, + "loss": 2.8237, + "step": 682 + }, + { + "epoch": 0.5380070894052776, + "grad_norm": 5.516214370727539, + "learning_rate": 2.034e-06, + "loss": 2.8565, + "step": 683 + }, + { + "epoch": 0.5387948011027964, + "grad_norm": 12.032327651977539, + "learning_rate": 2.037e-06, + "loss": 2.862, + "step": 684 + }, + { + "epoch": 0.5395825128003151, + "grad_norm": 4.675620079040527, + "learning_rate": 2.0400000000000004e-06, + "loss": 2.8675, + "step": 685 + }, + { + "epoch": 0.5403702244978338, + "grad_norm": 5.928946495056152, + "learning_rate": 2.0429999999999998e-06, + "loss": 2.847, + "step": 686 + }, + { + "epoch": 0.5411579361953525, + "grad_norm": 6.6510701179504395, + "learning_rate": 2.046e-06, + "loss": 2.8798, + "step": 687 + }, + { + "epoch": 0.5419456478928713, + "grad_norm": 5.8854217529296875, + "learning_rate": 2.049e-06, + "loss": 2.8409, + "step": 688 + }, + { + "epoch": 0.5427333595903899, + "grad_norm": 9.621444702148438, + "learning_rate": 2.052e-06, + "loss": 2.8487, + "step": 689 + }, + { + "epoch": 0.5435210712879086, + "grad_norm": 5.803048610687256, + "learning_rate": 2.0550000000000002e-06, + "loss": 2.8806, + "step": 690 + }, + { + "epoch": 0.5443087829854273, + "grad_norm": 7.4785332679748535, + "learning_rate": 2.058e-06, + "loss": 2.8454, + "step": 691 + }, + { + "epoch": 0.5450964946829461, + "grad_norm": 3.604034662246704, + "learning_rate": 2.061e-06, + "loss": 2.8868, + "step": 692 + }, + { + "epoch": 0.5458842063804648, + "grad_norm": 6.985630035400391, + "learning_rate": 2.064e-06, + "loss": 2.8404, + "step": 693 + }, + { + "epoch": 0.5466719180779834, + "grad_norm": 13.945947647094727, + "learning_rate": 2.0670000000000003e-06, + "loss": 2.8404, + "step": 694 + }, + { + "epoch": 0.5474596297755022, + "grad_norm": 14.329117774963379, + "learning_rate": 2.07e-06, + "loss": 2.8506, + "step": 695 + }, + { + "epoch": 0.5482473414730209, + "grad_norm": 8.316871643066406, + "learning_rate": 2.073e-06, + "loss": 2.8313, + "step": 696 + }, + { + "epoch": 0.5490350531705396, + "grad_norm": 15.510714530944824, + "learning_rate": 2.076e-06, + "loss": 2.8875, + "step": 697 + }, + { + "epoch": 0.5498227648680583, + "grad_norm": 9.466656684875488, + "learning_rate": 2.079e-06, + "loss": 2.9177, + "step": 698 + }, + { + "epoch": 0.550610476565577, + "grad_norm": 9.893726348876953, + "learning_rate": 2.082e-06, + "loss": 2.8709, + "step": 699 + }, + { + "epoch": 0.5513981882630957, + "grad_norm": 4.60153865814209, + "learning_rate": 2.0850000000000004e-06, + "loss": 2.9101, + "step": 700 + }, + { + "epoch": 0.5521858999606144, + "grad_norm": 4.687081336975098, + "learning_rate": 2.0879999999999997e-06, + "loss": 3.0386, + "step": 701 + }, + { + "epoch": 0.5529736116581331, + "grad_norm": 3.6009700298309326, + "learning_rate": 2.091e-06, + "loss": 2.9353, + "step": 702 + }, + { + "epoch": 0.5537613233556519, + "grad_norm": 4.916120529174805, + "learning_rate": 2.094e-06, + "loss": 2.9679, + "step": 703 + }, + { + "epoch": 0.5545490350531705, + "grad_norm": 4.996851921081543, + "learning_rate": 2.097e-06, + "loss": 2.928, + "step": 704 + }, + { + "epoch": 0.5553367467506892, + "grad_norm": 4.834575176239014, + "learning_rate": 2.1000000000000002e-06, + "loss": 2.9153, + "step": 705 + }, + { + "epoch": 0.5561244584482079, + "grad_norm": 4.9449639320373535, + "learning_rate": 2.103e-06, + "loss": 2.8984, + "step": 706 + }, + { + "epoch": 0.5569121701457267, + "grad_norm": 6.858949661254883, + "learning_rate": 2.106e-06, + "loss": 2.8734, + "step": 707 + }, + { + "epoch": 0.5576998818432454, + "grad_norm": 8.15453052520752, + "learning_rate": 2.109e-06, + "loss": 2.8611, + "step": 708 + }, + { + "epoch": 0.558487593540764, + "grad_norm": 5.372587203979492, + "learning_rate": 2.1120000000000003e-06, + "loss": 2.8806, + "step": 709 + }, + { + "epoch": 0.5592753052382828, + "grad_norm": 8.374645233154297, + "learning_rate": 2.1149999999999997e-06, + "loss": 2.8689, + "step": 710 + }, + { + "epoch": 0.5600630169358015, + "grad_norm": 9.488783836364746, + "learning_rate": 2.118e-06, + "loss": 2.8685, + "step": 711 + }, + { + "epoch": 0.5608507286333202, + "grad_norm": 5.85254430770874, + "learning_rate": 2.121e-06, + "loss": 2.8971, + "step": 712 + }, + { + "epoch": 0.5616384403308389, + "grad_norm": 3.8994786739349365, + "learning_rate": 2.124e-06, + "loss": 2.8696, + "step": 713 + }, + { + "epoch": 0.5624261520283577, + "grad_norm": 4.133752822875977, + "learning_rate": 2.127e-06, + "loss": 2.826, + "step": 714 + }, + { + "epoch": 0.5632138637258763, + "grad_norm": 5.342362880706787, + "learning_rate": 2.13e-06, + "loss": 2.854, + "step": 715 + }, + { + "epoch": 0.564001575423395, + "grad_norm": 7.272205829620361, + "learning_rate": 2.133e-06, + "loss": 2.8524, + "step": 716 + }, + { + "epoch": 0.5647892871209137, + "grad_norm": 4.9538397789001465, + "learning_rate": 2.136e-06, + "loss": 2.8249, + "step": 717 + }, + { + "epoch": 0.5655769988184325, + "grad_norm": 7.379202365875244, + "learning_rate": 2.139e-06, + "loss": 2.8703, + "step": 718 + }, + { + "epoch": 0.5663647105159512, + "grad_norm": 7.3580803871154785, + "learning_rate": 2.1420000000000004e-06, + "loss": 2.8554, + "step": 719 + }, + { + "epoch": 0.5671524222134698, + "grad_norm": 9.854637145996094, + "learning_rate": 2.145e-06, + "loss": 2.853, + "step": 720 + }, + { + "epoch": 0.5679401339109886, + "grad_norm": 21.05218505859375, + "learning_rate": 2.148e-06, + "loss": 2.8023, + "step": 721 + }, + { + "epoch": 0.5687278456085073, + "grad_norm": 7.692500114440918, + "learning_rate": 2.1510000000000002e-06, + "loss": 2.8701, + "step": 722 + }, + { + "epoch": 0.569515557306026, + "grad_norm": 4.691758632659912, + "learning_rate": 2.154e-06, + "loss": 2.8607, + "step": 723 + }, + { + "epoch": 0.5703032690035447, + "grad_norm": 4.079660892486572, + "learning_rate": 2.1570000000000003e-06, + "loss": 2.8428, + "step": 724 + }, + { + "epoch": 0.5710909807010635, + "grad_norm": 5.782022953033447, + "learning_rate": 2.16e-06, + "loss": 2.8538, + "step": 725 + }, + { + "epoch": 0.5718786923985821, + "grad_norm": 4.623614311218262, + "learning_rate": 2.163e-06, + "loss": 2.8732, + "step": 726 + }, + { + "epoch": 0.5726664040961008, + "grad_norm": 9.634215354919434, + "learning_rate": 2.166e-06, + "loss": 2.8613, + "step": 727 + }, + { + "epoch": 0.5734541157936195, + "grad_norm": 5.7070722579956055, + "learning_rate": 2.1690000000000003e-06, + "loss": 2.8033, + "step": 728 + }, + { + "epoch": 0.5742418274911383, + "grad_norm": 6.7316813468933105, + "learning_rate": 2.172e-06, + "loss": 2.8796, + "step": 729 + }, + { + "epoch": 0.575029539188657, + "grad_norm": 10.98059368133545, + "learning_rate": 2.175e-06, + "loss": 2.9108, + "step": 730 + }, + { + "epoch": 0.5758172508861756, + "grad_norm": 19.880403518676758, + "learning_rate": 2.178e-06, + "loss": 2.8557, + "step": 731 + }, + { + "epoch": 0.5766049625836943, + "grad_norm": 7.932878494262695, + "learning_rate": 2.181e-06, + "loss": 2.8233, + "step": 732 + }, + { + "epoch": 0.5773926742812131, + "grad_norm": 26.12215805053711, + "learning_rate": 2.184e-06, + "loss": 2.8776, + "step": 733 + }, + { + "epoch": 0.5781803859787318, + "grad_norm": 13.937017440795898, + "learning_rate": 2.1870000000000004e-06, + "loss": 2.8315, + "step": 734 + }, + { + "epoch": 0.5789680976762505, + "grad_norm": 8.766471862792969, + "learning_rate": 2.1899999999999998e-06, + "loss": 2.8671, + "step": 735 + }, + { + "epoch": 0.5797558093737692, + "grad_norm": 11.888102531433105, + "learning_rate": 2.193e-06, + "loss": 2.8647, + "step": 736 + }, + { + "epoch": 0.5805435210712879, + "grad_norm": 9.377480506896973, + "learning_rate": 2.1960000000000002e-06, + "loss": 2.8612, + "step": 737 + }, + { + "epoch": 0.5813312327688066, + "grad_norm": 6.994541645050049, + "learning_rate": 2.199e-06, + "loss": 2.8428, + "step": 738 + }, + { + "epoch": 0.5821189444663253, + "grad_norm": 6.701540946960449, + "learning_rate": 2.2020000000000003e-06, + "loss": 2.8284, + "step": 739 + }, + { + "epoch": 0.5829066561638441, + "grad_norm": 5.453619956970215, + "learning_rate": 2.205e-06, + "loss": 2.8739, + "step": 740 + }, + { + "epoch": 0.5836943678613628, + "grad_norm": 9.64240837097168, + "learning_rate": 2.208e-06, + "loss": 2.831, + "step": 741 + }, + { + "epoch": 0.5844820795588814, + "grad_norm": 26.42766761779785, + "learning_rate": 2.211e-06, + "loss": 2.8304, + "step": 742 + }, + { + "epoch": 0.5852697912564001, + "grad_norm": 8.842597961425781, + "learning_rate": 2.2140000000000003e-06, + "loss": 2.8108, + "step": 743 + }, + { + "epoch": 0.5860575029539189, + "grad_norm": 7.504437446594238, + "learning_rate": 2.2169999999999997e-06, + "loss": 2.8247, + "step": 744 + }, + { + "epoch": 0.5868452146514376, + "grad_norm": 16.623641967773438, + "learning_rate": 2.22e-06, + "loss": 2.8739, + "step": 745 + }, + { + "epoch": 0.5876329263489563, + "grad_norm": 4.994149208068848, + "learning_rate": 2.223e-06, + "loss": 2.8525, + "step": 746 + }, + { + "epoch": 0.588420638046475, + "grad_norm": 4.573817729949951, + "learning_rate": 2.226e-06, + "loss": 2.8435, + "step": 747 + }, + { + "epoch": 0.5892083497439937, + "grad_norm": 12.632636070251465, + "learning_rate": 2.229e-06, + "loss": 2.8031, + "step": 748 + }, + { + "epoch": 0.5899960614415124, + "grad_norm": 33.94064712524414, + "learning_rate": 2.232e-06, + "loss": 2.8592, + "step": 749 + }, + { + "epoch": 0.5907837731390311, + "grad_norm": 7.081697463989258, + "learning_rate": 2.2349999999999998e-06, + "loss": 2.9044, + "step": 750 + }, + { + "epoch": 0.5915714848365499, + "grad_norm": 10.216980934143066, + "learning_rate": 2.238e-06, + "loss": 3.0028, + "step": 751 + }, + { + "epoch": 0.5923591965340685, + "grad_norm": 10.8344144821167, + "learning_rate": 2.2410000000000002e-06, + "loss": 2.9335, + "step": 752 + }, + { + "epoch": 0.5931469082315872, + "grad_norm": 5.619133472442627, + "learning_rate": 2.244e-06, + "loss": 2.9123, + "step": 753 + }, + { + "epoch": 0.5939346199291059, + "grad_norm": 8.002742767333984, + "learning_rate": 2.247e-06, + "loss": 2.9329, + "step": 754 + }, + { + "epoch": 0.5947223316266247, + "grad_norm": 6.635711669921875, + "learning_rate": 2.25e-06, + "loss": 2.9188, + "step": 755 + }, + { + "epoch": 0.5955100433241434, + "grad_norm": 4.261734962463379, + "learning_rate": 2.253e-06, + "loss": 2.8557, + "step": 756 + }, + { + "epoch": 0.596297755021662, + "grad_norm": 8.110092163085938, + "learning_rate": 2.256e-06, + "loss": 2.8448, + "step": 757 + }, + { + "epoch": 0.5970854667191808, + "grad_norm": 3.4941134452819824, + "learning_rate": 2.2590000000000003e-06, + "loss": 2.8453, + "step": 758 + }, + { + "epoch": 0.5978731784166995, + "grad_norm": 4.314650535583496, + "learning_rate": 2.262e-06, + "loss": 2.8281, + "step": 759 + }, + { + "epoch": 0.5986608901142182, + "grad_norm": 5.90473747253418, + "learning_rate": 2.265e-06, + "loss": 2.8439, + "step": 760 + }, + { + "epoch": 0.5994486018117369, + "grad_norm": 7.077005386352539, + "learning_rate": 2.268e-06, + "loss": 2.8595, + "step": 761 + }, + { + "epoch": 0.6002363135092557, + "grad_norm": 9.909887313842773, + "learning_rate": 2.2710000000000004e-06, + "loss": 2.7762, + "step": 762 + }, + { + "epoch": 0.6010240252067743, + "grad_norm": 4.287919998168945, + "learning_rate": 2.274e-06, + "loss": 2.8058, + "step": 763 + }, + { + "epoch": 0.601811736904293, + "grad_norm": 10.988466262817383, + "learning_rate": 2.277e-06, + "loss": 2.8213, + "step": 764 + }, + { + "epoch": 0.6025994486018117, + "grad_norm": 4.901302814483643, + "learning_rate": 2.28e-06, + "loss": 2.805, + "step": 765 + }, + { + "epoch": 0.6033871602993305, + "grad_norm": 15.819864273071289, + "learning_rate": 2.283e-06, + "loss": 2.8763, + "step": 766 + }, + { + "epoch": 0.6041748719968492, + "grad_norm": 4.5066328048706055, + "learning_rate": 2.2860000000000002e-06, + "loss": 2.7669, + "step": 767 + }, + { + "epoch": 0.6049625836943678, + "grad_norm": 5.915872097015381, + "learning_rate": 2.2890000000000004e-06, + "loss": 2.787, + "step": 768 + }, + { + "epoch": 0.6057502953918865, + "grad_norm": 5.932315349578857, + "learning_rate": 2.292e-06, + "loss": 2.8347, + "step": 769 + }, + { + "epoch": 0.6065380070894053, + "grad_norm": 7.7249860763549805, + "learning_rate": 2.295e-06, + "loss": 2.8203, + "step": 770 + }, + { + "epoch": 0.607325718786924, + "grad_norm": 24.64418601989746, + "learning_rate": 2.2980000000000003e-06, + "loss": 2.7896, + "step": 771 + }, + { + "epoch": 0.6081134304844427, + "grad_norm": 8.233770370483398, + "learning_rate": 2.301e-06, + "loss": 2.8216, + "step": 772 + }, + { + "epoch": 0.6089011421819615, + "grad_norm": 5.895259380340576, + "learning_rate": 2.304e-06, + "loss": 2.8754, + "step": 773 + }, + { + "epoch": 0.6096888538794801, + "grad_norm": 11.134256362915039, + "learning_rate": 2.307e-06, + "loss": 2.8302, + "step": 774 + }, + { + "epoch": 0.6104765655769988, + "grad_norm": 4.455706596374512, + "learning_rate": 2.31e-06, + "loss": 2.8328, + "step": 775 + }, + { + "epoch": 0.6112642772745175, + "grad_norm": 6.043004512786865, + "learning_rate": 2.313e-06, + "loss": 2.8467, + "step": 776 + }, + { + "epoch": 0.6120519889720363, + "grad_norm": 5.738504409790039, + "learning_rate": 2.3160000000000004e-06, + "loss": 2.7751, + "step": 777 + }, + { + "epoch": 0.612839700669555, + "grad_norm": 10.544304847717285, + "learning_rate": 2.3189999999999997e-06, + "loss": 2.8156, + "step": 778 + }, + { + "epoch": 0.6136274123670736, + "grad_norm": 5.155287742614746, + "learning_rate": 2.322e-06, + "loss": 2.8122, + "step": 779 + }, + { + "epoch": 0.6144151240645923, + "grad_norm": 4.9154791831970215, + "learning_rate": 2.325e-06, + "loss": 2.8159, + "step": 780 + }, + { + "epoch": 0.6152028357621111, + "grad_norm": 3.673779249191284, + "learning_rate": 2.328e-06, + "loss": 2.7928, + "step": 781 + }, + { + "epoch": 0.6159905474596298, + "grad_norm": 13.24332332611084, + "learning_rate": 2.3310000000000002e-06, + "loss": 2.808, + "step": 782 + }, + { + "epoch": 0.6167782591571485, + "grad_norm": 5.738521099090576, + "learning_rate": 2.334e-06, + "loss": 2.7831, + "step": 783 + }, + { + "epoch": 0.6175659708546672, + "grad_norm": 7.271434783935547, + "learning_rate": 2.337e-06, + "loss": 2.7958, + "step": 784 + }, + { + "epoch": 0.6183536825521859, + "grad_norm": 4.814392566680908, + "learning_rate": 2.34e-06, + "loss": 2.7875, + "step": 785 + }, + { + "epoch": 0.6191413942497046, + "grad_norm": 5.820189476013184, + "learning_rate": 2.3430000000000003e-06, + "loss": 2.754, + "step": 786 + }, + { + "epoch": 0.6199291059472233, + "grad_norm": 13.24390983581543, + "learning_rate": 2.346e-06, + "loss": 2.7886, + "step": 787 + }, + { + "epoch": 0.6207168176447421, + "grad_norm": 5.787917613983154, + "learning_rate": 2.349e-06, + "loss": 2.7796, + "step": 788 + }, + { + "epoch": 0.6215045293422607, + "grad_norm": 7.650834083557129, + "learning_rate": 2.352e-06, + "loss": 2.7697, + "step": 789 + }, + { + "epoch": 0.6222922410397794, + "grad_norm": 4.177215099334717, + "learning_rate": 2.355e-06, + "loss": 2.7086, + "step": 790 + }, + { + "epoch": 0.6230799527372981, + "grad_norm": 6.4744343757629395, + "learning_rate": 2.358e-06, + "loss": 2.7426, + "step": 791 + }, + { + "epoch": 0.6238676644348169, + "grad_norm": 4.7306928634643555, + "learning_rate": 2.3610000000000003e-06, + "loss": 2.8243, + "step": 792 + }, + { + "epoch": 0.6246553761323356, + "grad_norm": 18.65669822692871, + "learning_rate": 2.3639999999999997e-06, + "loss": 2.7921, + "step": 793 + }, + { + "epoch": 0.6254430878298542, + "grad_norm": 3.6662704944610596, + "learning_rate": 2.367e-06, + "loss": 2.8271, + "step": 794 + }, + { + "epoch": 0.6262307995273729, + "grad_norm": 10.99417495727539, + "learning_rate": 2.37e-06, + "loss": 2.8152, + "step": 795 + }, + { + "epoch": 0.6270185112248917, + "grad_norm": 6.267129421234131, + "learning_rate": 2.373e-06, + "loss": 2.7929, + "step": 796 + }, + { + "epoch": 0.6278062229224104, + "grad_norm": 4.155924320220947, + "learning_rate": 2.376e-06, + "loss": 2.7987, + "step": 797 + }, + { + "epoch": 0.6285939346199291, + "grad_norm": 5.971964359283447, + "learning_rate": 2.379e-06, + "loss": 2.7693, + "step": 798 + }, + { + "epoch": 0.6293816463174479, + "grad_norm": 9.999753952026367, + "learning_rate": 2.382e-06, + "loss": 2.7772, + "step": 799 + }, + { + "epoch": 0.6301693580149665, + "grad_norm": 4.541004180908203, + "learning_rate": 2.385e-06, + "loss": 2.7762, + "step": 800 + }, + { + "epoch": 0.6309570697124852, + "grad_norm": 12.461874008178711, + "learning_rate": 2.3880000000000003e-06, + "loss": 2.9569, + "step": 801 + }, + { + "epoch": 0.6317447814100039, + "grad_norm": 6.426452159881592, + "learning_rate": 2.391e-06, + "loss": 2.9137, + "step": 802 + }, + { + "epoch": 0.6325324931075227, + "grad_norm": 3.3654088973999023, + "learning_rate": 2.394e-06, + "loss": 2.8745, + "step": 803 + }, + { + "epoch": 0.6333202048050414, + "grad_norm": 4.380138397216797, + "learning_rate": 2.397e-06, + "loss": 2.861, + "step": 804 + }, + { + "epoch": 0.63410791650256, + "grad_norm": 5.910552024841309, + "learning_rate": 2.4000000000000003e-06, + "loss": 2.8437, + "step": 805 + }, + { + "epoch": 0.6348956282000787, + "grad_norm": 12.019161224365234, + "learning_rate": 2.403e-06, + "loss": 2.8096, + "step": 806 + }, + { + "epoch": 0.6356833398975975, + "grad_norm": 5.579248905181885, + "learning_rate": 2.406e-06, + "loss": 2.7705, + "step": 807 + }, + { + "epoch": 0.6364710515951162, + "grad_norm": 4.317277908325195, + "learning_rate": 2.409e-06, + "loss": 2.7601, + "step": 808 + }, + { + "epoch": 0.6372587632926349, + "grad_norm": 4.391234397888184, + "learning_rate": 2.412e-06, + "loss": 2.7464, + "step": 809 + }, + { + "epoch": 0.6380464749901537, + "grad_norm": 5.695110321044922, + "learning_rate": 2.415e-06, + "loss": 2.7318, + "step": 810 + }, + { + "epoch": 0.6388341866876723, + "grad_norm": 5.645821571350098, + "learning_rate": 2.4180000000000004e-06, + "loss": 2.7594, + "step": 811 + }, + { + "epoch": 0.639621898385191, + "grad_norm": 15.202424049377441, + "learning_rate": 2.4209999999999998e-06, + "loss": 2.7473, + "step": 812 + }, + { + "epoch": 0.6404096100827097, + "grad_norm": 7.115016937255859, + "learning_rate": 2.424e-06, + "loss": 2.8013, + "step": 813 + }, + { + "epoch": 0.6411973217802285, + "grad_norm": 4.909542083740234, + "learning_rate": 2.4270000000000002e-06, + "loss": 2.7614, + "step": 814 + }, + { + "epoch": 0.6419850334777472, + "grad_norm": 5.563431262969971, + "learning_rate": 2.43e-06, + "loss": 2.7585, + "step": 815 + }, + { + "epoch": 0.6427727451752658, + "grad_norm": 6.04931640625, + "learning_rate": 2.4330000000000003e-06, + "loss": 2.7369, + "step": 816 + }, + { + "epoch": 0.6435604568727845, + "grad_norm": 19.162864685058594, + "learning_rate": 2.436e-06, + "loss": 2.7367, + "step": 817 + }, + { + "epoch": 0.6443481685703033, + "grad_norm": 4.426506042480469, + "learning_rate": 2.439e-06, + "loss": 2.7201, + "step": 818 + }, + { + "epoch": 0.645135880267822, + "grad_norm": 7.10182523727417, + "learning_rate": 2.442e-06, + "loss": 2.7357, + "step": 819 + }, + { + "epoch": 0.6459235919653407, + "grad_norm": 5.579860210418701, + "learning_rate": 2.4450000000000003e-06, + "loss": 2.7534, + "step": 820 + }, + { + "epoch": 0.6467113036628593, + "grad_norm": 5.292620658874512, + "learning_rate": 2.448e-06, + "loss": 2.7379, + "step": 821 + }, + { + "epoch": 0.6474990153603781, + "grad_norm": 3.888976812362671, + "learning_rate": 2.451e-06, + "loss": 2.777, + "step": 822 + }, + { + "epoch": 0.6482867270578968, + "grad_norm": 6.729205131530762, + "learning_rate": 2.454e-06, + "loss": 2.7676, + "step": 823 + }, + { + "epoch": 0.6490744387554155, + "grad_norm": 8.74480152130127, + "learning_rate": 2.457e-06, + "loss": 2.7885, + "step": 824 + }, + { + "epoch": 0.6498621504529343, + "grad_norm": 5.220530033111572, + "learning_rate": 2.46e-06, + "loss": 2.7726, + "step": 825 + }, + { + "epoch": 0.650649862150453, + "grad_norm": 4.802467346191406, + "learning_rate": 2.4630000000000004e-06, + "loss": 2.7188, + "step": 826 + }, + { + "epoch": 0.6514375738479716, + "grad_norm": 5.095948696136475, + "learning_rate": 2.4659999999999998e-06, + "loss": 2.782, + "step": 827 + }, + { + "epoch": 0.6522252855454903, + "grad_norm": 6.239099979400635, + "learning_rate": 2.469e-06, + "loss": 2.7186, + "step": 828 + }, + { + "epoch": 0.6530129972430091, + "grad_norm": 2.9244749546051025, + "learning_rate": 2.4720000000000002e-06, + "loss": 2.7729, + "step": 829 + }, + { + "epoch": 0.6538007089405278, + "grad_norm": 9.764730453491211, + "learning_rate": 2.475e-06, + "loss": 2.7793, + "step": 830 + }, + { + "epoch": 0.6545884206380465, + "grad_norm": 4.770854949951172, + "learning_rate": 2.4780000000000002e-06, + "loss": 2.7659, + "step": 831 + }, + { + "epoch": 0.6553761323355651, + "grad_norm": 7.600482940673828, + "learning_rate": 2.481e-06, + "loss": 2.8746, + "step": 832 + }, + { + "epoch": 0.6561638440330839, + "grad_norm": 4.031045913696289, + "learning_rate": 2.484e-06, + "loss": 2.7193, + "step": 833 + }, + { + "epoch": 0.6569515557306026, + "grad_norm": 7.161046504974365, + "learning_rate": 2.487e-06, + "loss": 2.7621, + "step": 834 + }, + { + "epoch": 0.6577392674281213, + "grad_norm": 5.090322494506836, + "learning_rate": 2.4900000000000003e-06, + "loss": 2.7516, + "step": 835 + }, + { + "epoch": 0.6585269791256401, + "grad_norm": 4.827559947967529, + "learning_rate": 2.4929999999999997e-06, + "loss": 2.7087, + "step": 836 + }, + { + "epoch": 0.6593146908231587, + "grad_norm": 4.857949733734131, + "learning_rate": 2.496e-06, + "loss": 2.7414, + "step": 837 + }, + { + "epoch": 0.6601024025206774, + "grad_norm": 6.309624671936035, + "learning_rate": 2.499e-06, + "loss": 2.7688, + "step": 838 + }, + { + "epoch": 0.6608901142181961, + "grad_norm": 4.706110954284668, + "learning_rate": 2.502e-06, + "loss": 2.7543, + "step": 839 + }, + { + "epoch": 0.6616778259157149, + "grad_norm": 4.942398548126221, + "learning_rate": 2.505e-06, + "loss": 2.7504, + "step": 840 + }, + { + "epoch": 0.6624655376132336, + "grad_norm": 6.5483479499816895, + "learning_rate": 2.508e-06, + "loss": 2.7883, + "step": 841 + }, + { + "epoch": 0.6632532493107522, + "grad_norm": 5.3630690574646, + "learning_rate": 2.5109999999999998e-06, + "loss": 2.7679, + "step": 842 + }, + { + "epoch": 0.6640409610082709, + "grad_norm": 4.537487506866455, + "learning_rate": 2.514e-06, + "loss": 2.7969, + "step": 843 + }, + { + "epoch": 0.6648286727057897, + "grad_norm": 7.507015705108643, + "learning_rate": 2.517e-06, + "loss": 2.7183, + "step": 844 + }, + { + "epoch": 0.6656163844033084, + "grad_norm": 5.378933429718018, + "learning_rate": 2.52e-06, + "loss": 2.7301, + "step": 845 + }, + { + "epoch": 0.6664040961008271, + "grad_norm": 3.144681453704834, + "learning_rate": 2.523e-06, + "loss": 2.7245, + "step": 846 + }, + { + "epoch": 0.6671918077983459, + "grad_norm": 6.10594367980957, + "learning_rate": 2.526e-06, + "loss": 2.7591, + "step": 847 + }, + { + "epoch": 0.6679795194958645, + "grad_norm": 9.208066940307617, + "learning_rate": 2.5290000000000003e-06, + "loss": 2.747, + "step": 848 + }, + { + "epoch": 0.6687672311933832, + "grad_norm": 4.90377140045166, + "learning_rate": 2.532e-06, + "loss": 2.7623, + "step": 849 + }, + { + "epoch": 0.6695549428909019, + "grad_norm": 3.9081766605377197, + "learning_rate": 2.5350000000000003e-06, + "loss": 2.7393, + "step": 850 + }, + { + "epoch": 0.6703426545884207, + "grad_norm": 5.882458209991455, + "learning_rate": 2.538e-06, + "loss": 2.9494, + "step": 851 + }, + { + "epoch": 0.6711303662859394, + "grad_norm": 4.277505397796631, + "learning_rate": 2.541e-06, + "loss": 2.8396, + "step": 852 + }, + { + "epoch": 0.671918077983458, + "grad_norm": 7.640064239501953, + "learning_rate": 2.544e-06, + "loss": 2.8631, + "step": 853 + }, + { + "epoch": 0.6727057896809767, + "grad_norm": 5.824072360992432, + "learning_rate": 2.5470000000000003e-06, + "loss": 2.799, + "step": 854 + }, + { + "epoch": 0.6734935013784955, + "grad_norm": 3.70347261428833, + "learning_rate": 2.55e-06, + "loss": 2.7459, + "step": 855 + }, + { + "epoch": 0.6742812130760142, + "grad_norm": 3.4522409439086914, + "learning_rate": 2.553e-06, + "loss": 2.71, + "step": 856 + }, + { + "epoch": 0.6750689247735329, + "grad_norm": 3.8990273475646973, + "learning_rate": 2.556e-06, + "loss": 2.6903, + "step": 857 + }, + { + "epoch": 0.6758566364710515, + "grad_norm": 2.2706780433654785, + "learning_rate": 2.559e-06, + "loss": 2.7187, + "step": 858 + }, + { + "epoch": 0.6766443481685703, + "grad_norm": 4.869923114776611, + "learning_rate": 2.562e-06, + "loss": 2.7088, + "step": 859 + }, + { + "epoch": 0.677432059866089, + "grad_norm": 4.3550004959106445, + "learning_rate": 2.5650000000000004e-06, + "loss": 2.7134, + "step": 860 + }, + { + "epoch": 0.6782197715636077, + "grad_norm": 2.999727249145508, + "learning_rate": 2.568e-06, + "loss": 2.6976, + "step": 861 + }, + { + "epoch": 0.6790074832611265, + "grad_norm": 5.465621471405029, + "learning_rate": 2.571e-06, + "loss": 2.7196, + "step": 862 + }, + { + "epoch": 0.6797951949586452, + "grad_norm": 3.6364669799804688, + "learning_rate": 2.5740000000000003e-06, + "loss": 2.7572, + "step": 863 + }, + { + "epoch": 0.6805829066561638, + "grad_norm": 3.3633182048797607, + "learning_rate": 2.577e-06, + "loss": 2.6926, + "step": 864 + }, + { + "epoch": 0.6813706183536825, + "grad_norm": 3.398127317428589, + "learning_rate": 2.58e-06, + "loss": 2.6806, + "step": 865 + }, + { + "epoch": 0.6821583300512013, + "grad_norm": 7.115653038024902, + "learning_rate": 2.583e-06, + "loss": 2.7101, + "step": 866 + }, + { + "epoch": 0.68294604174872, + "grad_norm": 4.248654365539551, + "learning_rate": 2.586e-06, + "loss": 2.723, + "step": 867 + }, + { + "epoch": 0.6837337534462387, + "grad_norm": 2.7735235691070557, + "learning_rate": 2.589e-06, + "loss": 2.6921, + "step": 868 + }, + { + "epoch": 0.6845214651437573, + "grad_norm": 2.600982189178467, + "learning_rate": 2.5920000000000003e-06, + "loss": 2.6728, + "step": 869 + }, + { + "epoch": 0.6853091768412761, + "grad_norm": 3.555784225463867, + "learning_rate": 2.5949999999999997e-06, + "loss": 2.6862, + "step": 870 + }, + { + "epoch": 0.6860968885387948, + "grad_norm": 2.826633930206299, + "learning_rate": 2.598e-06, + "loss": 2.6588, + "step": 871 + }, + { + "epoch": 0.6868846002363135, + "grad_norm": 3.0642995834350586, + "learning_rate": 2.601e-06, + "loss": 2.6558, + "step": 872 + }, + { + "epoch": 0.6876723119338323, + "grad_norm": 11.33929443359375, + "learning_rate": 2.604e-06, + "loss": 2.7236, + "step": 873 + }, + { + "epoch": 0.688460023631351, + "grad_norm": 3.578444719314575, + "learning_rate": 2.607e-06, + "loss": 2.7176, + "step": 874 + }, + { + "epoch": 0.6892477353288696, + "grad_norm": 3.0572242736816406, + "learning_rate": 2.61e-06, + "loss": 2.7049, + "step": 875 + }, + { + "epoch": 0.6900354470263883, + "grad_norm": 3.767564535140991, + "learning_rate": 2.613e-06, + "loss": 2.6302, + "step": 876 + }, + { + "epoch": 0.6908231587239071, + "grad_norm": 5.0737409591674805, + "learning_rate": 2.616e-06, + "loss": 2.7326, + "step": 877 + }, + { + "epoch": 0.6916108704214258, + "grad_norm": 2.980607032775879, + "learning_rate": 2.6190000000000003e-06, + "loss": 2.7139, + "step": 878 + }, + { + "epoch": 0.6923985821189445, + "grad_norm": 3.3015060424804688, + "learning_rate": 2.622e-06, + "loss": 2.7427, + "step": 879 + }, + { + "epoch": 0.6931862938164631, + "grad_norm": 8.270029067993164, + "learning_rate": 2.625e-06, + "loss": 2.7397, + "step": 880 + }, + { + "epoch": 0.6939740055139819, + "grad_norm": 6.810901641845703, + "learning_rate": 2.628e-06, + "loss": 2.6775, + "step": 881 + }, + { + "epoch": 0.6947617172115006, + "grad_norm": 5.2713847160339355, + "learning_rate": 2.631e-06, + "loss": 2.7296, + "step": 882 + }, + { + "epoch": 0.6955494289090193, + "grad_norm": 6.353626251220703, + "learning_rate": 2.634e-06, + "loss": 2.7493, + "step": 883 + }, + { + "epoch": 0.696337140606538, + "grad_norm": 10.858621597290039, + "learning_rate": 2.6370000000000003e-06, + "loss": 2.7275, + "step": 884 + }, + { + "epoch": 0.6971248523040567, + "grad_norm": 7.564162254333496, + "learning_rate": 2.6399999999999997e-06, + "loss": 2.7104, + "step": 885 + }, + { + "epoch": 0.6979125640015754, + "grad_norm": 7.291218280792236, + "learning_rate": 2.643e-06, + "loss": 2.7182, + "step": 886 + }, + { + "epoch": 0.6987002756990941, + "grad_norm": 7.852723121643066, + "learning_rate": 2.646e-06, + "loss": 2.7282, + "step": 887 + }, + { + "epoch": 0.6994879873966129, + "grad_norm": 11.244953155517578, + "learning_rate": 2.649e-06, + "loss": 2.7366, + "step": 888 + }, + { + "epoch": 0.7002756990941316, + "grad_norm": 6.892150402069092, + "learning_rate": 2.652e-06, + "loss": 2.7472, + "step": 889 + }, + { + "epoch": 0.7010634107916502, + "grad_norm": 12.440132141113281, + "learning_rate": 2.655e-06, + "loss": 2.8171, + "step": 890 + }, + { + "epoch": 0.7018511224891689, + "grad_norm": 3.6120615005493164, + "learning_rate": 2.6580000000000002e-06, + "loss": 2.7497, + "step": 891 + }, + { + "epoch": 0.7026388341866877, + "grad_norm": 3.2154650688171387, + "learning_rate": 2.661e-06, + "loss": 2.7481, + "step": 892 + }, + { + "epoch": 0.7034265458842064, + "grad_norm": 4.393897533416748, + "learning_rate": 2.6640000000000002e-06, + "loss": 2.7325, + "step": 893 + }, + { + "epoch": 0.7042142575817251, + "grad_norm": 6.41448974609375, + "learning_rate": 2.6670000000000005e-06, + "loss": 2.7088, + "step": 894 + }, + { + "epoch": 0.7050019692792437, + "grad_norm": 4.030684947967529, + "learning_rate": 2.67e-06, + "loss": 2.7059, + "step": 895 + }, + { + "epoch": 0.7057896809767625, + "grad_norm": 5.544601917266846, + "learning_rate": 2.673e-06, + "loss": 2.7061, + "step": 896 + }, + { + "epoch": 0.7065773926742812, + "grad_norm": 2.418160915374756, + "learning_rate": 2.6760000000000003e-06, + "loss": 2.6579, + "step": 897 + }, + { + "epoch": 0.7073651043717999, + "grad_norm": 3.112351655960083, + "learning_rate": 2.679e-06, + "loss": 2.6932, + "step": 898 + }, + { + "epoch": 0.7081528160693187, + "grad_norm": 3.5547261238098145, + "learning_rate": 2.682e-06, + "loss": 2.6978, + "step": 899 + }, + { + "epoch": 0.7089405277668374, + "grad_norm": 5.3939409255981445, + "learning_rate": 2.685e-06, + "loss": 2.7246, + "step": 900 + }, + { + "epoch": 0.709728239464356, + "grad_norm": 9.814820289611816, + "learning_rate": 2.688e-06, + "loss": 2.8772, + "step": 901 + }, + { + "epoch": 0.7105159511618747, + "grad_norm": 6.419727802276611, + "learning_rate": 2.691e-06, + "loss": 2.8097, + "step": 902 + }, + { + "epoch": 0.7113036628593935, + "grad_norm": 3.515667676925659, + "learning_rate": 2.6940000000000004e-06, + "loss": 2.8043, + "step": 903 + }, + { + "epoch": 0.7120913745569122, + "grad_norm": 6.008763313293457, + "learning_rate": 2.6969999999999998e-06, + "loss": 2.7516, + "step": 904 + }, + { + "epoch": 0.7128790862544309, + "grad_norm": 3.836956024169922, + "learning_rate": 2.7e-06, + "loss": 2.7521, + "step": 905 + }, + { + "epoch": 0.7136667979519495, + "grad_norm": 3.8367886543273926, + "learning_rate": 2.703e-06, + "loss": 2.7394, + "step": 906 + }, + { + "epoch": 0.7144545096494683, + "grad_norm": 4.147394180297852, + "learning_rate": 2.706e-06, + "loss": 2.6524, + "step": 907 + }, + { + "epoch": 0.715242221346987, + "grad_norm": 5.10467529296875, + "learning_rate": 2.7090000000000002e-06, + "loss": 2.6873, + "step": 908 + }, + { + "epoch": 0.7160299330445057, + "grad_norm": 3.036102771759033, + "learning_rate": 2.712e-06, + "loss": 2.6179, + "step": 909 + }, + { + "epoch": 0.7168176447420245, + "grad_norm": 5.30342960357666, + "learning_rate": 2.715e-06, + "loss": 2.6098, + "step": 910 + }, + { + "epoch": 0.7176053564395432, + "grad_norm": 5.201896667480469, + "learning_rate": 2.718e-06, + "loss": 2.6247, + "step": 911 + }, + { + "epoch": 0.7183930681370618, + "grad_norm": 2.5018019676208496, + "learning_rate": 2.7210000000000003e-06, + "loss": 2.6353, + "step": 912 + }, + { + "epoch": 0.7191807798345805, + "grad_norm": 3.060737371444702, + "learning_rate": 2.724e-06, + "loss": 2.6266, + "step": 913 + }, + { + "epoch": 0.7199684915320993, + "grad_norm": 8.369832992553711, + "learning_rate": 2.727e-06, + "loss": 2.6364, + "step": 914 + }, + { + "epoch": 0.720756203229618, + "grad_norm": 6.304200172424316, + "learning_rate": 2.73e-06, + "loss": 2.6486, + "step": 915 + }, + { + "epoch": 0.7215439149271367, + "grad_norm": 8.337112426757812, + "learning_rate": 2.733e-06, + "loss": 2.5993, + "step": 916 + }, + { + "epoch": 0.7223316266246553, + "grad_norm": 3.012908935546875, + "learning_rate": 2.736e-06, + "loss": 2.6251, + "step": 917 + }, + { + "epoch": 0.7231193383221741, + "grad_norm": 5.077672958374023, + "learning_rate": 2.7390000000000004e-06, + "loss": 2.6182, + "step": 918 + }, + { + "epoch": 0.7239070500196928, + "grad_norm": 6.160772800445557, + "learning_rate": 2.7419999999999998e-06, + "loss": 2.6599, + "step": 919 + }, + { + "epoch": 0.7246947617172115, + "grad_norm": 4.509033679962158, + "learning_rate": 2.745e-06, + "loss": 2.6716, + "step": 920 + }, + { + "epoch": 0.7254824734147302, + "grad_norm": 5.251223087310791, + "learning_rate": 2.748e-06, + "loss": 2.6599, + "step": 921 + }, + { + "epoch": 0.726270185112249, + "grad_norm": 6.446579933166504, + "learning_rate": 2.751e-06, + "loss": 2.6399, + "step": 922 + }, + { + "epoch": 0.7270578968097676, + "grad_norm": 7.4050703048706055, + "learning_rate": 2.7540000000000002e-06, + "loss": 2.6209, + "step": 923 + }, + { + "epoch": 0.7278456085072863, + "grad_norm": 4.355445861816406, + "learning_rate": 2.757e-06, + "loss": 2.6172, + "step": 924 + }, + { + "epoch": 0.7286333202048051, + "grad_norm": 7.446080684661865, + "learning_rate": 2.76e-06, + "loss": 2.6551, + "step": 925 + }, + { + "epoch": 0.7294210319023238, + "grad_norm": 4.662217617034912, + "learning_rate": 2.763e-06, + "loss": 2.6766, + "step": 926 + }, + { + "epoch": 0.7302087435998424, + "grad_norm": 5.835084915161133, + "learning_rate": 2.7660000000000003e-06, + "loss": 2.6336, + "step": 927 + }, + { + "epoch": 0.7309964552973611, + "grad_norm": 3.318727970123291, + "learning_rate": 2.7689999999999997e-06, + "loss": 2.6019, + "step": 928 + }, + { + "epoch": 0.7317841669948799, + "grad_norm": 4.41193962097168, + "learning_rate": 2.772e-06, + "loss": 2.6133, + "step": 929 + }, + { + "epoch": 0.7325718786923986, + "grad_norm": 3.3794174194335938, + "learning_rate": 2.775e-06, + "loss": 2.5631, + "step": 930 + }, + { + "epoch": 0.7333595903899173, + "grad_norm": 5.633148193359375, + "learning_rate": 2.778e-06, + "loss": 2.5784, + "step": 931 + }, + { + "epoch": 0.734147302087436, + "grad_norm": 7.116572380065918, + "learning_rate": 2.781e-06, + "loss": 2.6323, + "step": 932 + }, + { + "epoch": 0.7349350137849547, + "grad_norm": 4.24117374420166, + "learning_rate": 2.784e-06, + "loss": 2.6458, + "step": 933 + }, + { + "epoch": 0.7357227254824734, + "grad_norm": 3.791365623474121, + "learning_rate": 2.787e-06, + "loss": 2.606, + "step": 934 + }, + { + "epoch": 0.7365104371799921, + "grad_norm": 6.5029072761535645, + "learning_rate": 2.79e-06, + "loss": 2.6148, + "step": 935 + }, + { + "epoch": 0.7372981488775109, + "grad_norm": 8.154953956604004, + "learning_rate": 2.793e-06, + "loss": 2.6055, + "step": 936 + }, + { + "epoch": 0.7380858605750296, + "grad_norm": 5.518546104431152, + "learning_rate": 2.7960000000000004e-06, + "loss": 2.6569, + "step": 937 + }, + { + "epoch": 0.7388735722725482, + "grad_norm": 3.900184154510498, + "learning_rate": 2.799e-06, + "loss": 2.6078, + "step": 938 + }, + { + "epoch": 0.7396612839700669, + "grad_norm": 5.085064888000488, + "learning_rate": 2.802e-06, + "loss": 2.6401, + "step": 939 + }, + { + "epoch": 0.7404489956675857, + "grad_norm": 3.3693017959594727, + "learning_rate": 2.8050000000000002e-06, + "loss": 2.6229, + "step": 940 + }, + { + "epoch": 0.7412367073651044, + "grad_norm": 3.4498684406280518, + "learning_rate": 2.808e-06, + "loss": 2.6043, + "step": 941 + }, + { + "epoch": 0.7420244190626231, + "grad_norm": 5.3393402099609375, + "learning_rate": 2.8110000000000003e-06, + "loss": 2.5811, + "step": 942 + }, + { + "epoch": 0.7428121307601417, + "grad_norm": 3.7463505268096924, + "learning_rate": 2.814e-06, + "loss": 2.6004, + "step": 943 + }, + { + "epoch": 0.7435998424576605, + "grad_norm": 4.73776388168335, + "learning_rate": 2.817e-06, + "loss": 2.6117, + "step": 944 + }, + { + "epoch": 0.7443875541551792, + "grad_norm": 11.669256210327148, + "learning_rate": 2.82e-06, + "loss": 2.5977, + "step": 945 + }, + { + "epoch": 0.7451752658526979, + "grad_norm": 11.394187927246094, + "learning_rate": 2.8230000000000003e-06, + "loss": 2.6386, + "step": 946 + }, + { + "epoch": 0.7459629775502166, + "grad_norm": 16.070693969726562, + "learning_rate": 2.826e-06, + "loss": 2.6396, + "step": 947 + }, + { + "epoch": 0.7467506892477354, + "grad_norm": 4.535414218902588, + "learning_rate": 2.829e-06, + "loss": 2.5963, + "step": 948 + }, + { + "epoch": 0.747538400945254, + "grad_norm": 5.385873317718506, + "learning_rate": 2.832e-06, + "loss": 2.5959, + "step": 949 + }, + { + "epoch": 0.7483261126427727, + "grad_norm": 4.599339962005615, + "learning_rate": 2.835e-06, + "loss": 2.5923, + "step": 950 + }, + { + "epoch": 0.7491138243402915, + "grad_norm": 5.463631629943848, + "learning_rate": 2.838e-06, + "loss": 2.8454, + "step": 951 + }, + { + "epoch": 0.7499015360378102, + "grad_norm": 3.9152190685272217, + "learning_rate": 2.8410000000000004e-06, + "loss": 2.7893, + "step": 952 + }, + { + "epoch": 0.7506892477353289, + "grad_norm": 2.9627254009246826, + "learning_rate": 2.844e-06, + "loss": 2.7139, + "step": 953 + }, + { + "epoch": 0.7514769594328475, + "grad_norm": 3.3034207820892334, + "learning_rate": 2.847e-06, + "loss": 2.6919, + "step": 954 + }, + { + "epoch": 0.7522646711303663, + "grad_norm": 3.965918779373169, + "learning_rate": 2.8500000000000002e-06, + "loss": 2.6439, + "step": 955 + }, + { + "epoch": 0.753052382827885, + "grad_norm": 4.072619915008545, + "learning_rate": 2.853e-06, + "loss": 2.6388, + "step": 956 + }, + { + "epoch": 0.7538400945254037, + "grad_norm": 5.512754917144775, + "learning_rate": 2.8560000000000003e-06, + "loss": 2.5594, + "step": 957 + }, + { + "epoch": 0.7546278062229224, + "grad_norm": 3.4037563800811768, + "learning_rate": 2.859e-06, + "loss": 2.5928, + "step": 958 + }, + { + "epoch": 0.7554155179204411, + "grad_norm": 3.1914756298065186, + "learning_rate": 2.862e-06, + "loss": 2.565, + "step": 959 + }, + { + "epoch": 0.7562032296179598, + "grad_norm": 4.397670269012451, + "learning_rate": 2.865e-06, + "loss": 2.5355, + "step": 960 + }, + { + "epoch": 0.7569909413154785, + "grad_norm": 2.868762731552124, + "learning_rate": 2.8680000000000003e-06, + "loss": 2.5483, + "step": 961 + }, + { + "epoch": 0.7577786530129973, + "grad_norm": 3.451880931854248, + "learning_rate": 2.8709999999999997e-06, + "loss": 2.5031, + "step": 962 + }, + { + "epoch": 0.758566364710516, + "grad_norm": 4.089529514312744, + "learning_rate": 2.874e-06, + "loss": 2.5254, + "step": 963 + }, + { + "epoch": 0.7593540764080347, + "grad_norm": 2.501394748687744, + "learning_rate": 2.877e-06, + "loss": 2.5121, + "step": 964 + }, + { + "epoch": 0.7601417881055533, + "grad_norm": 3.668564558029175, + "learning_rate": 2.88e-06, + "loss": 2.5678, + "step": 965 + }, + { + "epoch": 0.7609294998030721, + "grad_norm": 3.791240930557251, + "learning_rate": 2.883e-06, + "loss": 2.4818, + "step": 966 + }, + { + "epoch": 0.7617172115005908, + "grad_norm": 6.739199638366699, + "learning_rate": 2.886e-06, + "loss": 2.5242, + "step": 967 + }, + { + "epoch": 0.7625049231981095, + "grad_norm": 8.123620986938477, + "learning_rate": 2.8889999999999998e-06, + "loss": 2.5759, + "step": 968 + }, + { + "epoch": 0.7632926348956282, + "grad_norm": 3.4705827236175537, + "learning_rate": 2.892e-06, + "loss": 2.4753, + "step": 969 + }, + { + "epoch": 0.7640803465931469, + "grad_norm": 6.2319464683532715, + "learning_rate": 2.8950000000000002e-06, + "loss": 2.469, + "step": 970 + }, + { + "epoch": 0.7648680582906656, + "grad_norm": 5.678664684295654, + "learning_rate": 2.898e-06, + "loss": 2.4643, + "step": 971 + }, + { + "epoch": 0.7656557699881843, + "grad_norm": 3.860183000564575, + "learning_rate": 2.901e-06, + "loss": 2.5249, + "step": 972 + }, + { + "epoch": 0.7664434816857031, + "grad_norm": 4.02783203125, + "learning_rate": 2.904e-06, + "loss": 2.4197, + "step": 973 + }, + { + "epoch": 0.7672311933832218, + "grad_norm": 4.357760429382324, + "learning_rate": 2.907e-06, + "loss": 2.4897, + "step": 974 + }, + { + "epoch": 0.7680189050807404, + "grad_norm": 3.415785789489746, + "learning_rate": 2.91e-06, + "loss": 2.473, + "step": 975 + }, + { + "epoch": 0.7688066167782591, + "grad_norm": 3.6924259662628174, + "learning_rate": 2.9130000000000003e-06, + "loss": 2.4569, + "step": 976 + }, + { + "epoch": 0.7695943284757779, + "grad_norm": 15.13408374786377, + "learning_rate": 2.916e-06, + "loss": 2.4484, + "step": 977 + }, + { + "epoch": 0.7703820401732966, + "grad_norm": 5.287290096282959, + "learning_rate": 2.919e-06, + "loss": 2.4198, + "step": 978 + }, + { + "epoch": 0.7711697518708153, + "grad_norm": 4.2618560791015625, + "learning_rate": 2.922e-06, + "loss": 2.4496, + "step": 979 + }, + { + "epoch": 0.771957463568334, + "grad_norm": 17.158103942871094, + "learning_rate": 2.9250000000000004e-06, + "loss": 2.4097, + "step": 980 + }, + { + "epoch": 0.7727451752658527, + "grad_norm": 7.896697521209717, + "learning_rate": 2.928e-06, + "loss": 2.4382, + "step": 981 + }, + { + "epoch": 0.7735328869633714, + "grad_norm": 12.84074878692627, + "learning_rate": 2.931e-06, + "loss": 2.4077, + "step": 982 + }, + { + "epoch": 0.7743205986608901, + "grad_norm": 6.3725481033325195, + "learning_rate": 2.934e-06, + "loss": 2.4276, + "step": 983 + }, + { + "epoch": 0.7751083103584088, + "grad_norm": 5.798040390014648, + "learning_rate": 2.937e-06, + "loss": 2.3956, + "step": 984 + }, + { + "epoch": 0.7758960220559276, + "grad_norm": 7.800330638885498, + "learning_rate": 2.9400000000000002e-06, + "loss": 2.346, + "step": 985 + }, + { + "epoch": 0.7766837337534462, + "grad_norm": 8.115926742553711, + "learning_rate": 2.9430000000000005e-06, + "loss": 2.4013, + "step": 986 + }, + { + "epoch": 0.7774714454509649, + "grad_norm": 5.255527496337891, + "learning_rate": 2.946e-06, + "loss": 2.3087, + "step": 987 + }, + { + "epoch": 0.7782591571484837, + "grad_norm": 5.583354473114014, + "learning_rate": 2.949e-06, + "loss": 2.3236, + "step": 988 + }, + { + "epoch": 0.7790468688460024, + "grad_norm": 9.138849258422852, + "learning_rate": 2.9520000000000003e-06, + "loss": 2.3429, + "step": 989 + }, + { + "epoch": 0.7798345805435211, + "grad_norm": 7.863848686218262, + "learning_rate": 2.955e-06, + "loss": 2.3771, + "step": 990 + }, + { + "epoch": 0.7806222922410397, + "grad_norm": 5.0195207595825195, + "learning_rate": 2.958e-06, + "loss": 2.316, + "step": 991 + }, + { + "epoch": 0.7814100039385585, + "grad_norm": 3.2321698665618896, + "learning_rate": 2.961e-06, + "loss": 2.2962, + "step": 992 + }, + { + "epoch": 0.7821977156360772, + "grad_norm": 10.669112205505371, + "learning_rate": 2.964e-06, + "loss": 2.327, + "step": 993 + }, + { + "epoch": 0.7829854273335959, + "grad_norm": 7.035267353057861, + "learning_rate": 2.967e-06, + "loss": 2.3225, + "step": 994 + }, + { + "epoch": 0.7837731390311146, + "grad_norm": 8.379196166992188, + "learning_rate": 2.9700000000000004e-06, + "loss": 2.2827, + "step": 995 + }, + { + "epoch": 0.7845608507286334, + "grad_norm": 4.863012313842773, + "learning_rate": 2.9729999999999997e-06, + "loss": 2.232, + "step": 996 + }, + { + "epoch": 0.785348562426152, + "grad_norm": 3.9591684341430664, + "learning_rate": 2.976e-06, + "loss": 2.2503, + "step": 997 + }, + { + "epoch": 0.7861362741236707, + "grad_norm": 6.418039798736572, + "learning_rate": 2.979e-06, + "loss": 2.2365, + "step": 998 + }, + { + "epoch": 0.7869239858211895, + "grad_norm": 7.287939548492432, + "learning_rate": 2.982e-06, + "loss": 2.2527, + "step": 999 + }, + { + "epoch": 0.7877116975187082, + "grad_norm": 5.641862392425537, + "learning_rate": 2.9850000000000002e-06, + "loss": 2.2628, + "step": 1000 + }, + { + "epoch": 0.7877116975187082, + "eval_cer": 0.7664152215050418, + "eval_loss": 2.611344575881958, + "eval_runtime": 16.5269, + "eval_samples_per_second": 18.394, + "eval_steps_per_second": 0.605, + "eval_wer": 0.9980813507290868, + "step": 1000 + }, + { + "epoch": 0.7884994092162269, + "grad_norm": 7.776047229766846, + "learning_rate": 2.988e-06, + "loss": 2.5375, + "step": 1001 + }, + { + "epoch": 0.7892871209137455, + "grad_norm": 7.920994758605957, + "learning_rate": 2.991e-06, + "loss": 2.5107, + "step": 1002 + }, + { + "epoch": 0.7900748326112643, + "grad_norm": 5.05686616897583, + "learning_rate": 2.994e-06, + "loss": 2.4142, + "step": 1003 + }, + { + "epoch": 0.790862544308783, + "grad_norm": 8.361905097961426, + "learning_rate": 2.9970000000000003e-06, + "loss": 2.3424, + "step": 1004 + }, + { + "epoch": 0.7916502560063017, + "grad_norm": 4.957725524902344, + "learning_rate": 3e-06, + "loss": 2.2596, + "step": 1005 + }, + { + "epoch": 0.7924379677038204, + "grad_norm": 5.821000099182129, + "learning_rate": 3.003e-06, + "loss": 2.2211, + "step": 1006 + }, + { + "epoch": 0.7932256794013391, + "grad_norm": 4.263027667999268, + "learning_rate": 3.006e-06, + "loss": 2.1357, + "step": 1007 + }, + { + "epoch": 0.7940133910988578, + "grad_norm": 4.6923980712890625, + "learning_rate": 3.009e-06, + "loss": 2.1427, + "step": 1008 + }, + { + "epoch": 0.7948011027963765, + "grad_norm": 6.3352837562561035, + "learning_rate": 3.012e-06, + "loss": 2.1166, + "step": 1009 + }, + { + "epoch": 0.7955888144938952, + "grad_norm": 5.955663681030273, + "learning_rate": 3.0150000000000004e-06, + "loss": 2.0457, + "step": 1010 + }, + { + "epoch": 0.796376526191414, + "grad_norm": 5.609542369842529, + "learning_rate": 3.0179999999999997e-06, + "loss": 2.0087, + "step": 1011 + }, + { + "epoch": 0.7971642378889326, + "grad_norm": 4.52250862121582, + "learning_rate": 3.021e-06, + "loss": 2.0284, + "step": 1012 + }, + { + "epoch": 0.7979519495864513, + "grad_norm": 4.365993499755859, + "learning_rate": 3.024e-06, + "loss": 1.9318, + "step": 1013 + }, + { + "epoch": 0.7987396612839701, + "grad_norm": 4.496088981628418, + "learning_rate": 3.027e-06, + "loss": 1.9644, + "step": 1014 + }, + { + "epoch": 0.7995273729814888, + "grad_norm": 3.4390265941619873, + "learning_rate": 3.0300000000000002e-06, + "loss": 1.9521, + "step": 1015 + }, + { + "epoch": 0.8003150846790075, + "grad_norm": 6.598624229431152, + "learning_rate": 3.033e-06, + "loss": 1.9013, + "step": 1016 + }, + { + "epoch": 0.8011027963765261, + "grad_norm": 4.286230087280273, + "learning_rate": 3.036e-06, + "loss": 1.9013, + "step": 1017 + }, + { + "epoch": 0.8018905080740449, + "grad_norm": 4.926265716552734, + "learning_rate": 3.039e-06, + "loss": 1.8178, + "step": 1018 + }, + { + "epoch": 0.8026782197715636, + "grad_norm": 3.5817341804504395, + "learning_rate": 3.0420000000000003e-06, + "loss": 1.8172, + "step": 1019 + }, + { + "epoch": 0.8034659314690823, + "grad_norm": 6.955155849456787, + "learning_rate": 3.0450000000000005e-06, + "loss": 1.8349, + "step": 1020 + }, + { + "epoch": 0.804253643166601, + "grad_norm": 4.973232746124268, + "learning_rate": 3.048e-06, + "loss": 1.828, + "step": 1021 + }, + { + "epoch": 0.8050413548641198, + "grad_norm": 4.2520527839660645, + "learning_rate": 3.051e-06, + "loss": 1.7728, + "step": 1022 + }, + { + "epoch": 0.8058290665616384, + "grad_norm": 5.360554218292236, + "learning_rate": 3.0540000000000003e-06, + "loss": 1.7346, + "step": 1023 + }, + { + "epoch": 0.8066167782591571, + "grad_norm": 3.994264841079712, + "learning_rate": 3.057e-06, + "loss": 1.7357, + "step": 1024 + }, + { + "epoch": 0.8074044899566759, + "grad_norm": 3.268131971359253, + "learning_rate": 3.06e-06, + "loss": 1.6717, + "step": 1025 + }, + { + "epoch": 0.8081922016541946, + "grad_norm": 5.882508277893066, + "learning_rate": 3.063e-06, + "loss": 1.6847, + "step": 1026 + }, + { + "epoch": 0.8089799133517133, + "grad_norm": 3.9281651973724365, + "learning_rate": 3.066e-06, + "loss": 1.6345, + "step": 1027 + }, + { + "epoch": 0.8097676250492319, + "grad_norm": 3.8242311477661133, + "learning_rate": 3.069e-06, + "loss": 1.649, + "step": 1028 + }, + { + "epoch": 0.8105553367467507, + "grad_norm": 3.1716628074645996, + "learning_rate": 3.0720000000000004e-06, + "loss": 1.5795, + "step": 1029 + }, + { + "epoch": 0.8113430484442694, + "grad_norm": 4.0709452629089355, + "learning_rate": 3.0749999999999998e-06, + "loss": 1.5906, + "step": 1030 + }, + { + "epoch": 0.8121307601417881, + "grad_norm": 3.973672389984131, + "learning_rate": 3.078e-06, + "loss": 1.5635, + "step": 1031 + }, + { + "epoch": 0.8129184718393068, + "grad_norm": 3.125135660171509, + "learning_rate": 3.0810000000000002e-06, + "loss": 1.5833, + "step": 1032 + }, + { + "epoch": 0.8137061835368256, + "grad_norm": 3.751004457473755, + "learning_rate": 3.084e-06, + "loss": 1.5243, + "step": 1033 + }, + { + "epoch": 0.8144938952343442, + "grad_norm": 4.194555282592773, + "learning_rate": 3.0870000000000003e-06, + "loss": 1.4787, + "step": 1034 + }, + { + "epoch": 0.8152816069318629, + "grad_norm": 4.877456188201904, + "learning_rate": 3.09e-06, + "loss": 1.454, + "step": 1035 + }, + { + "epoch": 0.8160693186293817, + "grad_norm": 5.8408966064453125, + "learning_rate": 3.093e-06, + "loss": 1.4853, + "step": 1036 + }, + { + "epoch": 0.8168570303269004, + "grad_norm": 4.6098480224609375, + "learning_rate": 3.096e-06, + "loss": 1.3987, + "step": 1037 + }, + { + "epoch": 0.8176447420244191, + "grad_norm": 28.30730438232422, + "learning_rate": 3.0990000000000003e-06, + "loss": 1.4065, + "step": 1038 + }, + { + "epoch": 0.8184324537219377, + "grad_norm": 3.609328508377075, + "learning_rate": 3.102e-06, + "loss": 1.3848, + "step": 1039 + }, + { + "epoch": 0.8192201654194565, + "grad_norm": 4.1485981941223145, + "learning_rate": 3.105e-06, + "loss": 1.2898, + "step": 1040 + }, + { + "epoch": 0.8200078771169752, + "grad_norm": 3.755368947982788, + "learning_rate": 3.108e-06, + "loss": 1.3154, + "step": 1041 + }, + { + "epoch": 0.8207955888144939, + "grad_norm": 6.452070713043213, + "learning_rate": 3.111e-06, + "loss": 1.3463, + "step": 1042 + }, + { + "epoch": 0.8215833005120126, + "grad_norm": 5.0185370445251465, + "learning_rate": 3.114e-06, + "loss": 1.3487, + "step": 1043 + }, + { + "epoch": 0.8223710122095313, + "grad_norm": 4.466368198394775, + "learning_rate": 3.1170000000000004e-06, + "loss": 1.2738, + "step": 1044 + }, + { + "epoch": 0.82315872390705, + "grad_norm": 4.167668342590332, + "learning_rate": 3.1199999999999998e-06, + "loss": 1.3069, + "step": 1045 + }, + { + "epoch": 0.8239464356045687, + "grad_norm": 5.197746276855469, + "learning_rate": 3.123e-06, + "loss": 1.2811, + "step": 1046 + }, + { + "epoch": 0.8247341473020874, + "grad_norm": 4.853879928588867, + "learning_rate": 3.1260000000000002e-06, + "loss": 1.2762, + "step": 1047 + }, + { + "epoch": 0.8255218589996062, + "grad_norm": 11.127102851867676, + "learning_rate": 3.129e-06, + "loss": 1.2836, + "step": 1048 + }, + { + "epoch": 0.8263095706971249, + "grad_norm": 5.074981212615967, + "learning_rate": 3.1320000000000003e-06, + "loss": 1.258, + "step": 1049 + }, + { + "epoch": 0.8270972823946435, + "grad_norm": 4.561675071716309, + "learning_rate": 3.135e-06, + "loss": 1.2269, + "step": 1050 + }, + { + "epoch": 0.8278849940921623, + "grad_norm": 7.019214630126953, + "learning_rate": 3.138e-06, + "loss": 1.939, + "step": 1051 + }, + { + "epoch": 0.828672705789681, + "grad_norm": 3.969000816345215, + "learning_rate": 3.141e-06, + "loss": 1.6244, + "step": 1052 + }, + { + "epoch": 0.8294604174871997, + "grad_norm": 3.4963326454162598, + "learning_rate": 3.1440000000000003e-06, + "loss": 1.6006, + "step": 1053 + }, + { + "epoch": 0.8302481291847184, + "grad_norm": 4.081366539001465, + "learning_rate": 3.1469999999999997e-06, + "loss": 1.5355, + "step": 1054 + }, + { + "epoch": 0.8310358408822371, + "grad_norm": 4.878650188446045, + "learning_rate": 3.15e-06, + "loss": 1.351, + "step": 1055 + }, + { + "epoch": 0.8318235525797558, + "grad_norm": 6.096035003662109, + "learning_rate": 3.153e-06, + "loss": 1.2609, + "step": 1056 + }, + { + "epoch": 0.8326112642772745, + "grad_norm": 7.754105091094971, + "learning_rate": 3.156e-06, + "loss": 1.1318, + "step": 1057 + }, + { + "epoch": 0.8333989759747932, + "grad_norm": 3.959177255630493, + "learning_rate": 3.159e-06, + "loss": 1.0371, + "step": 1058 + }, + { + "epoch": 0.834186687672312, + "grad_norm": 5.234315395355225, + "learning_rate": 3.162e-06, + "loss": 1.031, + "step": 1059 + }, + { + "epoch": 0.8349743993698306, + "grad_norm": 3.723569393157959, + "learning_rate": 3.1649999999999998e-06, + "loss": 1.0515, + "step": 1060 + }, + { + "epoch": 0.8357621110673493, + "grad_norm": 3.9514729976654053, + "learning_rate": 3.168e-06, + "loss": 1.0368, + "step": 1061 + }, + { + "epoch": 0.8365498227648681, + "grad_norm": 13.514260292053223, + "learning_rate": 3.1710000000000002e-06, + "loss": 0.9899, + "step": 1062 + }, + { + "epoch": 0.8373375344623868, + "grad_norm": 3.866931915283203, + "learning_rate": 3.1740000000000004e-06, + "loss": 0.9827, + "step": 1063 + }, + { + "epoch": 0.8381252461599055, + "grad_norm": 5.026232719421387, + "learning_rate": 3.177e-06, + "loss": 0.9044, + "step": 1064 + }, + { + "epoch": 0.8389129578574241, + "grad_norm": 5.592519760131836, + "learning_rate": 3.18e-06, + "loss": 0.9391, + "step": 1065 + }, + { + "epoch": 0.8397006695549429, + "grad_norm": 3.269538164138794, + "learning_rate": 3.1830000000000003e-06, + "loss": 0.9037, + "step": 1066 + }, + { + "epoch": 0.8404883812524616, + "grad_norm": 4.006326675415039, + "learning_rate": 3.186e-06, + "loss": 0.9226, + "step": 1067 + }, + { + "epoch": 0.8412760929499803, + "grad_norm": 2.966592788696289, + "learning_rate": 3.1890000000000003e-06, + "loss": 0.9015, + "step": 1068 + }, + { + "epoch": 0.842063804647499, + "grad_norm": 3.4548263549804688, + "learning_rate": 3.192e-06, + "loss": 0.9244, + "step": 1069 + }, + { + "epoch": 0.8428515163450178, + "grad_norm": 2.946293354034424, + "learning_rate": 3.195e-06, + "loss": 0.8466, + "step": 1070 + }, + { + "epoch": 0.8436392280425364, + "grad_norm": 4.226003646850586, + "learning_rate": 3.198e-06, + "loss": 0.8613, + "step": 1071 + }, + { + "epoch": 0.8444269397400551, + "grad_norm": 3.7543320655822754, + "learning_rate": 3.2010000000000004e-06, + "loss": 0.7961, + "step": 1072 + }, + { + "epoch": 0.8452146514375738, + "grad_norm": 4.749732971191406, + "learning_rate": 3.204e-06, + "loss": 0.7879, + "step": 1073 + }, + { + "epoch": 0.8460023631350926, + "grad_norm": 3.8218307495117188, + "learning_rate": 3.207e-06, + "loss": 0.7886, + "step": 1074 + }, + { + "epoch": 0.8467900748326113, + "grad_norm": 6.507490158081055, + "learning_rate": 3.21e-06, + "loss": 0.8012, + "step": 1075 + }, + { + "epoch": 0.8475777865301299, + "grad_norm": 4.772679805755615, + "learning_rate": 3.213e-06, + "loss": 0.7476, + "step": 1076 + }, + { + "epoch": 0.8483654982276487, + "grad_norm": 3.493980884552002, + "learning_rate": 3.216e-06, + "loss": 0.8121, + "step": 1077 + }, + { + "epoch": 0.8491532099251674, + "grad_norm": 3.303112030029297, + "learning_rate": 3.2190000000000004e-06, + "loss": 0.8281, + "step": 1078 + }, + { + "epoch": 0.8499409216226861, + "grad_norm": 3.6007447242736816, + "learning_rate": 3.222e-06, + "loss": 0.7475, + "step": 1079 + }, + { + "epoch": 0.8507286333202048, + "grad_norm": 5.453388214111328, + "learning_rate": 3.225e-06, + "loss": 0.777, + "step": 1080 + }, + { + "epoch": 0.8515163450177236, + "grad_norm": 10.656065940856934, + "learning_rate": 3.2280000000000003e-06, + "loss": 0.7758, + "step": 1081 + }, + { + "epoch": 0.8523040567152422, + "grad_norm": 3.0773637294769287, + "learning_rate": 3.231e-06, + "loss": 0.7289, + "step": 1082 + }, + { + "epoch": 0.8530917684127609, + "grad_norm": 13.724759101867676, + "learning_rate": 3.2340000000000003e-06, + "loss": 0.7569, + "step": 1083 + }, + { + "epoch": 0.8538794801102796, + "grad_norm": 3.7682483196258545, + "learning_rate": 3.237e-06, + "loss": 0.7453, + "step": 1084 + }, + { + "epoch": 0.8546671918077984, + "grad_norm": 3.000655174255371, + "learning_rate": 3.24e-06, + "loss": 0.7408, + "step": 1085 + }, + { + "epoch": 0.855454903505317, + "grad_norm": 7.748376369476318, + "learning_rate": 3.243e-06, + "loss": 0.7145, + "step": 1086 + }, + { + "epoch": 0.8562426152028357, + "grad_norm": 5.345240592956543, + "learning_rate": 3.2460000000000003e-06, + "loss": 0.6899, + "step": 1087 + }, + { + "epoch": 0.8570303269003545, + "grad_norm": 10.873013496398926, + "learning_rate": 3.2489999999999997e-06, + "loss": 0.7102, + "step": 1088 + }, + { + "epoch": 0.8578180385978732, + "grad_norm": 9.619946479797363, + "learning_rate": 3.252e-06, + "loss": 0.7439, + "step": 1089 + }, + { + "epoch": 0.8586057502953919, + "grad_norm": 7.744878768920898, + "learning_rate": 3.255e-06, + "loss": 0.7081, + "step": 1090 + }, + { + "epoch": 0.8593934619929106, + "grad_norm": 5.827689170837402, + "learning_rate": 3.258e-06, + "loss": 0.7182, + "step": 1091 + }, + { + "epoch": 0.8601811736904293, + "grad_norm": 3.264641523361206, + "learning_rate": 3.261e-06, + "loss": 0.7393, + "step": 1092 + }, + { + "epoch": 0.860968885387948, + "grad_norm": 4.608630180358887, + "learning_rate": 3.264e-06, + "loss": 0.7142, + "step": 1093 + }, + { + "epoch": 0.8617565970854667, + "grad_norm": 3.6443278789520264, + "learning_rate": 3.267e-06, + "loss": 0.6989, + "step": 1094 + }, + { + "epoch": 0.8625443087829854, + "grad_norm": 7.6793951988220215, + "learning_rate": 3.27e-06, + "loss": 0.709, + "step": 1095 + }, + { + "epoch": 0.8633320204805042, + "grad_norm": 4.662232875823975, + "learning_rate": 3.2730000000000003e-06, + "loss": 0.6695, + "step": 1096 + }, + { + "epoch": 0.8641197321780228, + "grad_norm": 4.492241382598877, + "learning_rate": 3.276e-06, + "loss": 0.6782, + "step": 1097 + }, + { + "epoch": 0.8649074438755415, + "grad_norm": 17.65072250366211, + "learning_rate": 3.279e-06, + "loss": 0.658, + "step": 1098 + }, + { + "epoch": 0.8656951555730603, + "grad_norm": 6.272402763366699, + "learning_rate": 3.282e-06, + "loss": 0.6805, + "step": 1099 + }, + { + "epoch": 0.866482867270579, + "grad_norm": 9.907552719116211, + "learning_rate": 3.285e-06, + "loss": 0.8197, + "step": 1100 + }, + { + "epoch": 0.8672705789680977, + "grad_norm": 14.023088455200195, + "learning_rate": 3.288e-06, + "loss": 1.5, + "step": 1101 + }, + { + "epoch": 0.8680582906656163, + "grad_norm": 4.32036018371582, + "learning_rate": 3.2910000000000003e-06, + "loss": 1.161, + "step": 1102 + }, + { + "epoch": 0.8688460023631351, + "grad_norm": 3.92932391166687, + "learning_rate": 3.2939999999999997e-06, + "loss": 1.0367, + "step": 1103 + }, + { + "epoch": 0.8696337140606538, + "grad_norm": 19.382919311523438, + "learning_rate": 3.297e-06, + "loss": 0.9937, + "step": 1104 + }, + { + "epoch": 0.8704214257581725, + "grad_norm": 5.846754550933838, + "learning_rate": 3.3e-06, + "loss": 0.9186, + "step": 1105 + }, + { + "epoch": 0.8712091374556912, + "grad_norm": 4.098384857177734, + "learning_rate": 3.3030000000000004e-06, + "loss": 0.8399, + "step": 1106 + }, + { + "epoch": 0.87199684915321, + "grad_norm": 3.697789430618286, + "learning_rate": 3.306e-06, + "loss": 0.6737, + "step": 1107 + }, + { + "epoch": 0.8727845608507286, + "grad_norm": 4.253589153289795, + "learning_rate": 3.309e-06, + "loss": 0.6102, + "step": 1108 + }, + { + "epoch": 0.8735722725482473, + "grad_norm": 6.122470855712891, + "learning_rate": 3.3120000000000002e-06, + "loss": 0.5616, + "step": 1109 + }, + { + "epoch": 0.874359984245766, + "grad_norm": 2.4783265590667725, + "learning_rate": 3.315e-06, + "loss": 0.5954, + "step": 1110 + }, + { + "epoch": 0.8751476959432848, + "grad_norm": 3.4110374450683594, + "learning_rate": 3.3180000000000003e-06, + "loss": 0.6261, + "step": 1111 + }, + { + "epoch": 0.8759354076408035, + "grad_norm": 4.064210891723633, + "learning_rate": 3.3210000000000005e-06, + "loss": 0.5841, + "step": 1112 + }, + { + "epoch": 0.8767231193383221, + "grad_norm": 2.7486207485198975, + "learning_rate": 3.324e-06, + "loss": 0.6, + "step": 1113 + }, + { + "epoch": 0.8775108310358409, + "grad_norm": 3.551591634750366, + "learning_rate": 3.327e-06, + "loss": 0.5035, + "step": 1114 + }, + { + "epoch": 0.8782985427333596, + "grad_norm": 4.115954875946045, + "learning_rate": 3.3300000000000003e-06, + "loss": 0.5682, + "step": 1115 + }, + { + "epoch": 0.8790862544308783, + "grad_norm": 3.460911989212036, + "learning_rate": 3.333e-06, + "loss": 0.5232, + "step": 1116 + }, + { + "epoch": 0.879873966128397, + "grad_norm": 3.8540542125701904, + "learning_rate": 3.336e-06, + "loss": 0.5396, + "step": 1117 + }, + { + "epoch": 0.8806616778259158, + "grad_norm": 3.6944868564605713, + "learning_rate": 3.339e-06, + "loss": 0.5115, + "step": 1118 + }, + { + "epoch": 0.8814493895234344, + "grad_norm": 3.835061550140381, + "learning_rate": 3.342e-06, + "loss": 0.5507, + "step": 1119 + }, + { + "epoch": 0.8822371012209531, + "grad_norm": 4.343217372894287, + "learning_rate": 3.345e-06, + "loss": 0.5428, + "step": 1120 + }, + { + "epoch": 0.8830248129184718, + "grad_norm": 7.847365379333496, + "learning_rate": 3.3480000000000004e-06, + "loss": 0.4942, + "step": 1121 + }, + { + "epoch": 0.8838125246159906, + "grad_norm": 3.0227346420288086, + "learning_rate": 3.3509999999999998e-06, + "loss": 0.5073, + "step": 1122 + }, + { + "epoch": 0.8846002363135093, + "grad_norm": 8.99701976776123, + "learning_rate": 3.354e-06, + "loss": 0.5178, + "step": 1123 + }, + { + "epoch": 0.8853879480110279, + "grad_norm": 6.753998756408691, + "learning_rate": 3.3570000000000002e-06, + "loss": 0.5119, + "step": 1124 + }, + { + "epoch": 0.8861756597085467, + "grad_norm": 5.206209182739258, + "learning_rate": 3.36e-06, + "loss": 0.5091, + "step": 1125 + }, + { + "epoch": 0.8869633714060654, + "grad_norm": 7.466050148010254, + "learning_rate": 3.3630000000000002e-06, + "loss": 0.502, + "step": 1126 + }, + { + "epoch": 0.8877510831035841, + "grad_norm": 4.531610488891602, + "learning_rate": 3.366e-06, + "loss": 0.5358, + "step": 1127 + }, + { + "epoch": 0.8885387948011028, + "grad_norm": 2.748633861541748, + "learning_rate": 3.369e-06, + "loss": 0.5284, + "step": 1128 + }, + { + "epoch": 0.8893265064986215, + "grad_norm": 4.822118759155273, + "learning_rate": 3.372e-06, + "loss": 0.472, + "step": 1129 + }, + { + "epoch": 0.8901142181961402, + "grad_norm": 4.843677043914795, + "learning_rate": 3.3750000000000003e-06, + "loss": 0.5163, + "step": 1130 + }, + { + "epoch": 0.8909019298936589, + "grad_norm": 6.4921488761901855, + "learning_rate": 3.378e-06, + "loss": 0.5166, + "step": 1131 + }, + { + "epoch": 0.8916896415911776, + "grad_norm": 4.164239883422852, + "learning_rate": 3.381e-06, + "loss": 0.5446, + "step": 1132 + }, + { + "epoch": 0.8924773532886964, + "grad_norm": 3.3269364833831787, + "learning_rate": 3.384e-06, + "loss": 0.5246, + "step": 1133 + }, + { + "epoch": 0.893265064986215, + "grad_norm": 4.669920444488525, + "learning_rate": 3.387e-06, + "loss": 0.5059, + "step": 1134 + }, + { + "epoch": 0.8940527766837337, + "grad_norm": 4.063310623168945, + "learning_rate": 3.39e-06, + "loss": 0.5054, + "step": 1135 + }, + { + "epoch": 0.8948404883812524, + "grad_norm": 2.8491244316101074, + "learning_rate": 3.3930000000000004e-06, + "loss": 0.4645, + "step": 1136 + }, + { + "epoch": 0.8956282000787712, + "grad_norm": 2.9744503498077393, + "learning_rate": 3.3959999999999998e-06, + "loss": 0.5225, + "step": 1137 + }, + { + "epoch": 0.8964159117762899, + "grad_norm": 6.558324813842773, + "learning_rate": 3.399e-06, + "loss": 0.5386, + "step": 1138 + }, + { + "epoch": 0.8972036234738086, + "grad_norm": 7.658748149871826, + "learning_rate": 3.402e-06, + "loss": 0.5537, + "step": 1139 + }, + { + "epoch": 0.8979913351713273, + "grad_norm": 4.289274215698242, + "learning_rate": 3.405e-06, + "loss": 0.5274, + "step": 1140 + }, + { + "epoch": 0.898779046868846, + "grad_norm": 4.710799217224121, + "learning_rate": 3.4080000000000002e-06, + "loss": 0.4817, + "step": 1141 + }, + { + "epoch": 0.8995667585663647, + "grad_norm": 3.0264246463775635, + "learning_rate": 3.411e-06, + "loss": 0.531, + "step": 1142 + }, + { + "epoch": 0.9003544702638834, + "grad_norm": 6.043869495391846, + "learning_rate": 3.414e-06, + "loss": 0.5046, + "step": 1143 + }, + { + "epoch": 0.9011421819614022, + "grad_norm": 9.2271089553833, + "learning_rate": 3.417e-06, + "loss": 0.4901, + "step": 1144 + }, + { + "epoch": 0.9019298936589208, + "grad_norm": 11.42390251159668, + "learning_rate": 3.4200000000000003e-06, + "loss": 0.5273, + "step": 1145 + }, + { + "epoch": 0.9027176053564395, + "grad_norm": 3.458566427230835, + "learning_rate": 3.4229999999999997e-06, + "loss": 0.4252, + "step": 1146 + }, + { + "epoch": 0.9035053170539582, + "grad_norm": 6.166426658630371, + "learning_rate": 3.426e-06, + "loss": 0.4634, + "step": 1147 + }, + { + "epoch": 0.904293028751477, + "grad_norm": 3.6612441539764404, + "learning_rate": 3.429e-06, + "loss": 0.4329, + "step": 1148 + }, + { + "epoch": 0.9050807404489957, + "grad_norm": 4.446394920349121, + "learning_rate": 3.4320000000000003e-06, + "loss": 0.5137, + "step": 1149 + }, + { + "epoch": 0.9058684521465143, + "grad_norm": 6.211001873016357, + "learning_rate": 3.435e-06, + "loss": 0.631, + "step": 1150 + }, + { + "epoch": 0.9066561638440331, + "grad_norm": 8.367463111877441, + "learning_rate": 3.438e-06, + "loss": 1.3483, + "step": 1151 + }, + { + "epoch": 0.9074438755415518, + "grad_norm": 3.72420072555542, + "learning_rate": 3.441e-06, + "loss": 1.1415, + "step": 1152 + }, + { + "epoch": 0.9082315872390705, + "grad_norm": 3.531597852706909, + "learning_rate": 3.444e-06, + "loss": 1.0273, + "step": 1153 + }, + { + "epoch": 0.9090192989365892, + "grad_norm": 2.968470335006714, + "learning_rate": 3.447e-06, + "loss": 0.7341, + "step": 1154 + }, + { + "epoch": 0.909807010634108, + "grad_norm": 2.9107186794281006, + "learning_rate": 3.4500000000000004e-06, + "loss": 0.6421, + "step": 1155 + }, + { + "epoch": 0.9105947223316266, + "grad_norm": 12.050193786621094, + "learning_rate": 3.453e-06, + "loss": 0.6282, + "step": 1156 + }, + { + "epoch": 0.9113824340291453, + "grad_norm": 11.267151832580566, + "learning_rate": 3.456e-06, + "loss": 0.5361, + "step": 1157 + }, + { + "epoch": 0.912170145726664, + "grad_norm": 4.539441108703613, + "learning_rate": 3.4590000000000003e-06, + "loss": 0.4934, + "step": 1158 + }, + { + "epoch": 0.9129578574241828, + "grad_norm": 4.396383285522461, + "learning_rate": 3.462e-06, + "loss": 0.458, + "step": 1159 + }, + { + "epoch": 0.9137455691217015, + "grad_norm": 2.889822244644165, + "learning_rate": 3.4650000000000003e-06, + "loss": 0.4735, + "step": 1160 + }, + { + "epoch": 0.9145332808192201, + "grad_norm": 4.458437442779541, + "learning_rate": 3.468e-06, + "loss": 0.4083, + "step": 1161 + }, + { + "epoch": 0.9153209925167388, + "grad_norm": 3.48685359954834, + "learning_rate": 3.471e-06, + "loss": 0.5187, + "step": 1162 + }, + { + "epoch": 0.9161087042142576, + "grad_norm": 4.9519500732421875, + "learning_rate": 3.474e-06, + "loss": 0.4172, + "step": 1163 + }, + { + "epoch": 0.9168964159117763, + "grad_norm": 5.605695724487305, + "learning_rate": 3.4770000000000003e-06, + "loss": 0.4136, + "step": 1164 + }, + { + "epoch": 0.917684127609295, + "grad_norm": 5.635591506958008, + "learning_rate": 3.48e-06, + "loss": 0.4686, + "step": 1165 + }, + { + "epoch": 0.9184718393068138, + "grad_norm": 2.882098436355591, + "learning_rate": 3.483e-06, + "loss": 0.462, + "step": 1166 + }, + { + "epoch": 0.9192595510043324, + "grad_norm": 2.6808080673217773, + "learning_rate": 3.486e-06, + "loss": 0.3938, + "step": 1167 + }, + { + "epoch": 0.9200472627018511, + "grad_norm": 4.475885391235352, + "learning_rate": 3.489e-06, + "loss": 0.4678, + "step": 1168 + }, + { + "epoch": 0.9208349743993698, + "grad_norm": 3.4507811069488525, + "learning_rate": 3.492e-06, + "loss": 0.4437, + "step": 1169 + }, + { + "epoch": 0.9216226860968886, + "grad_norm": 10.463024139404297, + "learning_rate": 3.4950000000000004e-06, + "loss": 0.423, + "step": 1170 + }, + { + "epoch": 0.9224103977944073, + "grad_norm": 3.1919608116149902, + "learning_rate": 3.498e-06, + "loss": 0.419, + "step": 1171 + }, + { + "epoch": 0.9231981094919259, + "grad_norm": 2.8023018836975098, + "learning_rate": 3.501e-06, + "loss": 0.4195, + "step": 1172 + }, + { + "epoch": 0.9239858211894446, + "grad_norm": 2.7175233364105225, + "learning_rate": 3.5040000000000002e-06, + "loss": 0.4538, + "step": 1173 + }, + { + "epoch": 0.9247735328869634, + "grad_norm": 3.809610605239868, + "learning_rate": 3.507e-06, + "loss": 0.4586, + "step": 1174 + }, + { + "epoch": 0.9255612445844821, + "grad_norm": 3.9508564472198486, + "learning_rate": 3.5100000000000003e-06, + "loss": 0.4288, + "step": 1175 + }, + { + "epoch": 0.9263489562820008, + "grad_norm": 3.158886671066284, + "learning_rate": 3.513e-06, + "loss": 0.4903, + "step": 1176 + }, + { + "epoch": 0.9271366679795195, + "grad_norm": 2.8902432918548584, + "learning_rate": 3.516e-06, + "loss": 0.3832, + "step": 1177 + }, + { + "epoch": 0.9279243796770382, + "grad_norm": 8.871356964111328, + "learning_rate": 3.519e-06, + "loss": 0.4013, + "step": 1178 + }, + { + "epoch": 0.9287120913745569, + "grad_norm": 7.328275203704834, + "learning_rate": 3.5220000000000003e-06, + "loss": 0.4124, + "step": 1179 + }, + { + "epoch": 0.9294998030720756, + "grad_norm": 4.21427583694458, + "learning_rate": 3.5249999999999997e-06, + "loss": 0.4841, + "step": 1180 + }, + { + "epoch": 0.9302875147695944, + "grad_norm": 3.668189764022827, + "learning_rate": 3.528e-06, + "loss": 0.413, + "step": 1181 + }, + { + "epoch": 0.931075226467113, + "grad_norm": 5.004907608032227, + "learning_rate": 3.531e-06, + "loss": 0.3987, + "step": 1182 + }, + { + "epoch": 0.9318629381646317, + "grad_norm": 3.033874988555908, + "learning_rate": 3.534e-06, + "loss": 0.3774, + "step": 1183 + }, + { + "epoch": 0.9326506498621504, + "grad_norm": 2.6235339641571045, + "learning_rate": 3.537e-06, + "loss": 0.4237, + "step": 1184 + }, + { + "epoch": 0.9334383615596692, + "grad_norm": 2.554625988006592, + "learning_rate": 3.54e-06, + "loss": 0.407, + "step": 1185 + }, + { + "epoch": 0.9342260732571879, + "grad_norm": 2.401357889175415, + "learning_rate": 3.543e-06, + "loss": 0.4266, + "step": 1186 + }, + { + "epoch": 0.9350137849547066, + "grad_norm": 8.281052589416504, + "learning_rate": 3.546e-06, + "loss": 0.3797, + "step": 1187 + }, + { + "epoch": 0.9358014966522253, + "grad_norm": 7.606924057006836, + "learning_rate": 3.5490000000000002e-06, + "loss": 0.3961, + "step": 1188 + }, + { + "epoch": 0.936589208349744, + "grad_norm": 3.2093706130981445, + "learning_rate": 3.552e-06, + "loss": 0.4239, + "step": 1189 + }, + { + "epoch": 0.9373769200472627, + "grad_norm": 2.5336520671844482, + "learning_rate": 3.555e-06, + "loss": 0.3447, + "step": 1190 + }, + { + "epoch": 0.9381646317447814, + "grad_norm": 2.9097177982330322, + "learning_rate": 3.558e-06, + "loss": 0.4065, + "step": 1191 + }, + { + "epoch": 0.9389523434423002, + "grad_norm": 39.24934005737305, + "learning_rate": 3.5610000000000003e-06, + "loss": 0.402, + "step": 1192 + }, + { + "epoch": 0.9397400551398188, + "grad_norm": 4.197928428649902, + "learning_rate": 3.564e-06, + "loss": 0.4036, + "step": 1193 + }, + { + "epoch": 0.9405277668373375, + "grad_norm": 3.7279715538024902, + "learning_rate": 3.5670000000000003e-06, + "loss": 0.4383, + "step": 1194 + }, + { + "epoch": 0.9413154785348562, + "grad_norm": 7.544140815734863, + "learning_rate": 3.57e-06, + "loss": 0.4562, + "step": 1195 + }, + { + "epoch": 0.942103190232375, + "grad_norm": 3.498790740966797, + "learning_rate": 3.573e-06, + "loss": 0.4016, + "step": 1196 + }, + { + "epoch": 0.9428909019298937, + "grad_norm": 5.301417350769043, + "learning_rate": 3.576e-06, + "loss": 0.4085, + "step": 1197 + }, + { + "epoch": 0.9436786136274123, + "grad_norm": 2.7728464603424072, + "learning_rate": 3.5790000000000004e-06, + "loss": 0.4215, + "step": 1198 + }, + { + "epoch": 0.944466325324931, + "grad_norm": 11.448049545288086, + "learning_rate": 3.582e-06, + "loss": 0.4795, + "step": 1199 + }, + { + "epoch": 0.9452540370224498, + "grad_norm": 6.613025665283203, + "learning_rate": 3.585e-06, + "loss": 0.4803, + "step": 1200 + }, + { + "epoch": 0.9460417487199685, + "grad_norm": 8.056041717529297, + "learning_rate": 3.588e-06, + "loss": 1.1664, + "step": 1201 + }, + { + "epoch": 0.9468294604174872, + "grad_norm": 3.7765228748321533, + "learning_rate": 3.591e-06, + "loss": 0.8134, + "step": 1202 + }, + { + "epoch": 0.947617172115006, + "grad_norm": 3.0526349544525146, + "learning_rate": 3.5940000000000002e-06, + "loss": 0.7408, + "step": 1203 + }, + { + "epoch": 0.9484048838125246, + "grad_norm": 6.651916027069092, + "learning_rate": 3.5970000000000005e-06, + "loss": 0.7296, + "step": 1204 + }, + { + "epoch": 0.9491925955100433, + "grad_norm": 8.659046173095703, + "learning_rate": 3.6e-06, + "loss": 0.7233, + "step": 1205 + }, + { + "epoch": 0.949980307207562, + "grad_norm": 4.675987243652344, + "learning_rate": 3.603e-06, + "loss": 0.4628, + "step": 1206 + }, + { + "epoch": 0.9507680189050808, + "grad_norm": 5.705756664276123, + "learning_rate": 3.6060000000000003e-06, + "loss": 0.5057, + "step": 1207 + }, + { + "epoch": 0.9515557306025995, + "grad_norm": 3.674692392349243, + "learning_rate": 3.609e-06, + "loss": 0.4442, + "step": 1208 + }, + { + "epoch": 0.9523434423001181, + "grad_norm": 3.062688112258911, + "learning_rate": 3.612e-06, + "loss": 0.3974, + "step": 1209 + }, + { + "epoch": 0.9531311539976368, + "grad_norm": 2.6075356006622314, + "learning_rate": 3.615e-06, + "loss": 0.3625, + "step": 1210 + }, + { + "epoch": 0.9539188656951556, + "grad_norm": 2.6464169025421143, + "learning_rate": 3.618e-06, + "loss": 0.315, + "step": 1211 + }, + { + "epoch": 0.9547065773926743, + "grad_norm": 2.388126850128174, + "learning_rate": 3.621e-06, + "loss": 0.3166, + "step": 1212 + }, + { + "epoch": 0.955494289090193, + "grad_norm": 3.2879750728607178, + "learning_rate": 3.6240000000000004e-06, + "loss": 0.3577, + "step": 1213 + }, + { + "epoch": 0.9562820007877118, + "grad_norm": 2.615007162094116, + "learning_rate": 3.6269999999999997e-06, + "loss": 0.3752, + "step": 1214 + }, + { + "epoch": 0.9570697124852304, + "grad_norm": 2.915797233581543, + "learning_rate": 3.63e-06, + "loss": 0.3525, + "step": 1215 + }, + { + "epoch": 0.9578574241827491, + "grad_norm": 22.323450088500977, + "learning_rate": 3.633e-06, + "loss": 0.378, + "step": 1216 + }, + { + "epoch": 0.9586451358802678, + "grad_norm": 3.7136592864990234, + "learning_rate": 3.636e-06, + "loss": 0.3858, + "step": 1217 + }, + { + "epoch": 0.9594328475777866, + "grad_norm": 2.5390210151672363, + "learning_rate": 3.6390000000000002e-06, + "loss": 0.2957, + "step": 1218 + }, + { + "epoch": 0.9602205592753053, + "grad_norm": 3.7099318504333496, + "learning_rate": 3.642e-06, + "loss": 0.3973, + "step": 1219 + }, + { + "epoch": 0.9610082709728239, + "grad_norm": 7.129267692565918, + "learning_rate": 3.645e-06, + "loss": 0.3177, + "step": 1220 + }, + { + "epoch": 0.9617959826703426, + "grad_norm": 2.680284023284912, + "learning_rate": 3.648e-06, + "loss": 0.3305, + "step": 1221 + }, + { + "epoch": 0.9625836943678614, + "grad_norm": 3.1353695392608643, + "learning_rate": 3.6510000000000003e-06, + "loss": 0.3593, + "step": 1222 + }, + { + "epoch": 0.9633714060653801, + "grad_norm": 4.070639133453369, + "learning_rate": 3.654e-06, + "loss": 0.3776, + "step": 1223 + }, + { + "epoch": 0.9641591177628988, + "grad_norm": 3.5022709369659424, + "learning_rate": 3.657e-06, + "loss": 0.3326, + "step": 1224 + }, + { + "epoch": 0.9649468294604174, + "grad_norm": 2.3663454055786133, + "learning_rate": 3.66e-06, + "loss": 0.3466, + "step": 1225 + }, + { + "epoch": 0.9657345411579362, + "grad_norm": 2.428966522216797, + "learning_rate": 3.663e-06, + "loss": 0.3499, + "step": 1226 + }, + { + "epoch": 0.9665222528554549, + "grad_norm": 3.6676034927368164, + "learning_rate": 3.666e-06, + "loss": 0.3935, + "step": 1227 + }, + { + "epoch": 0.9673099645529736, + "grad_norm": 4.625428676605225, + "learning_rate": 3.6690000000000004e-06, + "loss": 0.3518, + "step": 1228 + }, + { + "epoch": 0.9680976762504924, + "grad_norm": 4.880057334899902, + "learning_rate": 3.6719999999999997e-06, + "loss": 0.3731, + "step": 1229 + }, + { + "epoch": 0.968885387948011, + "grad_norm": 3.1255228519439697, + "learning_rate": 3.675e-06, + "loss": 0.3683, + "step": 1230 + }, + { + "epoch": 0.9696730996455297, + "grad_norm": 3.32456636428833, + "learning_rate": 3.678e-06, + "loss": 0.3061, + "step": 1231 + }, + { + "epoch": 0.9704608113430484, + "grad_norm": 5.8004069328308105, + "learning_rate": 3.681e-06, + "loss": 0.3928, + "step": 1232 + }, + { + "epoch": 0.9712485230405672, + "grad_norm": 2.2535367012023926, + "learning_rate": 3.6840000000000002e-06, + "loss": 0.318, + "step": 1233 + }, + { + "epoch": 0.9720362347380859, + "grad_norm": 3.510406255722046, + "learning_rate": 3.687e-06, + "loss": 0.3348, + "step": 1234 + }, + { + "epoch": 0.9728239464356045, + "grad_norm": 1.9224129915237427, + "learning_rate": 3.6900000000000002e-06, + "loss": 0.3076, + "step": 1235 + }, + { + "epoch": 0.9736116581331232, + "grad_norm": 3.1807479858398438, + "learning_rate": 3.693e-06, + "loss": 0.3456, + "step": 1236 + }, + { + "epoch": 0.974399369830642, + "grad_norm": 3.2000956535339355, + "learning_rate": 3.6960000000000003e-06, + "loss": 0.3875, + "step": 1237 + }, + { + "epoch": 0.9751870815281607, + "grad_norm": 4.745882987976074, + "learning_rate": 3.6990000000000005e-06, + "loss": 0.3599, + "step": 1238 + }, + { + "epoch": 0.9759747932256794, + "grad_norm": 4.309238910675049, + "learning_rate": 3.702e-06, + "loss": 0.3918, + "step": 1239 + }, + { + "epoch": 0.9767625049231982, + "grad_norm": 4.068593978881836, + "learning_rate": 3.705e-06, + "loss": 0.3654, + "step": 1240 + }, + { + "epoch": 0.9775502166207168, + "grad_norm": 3.124537706375122, + "learning_rate": 3.7080000000000003e-06, + "loss": 0.3482, + "step": 1241 + }, + { + "epoch": 0.9783379283182355, + "grad_norm": 3.208033323287964, + "learning_rate": 3.711e-06, + "loss": 0.3725, + "step": 1242 + }, + { + "epoch": 0.9791256400157542, + "grad_norm": 3.003239870071411, + "learning_rate": 3.714e-06, + "loss": 0.3101, + "step": 1243 + }, + { + "epoch": 0.979913351713273, + "grad_norm": 13.011398315429688, + "learning_rate": 3.717e-06, + "loss": 0.3142, + "step": 1244 + }, + { + "epoch": 0.9807010634107917, + "grad_norm": 4.733878135681152, + "learning_rate": 3.72e-06, + "loss": 0.3456, + "step": 1245 + }, + { + "epoch": 0.9814887751083103, + "grad_norm": 3.7196409702301025, + "learning_rate": 3.723e-06, + "loss": 0.3859, + "step": 1246 + }, + { + "epoch": 0.982276486805829, + "grad_norm": 6.0371294021606445, + "learning_rate": 3.7260000000000004e-06, + "loss": 0.4415, + "step": 1247 + }, + { + "epoch": 0.9830641985033478, + "grad_norm": 3.4489998817443848, + "learning_rate": 3.7289999999999998e-06, + "loss": 0.3477, + "step": 1248 + }, + { + "epoch": 0.9838519102008665, + "grad_norm": 5.294630527496338, + "learning_rate": 3.732e-06, + "loss": 0.4137, + "step": 1249 + }, + { + "epoch": 0.9846396218983852, + "grad_norm": 3.080110549926758, + "learning_rate": 3.7350000000000002e-06, + "loss": 0.3911, + "step": 1250 + }, + { + "epoch": 0.985427333595904, + "grad_norm": 5.3841962814331055, + "learning_rate": 3.738e-06, + "loss": 1.0347, + "step": 1251 + }, + { + "epoch": 0.9862150452934226, + "grad_norm": 2.6730129718780518, + "learning_rate": 3.7410000000000003e-06, + "loss": 0.5279, + "step": 1252 + }, + { + "epoch": 0.9870027569909413, + "grad_norm": 2.3058762550354004, + "learning_rate": 3.744e-06, + "loss": 0.2745, + "step": 1253 + }, + { + "epoch": 0.98779046868846, + "grad_norm": 2.879927635192871, + "learning_rate": 3.747e-06, + "loss": 0.2406, + "step": 1254 + }, + { + "epoch": 0.9885781803859788, + "grad_norm": 2.425311803817749, + "learning_rate": 3.75e-06, + "loss": 0.2625, + "step": 1255 + }, + { + "epoch": 0.9893658920834975, + "grad_norm": 2.372466564178467, + "learning_rate": 3.753e-06, + "loss": 0.3108, + "step": 1256 + }, + { + "epoch": 0.9901536037810161, + "grad_norm": 3.010411262512207, + "learning_rate": 3.756e-06, + "loss": 0.2902, + "step": 1257 + }, + { + "epoch": 0.9909413154785348, + "grad_norm": 3.0602707862854004, + "learning_rate": 3.759e-06, + "loss": 0.3047, + "step": 1258 + }, + { + "epoch": 0.9917290271760536, + "grad_norm": 3.1091232299804688, + "learning_rate": 3.7620000000000006e-06, + "loss": 0.3093, + "step": 1259 + }, + { + "epoch": 0.9925167388735723, + "grad_norm": 6.6517157554626465, + "learning_rate": 3.765e-06, + "loss": 0.3034, + "step": 1260 + }, + { + "epoch": 0.993304450571091, + "grad_norm": 2.4148201942443848, + "learning_rate": 3.7679999999999998e-06, + "loss": 0.3529, + "step": 1261 + }, + { + "epoch": 0.9940921622686096, + "grad_norm": 3.7073416709899902, + "learning_rate": 3.7710000000000004e-06, + "loss": 0.324, + "step": 1262 + }, + { + "epoch": 0.9948798739661284, + "grad_norm": 4.360737323760986, + "learning_rate": 3.7739999999999998e-06, + "loss": 0.2868, + "step": 1263 + }, + { + "epoch": 0.9956675856636471, + "grad_norm": 4.828347206115723, + "learning_rate": 3.7770000000000004e-06, + "loss": 0.3578, + "step": 1264 + }, + { + "epoch": 0.9964552973611658, + "grad_norm": 3.4224038124084473, + "learning_rate": 3.7800000000000002e-06, + "loss": 0.307, + "step": 1265 + }, + { + "epoch": 0.9972430090586846, + "grad_norm": 2.7103476524353027, + "learning_rate": 3.7829999999999996e-06, + "loss": 0.3204, + "step": 1266 + }, + { + "epoch": 0.9980307207562032, + "grad_norm": 2.2158963680267334, + "learning_rate": 3.7860000000000003e-06, + "loss": 0.2685, + "step": 1267 + }, + { + "epoch": 0.9988184324537219, + "grad_norm": 10.033968925476074, + "learning_rate": 3.789e-06, + "loss": 0.3249, + "step": 1268 + }, + { + "epoch": 0.9996061441512406, + "grad_norm": 3.2861170768737793, + "learning_rate": 3.7920000000000003e-06, + "loss": 0.3775, + "step": 1269 + }, + { + "epoch": 1.0, + "grad_norm": 3.7990927696228027, + "learning_rate": 3.795e-06, + "loss": 0.1639, + "step": 1270 + }, + { + "epoch": 1.0007877116975188, + "grad_norm": 5.074099063873291, + "learning_rate": 3.798e-06, + "loss": 1.1011, + "step": 1271 + }, + { + "epoch": 1.0015754233950374, + "grad_norm": 7.191810607910156, + "learning_rate": 3.801e-06, + "loss": 0.9229, + "step": 1272 + }, + { + "epoch": 1.0023631350925561, + "grad_norm": 3.9516053199768066, + "learning_rate": 3.804e-06, + "loss": 0.7964, + "step": 1273 + }, + { + "epoch": 1.003150846790075, + "grad_norm": 3.514815330505371, + "learning_rate": 3.8070000000000006e-06, + "loss": 0.6853, + "step": 1274 + }, + { + "epoch": 1.0039385584875935, + "grad_norm": 4.220038414001465, + "learning_rate": 3.81e-06, + "loss": 0.5452, + "step": 1275 + }, + { + "epoch": 1.0047262701851123, + "grad_norm": 3.2218754291534424, + "learning_rate": 3.8129999999999997e-06, + "loss": 0.5181, + "step": 1276 + }, + { + "epoch": 1.0055139818826309, + "grad_norm": 3.5068178176879883, + "learning_rate": 3.816e-06, + "loss": 0.3258, + "step": 1277 + }, + { + "epoch": 1.0063016935801496, + "grad_norm": 3.6774039268493652, + "learning_rate": 3.819e-06, + "loss": 0.3208, + "step": 1278 + }, + { + "epoch": 1.0070894052776684, + "grad_norm": 2.5485074520111084, + "learning_rate": 3.822000000000001e-06, + "loss": 0.3461, + "step": 1279 + }, + { + "epoch": 1.007877116975187, + "grad_norm": 3.5807623863220215, + "learning_rate": 3.825e-06, + "loss": 0.2671, + "step": 1280 + }, + { + "epoch": 1.0086648286727058, + "grad_norm": 3.782395839691162, + "learning_rate": 3.828e-06, + "loss": 0.2717, + "step": 1281 + }, + { + "epoch": 1.0094525403702246, + "grad_norm": 2.7233588695526123, + "learning_rate": 3.831e-06, + "loss": 0.3083, + "step": 1282 + }, + { + "epoch": 1.0102402520677431, + "grad_norm": 4.000791072845459, + "learning_rate": 3.834e-06, + "loss": 0.2735, + "step": 1283 + }, + { + "epoch": 1.011027963765262, + "grad_norm": 2.466639995574951, + "learning_rate": 3.837000000000001e-06, + "loss": 0.2986, + "step": 1284 + }, + { + "epoch": 1.0118156754627807, + "grad_norm": 6.511389255523682, + "learning_rate": 3.8400000000000005e-06, + "loss": 0.3101, + "step": 1285 + }, + { + "epoch": 1.0126033871602993, + "grad_norm": 2.5770514011383057, + "learning_rate": 3.8429999999999995e-06, + "loss": 0.2505, + "step": 1286 + }, + { + "epoch": 1.013391098857818, + "grad_norm": 2.2380118370056152, + "learning_rate": 3.846e-06, + "loss": 0.259, + "step": 1287 + }, + { + "epoch": 1.0141788105553367, + "grad_norm": 2.917198896408081, + "learning_rate": 3.849e-06, + "loss": 0.2534, + "step": 1288 + }, + { + "epoch": 1.0149665222528554, + "grad_norm": 2.2656519412994385, + "learning_rate": 3.852e-06, + "loss": 0.2545, + "step": 1289 + }, + { + "epoch": 1.0157542339503742, + "grad_norm": 2.572082757949829, + "learning_rate": 3.855e-06, + "loss": 0.2767, + "step": 1290 + }, + { + "epoch": 1.0165419456478928, + "grad_norm": 2.0214669704437256, + "learning_rate": 3.858e-06, + "loss": 0.2647, + "step": 1291 + }, + { + "epoch": 1.0173296573454116, + "grad_norm": 2.2449562549591064, + "learning_rate": 3.861e-06, + "loss": 0.2411, + "step": 1292 + }, + { + "epoch": 1.0181173690429304, + "grad_norm": 2.1194849014282227, + "learning_rate": 3.864e-06, + "loss": 0.2576, + "step": 1293 + }, + { + "epoch": 1.018905080740449, + "grad_norm": 2.1098968982696533, + "learning_rate": 3.8669999999999996e-06, + "loss": 0.2547, + "step": 1294 + }, + { + "epoch": 1.0196927924379677, + "grad_norm": 2.6017301082611084, + "learning_rate": 3.87e-06, + "loss": 0.254, + "step": 1295 + }, + { + "epoch": 1.0204805041354863, + "grad_norm": 4.482944965362549, + "learning_rate": 3.873e-06, + "loss": 0.3331, + "step": 1296 + }, + { + "epoch": 1.021268215833005, + "grad_norm": 7.798954963684082, + "learning_rate": 3.876000000000001e-06, + "loss": 0.2408, + "step": 1297 + }, + { + "epoch": 1.0220559275305239, + "grad_norm": 2.632870674133301, + "learning_rate": 3.8790000000000005e-06, + "loss": 0.3011, + "step": 1298 + }, + { + "epoch": 1.0228436392280424, + "grad_norm": 2.679248571395874, + "learning_rate": 3.8819999999999994e-06, + "loss": 0.3029, + "step": 1299 + }, + { + "epoch": 1.0236313509255612, + "grad_norm": 4.549932479858398, + "learning_rate": 3.885e-06, + "loss": 0.2807, + "step": 1300 + }, + { + "epoch": 1.02441906262308, + "grad_norm": 2.839916944503784, + "learning_rate": 3.888e-06, + "loss": 0.2204, + "step": 1301 + }, + { + "epoch": 1.0252067743205986, + "grad_norm": 6.154565811157227, + "learning_rate": 3.8910000000000005e-06, + "loss": 0.2825, + "step": 1302 + }, + { + "epoch": 1.0259944860181174, + "grad_norm": 3.1043379306793213, + "learning_rate": 3.894e-06, + "loss": 0.2611, + "step": 1303 + }, + { + "epoch": 1.0267821977156362, + "grad_norm": 3.4997217655181885, + "learning_rate": 3.897e-06, + "loss": 0.3415, + "step": 1304 + }, + { + "epoch": 1.0275699094131547, + "grad_norm": 3.0458970069885254, + "learning_rate": 3.9e-06, + "loss": 0.2799, + "step": 1305 + }, + { + "epoch": 1.0283576211106735, + "grad_norm": 6.000100612640381, + "learning_rate": 3.903e-06, + "loss": 0.2506, + "step": 1306 + }, + { + "epoch": 1.029145332808192, + "grad_norm": 3.340606451034546, + "learning_rate": 3.906e-06, + "loss": 0.3162, + "step": 1307 + }, + { + "epoch": 1.0299330445057109, + "grad_norm": 4.020846843719482, + "learning_rate": 3.909e-06, + "loss": 0.3353, + "step": 1308 + }, + { + "epoch": 1.0307207562032297, + "grad_norm": 2.162748336791992, + "learning_rate": 3.912e-06, + "loss": 0.2822, + "step": 1309 + }, + { + "epoch": 1.0315084679007482, + "grad_norm": 2.462007761001587, + "learning_rate": 3.915000000000001e-06, + "loss": 0.2844, + "step": 1310 + }, + { + "epoch": 1.032296179598267, + "grad_norm": 2.053989887237549, + "learning_rate": 3.918e-06, + "loss": 0.2903, + "step": 1311 + }, + { + "epoch": 1.0330838912957858, + "grad_norm": 4.402164459228516, + "learning_rate": 3.921e-06, + "loss": 0.3009, + "step": 1312 + }, + { + "epoch": 1.0338716029933044, + "grad_norm": 2.4374823570251465, + "learning_rate": 3.924e-06, + "loss": 0.3609, + "step": 1313 + }, + { + "epoch": 1.0346593146908232, + "grad_norm": 13.578911781311035, + "learning_rate": 3.927e-06, + "loss": 0.3242, + "step": 1314 + }, + { + "epoch": 1.035447026388342, + "grad_norm": 2.7982418537139893, + "learning_rate": 3.9300000000000005e-06, + "loss": 0.2616, + "step": 1315 + }, + { + "epoch": 1.0362347380858605, + "grad_norm": 2.6825201511383057, + "learning_rate": 3.933e-06, + "loss": 0.2613, + "step": 1316 + }, + { + "epoch": 1.0370224497833793, + "grad_norm": 2.9255318641662598, + "learning_rate": 3.936e-06, + "loss": 0.3234, + "step": 1317 + }, + { + "epoch": 1.0378101614808979, + "grad_norm": 2.5087146759033203, + "learning_rate": 3.939e-06, + "loss": 0.2986, + "step": 1318 + }, + { + "epoch": 1.0385978731784167, + "grad_norm": 3.330782651901245, + "learning_rate": 3.942e-06, + "loss": 0.3506, + "step": 1319 + }, + { + "epoch": 1.0393855848759355, + "grad_norm": 2.6149423122406006, + "learning_rate": 3.945e-06, + "loss": 0.3507, + "step": 1320 + }, + { + "epoch": 1.040173296573454, + "grad_norm": 5.548201084136963, + "learning_rate": 3.948e-06, + "loss": 1.1716, + "step": 1321 + }, + { + "epoch": 1.0409610082709728, + "grad_norm": 2.8996071815490723, + "learning_rate": 3.951000000000001e-06, + "loss": 0.7794, + "step": 1322 + }, + { + "epoch": 1.0417487199684916, + "grad_norm": 4.579773426055908, + "learning_rate": 3.954e-06, + "loss": 0.77, + "step": 1323 + }, + { + "epoch": 1.0425364316660102, + "grad_norm": 15.916885375976562, + "learning_rate": 3.9569999999999996e-06, + "loss": 0.6484, + "step": 1324 + }, + { + "epoch": 1.043324143363529, + "grad_norm": 3.112560749053955, + "learning_rate": 3.96e-06, + "loss": 0.4904, + "step": 1325 + }, + { + "epoch": 1.0441118550610478, + "grad_norm": 3.4563841819763184, + "learning_rate": 3.963e-06, + "loss": 0.3006, + "step": 1326 + }, + { + "epoch": 1.0448995667585663, + "grad_norm": 3.6241471767425537, + "learning_rate": 3.966000000000001e-06, + "loss": 0.3964, + "step": 1327 + }, + { + "epoch": 1.0456872784560851, + "grad_norm": 2.649308204650879, + "learning_rate": 3.9690000000000005e-06, + "loss": 0.2769, + "step": 1328 + }, + { + "epoch": 1.0464749901536037, + "grad_norm": 2.5044238567352295, + "learning_rate": 3.971999999999999e-06, + "loss": 0.2729, + "step": 1329 + }, + { + "epoch": 1.0472627018511225, + "grad_norm": 2.265953540802002, + "learning_rate": 3.975e-06, + "loss": 0.2752, + "step": 1330 + }, + { + "epoch": 1.0480504135486413, + "grad_norm": 2.349790334701538, + "learning_rate": 3.978e-06, + "loss": 0.2775, + "step": 1331 + }, + { + "epoch": 1.0488381252461598, + "grad_norm": 2.6589767932891846, + "learning_rate": 3.9810000000000005e-06, + "loss": 0.2729, + "step": 1332 + }, + { + "epoch": 1.0496258369436786, + "grad_norm": 2.6547484397888184, + "learning_rate": 3.984e-06, + "loss": 0.2582, + "step": 1333 + }, + { + "epoch": 1.0504135486411974, + "grad_norm": 2.23648738861084, + "learning_rate": 3.987e-06, + "loss": 0.2366, + "step": 1334 + }, + { + "epoch": 1.051201260338716, + "grad_norm": 3.132201671600342, + "learning_rate": 3.99e-06, + "loss": 0.2613, + "step": 1335 + }, + { + "epoch": 1.0519889720362348, + "grad_norm": 2.2267916202545166, + "learning_rate": 3.993e-06, + "loss": 0.2384, + "step": 1336 + }, + { + "epoch": 1.0527766837337535, + "grad_norm": 2.4653968811035156, + "learning_rate": 3.996e-06, + "loss": 0.2248, + "step": 1337 + }, + { + "epoch": 1.0535643954312721, + "grad_norm": 2.1855313777923584, + "learning_rate": 3.999e-06, + "loss": 0.2372, + "step": 1338 + }, + { + "epoch": 1.054352107128791, + "grad_norm": 2.0443639755249023, + "learning_rate": 4.002e-06, + "loss": 0.2174, + "step": 1339 + }, + { + "epoch": 1.0551398188263095, + "grad_norm": 4.832343578338623, + "learning_rate": 4.005000000000001e-06, + "loss": 0.244, + "step": 1340 + }, + { + "epoch": 1.0559275305238283, + "grad_norm": 2.6075847148895264, + "learning_rate": 4.008e-06, + "loss": 0.2788, + "step": 1341 + }, + { + "epoch": 1.056715242221347, + "grad_norm": 2.276372194290161, + "learning_rate": 4.011e-06, + "loss": 0.215, + "step": 1342 + }, + { + "epoch": 1.0575029539188656, + "grad_norm": 1.9335260391235352, + "learning_rate": 4.014e-06, + "loss": 0.2299, + "step": 1343 + }, + { + "epoch": 1.0582906656163844, + "grad_norm": 2.5269041061401367, + "learning_rate": 4.017e-06, + "loss": 0.2886, + "step": 1344 + }, + { + "epoch": 1.0590783773139032, + "grad_norm": 2.701178789138794, + "learning_rate": 4.0200000000000005e-06, + "loss": 0.2604, + "step": 1345 + }, + { + "epoch": 1.0598660890114218, + "grad_norm": 6.467953681945801, + "learning_rate": 4.023e-06, + "loss": 0.2416, + "step": 1346 + }, + { + "epoch": 1.0606538007089406, + "grad_norm": 3.2166693210601807, + "learning_rate": 4.026000000000001e-06, + "loss": 0.2626, + "step": 1347 + }, + { + "epoch": 1.0614415124064593, + "grad_norm": 2.213616132736206, + "learning_rate": 4.029e-06, + "loss": 0.2087, + "step": 1348 + }, + { + "epoch": 1.062229224103978, + "grad_norm": 5.1791582107543945, + "learning_rate": 4.032e-06, + "loss": 0.2868, + "step": 1349 + }, + { + "epoch": 1.0630169358014967, + "grad_norm": 2.0441837310791016, + "learning_rate": 4.035e-06, + "loss": 0.2127, + "step": 1350 + }, + { + "epoch": 1.0638046474990153, + "grad_norm": 2.156541347503662, + "learning_rate": 4.038e-06, + "loss": 0.2449, + "step": 1351 + }, + { + "epoch": 1.064592359196534, + "grad_norm": 2.4919064044952393, + "learning_rate": 4.041e-06, + "loss": 0.2573, + "step": 1352 + }, + { + "epoch": 1.0653800708940528, + "grad_norm": 22.387035369873047, + "learning_rate": 4.044000000000001e-06, + "loss": 0.2879, + "step": 1353 + }, + { + "epoch": 1.0661677825915714, + "grad_norm": 2.976672410964966, + "learning_rate": 4.0469999999999995e-06, + "loss": 0.2356, + "step": 1354 + }, + { + "epoch": 1.0669554942890902, + "grad_norm": 3.1048190593719482, + "learning_rate": 4.05e-06, + "loss": 0.2913, + "step": 1355 + }, + { + "epoch": 1.067743205986609, + "grad_norm": 4.051537990570068, + "learning_rate": 4.053e-06, + "loss": 0.2106, + "step": 1356 + }, + { + "epoch": 1.0685309176841276, + "grad_norm": 3.4649832248687744, + "learning_rate": 4.056e-06, + "loss": 0.1842, + "step": 1357 + }, + { + "epoch": 1.0693186293816463, + "grad_norm": 2.642807722091675, + "learning_rate": 4.0590000000000004e-06, + "loss": 0.2627, + "step": 1358 + }, + { + "epoch": 1.070106341079165, + "grad_norm": 2.6349518299102783, + "learning_rate": 4.062e-06, + "loss": 0.2597, + "step": 1359 + }, + { + "epoch": 1.0708940527766837, + "grad_norm": 2.8392624855041504, + "learning_rate": 4.065e-06, + "loss": 0.2501, + "step": 1360 + }, + { + "epoch": 1.0716817644742025, + "grad_norm": 2.953023672103882, + "learning_rate": 4.068e-06, + "loss": 0.2725, + "step": 1361 + }, + { + "epoch": 1.072469476171721, + "grad_norm": 2.525111675262451, + "learning_rate": 4.071e-06, + "loss": 0.2718, + "step": 1362 + }, + { + "epoch": 1.0732571878692398, + "grad_norm": 3.580592632293701, + "learning_rate": 4.074e-06, + "loss": 0.2703, + "step": 1363 + }, + { + "epoch": 1.0740448995667586, + "grad_norm": 2.3162922859191895, + "learning_rate": 4.077e-06, + "loss": 0.2434, + "step": 1364 + }, + { + "epoch": 1.0748326112642772, + "grad_norm": 3.014270782470703, + "learning_rate": 4.080000000000001e-06, + "loss": 0.2931, + "step": 1365 + }, + { + "epoch": 1.075620322961796, + "grad_norm": 1.8016266822814941, + "learning_rate": 4.083e-06, + "loss": 0.1787, + "step": 1366 + }, + { + "epoch": 1.0764080346593148, + "grad_norm": 3.4548041820526123, + "learning_rate": 4.0859999999999995e-06, + "loss": 0.287, + "step": 1367 + }, + { + "epoch": 1.0771957463568334, + "grad_norm": 2.667912006378174, + "learning_rate": 4.089e-06, + "loss": 0.2602, + "step": 1368 + }, + { + "epoch": 1.0779834580543521, + "grad_norm": 2.7605912685394287, + "learning_rate": 4.092e-06, + "loss": 0.2364, + "step": 1369 + }, + { + "epoch": 1.078771169751871, + "grad_norm": 2.806605100631714, + "learning_rate": 4.095000000000001e-06, + "loss": 0.2848, + "step": 1370 + }, + { + "epoch": 1.0795588814493895, + "grad_norm": 4.618386745452881, + "learning_rate": 4.098e-06, + "loss": 0.8818, + "step": 1371 + }, + { + "epoch": 1.0803465931469083, + "grad_norm": 3.051034927368164, + "learning_rate": 4.100999999999999e-06, + "loss": 0.7928, + "step": 1372 + }, + { + "epoch": 1.0811343048444269, + "grad_norm": 2.380384683609009, + "learning_rate": 4.104e-06, + "loss": 0.6781, + "step": 1373 + }, + { + "epoch": 1.0819220165419456, + "grad_norm": 2.7943544387817383, + "learning_rate": 4.107e-06, + "loss": 0.6152, + "step": 1374 + }, + { + "epoch": 1.0827097282394644, + "grad_norm": 2.290545701980591, + "learning_rate": 4.1100000000000005e-06, + "loss": 0.4449, + "step": 1375 + }, + { + "epoch": 1.083497439936983, + "grad_norm": 3.236984968185425, + "learning_rate": 4.113e-06, + "loss": 0.3505, + "step": 1376 + }, + { + "epoch": 1.0842851516345018, + "grad_norm": 2.6649398803710938, + "learning_rate": 4.116e-06, + "loss": 0.3081, + "step": 1377 + }, + { + "epoch": 1.0850728633320206, + "grad_norm": 1.974839448928833, + "learning_rate": 4.119e-06, + "loss": 0.2553, + "step": 1378 + }, + { + "epoch": 1.0858605750295391, + "grad_norm": 2.6082816123962402, + "learning_rate": 4.122e-06, + "loss": 0.2693, + "step": 1379 + }, + { + "epoch": 1.086648286727058, + "grad_norm": 3.1818864345550537, + "learning_rate": 4.125e-06, + "loss": 0.2775, + "step": 1380 + }, + { + "epoch": 1.0874359984245765, + "grad_norm": 2.0848639011383057, + "learning_rate": 4.128e-06, + "loss": 0.2157, + "step": 1381 + }, + { + "epoch": 1.0882237101220953, + "grad_norm": 2.892906427383423, + "learning_rate": 4.131e-06, + "loss": 0.2238, + "step": 1382 + }, + { + "epoch": 1.089011421819614, + "grad_norm": 4.157930850982666, + "learning_rate": 4.1340000000000006e-06, + "loss": 0.2257, + "step": 1383 + }, + { + "epoch": 1.0897991335171326, + "grad_norm": 1.9442901611328125, + "learning_rate": 4.137e-06, + "loss": 0.2102, + "step": 1384 + }, + { + "epoch": 1.0905868452146514, + "grad_norm": 3.463264226913452, + "learning_rate": 4.14e-06, + "loss": 0.2621, + "step": 1385 + }, + { + "epoch": 1.0913745569121702, + "grad_norm": 1.9521247148513794, + "learning_rate": 4.143e-06, + "loss": 0.2439, + "step": 1386 + }, + { + "epoch": 1.0921622686096888, + "grad_norm": 1.9640698432922363, + "learning_rate": 4.146e-06, + "loss": 0.2389, + "step": 1387 + }, + { + "epoch": 1.0929499803072076, + "grad_norm": 3.201626777648926, + "learning_rate": 4.1490000000000004e-06, + "loss": 0.2089, + "step": 1388 + }, + { + "epoch": 1.0937376920047264, + "grad_norm": 2.2283473014831543, + "learning_rate": 4.152e-06, + "loss": 0.2162, + "step": 1389 + }, + { + "epoch": 1.094525403702245, + "grad_norm": 2.135795831680298, + "learning_rate": 4.155000000000001e-06, + "loss": 0.2335, + "step": 1390 + }, + { + "epoch": 1.0953131153997637, + "grad_norm": 1.859785556793213, + "learning_rate": 4.158e-06, + "loss": 0.1748, + "step": 1391 + }, + { + "epoch": 1.0961008270972823, + "grad_norm": 2.338768243789673, + "learning_rate": 4.161e-06, + "loss": 0.2146, + "step": 1392 + }, + { + "epoch": 1.096888538794801, + "grad_norm": 2.7761342525482178, + "learning_rate": 4.164e-06, + "loss": 0.2147, + "step": 1393 + }, + { + "epoch": 1.0976762504923199, + "grad_norm": 3.5589942932128906, + "learning_rate": 4.167e-06, + "loss": 0.2129, + "step": 1394 + }, + { + "epoch": 1.0984639621898384, + "grad_norm": 2.25223970413208, + "learning_rate": 4.170000000000001e-06, + "loss": 0.2155, + "step": 1395 + }, + { + "epoch": 1.0992516738873572, + "grad_norm": 2.8203892707824707, + "learning_rate": 4.1730000000000005e-06, + "loss": 0.218, + "step": 1396 + }, + { + "epoch": 1.100039385584876, + "grad_norm": 1.7710007429122925, + "learning_rate": 4.1759999999999995e-06, + "loss": 0.2172, + "step": 1397 + }, + { + "epoch": 1.1008270972823946, + "grad_norm": 2.285829782485962, + "learning_rate": 4.179e-06, + "loss": 0.2001, + "step": 1398 + }, + { + "epoch": 1.1016148089799134, + "grad_norm": 3.2490155696868896, + "learning_rate": 4.182e-06, + "loss": 0.2336, + "step": 1399 + }, + { + "epoch": 1.1024025206774322, + "grad_norm": 2.5338916778564453, + "learning_rate": 4.185000000000001e-06, + "loss": 0.1963, + "step": 1400 + }, + { + "epoch": 1.1031902323749507, + "grad_norm": 2.219653367996216, + "learning_rate": 4.188e-06, + "loss": 0.2304, + "step": 1401 + }, + { + "epoch": 1.1039779440724695, + "grad_norm": 2.3846399784088135, + "learning_rate": 4.191e-06, + "loss": 0.1924, + "step": 1402 + }, + { + "epoch": 1.104765655769988, + "grad_norm": 2.2359139919281006, + "learning_rate": 4.194e-06, + "loss": 0.2079, + "step": 1403 + }, + { + "epoch": 1.1055533674675069, + "grad_norm": 2.4560906887054443, + "learning_rate": 4.197e-06, + "loss": 0.2213, + "step": 1404 + }, + { + "epoch": 1.1063410791650257, + "grad_norm": 4.30728816986084, + "learning_rate": 4.2000000000000004e-06, + "loss": 0.241, + "step": 1405 + }, + { + "epoch": 1.1071287908625442, + "grad_norm": 3.2709856033325195, + "learning_rate": 4.203e-06, + "loss": 0.2033, + "step": 1406 + }, + { + "epoch": 1.107916502560063, + "grad_norm": 1.785904049873352, + "learning_rate": 4.206e-06, + "loss": 0.2201, + "step": 1407 + }, + { + "epoch": 1.1087042142575818, + "grad_norm": 2.0153253078460693, + "learning_rate": 4.209000000000001e-06, + "loss": 0.2197, + "step": 1408 + }, + { + "epoch": 1.1094919259551004, + "grad_norm": 2.168145179748535, + "learning_rate": 4.212e-06, + "loss": 0.2181, + "step": 1409 + }, + { + "epoch": 1.1102796376526192, + "grad_norm": 3.1822550296783447, + "learning_rate": 4.215e-06, + "loss": 0.2463, + "step": 1410 + }, + { + "epoch": 1.111067349350138, + "grad_norm": 2.4834699630737305, + "learning_rate": 4.218e-06, + "loss": 0.2653, + "step": 1411 + }, + { + "epoch": 1.1118550610476565, + "grad_norm": 2.636509418487549, + "learning_rate": 4.221e-06, + "loss": 0.2392, + "step": 1412 + }, + { + "epoch": 1.1126427727451753, + "grad_norm": 2.2220757007598877, + "learning_rate": 4.2240000000000006e-06, + "loss": 0.2079, + "step": 1413 + }, + { + "epoch": 1.1134304844426939, + "grad_norm": 2.0934243202209473, + "learning_rate": 4.227e-06, + "loss": 0.2401, + "step": 1414 + }, + { + "epoch": 1.1142181961402127, + "grad_norm": 2.0329339504241943, + "learning_rate": 4.229999999999999e-06, + "loss": 0.2383, + "step": 1415 + }, + { + "epoch": 1.1150059078377315, + "grad_norm": 2.1930184364318848, + "learning_rate": 4.233e-06, + "loss": 0.2188, + "step": 1416 + }, + { + "epoch": 1.11579361953525, + "grad_norm": 2.4248173236846924, + "learning_rate": 4.236e-06, + "loss": 0.2135, + "step": 1417 + }, + { + "epoch": 1.1165813312327688, + "grad_norm": 2.631425142288208, + "learning_rate": 4.239e-06, + "loss": 0.2497, + "step": 1418 + }, + { + "epoch": 1.1173690429302876, + "grad_norm": 4.472067356109619, + "learning_rate": 4.242e-06, + "loss": 0.2763, + "step": 1419 + }, + { + "epoch": 1.1181567546278062, + "grad_norm": 2.863323450088501, + "learning_rate": 4.245e-06, + "loss": 0.3307, + "step": 1420 + }, + { + "epoch": 1.118944466325325, + "grad_norm": 31.991539001464844, + "learning_rate": 4.248e-06, + "loss": 0.8352, + "step": 1421 + }, + { + "epoch": 1.1197321780228435, + "grad_norm": 4.249553680419922, + "learning_rate": 4.251e-06, + "loss": 0.8411, + "step": 1422 + }, + { + "epoch": 1.1205198897203623, + "grad_norm": 3.1861417293548584, + "learning_rate": 4.254e-06, + "loss": 0.7318, + "step": 1423 + }, + { + "epoch": 1.121307601417881, + "grad_norm": 5.0721435546875, + "learning_rate": 4.257e-06, + "loss": 0.5376, + "step": 1424 + }, + { + "epoch": 1.1220953131153997, + "grad_norm": 2.016747236251831, + "learning_rate": 4.26e-06, + "loss": 0.5568, + "step": 1425 + }, + { + "epoch": 1.1228830248129185, + "grad_norm": 3.5421297550201416, + "learning_rate": 4.2630000000000005e-06, + "loss": 0.4758, + "step": 1426 + }, + { + "epoch": 1.1236707365104373, + "grad_norm": 2.6883559226989746, + "learning_rate": 4.266e-06, + "loss": 0.3671, + "step": 1427 + }, + { + "epoch": 1.1244584482079558, + "grad_norm": 2.5775961875915527, + "learning_rate": 4.269e-06, + "loss": 0.2498, + "step": 1428 + }, + { + "epoch": 1.1252461599054746, + "grad_norm": 1.7765326499938965, + "learning_rate": 4.272e-06, + "loss": 0.2552, + "step": 1429 + }, + { + "epoch": 1.1260338716029934, + "grad_norm": 1.9387611150741577, + "learning_rate": 4.275e-06, + "loss": 0.1937, + "step": 1430 + }, + { + "epoch": 1.126821583300512, + "grad_norm": 2.491227388381958, + "learning_rate": 4.278e-06, + "loss": 0.2063, + "step": 1431 + }, + { + "epoch": 1.1276092949980308, + "grad_norm": 1.8124139308929443, + "learning_rate": 4.281e-06, + "loss": 0.2, + "step": 1432 + }, + { + "epoch": 1.1283970066955495, + "grad_norm": 1.6965700387954712, + "learning_rate": 4.284000000000001e-06, + "loss": 0.2214, + "step": 1433 + }, + { + "epoch": 1.129184718393068, + "grad_norm": 1.86831533908844, + "learning_rate": 4.287e-06, + "loss": 0.2321, + "step": 1434 + }, + { + "epoch": 1.129972430090587, + "grad_norm": 1.9035329818725586, + "learning_rate": 4.29e-06, + "loss": 0.176, + "step": 1435 + }, + { + "epoch": 1.1307601417881055, + "grad_norm": 2.026144504547119, + "learning_rate": 4.293e-06, + "loss": 0.1969, + "step": 1436 + }, + { + "epoch": 1.1315478534856243, + "grad_norm": 2.4393186569213867, + "learning_rate": 4.296e-06, + "loss": 0.2275, + "step": 1437 + }, + { + "epoch": 1.132335565183143, + "grad_norm": 2.1127843856811523, + "learning_rate": 4.299000000000001e-06, + "loss": 0.197, + "step": 1438 + }, + { + "epoch": 1.1331232768806616, + "grad_norm": 2.3346011638641357, + "learning_rate": 4.3020000000000005e-06, + "loss": 0.2416, + "step": 1439 + }, + { + "epoch": 1.1339109885781804, + "grad_norm": 1.8091472387313843, + "learning_rate": 4.3049999999999994e-06, + "loss": 0.1884, + "step": 1440 + }, + { + "epoch": 1.1346987002756992, + "grad_norm": 2.442594051361084, + "learning_rate": 4.308e-06, + "loss": 0.202, + "step": 1441 + }, + { + "epoch": 1.1354864119732178, + "grad_norm": 2.612393856048584, + "learning_rate": 4.311e-06, + "loss": 0.2051, + "step": 1442 + }, + { + "epoch": 1.1362741236707365, + "grad_norm": 2.063568353652954, + "learning_rate": 4.3140000000000005e-06, + "loss": 0.2392, + "step": 1443 + }, + { + "epoch": 1.1370618353682551, + "grad_norm": 2.542156457901001, + "learning_rate": 4.317e-06, + "loss": 0.1873, + "step": 1444 + }, + { + "epoch": 1.137849547065774, + "grad_norm": 2.9491281509399414, + "learning_rate": 4.32e-06, + "loss": 0.231, + "step": 1445 + }, + { + "epoch": 1.1386372587632927, + "grad_norm": 1.907684564590454, + "learning_rate": 4.323e-06, + "loss": 0.2261, + "step": 1446 + }, + { + "epoch": 1.1394249704608113, + "grad_norm": 2.1491286754608154, + "learning_rate": 4.326e-06, + "loss": 0.1729, + "step": 1447 + }, + { + "epoch": 1.14021268215833, + "grad_norm": 2.2964494228363037, + "learning_rate": 4.329e-06, + "loss": 0.1975, + "step": 1448 + }, + { + "epoch": 1.1410003938558488, + "grad_norm": 1.9837255477905273, + "learning_rate": 4.332e-06, + "loss": 0.2054, + "step": 1449 + }, + { + "epoch": 1.1417881055533674, + "grad_norm": 3.2300140857696533, + "learning_rate": 4.335e-06, + "loss": 0.2148, + "step": 1450 + }, + { + "epoch": 1.1425758172508862, + "grad_norm": 2.5738866329193115, + "learning_rate": 4.338000000000001e-06, + "loss": 0.2316, + "step": 1451 + }, + { + "epoch": 1.143363528948405, + "grad_norm": 2.156644821166992, + "learning_rate": 4.341e-06, + "loss": 0.1967, + "step": 1452 + }, + { + "epoch": 1.1441512406459236, + "grad_norm": 2.1813395023345947, + "learning_rate": 4.344e-06, + "loss": 0.1844, + "step": 1453 + }, + { + "epoch": 1.1449389523434423, + "grad_norm": 2.002578020095825, + "learning_rate": 4.347e-06, + "loss": 0.1977, + "step": 1454 + }, + { + "epoch": 1.1457266640409611, + "grad_norm": 2.0458996295928955, + "learning_rate": 4.35e-06, + "loss": 0.2184, + "step": 1455 + }, + { + "epoch": 1.1465143757384797, + "grad_norm": 2.675661325454712, + "learning_rate": 4.3530000000000005e-06, + "loss": 0.1963, + "step": 1456 + }, + { + "epoch": 1.1473020874359985, + "grad_norm": 2.153312921524048, + "learning_rate": 4.356e-06, + "loss": 0.2133, + "step": 1457 + }, + { + "epoch": 1.148089799133517, + "grad_norm": 2.052642822265625, + "learning_rate": 4.359e-06, + "loss": 0.2105, + "step": 1458 + }, + { + "epoch": 1.1488775108310358, + "grad_norm": 2.708080768585205, + "learning_rate": 4.362e-06, + "loss": 0.2706, + "step": 1459 + }, + { + "epoch": 1.1496652225285546, + "grad_norm": 3.6518948078155518, + "learning_rate": 4.365e-06, + "loss": 0.1998, + "step": 1460 + }, + { + "epoch": 1.1504529342260732, + "grad_norm": 3.3163983821868896, + "learning_rate": 4.368e-06, + "loss": 0.2278, + "step": 1461 + }, + { + "epoch": 1.151240645923592, + "grad_norm": 2.2107479572296143, + "learning_rate": 4.371e-06, + "loss": 0.2216, + "step": 1462 + }, + { + "epoch": 1.1520283576211106, + "grad_norm": 2.9090657234191895, + "learning_rate": 4.374000000000001e-06, + "loss": 0.1937, + "step": 1463 + }, + { + "epoch": 1.1528160693186293, + "grad_norm": 2.413884401321411, + "learning_rate": 4.377e-06, + "loss": 0.2186, + "step": 1464 + }, + { + "epoch": 1.1536037810161481, + "grad_norm": 3.3850107192993164, + "learning_rate": 4.3799999999999996e-06, + "loss": 0.2305, + "step": 1465 + }, + { + "epoch": 1.1543914927136667, + "grad_norm": 2.57000470161438, + "learning_rate": 4.383e-06, + "loss": 0.2704, + "step": 1466 + }, + { + "epoch": 1.1551792044111855, + "grad_norm": 6.091724872589111, + "learning_rate": 4.386e-06, + "loss": 0.217, + "step": 1467 + }, + { + "epoch": 1.1559669161087043, + "grad_norm": 2.978273630142212, + "learning_rate": 4.389000000000001e-06, + "loss": 0.2785, + "step": 1468 + }, + { + "epoch": 1.1567546278062228, + "grad_norm": 3.843172073364258, + "learning_rate": 4.3920000000000005e-06, + "loss": 0.2848, + "step": 1469 + }, + { + "epoch": 1.1575423395037416, + "grad_norm": 3.0450901985168457, + "learning_rate": 4.395e-06, + "loss": 0.2689, + "step": 1470 + }, + { + "epoch": 1.1583300512012604, + "grad_norm": 3.5387210845947266, + "learning_rate": 4.398e-06, + "loss": 0.8222, + "step": 1471 + }, + { + "epoch": 1.159117762898779, + "grad_norm": 4.299854755401611, + "learning_rate": 4.401e-06, + "loss": 0.7792, + "step": 1472 + }, + { + "epoch": 1.1599054745962978, + "grad_norm": 2.6716957092285156, + "learning_rate": 4.4040000000000005e-06, + "loss": 0.6196, + "step": 1473 + }, + { + "epoch": 1.1606931862938166, + "grad_norm": 2.9781932830810547, + "learning_rate": 4.407e-06, + "loss": 0.5405, + "step": 1474 + }, + { + "epoch": 1.1614808979913351, + "grad_norm": 2.7363336086273193, + "learning_rate": 4.41e-06, + "loss": 0.3827, + "step": 1475 + }, + { + "epoch": 1.162268609688854, + "grad_norm": 2.6103837490081787, + "learning_rate": 4.413000000000001e-06, + "loss": 0.4202, + "step": 1476 + }, + { + "epoch": 1.1630563213863725, + "grad_norm": 1.7130485773086548, + "learning_rate": 4.416e-06, + "loss": 0.2551, + "step": 1477 + }, + { + "epoch": 1.1638440330838913, + "grad_norm": 2.501558780670166, + "learning_rate": 4.4189999999999995e-06, + "loss": 0.2351, + "step": 1478 + }, + { + "epoch": 1.16463174478141, + "grad_norm": 1.8153505325317383, + "learning_rate": 4.422e-06, + "loss": 0.1726, + "step": 1479 + }, + { + "epoch": 1.1654194564789286, + "grad_norm": 1.9095743894577026, + "learning_rate": 4.425e-06, + "loss": 0.1849, + "step": 1480 + }, + { + "epoch": 1.1662071681764474, + "grad_norm": 1.6664682626724243, + "learning_rate": 4.428000000000001e-06, + "loss": 0.1875, + "step": 1481 + }, + { + "epoch": 1.1669948798739662, + "grad_norm": 1.7621850967407227, + "learning_rate": 4.4310000000000004e-06, + "loss": 0.2056, + "step": 1482 + }, + { + "epoch": 1.1677825915714848, + "grad_norm": 1.7339856624603271, + "learning_rate": 4.433999999999999e-06, + "loss": 0.1838, + "step": 1483 + }, + { + "epoch": 1.1685703032690036, + "grad_norm": 4.216865062713623, + "learning_rate": 4.437e-06, + "loss": 0.1857, + "step": 1484 + }, + { + "epoch": 1.1693580149665221, + "grad_norm": 6.512782573699951, + "learning_rate": 4.44e-06, + "loss": 0.2035, + "step": 1485 + }, + { + "epoch": 1.170145726664041, + "grad_norm": 2.7652268409729004, + "learning_rate": 4.4430000000000005e-06, + "loss": 0.1921, + "step": 1486 + }, + { + "epoch": 1.1709334383615597, + "grad_norm": 1.8875045776367188, + "learning_rate": 4.446e-06, + "loss": 0.2225, + "step": 1487 + }, + { + "epoch": 1.1717211500590783, + "grad_norm": 1.5609867572784424, + "learning_rate": 4.449e-06, + "loss": 0.1521, + "step": 1488 + }, + { + "epoch": 1.172508861756597, + "grad_norm": 1.4529367685317993, + "learning_rate": 4.452e-06, + "loss": 0.1764, + "step": 1489 + }, + { + "epoch": 1.1732965734541159, + "grad_norm": 1.3915789127349854, + "learning_rate": 4.455e-06, + "loss": 0.1326, + "step": 1490 + }, + { + "epoch": 1.1740842851516344, + "grad_norm": 2.289309501647949, + "learning_rate": 4.458e-06, + "loss": 0.1489, + "step": 1491 + }, + { + "epoch": 1.1748719968491532, + "grad_norm": 1.8142352104187012, + "learning_rate": 4.461e-06, + "loss": 0.1581, + "step": 1492 + }, + { + "epoch": 1.175659708546672, + "grad_norm": 1.6964137554168701, + "learning_rate": 4.464e-06, + "loss": 0.1915, + "step": 1493 + }, + { + "epoch": 1.1764474202441906, + "grad_norm": 2.8053553104400635, + "learning_rate": 4.467000000000001e-06, + "loss": 0.1835, + "step": 1494 + }, + { + "epoch": 1.1772351319417094, + "grad_norm": 2.3270018100738525, + "learning_rate": 4.4699999999999996e-06, + "loss": 0.2209, + "step": 1495 + }, + { + "epoch": 1.1780228436392282, + "grad_norm": 3.774376153945923, + "learning_rate": 4.473e-06, + "loss": 0.1657, + "step": 1496 + }, + { + "epoch": 1.1788105553367467, + "grad_norm": 1.6502881050109863, + "learning_rate": 4.476e-06, + "loss": 0.1209, + "step": 1497 + }, + { + "epoch": 1.1795982670342655, + "grad_norm": 1.6948647499084473, + "learning_rate": 4.479e-06, + "loss": 0.1902, + "step": 1498 + }, + { + "epoch": 1.180385978731784, + "grad_norm": 1.947313666343689, + "learning_rate": 4.4820000000000005e-06, + "loss": 0.2145, + "step": 1499 + }, + { + "epoch": 1.1811736904293029, + "grad_norm": 1.8772088289260864, + "learning_rate": 4.485e-06, + "loss": 0.1833, + "step": 1500 + }, + { + "epoch": 1.1819614021268217, + "grad_norm": 2.4377670288085938, + "learning_rate": 4.488e-06, + "loss": 0.1912, + "step": 1501 + }, + { + "epoch": 1.1827491138243402, + "grad_norm": 1.8174591064453125, + "learning_rate": 4.491e-06, + "loss": 0.1894, + "step": 1502 + }, + { + "epoch": 1.183536825521859, + "grad_norm": 10.37293529510498, + "learning_rate": 4.494e-06, + "loss": 0.198, + "step": 1503 + }, + { + "epoch": 1.1843245372193778, + "grad_norm": 2.0607078075408936, + "learning_rate": 4.497e-06, + "loss": 0.1875, + "step": 1504 + }, + { + "epoch": 1.1851122489168964, + "grad_norm": 1.977473258972168, + "learning_rate": 4.5e-06, + "loss": 0.2227, + "step": 1505 + }, + { + "epoch": 1.1858999606144152, + "grad_norm": 1.9465656280517578, + "learning_rate": 4.503000000000001e-06, + "loss": 0.1947, + "step": 1506 + }, + { + "epoch": 1.1866876723119337, + "grad_norm": 4.210458755493164, + "learning_rate": 4.506e-06, + "loss": 0.1867, + "step": 1507 + }, + { + "epoch": 1.1874753840094525, + "grad_norm": 1.9323291778564453, + "learning_rate": 4.5089999999999995e-06, + "loss": 0.2101, + "step": 1508 + }, + { + "epoch": 1.1882630957069713, + "grad_norm": 1.8548858165740967, + "learning_rate": 4.512e-06, + "loss": 0.183, + "step": 1509 + }, + { + "epoch": 1.1890508074044899, + "grad_norm": 2.0489132404327393, + "learning_rate": 4.515e-06, + "loss": 0.1794, + "step": 1510 + }, + { + "epoch": 1.1898385191020087, + "grad_norm": 2.1233701705932617, + "learning_rate": 4.518000000000001e-06, + "loss": 0.1802, + "step": 1511 + }, + { + "epoch": 1.1906262307995275, + "grad_norm": 2.4200077056884766, + "learning_rate": 4.521e-06, + "loss": 0.2171, + "step": 1512 + }, + { + "epoch": 1.191413942497046, + "grad_norm": 2.188631534576416, + "learning_rate": 4.524e-06, + "loss": 0.2105, + "step": 1513 + }, + { + "epoch": 1.1922016541945648, + "grad_norm": 3.2822625637054443, + "learning_rate": 4.527e-06, + "loss": 0.1841, + "step": 1514 + }, + { + "epoch": 1.1929893658920836, + "grad_norm": 1.7803794145584106, + "learning_rate": 4.53e-06, + "loss": 0.1978, + "step": 1515 + }, + { + "epoch": 1.1937770775896022, + "grad_norm": 2.3960065841674805, + "learning_rate": 4.5330000000000005e-06, + "loss": 0.2086, + "step": 1516 + }, + { + "epoch": 1.194564789287121, + "grad_norm": 2.524718761444092, + "learning_rate": 4.536e-06, + "loss": 0.2193, + "step": 1517 + }, + { + "epoch": 1.1953525009846397, + "grad_norm": 2.095709800720215, + "learning_rate": 4.539e-06, + "loss": 0.1854, + "step": 1518 + }, + { + "epoch": 1.1961402126821583, + "grad_norm": 2.439314126968384, + "learning_rate": 4.542000000000001e-06, + "loss": 0.2401, + "step": 1519 + }, + { + "epoch": 1.196927924379677, + "grad_norm": 2.5287070274353027, + "learning_rate": 4.545e-06, + "loss": 0.22, + "step": 1520 + }, + { + "epoch": 1.1977156360771957, + "grad_norm": 4.731534004211426, + "learning_rate": 4.548e-06, + "loss": 0.9526, + "step": 1521 + }, + { + "epoch": 1.1985033477747145, + "grad_norm": 3.559499502182007, + "learning_rate": 4.551e-06, + "loss": 0.8196, + "step": 1522 + }, + { + "epoch": 1.1992910594722332, + "grad_norm": 4.30386209487915, + "learning_rate": 4.554e-06, + "loss": 0.5936, + "step": 1523 + }, + { + "epoch": 1.2000787711697518, + "grad_norm": 1.9393211603164673, + "learning_rate": 4.557000000000001e-06, + "loss": 0.492, + "step": 1524 + }, + { + "epoch": 1.2008664828672706, + "grad_norm": 1.9320299625396729, + "learning_rate": 4.56e-06, + "loss": 0.3996, + "step": 1525 + }, + { + "epoch": 1.2016541945647892, + "grad_norm": 1.6681936979293823, + "learning_rate": 4.563e-06, + "loss": 0.3017, + "step": 1526 + }, + { + "epoch": 1.202441906262308, + "grad_norm": 1.784778118133545, + "learning_rate": 4.566e-06, + "loss": 0.2413, + "step": 1527 + }, + { + "epoch": 1.2032296179598267, + "grad_norm": 1.8203307390213013, + "learning_rate": 4.569e-06, + "loss": 0.1999, + "step": 1528 + }, + { + "epoch": 1.2040173296573453, + "grad_norm": 1.6379649639129639, + "learning_rate": 4.5720000000000004e-06, + "loss": 0.1829, + "step": 1529 + }, + { + "epoch": 1.204805041354864, + "grad_norm": 4.101673126220703, + "learning_rate": 4.575e-06, + "loss": 0.1764, + "step": 1530 + }, + { + "epoch": 1.205592753052383, + "grad_norm": 2.6085305213928223, + "learning_rate": 4.578000000000001e-06, + "loss": 0.2023, + "step": 1531 + }, + { + "epoch": 1.2063804647499015, + "grad_norm": 2.1599996089935303, + "learning_rate": 4.581e-06, + "loss": 0.1819, + "step": 1532 + }, + { + "epoch": 1.2071681764474202, + "grad_norm": 3.0152411460876465, + "learning_rate": 4.584e-06, + "loss": 0.1985, + "step": 1533 + }, + { + "epoch": 1.207955888144939, + "grad_norm": 1.4385250806808472, + "learning_rate": 4.587e-06, + "loss": 0.1454, + "step": 1534 + }, + { + "epoch": 1.2087435998424576, + "grad_norm": 2.8138840198516846, + "learning_rate": 4.59e-06, + "loss": 0.1713, + "step": 1535 + }, + { + "epoch": 1.2095313115399764, + "grad_norm": 1.7941782474517822, + "learning_rate": 4.593000000000001e-06, + "loss": 0.1672, + "step": 1536 + }, + { + "epoch": 1.2103190232374952, + "grad_norm": 1.7789355516433716, + "learning_rate": 4.5960000000000006e-06, + "loss": 0.1957, + "step": 1537 + }, + { + "epoch": 1.2111067349350138, + "grad_norm": 1.439636468887329, + "learning_rate": 4.5989999999999995e-06, + "loss": 0.1426, + "step": 1538 + }, + { + "epoch": 1.2118944466325325, + "grad_norm": 1.446069359779358, + "learning_rate": 4.602e-06, + "loss": 0.125, + "step": 1539 + }, + { + "epoch": 1.212682158330051, + "grad_norm": 1.6837477684020996, + "learning_rate": 4.605e-06, + "loss": 0.1721, + "step": 1540 + }, + { + "epoch": 1.21346987002757, + "grad_norm": 1.605806827545166, + "learning_rate": 4.608e-06, + "loss": 0.183, + "step": 1541 + }, + { + "epoch": 1.2142575817250887, + "grad_norm": 1.758195161819458, + "learning_rate": 4.611e-06, + "loss": 0.2063, + "step": 1542 + }, + { + "epoch": 1.2150452934226073, + "grad_norm": 1.6212952136993408, + "learning_rate": 4.614e-06, + "loss": 0.1772, + "step": 1543 + }, + { + "epoch": 1.215833005120126, + "grad_norm": 1.8541982173919678, + "learning_rate": 4.617e-06, + "loss": 0.1525, + "step": 1544 + }, + { + "epoch": 1.2166207168176448, + "grad_norm": 2.3207406997680664, + "learning_rate": 4.62e-06, + "loss": 0.2, + "step": 1545 + }, + { + "epoch": 1.2174084285151634, + "grad_norm": 3.823415517807007, + "learning_rate": 4.623e-06, + "loss": 0.1443, + "step": 1546 + }, + { + "epoch": 1.2181961402126822, + "grad_norm": 1.6368998289108276, + "learning_rate": 4.626e-06, + "loss": 0.1633, + "step": 1547 + }, + { + "epoch": 1.2189838519102008, + "grad_norm": 1.5862127542495728, + "learning_rate": 4.629e-06, + "loss": 0.1847, + "step": 1548 + }, + { + "epoch": 1.2197715636077195, + "grad_norm": 1.5342658758163452, + "learning_rate": 4.632000000000001e-06, + "loss": 0.1943, + "step": 1549 + }, + { + "epoch": 1.2205592753052383, + "grad_norm": 3.3747944831848145, + "learning_rate": 4.635e-06, + "loss": 0.1617, + "step": 1550 + }, + { + "epoch": 1.221346987002757, + "grad_norm": 1.6960935592651367, + "learning_rate": 4.6379999999999995e-06, + "loss": 0.1634, + "step": 1551 + }, + { + "epoch": 1.2221346987002757, + "grad_norm": 2.3394784927368164, + "learning_rate": 4.641e-06, + "loss": 0.1603, + "step": 1552 + }, + { + "epoch": 1.2229224103977945, + "grad_norm": 2.0296523571014404, + "learning_rate": 4.644e-06, + "loss": 0.187, + "step": 1553 + }, + { + "epoch": 1.223710122095313, + "grad_norm": 2.3873450756073, + "learning_rate": 4.6470000000000006e-06, + "loss": 0.1796, + "step": 1554 + }, + { + "epoch": 1.2244978337928318, + "grad_norm": 1.532723069190979, + "learning_rate": 4.65e-06, + "loss": 0.1615, + "step": 1555 + }, + { + "epoch": 1.2252855454903506, + "grad_norm": 2.0610644817352295, + "learning_rate": 4.653e-06, + "loss": 0.1533, + "step": 1556 + }, + { + "epoch": 1.2260732571878692, + "grad_norm": 2.7305448055267334, + "learning_rate": 4.656e-06, + "loss": 0.1819, + "step": 1557 + }, + { + "epoch": 1.226860968885388, + "grad_norm": 2.101029634475708, + "learning_rate": 4.659e-06, + "loss": 0.1868, + "step": 1558 + }, + { + "epoch": 1.2276486805829068, + "grad_norm": 2.6464929580688477, + "learning_rate": 4.6620000000000004e-06, + "loss": 0.1847, + "step": 1559 + }, + { + "epoch": 1.2284363922804253, + "grad_norm": 2.048093318939209, + "learning_rate": 4.665e-06, + "loss": 0.1655, + "step": 1560 + }, + { + "epoch": 1.2292241039779441, + "grad_norm": 3.851099967956543, + "learning_rate": 4.668e-06, + "loss": 0.1232, + "step": 1561 + }, + { + "epoch": 1.2300118156754627, + "grad_norm": 2.463303565979004, + "learning_rate": 4.671000000000001e-06, + "loss": 0.1907, + "step": 1562 + }, + { + "epoch": 1.2307995273729815, + "grad_norm": 1.9657155275344849, + "learning_rate": 4.674e-06, + "loss": 0.1701, + "step": 1563 + }, + { + "epoch": 1.2315872390705003, + "grad_norm": 3.7873008251190186, + "learning_rate": 4.677e-06, + "loss": 0.2283, + "step": 1564 + }, + { + "epoch": 1.2323749507680188, + "grad_norm": 2.5985753536224365, + "learning_rate": 4.68e-06, + "loss": 0.1484, + "step": 1565 + }, + { + "epoch": 1.2331626624655376, + "grad_norm": 2.637148857116699, + "learning_rate": 4.683e-06, + "loss": 0.1584, + "step": 1566 + }, + { + "epoch": 1.2339503741630564, + "grad_norm": 2.060666561126709, + "learning_rate": 4.6860000000000005e-06, + "loss": 0.2081, + "step": 1567 + }, + { + "epoch": 1.234738085860575, + "grad_norm": 2.3856453895568848, + "learning_rate": 4.689e-06, + "loss": 0.1941, + "step": 1568 + }, + { + "epoch": 1.2355257975580938, + "grad_norm": 2.378973960876465, + "learning_rate": 4.692e-06, + "loss": 0.2087, + "step": 1569 + }, + { + "epoch": 1.2363135092556123, + "grad_norm": 3.3327252864837646, + "learning_rate": 4.695e-06, + "loss": 0.2328, + "step": 1570 + }, + { + "epoch": 1.2371012209531311, + "grad_norm": 4.798428058624268, + "learning_rate": 4.698e-06, + "loss": 0.8492, + "step": 1571 + }, + { + "epoch": 1.23788893265065, + "grad_norm": 2.6058645248413086, + "learning_rate": 4.701e-06, + "loss": 0.6009, + "step": 1572 + }, + { + "epoch": 1.2386766443481685, + "grad_norm": 1.7270125150680542, + "learning_rate": 4.704e-06, + "loss": 0.5389, + "step": 1573 + }, + { + "epoch": 1.2394643560456873, + "grad_norm": 2.714747190475464, + "learning_rate": 4.707000000000001e-06, + "loss": 0.4799, + "step": 1574 + }, + { + "epoch": 1.240252067743206, + "grad_norm": 2.411811351776123, + "learning_rate": 4.71e-06, + "loss": 0.3979, + "step": 1575 + }, + { + "epoch": 1.2410397794407246, + "grad_norm": 1.6519381999969482, + "learning_rate": 4.713e-06, + "loss": 0.2415, + "step": 1576 + }, + { + "epoch": 1.2418274911382434, + "grad_norm": 1.420729160308838, + "learning_rate": 4.716e-06, + "loss": 0.2101, + "step": 1577 + }, + { + "epoch": 1.2426152028357622, + "grad_norm": 1.6558588743209839, + "learning_rate": 4.719e-06, + "loss": 0.1993, + "step": 1578 + }, + { + "epoch": 1.2434029145332808, + "grad_norm": 2.327632427215576, + "learning_rate": 4.722000000000001e-06, + "loss": 0.2605, + "step": 1579 + }, + { + "epoch": 1.2441906262307996, + "grad_norm": 1.8050085306167603, + "learning_rate": 4.7250000000000005e-06, + "loss": 0.2234, + "step": 1580 + }, + { + "epoch": 1.2449783379283184, + "grad_norm": 1.6603147983551025, + "learning_rate": 4.7279999999999995e-06, + "loss": 0.1584, + "step": 1581 + }, + { + "epoch": 1.245766049625837, + "grad_norm": 1.709001064300537, + "learning_rate": 4.731e-06, + "loss": 0.1642, + "step": 1582 + }, + { + "epoch": 1.2465537613233557, + "grad_norm": 2.2881975173950195, + "learning_rate": 4.734e-06, + "loss": 0.1572, + "step": 1583 + }, + { + "epoch": 1.2473414730208743, + "grad_norm": 1.6562846899032593, + "learning_rate": 4.7370000000000006e-06, + "loss": 0.1238, + "step": 1584 + }, + { + "epoch": 1.248129184718393, + "grad_norm": 1.0307093858718872, + "learning_rate": 4.74e-06, + "loss": 0.1124, + "step": 1585 + }, + { + "epoch": 1.2489168964159119, + "grad_norm": 1.9209516048431396, + "learning_rate": 4.743e-06, + "loss": 0.1495, + "step": 1586 + }, + { + "epoch": 1.2497046081134304, + "grad_norm": 1.5972018241882324, + "learning_rate": 4.746e-06, + "loss": 0.1824, + "step": 1587 + }, + { + "epoch": 1.2504923198109492, + "grad_norm": 5.325455188751221, + "learning_rate": 4.749e-06, + "loss": 0.1536, + "step": 1588 + }, + { + "epoch": 1.2512800315084678, + "grad_norm": 2.4524919986724854, + "learning_rate": 4.752e-06, + "loss": 0.1667, + "step": 1589 + }, + { + "epoch": 1.2520677432059866, + "grad_norm": 1.3291022777557373, + "learning_rate": 4.755e-06, + "loss": 0.1684, + "step": 1590 + }, + { + "epoch": 1.2528554549035054, + "grad_norm": 1.4880223274230957, + "learning_rate": 4.758e-06, + "loss": 0.1605, + "step": 1591 + }, + { + "epoch": 1.253643166601024, + "grad_norm": 3.0534112453460693, + "learning_rate": 4.761000000000001e-06, + "loss": 0.1354, + "step": 1592 + }, + { + "epoch": 1.2544308782985427, + "grad_norm": 2.4136335849761963, + "learning_rate": 4.764e-06, + "loss": 0.2972, + "step": 1593 + }, + { + "epoch": 1.2552185899960615, + "grad_norm": 1.6286073923110962, + "learning_rate": 4.767e-06, + "loss": 0.1706, + "step": 1594 + }, + { + "epoch": 1.25600630169358, + "grad_norm": 1.681939721107483, + "learning_rate": 4.77e-06, + "loss": 0.1736, + "step": 1595 + }, + { + "epoch": 1.2567940133910989, + "grad_norm": 1.7457321882247925, + "learning_rate": 4.773e-06, + "loss": 0.1442, + "step": 1596 + }, + { + "epoch": 1.2575817250886177, + "grad_norm": 2.116851806640625, + "learning_rate": 4.7760000000000005e-06, + "loss": 0.1644, + "step": 1597 + }, + { + "epoch": 1.2583694367861362, + "grad_norm": 1.7889268398284912, + "learning_rate": 4.779e-06, + "loss": 0.1624, + "step": 1598 + }, + { + "epoch": 1.259157148483655, + "grad_norm": 1.7362173795700073, + "learning_rate": 4.782e-06, + "loss": 0.1846, + "step": 1599 + }, + { + "epoch": 1.2599448601811738, + "grad_norm": 2.049318313598633, + "learning_rate": 4.785e-06, + "loss": 0.1352, + "step": 1600 + }, + { + "epoch": 1.2607325718786924, + "grad_norm": 1.4666024446487427, + "learning_rate": 4.788e-06, + "loss": 0.1663, + "step": 1601 + }, + { + "epoch": 1.2615202835762112, + "grad_norm": 3.427913188934326, + "learning_rate": 4.791e-06, + "loss": 0.1693, + "step": 1602 + }, + { + "epoch": 1.26230799527373, + "grad_norm": 1.8724650144577026, + "learning_rate": 4.794e-06, + "loss": 0.1553, + "step": 1603 + }, + { + "epoch": 1.2630957069712485, + "grad_norm": 1.7167115211486816, + "learning_rate": 4.797e-06, + "loss": 0.1958, + "step": 1604 + }, + { + "epoch": 1.2638834186687673, + "grad_norm": 2.0887343883514404, + "learning_rate": 4.800000000000001e-06, + "loss": 0.1536, + "step": 1605 + }, + { + "epoch": 1.2646711303662859, + "grad_norm": 1.6804888248443604, + "learning_rate": 4.803e-06, + "loss": 0.1472, + "step": 1606 + }, + { + "epoch": 1.2654588420638047, + "grad_norm": 6.805255889892578, + "learning_rate": 4.806e-06, + "loss": 0.1473, + "step": 1607 + }, + { + "epoch": 1.2662465537613232, + "grad_norm": 2.006321668624878, + "learning_rate": 4.809e-06, + "loss": 0.1366, + "step": 1608 + }, + { + "epoch": 1.267034265458842, + "grad_norm": 5.931089878082275, + "learning_rate": 4.812e-06, + "loss": 0.157, + "step": 1609 + }, + { + "epoch": 1.2678219771563608, + "grad_norm": 2.0124671459198, + "learning_rate": 4.8150000000000005e-06, + "loss": 0.1713, + "step": 1610 + }, + { + "epoch": 1.2686096888538794, + "grad_norm": 1.4972435235977173, + "learning_rate": 4.818e-06, + "loss": 0.1585, + "step": 1611 + }, + { + "epoch": 1.2693974005513982, + "grad_norm": 1.868863821029663, + "learning_rate": 4.821e-06, + "loss": 0.1426, + "step": 1612 + }, + { + "epoch": 1.270185112248917, + "grad_norm": 1.8184653520584106, + "learning_rate": 4.824e-06, + "loss": 0.1839, + "step": 1613 + }, + { + "epoch": 1.2709728239464355, + "grad_norm": 1.618455171585083, + "learning_rate": 4.827e-06, + "loss": 0.1767, + "step": 1614 + }, + { + "epoch": 1.2717605356439543, + "grad_norm": 1.591862440109253, + "learning_rate": 4.83e-06, + "loss": 0.1626, + "step": 1615 + }, + { + "epoch": 1.272548247341473, + "grad_norm": 1.9004156589508057, + "learning_rate": 4.833e-06, + "loss": 0.1849, + "step": 1616 + }, + { + "epoch": 1.2733359590389917, + "grad_norm": 2.5542845726013184, + "learning_rate": 4.836000000000001e-06, + "loss": 0.1398, + "step": 1617 + }, + { + "epoch": 1.2741236707365104, + "grad_norm": 1.7816581726074219, + "learning_rate": 4.839e-06, + "loss": 0.1687, + "step": 1618 + }, + { + "epoch": 1.2749113824340292, + "grad_norm": 3.0721724033355713, + "learning_rate": 4.8419999999999996e-06, + "loss": 0.2057, + "step": 1619 + }, + { + "epoch": 1.2756990941315478, + "grad_norm": 4.150282859802246, + "learning_rate": 4.845e-06, + "loss": 0.1886, + "step": 1620 + }, + { + "epoch": 1.2764868058290666, + "grad_norm": 2.902459144592285, + "learning_rate": 4.848e-06, + "loss": 0.7075, + "step": 1621 + }, + { + "epoch": 1.2772745175265854, + "grad_norm": 3.054168224334717, + "learning_rate": 4.851000000000001e-06, + "loss": 0.6726, + "step": 1622 + }, + { + "epoch": 1.278062229224104, + "grad_norm": 2.7200114727020264, + "learning_rate": 4.8540000000000005e-06, + "loss": 0.6115, + "step": 1623 + }, + { + "epoch": 1.2788499409216227, + "grad_norm": 3.2391045093536377, + "learning_rate": 4.856999999999999e-06, + "loss": 0.4446, + "step": 1624 + }, + { + "epoch": 1.2796376526191415, + "grad_norm": 2.8543336391448975, + "learning_rate": 4.86e-06, + "loss": 0.4526, + "step": 1625 + }, + { + "epoch": 1.28042536431666, + "grad_norm": 2.1853229999542236, + "learning_rate": 4.863e-06, + "loss": 0.404, + "step": 1626 + }, + { + "epoch": 1.2812130760141789, + "grad_norm": 1.595824122428894, + "learning_rate": 4.8660000000000005e-06, + "loss": 0.1853, + "step": 1627 + }, + { + "epoch": 1.2820007877116975, + "grad_norm": 3.195786952972412, + "learning_rate": 4.869e-06, + "loss": 0.2303, + "step": 1628 + }, + { + "epoch": 1.2827884994092162, + "grad_norm": 2.824317455291748, + "learning_rate": 4.872e-06, + "loss": 0.207, + "step": 1629 + }, + { + "epoch": 1.2835762111067348, + "grad_norm": 1.718623161315918, + "learning_rate": 4.875e-06, + "loss": 0.1486, + "step": 1630 + }, + { + "epoch": 1.2843639228042536, + "grad_norm": 1.5308372974395752, + "learning_rate": 4.878e-06, + "loss": 0.1371, + "step": 1631 + }, + { + "epoch": 1.2851516345017724, + "grad_norm": 1.655269742012024, + "learning_rate": 4.881e-06, + "loss": 0.1521, + "step": 1632 + }, + { + "epoch": 1.285939346199291, + "grad_norm": 1.379262924194336, + "learning_rate": 4.884e-06, + "loss": 0.1636, + "step": 1633 + }, + { + "epoch": 1.2867270578968097, + "grad_norm": 1.459324598312378, + "learning_rate": 4.887e-06, + "loss": 0.1136, + "step": 1634 + }, + { + "epoch": 1.2875147695943285, + "grad_norm": 1.3243412971496582, + "learning_rate": 4.890000000000001e-06, + "loss": 0.1032, + "step": 1635 + }, + { + "epoch": 1.288302481291847, + "grad_norm": 1.2065154314041138, + "learning_rate": 4.8929999999999996e-06, + "loss": 0.1445, + "step": 1636 + }, + { + "epoch": 1.289090192989366, + "grad_norm": 1.6823034286499023, + "learning_rate": 4.896e-06, + "loss": 0.2097, + "step": 1637 + }, + { + "epoch": 1.2898779046868847, + "grad_norm": 1.7095601558685303, + "learning_rate": 4.899e-06, + "loss": 0.1706, + "step": 1638 + }, + { + "epoch": 1.2906656163844032, + "grad_norm": 1.775360107421875, + "learning_rate": 4.902e-06, + "loss": 0.1402, + "step": 1639 + }, + { + "epoch": 1.291453328081922, + "grad_norm": 1.3738079071044922, + "learning_rate": 4.9050000000000005e-06, + "loss": 0.143, + "step": 1640 + }, + { + "epoch": 1.2922410397794408, + "grad_norm": 1.3610645532608032, + "learning_rate": 4.908e-06, + "loss": 0.1304, + "step": 1641 + }, + { + "epoch": 1.2930287514769594, + "grad_norm": 1.9352234601974487, + "learning_rate": 4.911e-06, + "loss": 0.1947, + "step": 1642 + }, + { + "epoch": 1.2938164631744782, + "grad_norm": 2.0097334384918213, + "learning_rate": 4.914e-06, + "loss": 0.1469, + "step": 1643 + }, + { + "epoch": 1.294604174871997, + "grad_norm": 1.4931230545043945, + "learning_rate": 4.917e-06, + "loss": 0.1163, + "step": 1644 + }, + { + "epoch": 1.2953918865695155, + "grad_norm": 1.1945468187332153, + "learning_rate": 4.92e-06, + "loss": 0.1155, + "step": 1645 + }, + { + "epoch": 1.2961795982670343, + "grad_norm": 1.4391525983810425, + "learning_rate": 4.923e-06, + "loss": 0.1408, + "step": 1646 + }, + { + "epoch": 1.296967309964553, + "grad_norm": 1.953515648841858, + "learning_rate": 4.926000000000001e-06, + "loss": 0.1359, + "step": 1647 + }, + { + "epoch": 1.2977550216620717, + "grad_norm": 2.743753433227539, + "learning_rate": 4.929000000000001e-06, + "loss": 0.1472, + "step": 1648 + }, + { + "epoch": 1.2985427333595905, + "grad_norm": 2.9900920391082764, + "learning_rate": 4.9319999999999995e-06, + "loss": 0.1631, + "step": 1649 + }, + { + "epoch": 1.299330445057109, + "grad_norm": 1.5362787246704102, + "learning_rate": 4.935e-06, + "loss": 0.1327, + "step": 1650 + }, + { + "epoch": 1.3001181567546278, + "grad_norm": 1.345011591911316, + "learning_rate": 4.938e-06, + "loss": 0.1104, + "step": 1651 + }, + { + "epoch": 1.3009058684521464, + "grad_norm": 1.5389512777328491, + "learning_rate": 4.941000000000001e-06, + "loss": 0.1338, + "step": 1652 + }, + { + "epoch": 1.3016935801496652, + "grad_norm": 2.0760085582733154, + "learning_rate": 4.9440000000000004e-06, + "loss": 0.1479, + "step": 1653 + }, + { + "epoch": 1.302481291847184, + "grad_norm": 1.5635151863098145, + "learning_rate": 4.947e-06, + "loss": 0.1483, + "step": 1654 + }, + { + "epoch": 1.3032690035447025, + "grad_norm": 1.6753923892974854, + "learning_rate": 4.95e-06, + "loss": 0.156, + "step": 1655 + }, + { + "epoch": 1.3040567152422213, + "grad_norm": 1.7423052787780762, + "learning_rate": 4.953e-06, + "loss": 0.1343, + "step": 1656 + }, + { + "epoch": 1.3048444269397401, + "grad_norm": 2.1330244541168213, + "learning_rate": 4.9560000000000005e-06, + "loss": 0.1702, + "step": 1657 + }, + { + "epoch": 1.3056321386372587, + "grad_norm": 3.2471272945404053, + "learning_rate": 4.959e-06, + "loss": 0.1606, + "step": 1658 + }, + { + "epoch": 1.3064198503347775, + "grad_norm": 1.974599838256836, + "learning_rate": 4.962e-06, + "loss": 0.2146, + "step": 1659 + }, + { + "epoch": 1.3072075620322963, + "grad_norm": 2.207840919494629, + "learning_rate": 4.965000000000001e-06, + "loss": 0.1839, + "step": 1660 + }, + { + "epoch": 1.3079952737298148, + "grad_norm": 2.5430867671966553, + "learning_rate": 4.968e-06, + "loss": 0.1791, + "step": 1661 + }, + { + "epoch": 1.3087829854273336, + "grad_norm": 1.459302544593811, + "learning_rate": 4.9709999999999995e-06, + "loss": 0.1297, + "step": 1662 + }, + { + "epoch": 1.3095706971248524, + "grad_norm": 1.9244122505187988, + "learning_rate": 4.974e-06, + "loss": 0.1509, + "step": 1663 + }, + { + "epoch": 1.310358408822371, + "grad_norm": 2.351776123046875, + "learning_rate": 4.977e-06, + "loss": 0.1923, + "step": 1664 + }, + { + "epoch": 1.3111461205198898, + "grad_norm": 1.7056856155395508, + "learning_rate": 4.980000000000001e-06, + "loss": 0.1454, + "step": 1665 + }, + { + "epoch": 1.3119338322174086, + "grad_norm": 2.828514337539673, + "learning_rate": 4.983e-06, + "loss": 0.2273, + "step": 1666 + }, + { + "epoch": 1.3127215439149271, + "grad_norm": 2.414792060852051, + "learning_rate": 4.985999999999999e-06, + "loss": 0.1726, + "step": 1667 + }, + { + "epoch": 1.313509255612446, + "grad_norm": 1.9887245893478394, + "learning_rate": 4.989e-06, + "loss": 0.1552, + "step": 1668 + }, + { + "epoch": 1.3142969673099645, + "grad_norm": 3.5181517601013184, + "learning_rate": 4.992e-06, + "loss": 0.2083, + "step": 1669 + }, + { + "epoch": 1.3150846790074833, + "grad_norm": 3.1858813762664795, + "learning_rate": 4.9950000000000005e-06, + "loss": 0.2368, + "step": 1670 + }, + { + "epoch": 1.3158723907050018, + "grad_norm": 11.263724327087402, + "learning_rate": 4.998e-06, + "loss": 0.8547, + "step": 1671 + }, + { + "epoch": 1.3166601024025206, + "grad_norm": 5.405990123748779, + "learning_rate": 5.001e-06, + "loss": 0.6086, + "step": 1672 + }, + { + "epoch": 1.3174478141000394, + "grad_norm": 3.2828001976013184, + "learning_rate": 5.004e-06, + "loss": 0.5347, + "step": 1673 + }, + { + "epoch": 1.318235525797558, + "grad_norm": 2.311082363128662, + "learning_rate": 5.007e-06, + "loss": 0.4542, + "step": 1674 + }, + { + "epoch": 1.3190232374950768, + "grad_norm": 1.8366845846176147, + "learning_rate": 5.01e-06, + "loss": 0.4381, + "step": 1675 + }, + { + "epoch": 1.3198109491925956, + "grad_norm": 2.07597279548645, + "learning_rate": 5.013e-06, + "loss": 0.3366, + "step": 1676 + }, + { + "epoch": 1.3205986608901141, + "grad_norm": 2.4022512435913086, + "learning_rate": 5.016e-06, + "loss": 0.299, + "step": 1677 + }, + { + "epoch": 1.321386372587633, + "grad_norm": 1.4143428802490234, + "learning_rate": 5.0190000000000006e-06, + "loss": 0.1697, + "step": 1678 + }, + { + "epoch": 1.3221740842851517, + "grad_norm": 1.334641456604004, + "learning_rate": 5.0219999999999995e-06, + "loss": 0.1482, + "step": 1679 + }, + { + "epoch": 1.3229617959826703, + "grad_norm": 1.2785848379135132, + "learning_rate": 5.025e-06, + "loss": 0.148, + "step": 1680 + }, + { + "epoch": 1.323749507680189, + "grad_norm": 1.1628586053848267, + "learning_rate": 5.028e-06, + "loss": 0.1606, + "step": 1681 + }, + { + "epoch": 1.3245372193777079, + "grad_norm": 1.433028221130371, + "learning_rate": 5.031e-06, + "loss": 0.1522, + "step": 1682 + }, + { + "epoch": 1.3253249310752264, + "grad_norm": 1.3536367416381836, + "learning_rate": 5.034e-06, + "loss": 0.1379, + "step": 1683 + }, + { + "epoch": 1.3261126427727452, + "grad_norm": 1.9283136129379272, + "learning_rate": 5.037e-06, + "loss": 0.1523, + "step": 1684 + }, + { + "epoch": 1.326900354470264, + "grad_norm": 2.8229174613952637, + "learning_rate": 5.04e-06, + "loss": 0.1192, + "step": 1685 + }, + { + "epoch": 1.3276880661677826, + "grad_norm": 2.099372148513794, + "learning_rate": 5.043e-06, + "loss": 0.1546, + "step": 1686 + }, + { + "epoch": 1.3284757778653014, + "grad_norm": 1.1694847345352173, + "learning_rate": 5.046e-06, + "loss": 0.1042, + "step": 1687 + }, + { + "epoch": 1.3292634895628201, + "grad_norm": 2.4388203620910645, + "learning_rate": 5.049e-06, + "loss": 0.1632, + "step": 1688 + }, + { + "epoch": 1.3300512012603387, + "grad_norm": 1.4702945947647095, + "learning_rate": 5.052e-06, + "loss": 0.1472, + "step": 1689 + }, + { + "epoch": 1.3308389129578575, + "grad_norm": 2.43125057220459, + "learning_rate": 5.055000000000001e-06, + "loss": 0.0879, + "step": 1690 + }, + { + "epoch": 1.331626624655376, + "grad_norm": 1.8442119359970093, + "learning_rate": 5.0580000000000005e-06, + "loss": 0.1471, + "step": 1691 + }, + { + "epoch": 1.3324143363528949, + "grad_norm": 1.4547405242919922, + "learning_rate": 5.0609999999999995e-06, + "loss": 0.1198, + "step": 1692 + }, + { + "epoch": 1.3332020480504134, + "grad_norm": 1.3759397268295288, + "learning_rate": 5.064e-06, + "loss": 0.1153, + "step": 1693 + }, + { + "epoch": 1.3339897597479322, + "grad_norm": 1.471243143081665, + "learning_rate": 5.067e-06, + "loss": 0.1496, + "step": 1694 + }, + { + "epoch": 1.334777471445451, + "grad_norm": 2.5503149032592773, + "learning_rate": 5.070000000000001e-06, + "loss": 0.2273, + "step": 1695 + }, + { + "epoch": 1.3355651831429696, + "grad_norm": 1.8046985864639282, + "learning_rate": 5.073e-06, + "loss": 0.1494, + "step": 1696 + }, + { + "epoch": 1.3363528948404884, + "grad_norm": 2.297649621963501, + "learning_rate": 5.076e-06, + "loss": 0.1682, + "step": 1697 + }, + { + "epoch": 1.3371406065380071, + "grad_norm": 1.3333885669708252, + "learning_rate": 5.079e-06, + "loss": 0.1201, + "step": 1698 + }, + { + "epoch": 1.3379283182355257, + "grad_norm": 3.134977102279663, + "learning_rate": 5.082e-06, + "loss": 0.1292, + "step": 1699 + }, + { + "epoch": 1.3387160299330445, + "grad_norm": 1.5658873319625854, + "learning_rate": 5.0850000000000004e-06, + "loss": 0.1124, + "step": 1700 + }, + { + "epoch": 1.3395037416305633, + "grad_norm": 1.453944206237793, + "learning_rate": 5.088e-06, + "loss": 0.1406, + "step": 1701 + }, + { + "epoch": 1.3402914533280819, + "grad_norm": 1.431289553642273, + "learning_rate": 5.091e-06, + "loss": 0.1014, + "step": 1702 + }, + { + "epoch": 1.3410791650256007, + "grad_norm": 3.0615150928497314, + "learning_rate": 5.094000000000001e-06, + "loss": 0.1244, + "step": 1703 + }, + { + "epoch": 1.3418668767231194, + "grad_norm": 1.706949234008789, + "learning_rate": 5.097e-06, + "loss": 0.1536, + "step": 1704 + }, + { + "epoch": 1.342654588420638, + "grad_norm": 2.001342296600342, + "learning_rate": 5.1e-06, + "loss": 0.1217, + "step": 1705 + }, + { + "epoch": 1.3434423001181568, + "grad_norm": 2.0255844593048096, + "learning_rate": 5.103e-06, + "loss": 0.1352, + "step": 1706 + }, + { + "epoch": 1.3442300118156756, + "grad_norm": 1.8782458305358887, + "learning_rate": 5.106e-06, + "loss": 0.1423, + "step": 1707 + }, + { + "epoch": 1.3450177235131942, + "grad_norm": 1.5416837930679321, + "learning_rate": 5.1090000000000006e-06, + "loss": 0.1261, + "step": 1708 + }, + { + "epoch": 1.345805435210713, + "grad_norm": 1.919107437133789, + "learning_rate": 5.112e-06, + "loss": 0.1425, + "step": 1709 + }, + { + "epoch": 1.3465931469082315, + "grad_norm": 2.100724935531616, + "learning_rate": 5.115e-06, + "loss": 0.1327, + "step": 1710 + }, + { + "epoch": 1.3473808586057503, + "grad_norm": 2.054718255996704, + "learning_rate": 5.118e-06, + "loss": 0.1562, + "step": 1711 + }, + { + "epoch": 1.3481685703032689, + "grad_norm": 1.648091197013855, + "learning_rate": 5.121e-06, + "loss": 0.1383, + "step": 1712 + }, + { + "epoch": 1.3489562820007877, + "grad_norm": 2.5630714893341064, + "learning_rate": 5.124e-06, + "loss": 0.1407, + "step": 1713 + }, + { + "epoch": 1.3497439936983064, + "grad_norm": 1.5481083393096924, + "learning_rate": 5.127e-06, + "loss": 0.1469, + "step": 1714 + }, + { + "epoch": 1.350531705395825, + "grad_norm": 4.224588871002197, + "learning_rate": 5.130000000000001e-06, + "loss": 0.1485, + "step": 1715 + }, + { + "epoch": 1.3513194170933438, + "grad_norm": 2.460171937942505, + "learning_rate": 5.133e-06, + "loss": 0.1314, + "step": 1716 + }, + { + "epoch": 1.3521071287908626, + "grad_norm": 1.7648046016693115, + "learning_rate": 5.136e-06, + "loss": 0.1895, + "step": 1717 + }, + { + "epoch": 1.3528948404883812, + "grad_norm": 2.4391889572143555, + "learning_rate": 5.139e-06, + "loss": 0.1459, + "step": 1718 + }, + { + "epoch": 1.3536825521859, + "grad_norm": 2.0897881984710693, + "learning_rate": 5.142e-06, + "loss": 0.1681, + "step": 1719 + }, + { + "epoch": 1.3544702638834187, + "grad_norm": 3.089017868041992, + "learning_rate": 5.145000000000001e-06, + "loss": 0.2554, + "step": 1720 + }, + { + "epoch": 1.3552579755809373, + "grad_norm": 3.865854024887085, + "learning_rate": 5.1480000000000005e-06, + "loss": 0.6139, + "step": 1721 + }, + { + "epoch": 1.356045687278456, + "grad_norm": 1.8632386922836304, + "learning_rate": 5.1509999999999995e-06, + "loss": 0.5652, + "step": 1722 + }, + { + "epoch": 1.3568333989759749, + "grad_norm": 1.9412777423858643, + "learning_rate": 5.154e-06, + "loss": 0.5165, + "step": 1723 + }, + { + "epoch": 1.3576211106734934, + "grad_norm": 2.15293550491333, + "learning_rate": 5.157e-06, + "loss": 0.4566, + "step": 1724 + }, + { + "epoch": 1.3584088223710122, + "grad_norm": 2.669376850128174, + "learning_rate": 5.16e-06, + "loss": 0.3183, + "step": 1725 + }, + { + "epoch": 1.359196534068531, + "grad_norm": 1.7431740760803223, + "learning_rate": 5.163e-06, + "loss": 0.2278, + "step": 1726 + }, + { + "epoch": 1.3599842457660496, + "grad_norm": 1.9935517311096191, + "learning_rate": 5.166e-06, + "loss": 0.1724, + "step": 1727 + }, + { + "epoch": 1.3607719574635684, + "grad_norm": 1.52494478225708, + "learning_rate": 5.169e-06, + "loss": 0.1963, + "step": 1728 + }, + { + "epoch": 1.3615596691610872, + "grad_norm": 1.2170639038085938, + "learning_rate": 5.172e-06, + "loss": 0.1244, + "step": 1729 + }, + { + "epoch": 1.3623473808586057, + "grad_norm": 13.280274391174316, + "learning_rate": 5.175e-06, + "loss": 0.1553, + "step": 1730 + }, + { + "epoch": 1.3631350925561245, + "grad_norm": 1.4090402126312256, + "learning_rate": 5.178e-06, + "loss": 0.1254, + "step": 1731 + }, + { + "epoch": 1.363922804253643, + "grad_norm": 1.2774626016616821, + "learning_rate": 5.181e-06, + "loss": 0.1405, + "step": 1732 + }, + { + "epoch": 1.3647105159511619, + "grad_norm": 1.4664357900619507, + "learning_rate": 5.184000000000001e-06, + "loss": 0.0965, + "step": 1733 + }, + { + "epoch": 1.3654982276486805, + "grad_norm": 7.053225517272949, + "learning_rate": 5.1870000000000005e-06, + "loss": 0.1331, + "step": 1734 + }, + { + "epoch": 1.3662859393461992, + "grad_norm": 1.6811739206314087, + "learning_rate": 5.1899999999999994e-06, + "loss": 0.1288, + "step": 1735 + }, + { + "epoch": 1.367073651043718, + "grad_norm": 1.5373077392578125, + "learning_rate": 5.193e-06, + "loss": 0.1301, + "step": 1736 + }, + { + "epoch": 1.3678613627412366, + "grad_norm": 1.4774342775344849, + "learning_rate": 5.196e-06, + "loss": 0.1306, + "step": 1737 + }, + { + "epoch": 1.3686490744387554, + "grad_norm": 2.109443426132202, + "learning_rate": 5.1990000000000005e-06, + "loss": 0.1286, + "step": 1738 + }, + { + "epoch": 1.3694367861362742, + "grad_norm": 1.6606024503707886, + "learning_rate": 5.202e-06, + "loss": 0.1273, + "step": 1739 + }, + { + "epoch": 1.3702244978337927, + "grad_norm": 1.5891247987747192, + "learning_rate": 5.205e-06, + "loss": 0.1329, + "step": 1740 + }, + { + "epoch": 1.3710122095313115, + "grad_norm": 2.0780344009399414, + "learning_rate": 5.208e-06, + "loss": 0.127, + "step": 1741 + }, + { + "epoch": 1.3717999212288303, + "grad_norm": 1.4167920351028442, + "learning_rate": 5.211e-06, + "loss": 0.0941, + "step": 1742 + }, + { + "epoch": 1.3725876329263489, + "grad_norm": 1.6603425741195679, + "learning_rate": 5.214e-06, + "loss": 0.1223, + "step": 1743 + }, + { + "epoch": 1.3733753446238677, + "grad_norm": 1.3350050449371338, + "learning_rate": 5.217e-06, + "loss": 0.1473, + "step": 1744 + }, + { + "epoch": 1.3741630563213865, + "grad_norm": 1.3998534679412842, + "learning_rate": 5.22e-06, + "loss": 0.0951, + "step": 1745 + }, + { + "epoch": 1.374950768018905, + "grad_norm": 1.4839669466018677, + "learning_rate": 5.223000000000001e-06, + "loss": 0.1303, + "step": 1746 + }, + { + "epoch": 1.3757384797164238, + "grad_norm": 1.325621247291565, + "learning_rate": 5.226e-06, + "loss": 0.12, + "step": 1747 + }, + { + "epoch": 1.3765261914139426, + "grad_norm": 1.3337630033493042, + "learning_rate": 5.229e-06, + "loss": 0.1146, + "step": 1748 + }, + { + "epoch": 1.3773139031114612, + "grad_norm": 1.6508907079696655, + "learning_rate": 5.232e-06, + "loss": 0.096, + "step": 1749 + }, + { + "epoch": 1.37810161480898, + "grad_norm": 2.0200541019439697, + "learning_rate": 5.235e-06, + "loss": 0.1185, + "step": 1750 + }, + { + "epoch": 1.3788893265064988, + "grad_norm": 4.032569885253906, + "learning_rate": 5.2380000000000005e-06, + "loss": 0.1414, + "step": 1751 + }, + { + "epoch": 1.3796770382040173, + "grad_norm": 1.3888176679611206, + "learning_rate": 5.241e-06, + "loss": 0.128, + "step": 1752 + }, + { + "epoch": 1.3804647499015361, + "grad_norm": 1.8643460273742676, + "learning_rate": 5.244e-06, + "loss": 0.1583, + "step": 1753 + }, + { + "epoch": 1.3812524615990547, + "grad_norm": 1.432643175125122, + "learning_rate": 5.247e-06, + "loss": 0.1292, + "step": 1754 + }, + { + "epoch": 1.3820401732965735, + "grad_norm": 1.4902030229568481, + "learning_rate": 5.25e-06, + "loss": 0.1115, + "step": 1755 + }, + { + "epoch": 1.382827884994092, + "grad_norm": 2.3069956302642822, + "learning_rate": 5.253e-06, + "loss": 0.1392, + "step": 1756 + }, + { + "epoch": 1.3836155966916108, + "grad_norm": 1.5841898918151855, + "learning_rate": 5.256e-06, + "loss": 0.1067, + "step": 1757 + }, + { + "epoch": 1.3844033083891296, + "grad_norm": 1.8795080184936523, + "learning_rate": 5.259000000000001e-06, + "loss": 0.137, + "step": 1758 + }, + { + "epoch": 1.3851910200866482, + "grad_norm": 2.5567336082458496, + "learning_rate": 5.262e-06, + "loss": 0.1578, + "step": 1759 + }, + { + "epoch": 1.385978731784167, + "grad_norm": 1.7875360250473022, + "learning_rate": 5.2649999999999996e-06, + "loss": 0.1774, + "step": 1760 + }, + { + "epoch": 1.3867664434816858, + "grad_norm": 2.0214145183563232, + "learning_rate": 5.268e-06, + "loss": 0.1622, + "step": 1761 + }, + { + "epoch": 1.3875541551792043, + "grad_norm": 2.146591901779175, + "learning_rate": 5.271e-06, + "loss": 0.1409, + "step": 1762 + }, + { + "epoch": 1.3883418668767231, + "grad_norm": 1.6703717708587646, + "learning_rate": 5.274000000000001e-06, + "loss": 0.1558, + "step": 1763 + }, + { + "epoch": 1.389129578574242, + "grad_norm": 1.7041375637054443, + "learning_rate": 5.2770000000000005e-06, + "loss": 0.1394, + "step": 1764 + }, + { + "epoch": 1.3899172902717605, + "grad_norm": 1.6815412044525146, + "learning_rate": 5.279999999999999e-06, + "loss": 0.1298, + "step": 1765 + }, + { + "epoch": 1.3907050019692793, + "grad_norm": 1.9295676946640015, + "learning_rate": 5.283e-06, + "loss": 0.1408, + "step": 1766 + }, + { + "epoch": 1.391492713666798, + "grad_norm": 1.8058887720108032, + "learning_rate": 5.286e-06, + "loss": 0.135, + "step": 1767 + }, + { + "epoch": 1.3922804253643166, + "grad_norm": 1.791980504989624, + "learning_rate": 5.2890000000000005e-06, + "loss": 0.1653, + "step": 1768 + }, + { + "epoch": 1.3930681370618354, + "grad_norm": 2.326119899749756, + "learning_rate": 5.292e-06, + "loss": 0.1263, + "step": 1769 + }, + { + "epoch": 1.3938558487593542, + "grad_norm": 2.082167863845825, + "learning_rate": 5.295e-06, + "loss": 0.155, + "step": 1770 + }, + { + "epoch": 1.3946435604568728, + "grad_norm": 4.058281421661377, + "learning_rate": 5.298e-06, + "loss": 0.7414, + "step": 1771 + }, + { + "epoch": 1.3954312721543916, + "grad_norm": 4.878024578094482, + "learning_rate": 5.301e-06, + "loss": 0.5877, + "step": 1772 + }, + { + "epoch": 1.3962189838519101, + "grad_norm": 2.7080273628234863, + "learning_rate": 5.304e-06, + "loss": 0.5451, + "step": 1773 + }, + { + "epoch": 1.397006695549429, + "grad_norm": 1.8606659173965454, + "learning_rate": 5.307e-06, + "loss": 0.3531, + "step": 1774 + }, + { + "epoch": 1.3977944072469475, + "grad_norm": 1.803621768951416, + "learning_rate": 5.31e-06, + "loss": 0.2898, + "step": 1775 + }, + { + "epoch": 1.3985821189444663, + "grad_norm": 1.6789801120758057, + "learning_rate": 5.313000000000001e-06, + "loss": 0.2122, + "step": 1776 + }, + { + "epoch": 1.399369830641985, + "grad_norm": 2.1800220012664795, + "learning_rate": 5.3160000000000004e-06, + "loss": 0.1817, + "step": 1777 + }, + { + "epoch": 1.4001575423395036, + "grad_norm": 1.2713563442230225, + "learning_rate": 5.319e-06, + "loss": 0.1366, + "step": 1778 + }, + { + "epoch": 1.4009452540370224, + "grad_norm": 2.0120058059692383, + "learning_rate": 5.322e-06, + "loss": 0.1528, + "step": 1779 + }, + { + "epoch": 1.4017329657345412, + "grad_norm": 1.3649629354476929, + "learning_rate": 5.325e-06, + "loss": 0.114, + "step": 1780 + }, + { + "epoch": 1.4025206774320598, + "grad_norm": 1.123642086982727, + "learning_rate": 5.3280000000000005e-06, + "loss": 0.1129, + "step": 1781 + }, + { + "epoch": 1.4033083891295786, + "grad_norm": 1.3527052402496338, + "learning_rate": 5.331e-06, + "loss": 0.1275, + "step": 1782 + }, + { + "epoch": 1.4040961008270973, + "grad_norm": 10.70815372467041, + "learning_rate": 5.334000000000001e-06, + "loss": 0.1562, + "step": 1783 + }, + { + "epoch": 1.404883812524616, + "grad_norm": 1.1072977781295776, + "learning_rate": 5.337e-06, + "loss": 0.101, + "step": 1784 + }, + { + "epoch": 1.4056715242221347, + "grad_norm": 1.741899013519287, + "learning_rate": 5.34e-06, + "loss": 0.1445, + "step": 1785 + }, + { + "epoch": 1.4064592359196535, + "grad_norm": 1.754320502281189, + "learning_rate": 5.343e-06, + "loss": 0.1194, + "step": 1786 + }, + { + "epoch": 1.407246947617172, + "grad_norm": 1.45916748046875, + "learning_rate": 5.346e-06, + "loss": 0.1067, + "step": 1787 + }, + { + "epoch": 1.4080346593146909, + "grad_norm": 1.5633985996246338, + "learning_rate": 5.349e-06, + "loss": 0.1282, + "step": 1788 + }, + { + "epoch": 1.4088223710122096, + "grad_norm": 1.2081446647644043, + "learning_rate": 5.352000000000001e-06, + "loss": 0.1182, + "step": 1789 + }, + { + "epoch": 1.4096100827097282, + "grad_norm": 1.3807957172393799, + "learning_rate": 5.3549999999999996e-06, + "loss": 0.14, + "step": 1790 + }, + { + "epoch": 1.410397794407247, + "grad_norm": 1.1427627801895142, + "learning_rate": 5.358e-06, + "loss": 0.1257, + "step": 1791 + }, + { + "epoch": 1.4111855061047658, + "grad_norm": 1.4518381357192993, + "learning_rate": 5.361e-06, + "loss": 0.1381, + "step": 1792 + }, + { + "epoch": 1.4119732178022844, + "grad_norm": 1.1549009084701538, + "learning_rate": 5.364e-06, + "loss": 0.0898, + "step": 1793 + }, + { + "epoch": 1.4127609294998031, + "grad_norm": 1.804787516593933, + "learning_rate": 5.3670000000000005e-06, + "loss": 0.1232, + "step": 1794 + }, + { + "epoch": 1.4135486411973217, + "grad_norm": 1.5636800527572632, + "learning_rate": 5.37e-06, + "loss": 0.1556, + "step": 1795 + }, + { + "epoch": 1.4143363528948405, + "grad_norm": 1.1978609561920166, + "learning_rate": 5.373e-06, + "loss": 0.0888, + "step": 1796 + }, + { + "epoch": 1.415124064592359, + "grad_norm": 1.2240145206451416, + "learning_rate": 5.376e-06, + "loss": 0.1345, + "step": 1797 + }, + { + "epoch": 1.4159117762898779, + "grad_norm": 2.869180202484131, + "learning_rate": 5.379e-06, + "loss": 0.1503, + "step": 1798 + }, + { + "epoch": 1.4166994879873966, + "grad_norm": 1.2905280590057373, + "learning_rate": 5.382e-06, + "loss": 0.0728, + "step": 1799 + }, + { + "epoch": 1.4174871996849152, + "grad_norm": 1.6828737258911133, + "learning_rate": 5.385e-06, + "loss": 0.1231, + "step": 1800 + }, + { + "epoch": 1.418274911382434, + "grad_norm": 1.3699661493301392, + "learning_rate": 5.388000000000001e-06, + "loss": 0.0993, + "step": 1801 + }, + { + "epoch": 1.4190626230799528, + "grad_norm": 1.443221926689148, + "learning_rate": 5.391e-06, + "loss": 0.1405, + "step": 1802 + }, + { + "epoch": 1.4198503347774714, + "grad_norm": 2.1533172130584717, + "learning_rate": 5.3939999999999995e-06, + "loss": 0.1328, + "step": 1803 + }, + { + "epoch": 1.4206380464749901, + "grad_norm": 1.4091098308563232, + "learning_rate": 5.397e-06, + "loss": 0.1496, + "step": 1804 + }, + { + "epoch": 1.421425758172509, + "grad_norm": 1.646943211555481, + "learning_rate": 5.4e-06, + "loss": 0.1136, + "step": 1805 + }, + { + "epoch": 1.4222134698700275, + "grad_norm": 2.386762857437134, + "learning_rate": 5.403000000000001e-06, + "loss": 0.0902, + "step": 1806 + }, + { + "epoch": 1.4230011815675463, + "grad_norm": 1.3278599977493286, + "learning_rate": 5.406e-06, + "loss": 0.1294, + "step": 1807 + }, + { + "epoch": 1.423788893265065, + "grad_norm": 1.820590615272522, + "learning_rate": 5.408999999999999e-06, + "loss": 0.1406, + "step": 1808 + }, + { + "epoch": 1.4245766049625836, + "grad_norm": 2.176874876022339, + "learning_rate": 5.412e-06, + "loss": 0.124, + "step": 1809 + }, + { + "epoch": 1.4253643166601024, + "grad_norm": 1.499652624130249, + "learning_rate": 5.415e-06, + "loss": 0.121, + "step": 1810 + }, + { + "epoch": 1.4261520283576212, + "grad_norm": 1.684572696685791, + "learning_rate": 5.4180000000000005e-06, + "loss": 0.1329, + "step": 1811 + }, + { + "epoch": 1.4269397400551398, + "grad_norm": 1.4488623142242432, + "learning_rate": 5.421e-06, + "loss": 0.1426, + "step": 1812 + }, + { + "epoch": 1.4277274517526586, + "grad_norm": 2.1448886394500732, + "learning_rate": 5.424e-06, + "loss": 0.1888, + "step": 1813 + }, + { + "epoch": 1.4285151634501774, + "grad_norm": 1.9195634126663208, + "learning_rate": 5.427e-06, + "loss": 0.1413, + "step": 1814 + }, + { + "epoch": 1.429302875147696, + "grad_norm": 2.1322669982910156, + "learning_rate": 5.43e-06, + "loss": 0.1019, + "step": 1815 + }, + { + "epoch": 1.4300905868452147, + "grad_norm": 1.9278087615966797, + "learning_rate": 5.433e-06, + "loss": 0.1508, + "step": 1816 + }, + { + "epoch": 1.4308782985427333, + "grad_norm": 1.3047399520874023, + "learning_rate": 5.436e-06, + "loss": 0.1254, + "step": 1817 + }, + { + "epoch": 1.431666010240252, + "grad_norm": 2.0157995223999023, + "learning_rate": 5.439e-06, + "loss": 0.157, + "step": 1818 + }, + { + "epoch": 1.4324537219377707, + "grad_norm": 2.8333988189697266, + "learning_rate": 5.442000000000001e-06, + "loss": 0.2137, + "step": 1819 + }, + { + "epoch": 1.4332414336352894, + "grad_norm": 1.9050788879394531, + "learning_rate": 5.445e-06, + "loss": 0.1722, + "step": 1820 + }, + { + "epoch": 1.4340291453328082, + "grad_norm": 2.911777973175049, + "learning_rate": 5.448e-06, + "loss": 0.7334, + "step": 1821 + }, + { + "epoch": 1.4348168570303268, + "grad_norm": 1.8431897163391113, + "learning_rate": 5.451e-06, + "loss": 0.4373, + "step": 1822 + }, + { + "epoch": 1.4356045687278456, + "grad_norm": 2.0756711959838867, + "learning_rate": 5.454e-06, + "loss": 0.4263, + "step": 1823 + }, + { + "epoch": 1.4363922804253644, + "grad_norm": 1.5907059907913208, + "learning_rate": 5.4570000000000004e-06, + "loss": 0.3339, + "step": 1824 + }, + { + "epoch": 1.437179992122883, + "grad_norm": 1.8925427198410034, + "learning_rate": 5.46e-06, + "loss": 0.3055, + "step": 1825 + }, + { + "epoch": 1.4379677038204017, + "grad_norm": 2.303807020187378, + "learning_rate": 5.463000000000001e-06, + "loss": 0.2033, + "step": 1826 + }, + { + "epoch": 1.4387554155179205, + "grad_norm": 1.6927978992462158, + "learning_rate": 5.466e-06, + "loss": 0.1688, + "step": 1827 + }, + { + "epoch": 1.439543127215439, + "grad_norm": 1.0549391508102417, + "learning_rate": 5.469e-06, + "loss": 0.1383, + "step": 1828 + }, + { + "epoch": 1.4403308389129579, + "grad_norm": 0.9359405040740967, + "learning_rate": 5.472e-06, + "loss": 0.1165, + "step": 1829 + }, + { + "epoch": 1.4411185506104767, + "grad_norm": 1.8646037578582764, + "learning_rate": 5.475e-06, + "loss": 0.1947, + "step": 1830 + }, + { + "epoch": 1.4419062623079952, + "grad_norm": 1.3268438577651978, + "learning_rate": 5.478000000000001e-06, + "loss": 0.1457, + "step": 1831 + }, + { + "epoch": 1.442693974005514, + "grad_norm": 2.6664154529571533, + "learning_rate": 5.4810000000000005e-06, + "loss": 0.1194, + "step": 1832 + }, + { + "epoch": 1.4434816857030328, + "grad_norm": 1.624679684638977, + "learning_rate": 5.4839999999999995e-06, + "loss": 0.1274, + "step": 1833 + }, + { + "epoch": 1.4442693974005514, + "grad_norm": 1.153001308441162, + "learning_rate": 5.487e-06, + "loss": 0.0788, + "step": 1834 + }, + { + "epoch": 1.4450571090980702, + "grad_norm": 1.2284952402114868, + "learning_rate": 5.49e-06, + "loss": 0.1067, + "step": 1835 + }, + { + "epoch": 1.4458448207955887, + "grad_norm": 1.4566774368286133, + "learning_rate": 5.493000000000001e-06, + "loss": 0.1159, + "step": 1836 + }, + { + "epoch": 1.4466325324931075, + "grad_norm": 1.315289855003357, + "learning_rate": 5.496e-06, + "loss": 0.118, + "step": 1837 + }, + { + "epoch": 1.447420244190626, + "grad_norm": 1.6144418716430664, + "learning_rate": 5.499e-06, + "loss": 0.0986, + "step": 1838 + }, + { + "epoch": 1.4482079558881449, + "grad_norm": 1.6927317380905151, + "learning_rate": 5.502e-06, + "loss": 0.1485, + "step": 1839 + }, + { + "epoch": 1.4489956675856637, + "grad_norm": 1.1745178699493408, + "learning_rate": 5.505e-06, + "loss": 0.1188, + "step": 1840 + }, + { + "epoch": 1.4497833792831822, + "grad_norm": 1.3836534023284912, + "learning_rate": 5.5080000000000005e-06, + "loss": 0.1173, + "step": 1841 + }, + { + "epoch": 1.450571090980701, + "grad_norm": 1.812691569328308, + "learning_rate": 5.511e-06, + "loss": 0.1509, + "step": 1842 + }, + { + "epoch": 1.4513588026782198, + "grad_norm": 1.034318447113037, + "learning_rate": 5.514e-06, + "loss": 0.0792, + "step": 1843 + }, + { + "epoch": 1.4521465143757384, + "grad_norm": 1.8402694463729858, + "learning_rate": 5.517000000000001e-06, + "loss": 0.1319, + "step": 1844 + }, + { + "epoch": 1.4529342260732572, + "grad_norm": 1.4978020191192627, + "learning_rate": 5.52e-06, + "loss": 0.0892, + "step": 1845 + }, + { + "epoch": 1.453721937770776, + "grad_norm": 1.099042296409607, + "learning_rate": 5.523e-06, + "loss": 0.1068, + "step": 1846 + }, + { + "epoch": 1.4545096494682945, + "grad_norm": 1.9491219520568848, + "learning_rate": 5.526e-06, + "loss": 0.1057, + "step": 1847 + }, + { + "epoch": 1.4552973611658133, + "grad_norm": 1.5086005926132202, + "learning_rate": 5.529e-06, + "loss": 0.0955, + "step": 1848 + }, + { + "epoch": 1.456085072863332, + "grad_norm": 1.4064431190490723, + "learning_rate": 5.5320000000000006e-06, + "loss": 0.1284, + "step": 1849 + }, + { + "epoch": 1.4568727845608507, + "grad_norm": 1.515031337738037, + "learning_rate": 5.535e-06, + "loss": 0.1115, + "step": 1850 + }, + { + "epoch": 1.4576604962583695, + "grad_norm": 6.6499481201171875, + "learning_rate": 5.537999999999999e-06, + "loss": 0.1544, + "step": 1851 + }, + { + "epoch": 1.4584482079558883, + "grad_norm": 1.383749008178711, + "learning_rate": 5.541e-06, + "loss": 0.1117, + "step": 1852 + }, + { + "epoch": 1.4592359196534068, + "grad_norm": 1.3480415344238281, + "learning_rate": 5.544e-06, + "loss": 0.1271, + "step": 1853 + }, + { + "epoch": 1.4600236313509256, + "grad_norm": 1.3776739835739136, + "learning_rate": 5.547e-06, + "loss": 0.1145, + "step": 1854 + }, + { + "epoch": 1.4608113430484444, + "grad_norm": 1.1569019556045532, + "learning_rate": 5.55e-06, + "loss": 0.1234, + "step": 1855 + }, + { + "epoch": 1.461599054745963, + "grad_norm": 1.8077685832977295, + "learning_rate": 5.553e-06, + "loss": 0.137, + "step": 1856 + }, + { + "epoch": 1.4623867664434818, + "grad_norm": 1.5820025205612183, + "learning_rate": 5.556e-06, + "loss": 0.1155, + "step": 1857 + }, + { + "epoch": 1.4631744781410003, + "grad_norm": 1.3611739873886108, + "learning_rate": 5.559e-06, + "loss": 0.1175, + "step": 1858 + }, + { + "epoch": 1.4639621898385191, + "grad_norm": 1.3247723579406738, + "learning_rate": 5.562e-06, + "loss": 0.1011, + "step": 1859 + }, + { + "epoch": 1.4647499015360377, + "grad_norm": 2.0782599449157715, + "learning_rate": 5.565e-06, + "loss": 0.1369, + "step": 1860 + }, + { + "epoch": 1.4655376132335565, + "grad_norm": 3.694274425506592, + "learning_rate": 5.568e-06, + "loss": 0.1217, + "step": 1861 + }, + { + "epoch": 1.4663253249310753, + "grad_norm": 1.860387921333313, + "learning_rate": 5.5710000000000005e-06, + "loss": 0.1728, + "step": 1862 + }, + { + "epoch": 1.4671130366285938, + "grad_norm": 2.562697172164917, + "learning_rate": 5.574e-06, + "loss": 0.183, + "step": 1863 + }, + { + "epoch": 1.4679007483261126, + "grad_norm": 1.616194248199463, + "learning_rate": 5.577e-06, + "loss": 0.1507, + "step": 1864 + }, + { + "epoch": 1.4686884600236314, + "grad_norm": 7.1597161293029785, + "learning_rate": 5.58e-06, + "loss": 0.1329, + "step": 1865 + }, + { + "epoch": 1.46947617172115, + "grad_norm": 2.1697075366973877, + "learning_rate": 5.583e-06, + "loss": 0.1305, + "step": 1866 + }, + { + "epoch": 1.4702638834186688, + "grad_norm": 1.366432785987854, + "learning_rate": 5.586e-06, + "loss": 0.1312, + "step": 1867 + }, + { + "epoch": 1.4710515951161875, + "grad_norm": 1.687129259109497, + "learning_rate": 5.589e-06, + "loss": 0.114, + "step": 1868 + }, + { + "epoch": 1.4718393068137061, + "grad_norm": 1.9121204614639282, + "learning_rate": 5.592000000000001e-06, + "loss": 0.1127, + "step": 1869 + }, + { + "epoch": 1.472627018511225, + "grad_norm": 1.8062865734100342, + "learning_rate": 5.595e-06, + "loss": 0.1512, + "step": 1870 + }, + { + "epoch": 1.4734147302087437, + "grad_norm": 4.356456756591797, + "learning_rate": 5.598e-06, + "loss": 0.8037, + "step": 1871 + }, + { + "epoch": 1.4742024419062623, + "grad_norm": 1.9680871963500977, + "learning_rate": 5.601e-06, + "loss": 0.5122, + "step": 1872 + }, + { + "epoch": 1.474990153603781, + "grad_norm": 1.8072710037231445, + "learning_rate": 5.604e-06, + "loss": 0.5478, + "step": 1873 + }, + { + "epoch": 1.4757778653012998, + "grad_norm": 2.3635144233703613, + "learning_rate": 5.607000000000001e-06, + "loss": 0.4286, + "step": 1874 + }, + { + "epoch": 1.4765655769988184, + "grad_norm": 1.8601572513580322, + "learning_rate": 5.6100000000000005e-06, + "loss": 0.3891, + "step": 1875 + }, + { + "epoch": 1.4773532886963372, + "grad_norm": 1.8740166425704956, + "learning_rate": 5.6129999999999995e-06, + "loss": 0.2408, + "step": 1876 + }, + { + "epoch": 1.478141000393856, + "grad_norm": 2.361624240875244, + "learning_rate": 5.616e-06, + "loss": 0.152, + "step": 1877 + }, + { + "epoch": 1.4789287120913746, + "grad_norm": 1.239048957824707, + "learning_rate": 5.619e-06, + "loss": 0.1688, + "step": 1878 + }, + { + "epoch": 1.4797164237888933, + "grad_norm": 1.4285888671875, + "learning_rate": 5.6220000000000006e-06, + "loss": 0.1562, + "step": 1879 + }, + { + "epoch": 1.480504135486412, + "grad_norm": 1.1779606342315674, + "learning_rate": 5.625e-06, + "loss": 0.1382, + "step": 1880 + }, + { + "epoch": 1.4812918471839307, + "grad_norm": 1.1590780019760132, + "learning_rate": 5.628e-06, + "loss": 0.1149, + "step": 1881 + }, + { + "epoch": 1.4820795588814493, + "grad_norm": 1.5822523832321167, + "learning_rate": 5.631e-06, + "loss": 0.1171, + "step": 1882 + }, + { + "epoch": 1.482867270578968, + "grad_norm": 1.3546675443649292, + "learning_rate": 5.634e-06, + "loss": 0.0866, + "step": 1883 + }, + { + "epoch": 1.4836549822764868, + "grad_norm": 1.143267035484314, + "learning_rate": 5.637e-06, + "loss": 0.0969, + "step": 1884 + }, + { + "epoch": 1.4844426939740054, + "grad_norm": 1.450421929359436, + "learning_rate": 5.64e-06, + "loss": 0.0997, + "step": 1885 + }, + { + "epoch": 1.4852304056715242, + "grad_norm": 1.0394494533538818, + "learning_rate": 5.643e-06, + "loss": 0.0963, + "step": 1886 + }, + { + "epoch": 1.486018117369043, + "grad_norm": 2.2599053382873535, + "learning_rate": 5.646000000000001e-06, + "loss": 0.0946, + "step": 1887 + }, + { + "epoch": 1.4868058290665616, + "grad_norm": 1.3435033559799194, + "learning_rate": 5.649e-06, + "loss": 0.1203, + "step": 1888 + }, + { + "epoch": 1.4875935407640803, + "grad_norm": 1.6374821662902832, + "learning_rate": 5.652e-06, + "loss": 0.1276, + "step": 1889 + }, + { + "epoch": 1.4883812524615991, + "grad_norm": 1.3638209104537964, + "learning_rate": 5.655e-06, + "loss": 0.0952, + "step": 1890 + }, + { + "epoch": 1.4891689641591177, + "grad_norm": 2.0050747394561768, + "learning_rate": 5.658e-06, + "loss": 0.1155, + "step": 1891 + }, + { + "epoch": 1.4899566758566365, + "grad_norm": 1.0565004348754883, + "learning_rate": 5.6610000000000005e-06, + "loss": 0.0912, + "step": 1892 + }, + { + "epoch": 1.4907443875541553, + "grad_norm": 1.0568649768829346, + "learning_rate": 5.664e-06, + "loss": 0.0853, + "step": 1893 + }, + { + "epoch": 1.4915320992516738, + "grad_norm": 1.5457583665847778, + "learning_rate": 5.667e-06, + "loss": 0.1572, + "step": 1894 + }, + { + "epoch": 1.4923198109491926, + "grad_norm": 1.473497748374939, + "learning_rate": 5.67e-06, + "loss": 0.1439, + "step": 1895 + }, + { + "epoch": 1.4931075226467114, + "grad_norm": 1.2247576713562012, + "learning_rate": 5.673e-06, + "loss": 0.122, + "step": 1896 + }, + { + "epoch": 1.49389523434423, + "grad_norm": 1.417435646057129, + "learning_rate": 5.676e-06, + "loss": 0.1001, + "step": 1897 + }, + { + "epoch": 1.4946829460417488, + "grad_norm": 1.162069320678711, + "learning_rate": 5.679e-06, + "loss": 0.1179, + "step": 1898 + }, + { + "epoch": 1.4954706577392674, + "grad_norm": 1.4797500371932983, + "learning_rate": 5.682000000000001e-06, + "loss": 0.0854, + "step": 1899 + }, + { + "epoch": 1.4962583694367861, + "grad_norm": 1.8799631595611572, + "learning_rate": 5.685e-06, + "loss": 0.1069, + "step": 1900 + }, + { + "epoch": 1.4970460811343047, + "grad_norm": 2.496823310852051, + "learning_rate": 5.688e-06, + "loss": 0.1381, + "step": 1901 + }, + { + "epoch": 1.4978337928318235, + "grad_norm": 1.5541388988494873, + "learning_rate": 5.691e-06, + "loss": 0.1432, + "step": 1902 + }, + { + "epoch": 1.4986215045293423, + "grad_norm": 2.2380502223968506, + "learning_rate": 5.694e-06, + "loss": 0.1253, + "step": 1903 + }, + { + "epoch": 1.4994092162268609, + "grad_norm": 1.7040034532546997, + "learning_rate": 5.697000000000001e-06, + "loss": 0.1034, + "step": 1904 + }, + { + "epoch": 1.5001969279243796, + "grad_norm": 1.3872582912445068, + "learning_rate": 5.7000000000000005e-06, + "loss": 0.1261, + "step": 1905 + }, + { + "epoch": 1.5009846396218984, + "grad_norm": 1.181511640548706, + "learning_rate": 5.703e-06, + "loss": 0.0971, + "step": 1906 + }, + { + "epoch": 1.501772351319417, + "grad_norm": 1.3914154767990112, + "learning_rate": 5.706e-06, + "loss": 0.1304, + "step": 1907 + }, + { + "epoch": 1.5025600630169358, + "grad_norm": 1.1652450561523438, + "learning_rate": 5.709e-06, + "loss": 0.1049, + "step": 1908 + }, + { + "epoch": 1.5033477747144546, + "grad_norm": 2.8601927757263184, + "learning_rate": 5.7120000000000005e-06, + "loss": 0.1133, + "step": 1909 + }, + { + "epoch": 1.5041354864119731, + "grad_norm": 1.2588605880737305, + "learning_rate": 5.715e-06, + "loss": 0.0782, + "step": 1910 + }, + { + "epoch": 1.504923198109492, + "grad_norm": 1.3048579692840576, + "learning_rate": 5.718e-06, + "loss": 0.116, + "step": 1911 + }, + { + "epoch": 1.5057109098070107, + "grad_norm": 1.721673846244812, + "learning_rate": 5.721000000000001e-06, + "loss": 0.1168, + "step": 1912 + }, + { + "epoch": 1.5064986215045293, + "grad_norm": 1.718036413192749, + "learning_rate": 5.724e-06, + "loss": 0.123, + "step": 1913 + }, + { + "epoch": 1.507286333202048, + "grad_norm": 1.7615878582000732, + "learning_rate": 5.7269999999999995e-06, + "loss": 0.1543, + "step": 1914 + }, + { + "epoch": 1.5080740448995669, + "grad_norm": 1.5736769437789917, + "learning_rate": 5.73e-06, + "loss": 0.1048, + "step": 1915 + }, + { + "epoch": 1.5088617565970854, + "grad_norm": 3.1790571212768555, + "learning_rate": 5.733e-06, + "loss": 0.1128, + "step": 1916 + }, + { + "epoch": 1.5096494682946042, + "grad_norm": 1.8944861888885498, + "learning_rate": 5.736000000000001e-06, + "loss": 0.1584, + "step": 1917 + }, + { + "epoch": 1.510437179992123, + "grad_norm": 1.7968453168869019, + "learning_rate": 5.7390000000000004e-06, + "loss": 0.1366, + "step": 1918 + }, + { + "epoch": 1.5112248916896416, + "grad_norm": 2.0971381664276123, + "learning_rate": 5.741999999999999e-06, + "loss": 0.1442, + "step": 1919 + }, + { + "epoch": 1.5120126033871601, + "grad_norm": 2.528012990951538, + "learning_rate": 5.745e-06, + "loss": 0.1872, + "step": 1920 + }, + { + "epoch": 1.5128003150846792, + "grad_norm": 4.079789638519287, + "learning_rate": 5.748e-06, + "loss": 0.6511, + "step": 1921 + }, + { + "epoch": 1.5135880267821977, + "grad_norm": 2.356743812561035, + "learning_rate": 5.7510000000000005e-06, + "loss": 0.4889, + "step": 1922 + }, + { + "epoch": 1.5143757384797163, + "grad_norm": 1.7749271392822266, + "learning_rate": 5.754e-06, + "loss": 0.4193, + "step": 1923 + }, + { + "epoch": 1.515163450177235, + "grad_norm": 1.2615163326263428, + "learning_rate": 5.757e-06, + "loss": 0.3599, + "step": 1924 + }, + { + "epoch": 1.5159511618747539, + "grad_norm": 2.0365724563598633, + "learning_rate": 5.76e-06, + "loss": 0.3451, + "step": 1925 + }, + { + "epoch": 1.5167388735722724, + "grad_norm": 3.1115074157714844, + "learning_rate": 5.763e-06, + "loss": 0.3545, + "step": 1926 + }, + { + "epoch": 1.5175265852697912, + "grad_norm": 1.309216856956482, + "learning_rate": 5.766e-06, + "loss": 0.1321, + "step": 1927 + }, + { + "epoch": 1.51831429696731, + "grad_norm": 1.611151933670044, + "learning_rate": 5.769e-06, + "loss": 0.2061, + "step": 1928 + }, + { + "epoch": 1.5191020086648286, + "grad_norm": 1.4887337684631348, + "learning_rate": 5.772e-06, + "loss": 0.1312, + "step": 1929 + }, + { + "epoch": 1.5198897203623474, + "grad_norm": 1.4200907945632935, + "learning_rate": 5.775000000000001e-06, + "loss": 0.1247, + "step": 1930 + }, + { + "epoch": 1.5206774320598662, + "grad_norm": 1.3261045217514038, + "learning_rate": 5.7779999999999996e-06, + "loss": 0.1351, + "step": 1931 + }, + { + "epoch": 1.5214651437573847, + "grad_norm": 1.3636444807052612, + "learning_rate": 5.781e-06, + "loss": 0.1772, + "step": 1932 + }, + { + "epoch": 1.5222528554549035, + "grad_norm": 1.417580485343933, + "learning_rate": 5.784e-06, + "loss": 0.1404, + "step": 1933 + }, + { + "epoch": 1.5230405671524223, + "grad_norm": 3.305147171020508, + "learning_rate": 5.787e-06, + "loss": 0.0778, + "step": 1934 + }, + { + "epoch": 1.5238282788499409, + "grad_norm": 1.5110878944396973, + "learning_rate": 5.7900000000000005e-06, + "loss": 0.1077, + "step": 1935 + }, + { + "epoch": 1.5246159905474597, + "grad_norm": 1.4746930599212646, + "learning_rate": 5.793e-06, + "loss": 0.1104, + "step": 1936 + }, + { + "epoch": 1.5254037022449785, + "grad_norm": 1.4300026893615723, + "learning_rate": 5.796e-06, + "loss": 0.1133, + "step": 1937 + }, + { + "epoch": 1.526191413942497, + "grad_norm": 1.080856442451477, + "learning_rate": 5.799e-06, + "loss": 0.096, + "step": 1938 + }, + { + "epoch": 1.5269791256400156, + "grad_norm": 1.4429258108139038, + "learning_rate": 5.802e-06, + "loss": 0.0942, + "step": 1939 + }, + { + "epoch": 1.5277668373375346, + "grad_norm": 1.8029433488845825, + "learning_rate": 5.805e-06, + "loss": 0.1209, + "step": 1940 + }, + { + "epoch": 1.5285545490350532, + "grad_norm": 1.3922439813613892, + "learning_rate": 5.808e-06, + "loss": 0.0992, + "step": 1941 + }, + { + "epoch": 1.5293422607325717, + "grad_norm": 1.1118546724319458, + "learning_rate": 5.811000000000001e-06, + "loss": 0.0872, + "step": 1942 + }, + { + "epoch": 1.5301299724300907, + "grad_norm": 2.094820022583008, + "learning_rate": 5.814e-06, + "loss": 0.0838, + "step": 1943 + }, + { + "epoch": 1.5309176841276093, + "grad_norm": 4.067007541656494, + "learning_rate": 5.8169999999999995e-06, + "loss": 0.0918, + "step": 1944 + }, + { + "epoch": 1.5317053958251279, + "grad_norm": 1.332331657409668, + "learning_rate": 5.82e-06, + "loss": 0.1063, + "step": 1945 + }, + { + "epoch": 1.5324931075226467, + "grad_norm": 1.5611976385116577, + "learning_rate": 5.823e-06, + "loss": 0.0956, + "step": 1946 + }, + { + "epoch": 1.5332808192201655, + "grad_norm": 1.3629599809646606, + "learning_rate": 5.826000000000001e-06, + "loss": 0.1341, + "step": 1947 + }, + { + "epoch": 1.534068530917684, + "grad_norm": 2.417252779006958, + "learning_rate": 5.8290000000000004e-06, + "loss": 0.1022, + "step": 1948 + }, + { + "epoch": 1.5348562426152028, + "grad_norm": 2.001582384109497, + "learning_rate": 5.832e-06, + "loss": 0.1179, + "step": 1949 + }, + { + "epoch": 1.5356439543127216, + "grad_norm": 1.3021764755249023, + "learning_rate": 5.835e-06, + "loss": 0.1033, + "step": 1950 + }, + { + "epoch": 1.5364316660102402, + "grad_norm": 1.307317852973938, + "learning_rate": 5.838e-06, + "loss": 0.1185, + "step": 1951 + }, + { + "epoch": 1.537219377707759, + "grad_norm": 1.43360435962677, + "learning_rate": 5.8410000000000005e-06, + "loss": 0.1095, + "step": 1952 + }, + { + "epoch": 1.5380070894052777, + "grad_norm": 1.3849068880081177, + "learning_rate": 5.844e-06, + "loss": 0.0745, + "step": 1953 + }, + { + "epoch": 1.5387948011027963, + "grad_norm": 1.1410309076309204, + "learning_rate": 5.847e-06, + "loss": 0.0941, + "step": 1954 + }, + { + "epoch": 1.539582512800315, + "grad_norm": 1.6447511911392212, + "learning_rate": 5.850000000000001e-06, + "loss": 0.1145, + "step": 1955 + }, + { + "epoch": 1.540370224497834, + "grad_norm": 1.0723592042922974, + "learning_rate": 5.853e-06, + "loss": 0.0833, + "step": 1956 + }, + { + "epoch": 1.5411579361953525, + "grad_norm": 1.428427815437317, + "learning_rate": 5.856e-06, + "loss": 0.1334, + "step": 1957 + }, + { + "epoch": 1.5419456478928713, + "grad_norm": 1.6843966245651245, + "learning_rate": 5.859e-06, + "loss": 0.1111, + "step": 1958 + }, + { + "epoch": 1.54273335959039, + "grad_norm": 1.5524731874465942, + "learning_rate": 5.862e-06, + "loss": 0.1377, + "step": 1959 + }, + { + "epoch": 1.5435210712879086, + "grad_norm": 1.1505389213562012, + "learning_rate": 5.865000000000001e-06, + "loss": 0.0946, + "step": 1960 + }, + { + "epoch": 1.5443087829854272, + "grad_norm": 1.515657663345337, + "learning_rate": 5.868e-06, + "loss": 0.1152, + "step": 1961 + }, + { + "epoch": 1.5450964946829462, + "grad_norm": 1.841047763824463, + "learning_rate": 5.871e-06, + "loss": 0.1705, + "step": 1962 + }, + { + "epoch": 1.5458842063804648, + "grad_norm": 1.6482090950012207, + "learning_rate": 5.874e-06, + "loss": 0.1325, + "step": 1963 + }, + { + "epoch": 1.5466719180779833, + "grad_norm": 1.4114155769348145, + "learning_rate": 5.877e-06, + "loss": 0.1178, + "step": 1964 + }, + { + "epoch": 1.5474596297755023, + "grad_norm": 4.519859790802002, + "learning_rate": 5.8800000000000005e-06, + "loss": 0.1196, + "step": 1965 + }, + { + "epoch": 1.548247341473021, + "grad_norm": 2.1789190769195557, + "learning_rate": 5.883e-06, + "loss": 0.1193, + "step": 1966 + }, + { + "epoch": 1.5490350531705395, + "grad_norm": 1.6485240459442139, + "learning_rate": 5.886000000000001e-06, + "loss": 0.1498, + "step": 1967 + }, + { + "epoch": 1.5498227648680583, + "grad_norm": 2.095276355743408, + "learning_rate": 5.889e-06, + "loss": 0.1412, + "step": 1968 + }, + { + "epoch": 1.550610476565577, + "grad_norm": 3.2384228706359863, + "learning_rate": 5.892e-06, + "loss": 0.149, + "step": 1969 + }, + { + "epoch": 1.5513981882630956, + "grad_norm": 3.485745906829834, + "learning_rate": 5.895e-06, + "loss": 0.2113, + "step": 1970 + }, + { + "epoch": 1.5521858999606144, + "grad_norm": 2.459972858428955, + "learning_rate": 5.898e-06, + "loss": 0.6344, + "step": 1971 + }, + { + "epoch": 1.5529736116581332, + "grad_norm": 4.250369071960449, + "learning_rate": 5.901000000000001e-06, + "loss": 0.4517, + "step": 1972 + }, + { + "epoch": 1.5537613233556518, + "grad_norm": 1.7247180938720703, + "learning_rate": 5.9040000000000006e-06, + "loss": 0.4358, + "step": 1973 + }, + { + "epoch": 1.5545490350531705, + "grad_norm": 1.6459481716156006, + "learning_rate": 5.9069999999999995e-06, + "loss": 0.3852, + "step": 1974 + }, + { + "epoch": 1.5553367467506893, + "grad_norm": 1.8898355960845947, + "learning_rate": 5.91e-06, + "loss": 0.336, + "step": 1975 + }, + { + "epoch": 1.556124458448208, + "grad_norm": 1.781427025794983, + "learning_rate": 5.913e-06, + "loss": 0.2291, + "step": 1976 + }, + { + "epoch": 1.5569121701457267, + "grad_norm": 1.5328742265701294, + "learning_rate": 5.916e-06, + "loss": 0.1791, + "step": 1977 + }, + { + "epoch": 1.5576998818432455, + "grad_norm": 1.5085737705230713, + "learning_rate": 5.919e-06, + "loss": 0.1073, + "step": 1978 + }, + { + "epoch": 1.558487593540764, + "grad_norm": 1.319383978843689, + "learning_rate": 5.922e-06, + "loss": 0.1477, + "step": 1979 + }, + { + "epoch": 1.5592753052382828, + "grad_norm": 1.5972825288772583, + "learning_rate": 5.925e-06, + "loss": 0.0824, + "step": 1980 + }, + { + "epoch": 1.5600630169358016, + "grad_norm": 1.0425525903701782, + "learning_rate": 5.928e-06, + "loss": 0.0987, + "step": 1981 + }, + { + "epoch": 1.5608507286333202, + "grad_norm": 1.138417363166809, + "learning_rate": 5.931e-06, + "loss": 0.1211, + "step": 1982 + }, + { + "epoch": 1.5616384403308388, + "grad_norm": 1.6066434383392334, + "learning_rate": 5.934e-06, + "loss": 0.1878, + "step": 1983 + }, + { + "epoch": 1.5624261520283578, + "grad_norm": 1.0254368782043457, + "learning_rate": 5.937e-06, + "loss": 0.1163, + "step": 1984 + }, + { + "epoch": 1.5632138637258763, + "grad_norm": 1.1590912342071533, + "learning_rate": 5.940000000000001e-06, + "loss": 0.0997, + "step": 1985 + }, + { + "epoch": 1.564001575423395, + "grad_norm": 0.887489914894104, + "learning_rate": 5.943e-06, + "loss": 0.076, + "step": 1986 + }, + { + "epoch": 1.5647892871209137, + "grad_norm": 1.8064242601394653, + "learning_rate": 5.9459999999999995e-06, + "loss": 0.0653, + "step": 1987 + }, + { + "epoch": 1.5655769988184325, + "grad_norm": 2.4287116527557373, + "learning_rate": 5.949e-06, + "loss": 0.0917, + "step": 1988 + }, + { + "epoch": 1.566364710515951, + "grad_norm": 1.0747828483581543, + "learning_rate": 5.952e-06, + "loss": 0.0974, + "step": 1989 + }, + { + "epoch": 1.5671524222134698, + "grad_norm": 1.3912250995635986, + "learning_rate": 5.955000000000001e-06, + "loss": 0.0993, + "step": 1990 + }, + { + "epoch": 1.5679401339109886, + "grad_norm": 2.357205629348755, + "learning_rate": 5.958e-06, + "loss": 0.0904, + "step": 1991 + }, + { + "epoch": 1.5687278456085072, + "grad_norm": 0.9537429213523865, + "learning_rate": 5.961e-06, + "loss": 0.0849, + "step": 1992 + }, + { + "epoch": 1.569515557306026, + "grad_norm": 1.3569415807724, + "learning_rate": 5.964e-06, + "loss": 0.1076, + "step": 1993 + }, + { + "epoch": 1.5703032690035448, + "grad_norm": 0.9131097197532654, + "learning_rate": 5.967e-06, + "loss": 0.0621, + "step": 1994 + }, + { + "epoch": 1.5710909807010633, + "grad_norm": 1.933903455734253, + "learning_rate": 5.9700000000000004e-06, + "loss": 0.0995, + "step": 1995 + }, + { + "epoch": 1.5718786923985821, + "grad_norm": 1.335556983947754, + "learning_rate": 5.973e-06, + "loss": 0.0934, + "step": 1996 + }, + { + "epoch": 1.572666404096101, + "grad_norm": 1.1439136266708374, + "learning_rate": 5.976e-06, + "loss": 0.0854, + "step": 1997 + }, + { + "epoch": 1.5734541157936195, + "grad_norm": 1.6454391479492188, + "learning_rate": 5.979000000000001e-06, + "loss": 0.1216, + "step": 1998 + }, + { + "epoch": 1.5742418274911383, + "grad_norm": 1.7055174112319946, + "learning_rate": 5.982e-06, + "loss": 0.1264, + "step": 1999 + }, + { + "epoch": 1.575029539188657, + "grad_norm": 1.4524608850479126, + "learning_rate": 5.985e-06, + "loss": 0.1158, + "step": 2000 + }, + { + "epoch": 1.575029539188657, + "eval_cer": 0.1786297534800529, + "eval_loss": 0.7047962546348572, + "eval_runtime": 16.3882, + "eval_samples_per_second": 18.55, + "eval_steps_per_second": 0.61, + "eval_wer": 0.6110897927858787, + "step": 2000 + }, + { + "epoch": 1.5758172508861756, + "grad_norm": 1.8582065105438232, + "learning_rate": 5.988e-06, + "loss": 0.1052, + "step": 2001 + }, + { + "epoch": 1.5766049625836942, + "grad_norm": 1.3737285137176514, + "learning_rate": 5.991e-06, + "loss": 0.1085, + "step": 2002 + }, + { + "epoch": 1.5773926742812132, + "grad_norm": 2.7276010513305664, + "learning_rate": 5.9940000000000005e-06, + "loss": 0.0851, + "step": 2003 + }, + { + "epoch": 1.5781803859787318, + "grad_norm": 1.6280888319015503, + "learning_rate": 5.997e-06, + "loss": 0.1044, + "step": 2004 + }, + { + "epoch": 1.5789680976762503, + "grad_norm": 1.3149701356887817, + "learning_rate": 6e-06, + "loss": 0.0955, + "step": 2005 + }, + { + "epoch": 1.5797558093737694, + "grad_norm": 1.5746914148330688, + "learning_rate": 6.003e-06, + "loss": 0.1297, + "step": 2006 + }, + { + "epoch": 1.580543521071288, + "grad_norm": 1.4080314636230469, + "learning_rate": 6.006e-06, + "loss": 0.1129, + "step": 2007 + }, + { + "epoch": 1.5813312327688065, + "grad_norm": 1.5916858911514282, + "learning_rate": 6.009e-06, + "loss": 0.1173, + "step": 2008 + }, + { + "epoch": 1.5821189444663253, + "grad_norm": 2.281121015548706, + "learning_rate": 6.012e-06, + "loss": 0.0916, + "step": 2009 + }, + { + "epoch": 1.582906656163844, + "grad_norm": 1.2259238958358765, + "learning_rate": 6.015000000000001e-06, + "loss": 0.1062, + "step": 2010 + }, + { + "epoch": 1.5836943678613626, + "grad_norm": 1.3116024732589722, + "learning_rate": 6.018e-06, + "loss": 0.0998, + "step": 2011 + }, + { + "epoch": 1.5844820795588814, + "grad_norm": 1.2662123441696167, + "learning_rate": 6.021e-06, + "loss": 0.1209, + "step": 2012 + }, + { + "epoch": 1.5852697912564002, + "grad_norm": 1.0627256631851196, + "learning_rate": 6.024e-06, + "loss": 0.1068, + "step": 2013 + }, + { + "epoch": 1.5860575029539188, + "grad_norm": 1.1478118896484375, + "learning_rate": 6.027e-06, + "loss": 0.1105, + "step": 2014 + }, + { + "epoch": 1.5868452146514376, + "grad_norm": 2.249784231185913, + "learning_rate": 6.030000000000001e-06, + "loss": 0.1229, + "step": 2015 + }, + { + "epoch": 1.5876329263489564, + "grad_norm": 1.4060561656951904, + "learning_rate": 6.0330000000000005e-06, + "loss": 0.1143, + "step": 2016 + }, + { + "epoch": 1.588420638046475, + "grad_norm": 1.4337695837020874, + "learning_rate": 6.0359999999999995e-06, + "loss": 0.1085, + "step": 2017 + }, + { + "epoch": 1.5892083497439937, + "grad_norm": 2.603407621383667, + "learning_rate": 6.039e-06, + "loss": 0.1271, + "step": 2018 + }, + { + "epoch": 1.5899960614415125, + "grad_norm": 1.9465705156326294, + "learning_rate": 6.042e-06, + "loss": 0.1402, + "step": 2019 + }, + { + "epoch": 1.590783773139031, + "grad_norm": 1.5506170988082886, + "learning_rate": 6.0450000000000006e-06, + "loss": 0.1364, + "step": 2020 + }, + { + "epoch": 1.5915714848365499, + "grad_norm": 2.3310651779174805, + "learning_rate": 6.048e-06, + "loss": 0.6688, + "step": 2021 + }, + { + "epoch": 1.5923591965340687, + "grad_norm": 2.3944015502929688, + "learning_rate": 6.051e-06, + "loss": 0.5934, + "step": 2022 + }, + { + "epoch": 1.5931469082315872, + "grad_norm": 1.5833103656768799, + "learning_rate": 6.054e-06, + "loss": 0.3524, + "step": 2023 + }, + { + "epoch": 1.5939346199291058, + "grad_norm": 1.2483621835708618, + "learning_rate": 6.057e-06, + "loss": 0.3585, + "step": 2024 + }, + { + "epoch": 1.5947223316266248, + "grad_norm": 1.6799554824829102, + "learning_rate": 6.0600000000000004e-06, + "loss": 0.3589, + "step": 2025 + }, + { + "epoch": 1.5955100433241434, + "grad_norm": 1.2508004903793335, + "learning_rate": 6.063e-06, + "loss": 0.2621, + "step": 2026 + }, + { + "epoch": 1.596297755021662, + "grad_norm": 0.9876098036766052, + "learning_rate": 6.066e-06, + "loss": 0.1303, + "step": 2027 + }, + { + "epoch": 1.597085466719181, + "grad_norm": 1.0982367992401123, + "learning_rate": 6.069000000000001e-06, + "loss": 0.1687, + "step": 2028 + }, + { + "epoch": 1.5978731784166995, + "grad_norm": 1.0325015783309937, + "learning_rate": 6.072e-06, + "loss": 0.1112, + "step": 2029 + }, + { + "epoch": 1.598660890114218, + "grad_norm": 1.209921956062317, + "learning_rate": 6.075e-06, + "loss": 0.108, + "step": 2030 + }, + { + "epoch": 1.5994486018117369, + "grad_norm": 1.6946083307266235, + "learning_rate": 6.078e-06, + "loss": 0.1011, + "step": 2031 + }, + { + "epoch": 1.6002363135092557, + "grad_norm": 0.7468725442886353, + "learning_rate": 6.081e-06, + "loss": 0.0904, + "step": 2032 + }, + { + "epoch": 1.6010240252067742, + "grad_norm": 1.1540709733963013, + "learning_rate": 6.0840000000000005e-06, + "loss": 0.0752, + "step": 2033 + }, + { + "epoch": 1.601811736904293, + "grad_norm": 1.018190622329712, + "learning_rate": 6.087e-06, + "loss": 0.0903, + "step": 2034 + }, + { + "epoch": 1.6025994486018118, + "grad_norm": 1.2203247547149658, + "learning_rate": 6.090000000000001e-06, + "loss": 0.0863, + "step": 2035 + }, + { + "epoch": 1.6033871602993304, + "grad_norm": 1.319166898727417, + "learning_rate": 6.093e-06, + "loss": 0.117, + "step": 2036 + }, + { + "epoch": 1.6041748719968492, + "grad_norm": 1.6532902717590332, + "learning_rate": 6.096e-06, + "loss": 0.1322, + "step": 2037 + }, + { + "epoch": 1.604962583694368, + "grad_norm": 1.2487143278121948, + "learning_rate": 6.099e-06, + "loss": 0.113, + "step": 2038 + }, + { + "epoch": 1.6057502953918865, + "grad_norm": 1.2994301319122314, + "learning_rate": 6.102e-06, + "loss": 0.1193, + "step": 2039 + }, + { + "epoch": 1.6065380070894053, + "grad_norm": 0.9619595408439636, + "learning_rate": 6.105e-06, + "loss": 0.1104, + "step": 2040 + }, + { + "epoch": 1.607325718786924, + "grad_norm": 1.1607786417007446, + "learning_rate": 6.108000000000001e-06, + "loss": 0.087, + "step": 2041 + }, + { + "epoch": 1.6081134304844427, + "grad_norm": 1.3503427505493164, + "learning_rate": 6.111e-06, + "loss": 0.1074, + "step": 2042 + }, + { + "epoch": 1.6089011421819615, + "grad_norm": 1.0785471200942993, + "learning_rate": 6.114e-06, + "loss": 0.1034, + "step": 2043 + }, + { + "epoch": 1.6096888538794802, + "grad_norm": 0.8946079015731812, + "learning_rate": 6.117e-06, + "loss": 0.0968, + "step": 2044 + }, + { + "epoch": 1.6104765655769988, + "grad_norm": 1.7009787559509277, + "learning_rate": 6.12e-06, + "loss": 0.1248, + "step": 2045 + }, + { + "epoch": 1.6112642772745174, + "grad_norm": 1.2936779260635376, + "learning_rate": 6.1230000000000005e-06, + "loss": 0.0972, + "step": 2046 + }, + { + "epoch": 1.6120519889720364, + "grad_norm": 0.9326944351196289, + "learning_rate": 6.126e-06, + "loss": 0.0704, + "step": 2047 + }, + { + "epoch": 1.612839700669555, + "grad_norm": 1.0497071743011475, + "learning_rate": 6.129e-06, + "loss": 0.0983, + "step": 2048 + }, + { + "epoch": 1.6136274123670735, + "grad_norm": 1.2315483093261719, + "learning_rate": 6.132e-06, + "loss": 0.1125, + "step": 2049 + }, + { + "epoch": 1.6144151240645923, + "grad_norm": 1.4356440305709839, + "learning_rate": 6.135e-06, + "loss": 0.1149, + "step": 2050 + }, + { + "epoch": 1.615202835762111, + "grad_norm": 1.3301622867584229, + "learning_rate": 6.138e-06, + "loss": 0.0997, + "step": 2051 + }, + { + "epoch": 1.6159905474596297, + "grad_norm": 1.392967939376831, + "learning_rate": 6.141e-06, + "loss": 0.091, + "step": 2052 + }, + { + "epoch": 1.6167782591571485, + "grad_norm": 1.4408767223358154, + "learning_rate": 6.144000000000001e-06, + "loss": 0.1169, + "step": 2053 + }, + { + "epoch": 1.6175659708546672, + "grad_norm": 1.9372141361236572, + "learning_rate": 6.147e-06, + "loss": 0.0814, + "step": 2054 + }, + { + "epoch": 1.6183536825521858, + "grad_norm": 1.0979576110839844, + "learning_rate": 6.1499999999999996e-06, + "loss": 0.1081, + "step": 2055 + }, + { + "epoch": 1.6191413942497046, + "grad_norm": 1.8980953693389893, + "learning_rate": 6.153e-06, + "loss": 0.1114, + "step": 2056 + }, + { + "epoch": 1.6199291059472234, + "grad_norm": 1.705552339553833, + "learning_rate": 6.156e-06, + "loss": 0.1066, + "step": 2057 + }, + { + "epoch": 1.620716817644742, + "grad_norm": 1.518449306488037, + "learning_rate": 6.159000000000001e-06, + "loss": 0.1272, + "step": 2058 + }, + { + "epoch": 1.6215045293422607, + "grad_norm": 1.366895079612732, + "learning_rate": 6.1620000000000005e-06, + "loss": 0.113, + "step": 2059 + }, + { + "epoch": 1.6222922410397795, + "grad_norm": 1.5549808740615845, + "learning_rate": 6.164999999999999e-06, + "loss": 0.1245, + "step": 2060 + }, + { + "epoch": 1.623079952737298, + "grad_norm": 2.2337355613708496, + "learning_rate": 6.168e-06, + "loss": 0.0936, + "step": 2061 + }, + { + "epoch": 1.623867664434817, + "grad_norm": 1.6512267589569092, + "learning_rate": 6.171e-06, + "loss": 0.1409, + "step": 2062 + }, + { + "epoch": 1.6246553761323357, + "grad_norm": 2.6129202842712402, + "learning_rate": 6.1740000000000005e-06, + "loss": 0.0948, + "step": 2063 + }, + { + "epoch": 1.6254430878298542, + "grad_norm": 1.8567310571670532, + "learning_rate": 6.177e-06, + "loss": 0.1418, + "step": 2064 + }, + { + "epoch": 1.6262307995273728, + "grad_norm": 1.4097156524658203, + "learning_rate": 6.18e-06, + "loss": 0.1124, + "step": 2065 + }, + { + "epoch": 1.6270185112248918, + "grad_norm": 1.6893047094345093, + "learning_rate": 6.183e-06, + "loss": 0.1104, + "step": 2066 + }, + { + "epoch": 1.6278062229224104, + "grad_norm": 1.8430372476577759, + "learning_rate": 6.186e-06, + "loss": 0.1261, + "step": 2067 + }, + { + "epoch": 1.628593934619929, + "grad_norm": 1.4355581998825073, + "learning_rate": 6.189e-06, + "loss": 0.1174, + "step": 2068 + }, + { + "epoch": 1.629381646317448, + "grad_norm": 1.961177945137024, + "learning_rate": 6.192e-06, + "loss": 0.1677, + "step": 2069 + }, + { + "epoch": 1.6301693580149665, + "grad_norm": 2.547154188156128, + "learning_rate": 6.195e-06, + "loss": 0.1387, + "step": 2070 + }, + { + "epoch": 1.630957069712485, + "grad_norm": 1.928681492805481, + "learning_rate": 6.198000000000001e-06, + "loss": 0.5432, + "step": 2071 + }, + { + "epoch": 1.631744781410004, + "grad_norm": 2.220733642578125, + "learning_rate": 6.201e-06, + "loss": 0.4739, + "step": 2072 + }, + { + "epoch": 1.6325324931075227, + "grad_norm": 3.4729127883911133, + "learning_rate": 6.204e-06, + "loss": 0.6321, + "step": 2073 + }, + { + "epoch": 1.6333202048050413, + "grad_norm": 2.9729576110839844, + "learning_rate": 6.207e-06, + "loss": 0.3783, + "step": 2074 + }, + { + "epoch": 1.63410791650256, + "grad_norm": 1.6420705318450928, + "learning_rate": 6.21e-06, + "loss": 0.2917, + "step": 2075 + }, + { + "epoch": 1.6348956282000788, + "grad_norm": 1.2082351446151733, + "learning_rate": 6.2130000000000005e-06, + "loss": 0.1816, + "step": 2076 + }, + { + "epoch": 1.6356833398975974, + "grad_norm": 1.1329317092895508, + "learning_rate": 6.216e-06, + "loss": 0.1667, + "step": 2077 + }, + { + "epoch": 1.6364710515951162, + "grad_norm": 1.0313200950622559, + "learning_rate": 6.219000000000001e-06, + "loss": 0.1469, + "step": 2078 + }, + { + "epoch": 1.637258763292635, + "grad_norm": 1.190723180770874, + "learning_rate": 6.222e-06, + "loss": 0.1805, + "step": 2079 + }, + { + "epoch": 1.6380464749901535, + "grad_norm": 1.113817572593689, + "learning_rate": 6.225e-06, + "loss": 0.1021, + "step": 2080 + }, + { + "epoch": 1.6388341866876723, + "grad_norm": 0.9032092094421387, + "learning_rate": 6.228e-06, + "loss": 0.0983, + "step": 2081 + }, + { + "epoch": 1.6396218983851911, + "grad_norm": 0.994355320930481, + "learning_rate": 6.231e-06, + "loss": 0.0721, + "step": 2082 + }, + { + "epoch": 1.6404096100827097, + "grad_norm": 1.3908747434616089, + "learning_rate": 6.234000000000001e-06, + "loss": 0.1095, + "step": 2083 + }, + { + "epoch": 1.6411973217802285, + "grad_norm": 1.022200345993042, + "learning_rate": 6.237000000000001e-06, + "loss": 0.0909, + "step": 2084 + }, + { + "epoch": 1.6419850334777473, + "grad_norm": 1.1234228610992432, + "learning_rate": 6.2399999999999995e-06, + "loss": 0.0806, + "step": 2085 + }, + { + "epoch": 1.6427727451752658, + "grad_norm": 0.9880666136741638, + "learning_rate": 6.243e-06, + "loss": 0.0947, + "step": 2086 + }, + { + "epoch": 1.6435604568727844, + "grad_norm": 1.1043938398361206, + "learning_rate": 6.246e-06, + "loss": 0.107, + "step": 2087 + }, + { + "epoch": 1.6443481685703034, + "grad_norm": 1.282501459121704, + "learning_rate": 6.249000000000001e-06, + "loss": 0.0842, + "step": 2088 + }, + { + "epoch": 1.645135880267822, + "grad_norm": 3.414762496948242, + "learning_rate": 6.2520000000000004e-06, + "loss": 0.11, + "step": 2089 + }, + { + "epoch": 1.6459235919653405, + "grad_norm": 1.0352938175201416, + "learning_rate": 6.255e-06, + "loss": 0.0926, + "step": 2090 + }, + { + "epoch": 1.6467113036628593, + "grad_norm": 1.7186626195907593, + "learning_rate": 6.258e-06, + "loss": 0.1075, + "step": 2091 + }, + { + "epoch": 1.6474990153603781, + "grad_norm": 1.079129934310913, + "learning_rate": 6.261e-06, + "loss": 0.0774, + "step": 2092 + }, + { + "epoch": 1.6482867270578967, + "grad_norm": 1.2598578929901123, + "learning_rate": 6.2640000000000005e-06, + "loss": 0.0692, + "step": 2093 + }, + { + "epoch": 1.6490744387554155, + "grad_norm": 1.1474103927612305, + "learning_rate": 6.267e-06, + "loss": 0.0714, + "step": 2094 + }, + { + "epoch": 1.6498621504529343, + "grad_norm": 1.3369133472442627, + "learning_rate": 6.27e-06, + "loss": 0.1564, + "step": 2095 + }, + { + "epoch": 1.6506498621504528, + "grad_norm": 1.1265807151794434, + "learning_rate": 6.273000000000001e-06, + "loss": 0.0824, + "step": 2096 + }, + { + "epoch": 1.6514375738479716, + "grad_norm": 1.4499932527542114, + "learning_rate": 6.276e-06, + "loss": 0.1214, + "step": 2097 + }, + { + "epoch": 1.6522252855454904, + "grad_norm": 1.3029086589813232, + "learning_rate": 6.279e-06, + "loss": 0.0985, + "step": 2098 + }, + { + "epoch": 1.653012997243009, + "grad_norm": 1.5764985084533691, + "learning_rate": 6.282e-06, + "loss": 0.0824, + "step": 2099 + }, + { + "epoch": 1.6538007089405278, + "grad_norm": 1.3124969005584717, + "learning_rate": 6.285e-06, + "loss": 0.1133, + "step": 2100 + }, + { + "epoch": 1.6545884206380466, + "grad_norm": 1.48678719997406, + "learning_rate": 6.288000000000001e-06, + "loss": 0.1239, + "step": 2101 + }, + { + "epoch": 1.6553761323355651, + "grad_norm": 2.801495313644409, + "learning_rate": 6.291e-06, + "loss": 0.1073, + "step": 2102 + }, + { + "epoch": 1.656163844033084, + "grad_norm": 2.071078062057495, + "learning_rate": 6.293999999999999e-06, + "loss": 0.111, + "step": 2103 + }, + { + "epoch": 1.6569515557306027, + "grad_norm": 1.1787440776824951, + "learning_rate": 6.297e-06, + "loss": 0.0791, + "step": 2104 + }, + { + "epoch": 1.6577392674281213, + "grad_norm": 1.1216635704040527, + "learning_rate": 6.3e-06, + "loss": 0.0835, + "step": 2105 + }, + { + "epoch": 1.65852697912564, + "grad_norm": 1.571854591369629, + "learning_rate": 6.3030000000000005e-06, + "loss": 0.084, + "step": 2106 + }, + { + "epoch": 1.6593146908231589, + "grad_norm": 1.1552097797393799, + "learning_rate": 6.306e-06, + "loss": 0.0943, + "step": 2107 + }, + { + "epoch": 1.6601024025206774, + "grad_norm": 2.074686050415039, + "learning_rate": 6.309e-06, + "loss": 0.0919, + "step": 2108 + }, + { + "epoch": 1.660890114218196, + "grad_norm": 2.2527873516082764, + "learning_rate": 6.312e-06, + "loss": 0.0995, + "step": 2109 + }, + { + "epoch": 1.661677825915715, + "grad_norm": 1.308145523071289, + "learning_rate": 6.315e-06, + "loss": 0.0892, + "step": 2110 + }, + { + "epoch": 1.6624655376132336, + "grad_norm": 1.8404160737991333, + "learning_rate": 6.318e-06, + "loss": 0.1179, + "step": 2111 + }, + { + "epoch": 1.6632532493107521, + "grad_norm": 1.318767786026001, + "learning_rate": 6.321e-06, + "loss": 0.0901, + "step": 2112 + }, + { + "epoch": 1.664040961008271, + "grad_norm": 1.762204885482788, + "learning_rate": 6.324e-06, + "loss": 0.1455, + "step": 2113 + }, + { + "epoch": 1.6648286727057897, + "grad_norm": 1.4148224592208862, + "learning_rate": 6.327000000000001e-06, + "loss": 0.1132, + "step": 2114 + }, + { + "epoch": 1.6656163844033083, + "grad_norm": 1.2401576042175293, + "learning_rate": 6.3299999999999995e-06, + "loss": 0.0887, + "step": 2115 + }, + { + "epoch": 1.666404096100827, + "grad_norm": 1.6121525764465332, + "learning_rate": 6.333e-06, + "loss": 0.1269, + "step": 2116 + }, + { + "epoch": 1.6671918077983459, + "grad_norm": 1.5880990028381348, + "learning_rate": 6.336e-06, + "loss": 0.1345, + "step": 2117 + }, + { + "epoch": 1.6679795194958644, + "grad_norm": 1.7938941717147827, + "learning_rate": 6.339e-06, + "loss": 0.075, + "step": 2118 + }, + { + "epoch": 1.6687672311933832, + "grad_norm": 1.9858896732330322, + "learning_rate": 6.3420000000000004e-06, + "loss": 0.1368, + "step": 2119 + }, + { + "epoch": 1.669554942890902, + "grad_norm": 1.6454588174819946, + "learning_rate": 6.345e-06, + "loss": 0.1227, + "step": 2120 + }, + { + "epoch": 1.6703426545884206, + "grad_norm": 4.161025047302246, + "learning_rate": 6.348000000000001e-06, + "loss": 0.7035, + "step": 2121 + }, + { + "epoch": 1.6711303662859394, + "grad_norm": 1.6259748935699463, + "learning_rate": 6.351e-06, + "loss": 0.4352, + "step": 2122 + }, + { + "epoch": 1.6719180779834582, + "grad_norm": 1.6076951026916504, + "learning_rate": 6.354e-06, + "loss": 0.31, + "step": 2123 + }, + { + "epoch": 1.6727057896809767, + "grad_norm": 1.99458646774292, + "learning_rate": 6.357e-06, + "loss": 0.5162, + "step": 2124 + }, + { + "epoch": 1.6734935013784955, + "grad_norm": 1.4265570640563965, + "learning_rate": 6.36e-06, + "loss": 0.2643, + "step": 2125 + }, + { + "epoch": 1.6742812130760143, + "grad_norm": 1.2403234243392944, + "learning_rate": 6.363000000000001e-06, + "loss": 0.1581, + "step": 2126 + }, + { + "epoch": 1.6750689247735329, + "grad_norm": 0.8488486409187317, + "learning_rate": 6.3660000000000005e-06, + "loss": 0.1107, + "step": 2127 + }, + { + "epoch": 1.6758566364710514, + "grad_norm": 1.1754924058914185, + "learning_rate": 6.3689999999999995e-06, + "loss": 0.1285, + "step": 2128 + }, + { + "epoch": 1.6766443481685704, + "grad_norm": 2.575376510620117, + "learning_rate": 6.372e-06, + "loss": 0.0866, + "step": 2129 + }, + { + "epoch": 1.677432059866089, + "grad_norm": 1.2374447584152222, + "learning_rate": 6.375e-06, + "loss": 0.1084, + "step": 2130 + }, + { + "epoch": 1.6782197715636076, + "grad_norm": 0.8922511339187622, + "learning_rate": 6.378000000000001e-06, + "loss": 0.0981, + "step": 2131 + }, + { + "epoch": 1.6790074832611266, + "grad_norm": 1.3427491188049316, + "learning_rate": 6.381e-06, + "loss": 0.0883, + "step": 2132 + }, + { + "epoch": 1.6797951949586452, + "grad_norm": 1.023176670074463, + "learning_rate": 6.384e-06, + "loss": 0.0794, + "step": 2133 + }, + { + "epoch": 1.6805829066561637, + "grad_norm": 5.002610683441162, + "learning_rate": 6.387e-06, + "loss": 0.1641, + "step": 2134 + }, + { + "epoch": 1.6813706183536825, + "grad_norm": 0.8927420377731323, + "learning_rate": 6.39e-06, + "loss": 0.0689, + "step": 2135 + }, + { + "epoch": 1.6821583300512013, + "grad_norm": 1.3570008277893066, + "learning_rate": 6.3930000000000005e-06, + "loss": 0.1546, + "step": 2136 + }, + { + "epoch": 1.6829460417487199, + "grad_norm": 1.350559949874878, + "learning_rate": 6.396e-06, + "loss": 0.063, + "step": 2137 + }, + { + "epoch": 1.6837337534462387, + "grad_norm": 1.2928104400634766, + "learning_rate": 6.399e-06, + "loss": 0.0948, + "step": 2138 + }, + { + "epoch": 1.6845214651437574, + "grad_norm": 1.2666258811950684, + "learning_rate": 6.402000000000001e-06, + "loss": 0.073, + "step": 2139 + }, + { + "epoch": 1.685309176841276, + "grad_norm": 1.3382292985916138, + "learning_rate": 6.405e-06, + "loss": 0.0707, + "step": 2140 + }, + { + "epoch": 1.6860968885387948, + "grad_norm": 1.412780523300171, + "learning_rate": 6.408e-06, + "loss": 0.1026, + "step": 2141 + }, + { + "epoch": 1.6868846002363136, + "grad_norm": 1.7521750926971436, + "learning_rate": 6.411e-06, + "loss": 0.1048, + "step": 2142 + }, + { + "epoch": 1.6876723119338322, + "grad_norm": 1.5471895933151245, + "learning_rate": 6.414e-06, + "loss": 0.1019, + "step": 2143 + }, + { + "epoch": 1.688460023631351, + "grad_norm": 4.072195053100586, + "learning_rate": 6.4170000000000006e-06, + "loss": 0.1289, + "step": 2144 + }, + { + "epoch": 1.6892477353288697, + "grad_norm": 1.1209743022918701, + "learning_rate": 6.42e-06, + "loss": 0.0901, + "step": 2145 + }, + { + "epoch": 1.6900354470263883, + "grad_norm": 1.1713190078735352, + "learning_rate": 6.423e-06, + "loss": 0.0753, + "step": 2146 + }, + { + "epoch": 1.690823158723907, + "grad_norm": 0.9434868693351746, + "learning_rate": 6.426e-06, + "loss": 0.085, + "step": 2147 + }, + { + "epoch": 1.6916108704214259, + "grad_norm": 1.1587438583374023, + "learning_rate": 6.429e-06, + "loss": 0.1136, + "step": 2148 + }, + { + "epoch": 1.6923985821189445, + "grad_norm": 1.2528603076934814, + "learning_rate": 6.432e-06, + "loss": 0.0911, + "step": 2149 + }, + { + "epoch": 1.693186293816463, + "grad_norm": 1.6593995094299316, + "learning_rate": 6.435e-06, + "loss": 0.0868, + "step": 2150 + }, + { + "epoch": 1.693974005513982, + "grad_norm": 1.4747881889343262, + "learning_rate": 6.438000000000001e-06, + "loss": 0.1052, + "step": 2151 + }, + { + "epoch": 1.6947617172115006, + "grad_norm": 1.764748454093933, + "learning_rate": 6.441e-06, + "loss": 0.082, + "step": 2152 + }, + { + "epoch": 1.6955494289090192, + "grad_norm": 1.1839722394943237, + "learning_rate": 6.444e-06, + "loss": 0.1094, + "step": 2153 + }, + { + "epoch": 1.696337140606538, + "grad_norm": 1.6205731630325317, + "learning_rate": 6.447e-06, + "loss": 0.0886, + "step": 2154 + }, + { + "epoch": 1.6971248523040567, + "grad_norm": 1.1627981662750244, + "learning_rate": 6.45e-06, + "loss": 0.0947, + "step": 2155 + }, + { + "epoch": 1.6979125640015753, + "grad_norm": 1.210890293121338, + "learning_rate": 6.453000000000001e-06, + "loss": 0.0812, + "step": 2156 + }, + { + "epoch": 1.698700275699094, + "grad_norm": 2.121610641479492, + "learning_rate": 6.4560000000000005e-06, + "loss": 0.0857, + "step": 2157 + }, + { + "epoch": 1.6994879873966129, + "grad_norm": 1.029860019683838, + "learning_rate": 6.4589999999999995e-06, + "loss": 0.0809, + "step": 2158 + }, + { + "epoch": 1.7002756990941315, + "grad_norm": 1.2823312282562256, + "learning_rate": 6.462e-06, + "loss": 0.0869, + "step": 2159 + }, + { + "epoch": 1.7010634107916502, + "grad_norm": 1.4983186721801758, + "learning_rate": 6.465e-06, + "loss": 0.1117, + "step": 2160 + }, + { + "epoch": 1.701851122489169, + "grad_norm": 1.0903817415237427, + "learning_rate": 6.468000000000001e-06, + "loss": 0.1036, + "step": 2161 + }, + { + "epoch": 1.7026388341866876, + "grad_norm": 2.0817043781280518, + "learning_rate": 6.471e-06, + "loss": 0.1176, + "step": 2162 + }, + { + "epoch": 1.7034265458842064, + "grad_norm": 1.1206567287445068, + "learning_rate": 6.474e-06, + "loss": 0.0792, + "step": 2163 + }, + { + "epoch": 1.7042142575817252, + "grad_norm": 1.4174076318740845, + "learning_rate": 6.477000000000001e-06, + "loss": 0.0975, + "step": 2164 + }, + { + "epoch": 1.7050019692792437, + "grad_norm": 2.0874176025390625, + "learning_rate": 6.48e-06, + "loss": 0.1, + "step": 2165 + }, + { + "epoch": 1.7057896809767625, + "grad_norm": 1.360097885131836, + "learning_rate": 6.483e-06, + "loss": 0.0908, + "step": 2166 + }, + { + "epoch": 1.7065773926742813, + "grad_norm": 1.700896978378296, + "learning_rate": 6.486e-06, + "loss": 0.1162, + "step": 2167 + }, + { + "epoch": 1.7073651043718, + "grad_norm": 2.6971216201782227, + "learning_rate": 6.489e-06, + "loss": 0.1737, + "step": 2168 + }, + { + "epoch": 1.7081528160693187, + "grad_norm": 1.7689121961593628, + "learning_rate": 6.492000000000001e-06, + "loss": 0.1264, + "step": 2169 + }, + { + "epoch": 1.7089405277668375, + "grad_norm": 2.931126356124878, + "learning_rate": 6.4950000000000005e-06, + "loss": 0.155, + "step": 2170 + }, + { + "epoch": 1.709728239464356, + "grad_norm": 2.7281861305236816, + "learning_rate": 6.4979999999999994e-06, + "loss": 0.6333, + "step": 2171 + }, + { + "epoch": 1.7105159511618746, + "grad_norm": 1.6676089763641357, + "learning_rate": 6.501e-06, + "loss": 0.3764, + "step": 2172 + }, + { + "epoch": 1.7113036628593936, + "grad_norm": 1.5360935926437378, + "learning_rate": 6.504e-06, + "loss": 0.351, + "step": 2173 + }, + { + "epoch": 1.7120913745569122, + "grad_norm": 1.8415836095809937, + "learning_rate": 6.5070000000000005e-06, + "loss": 0.4062, + "step": 2174 + }, + { + "epoch": 1.7128790862544307, + "grad_norm": 1.4885926246643066, + "learning_rate": 6.51e-06, + "loss": 0.2115, + "step": 2175 + }, + { + "epoch": 1.7136667979519495, + "grad_norm": 3.7420589923858643, + "learning_rate": 6.513e-06, + "loss": 0.1462, + "step": 2176 + }, + { + "epoch": 1.7144545096494683, + "grad_norm": 1.33918297290802, + "learning_rate": 6.516e-06, + "loss": 0.1075, + "step": 2177 + }, + { + "epoch": 1.715242221346987, + "grad_norm": 1.196889042854309, + "learning_rate": 6.519e-06, + "loss": 0.1584, + "step": 2178 + }, + { + "epoch": 1.7160299330445057, + "grad_norm": 1.157125473022461, + "learning_rate": 6.522e-06, + "loss": 0.1, + "step": 2179 + }, + { + "epoch": 1.7168176447420245, + "grad_norm": 0.9941165447235107, + "learning_rate": 6.525e-06, + "loss": 0.0897, + "step": 2180 + }, + { + "epoch": 1.717605356439543, + "grad_norm": 1.1110663414001465, + "learning_rate": 6.528e-06, + "loss": 0.0923, + "step": 2181 + }, + { + "epoch": 1.7183930681370618, + "grad_norm": 1.2582423686981201, + "learning_rate": 6.531000000000001e-06, + "loss": 0.1322, + "step": 2182 + }, + { + "epoch": 1.7191807798345806, + "grad_norm": 0.9557735323905945, + "learning_rate": 6.534e-06, + "loss": 0.0858, + "step": 2183 + }, + { + "epoch": 1.7199684915320992, + "grad_norm": 1.058696985244751, + "learning_rate": 6.537e-06, + "loss": 0.0709, + "step": 2184 + }, + { + "epoch": 1.720756203229618, + "grad_norm": 1.0652040243148804, + "learning_rate": 6.54e-06, + "loss": 0.1108, + "step": 2185 + }, + { + "epoch": 1.7215439149271368, + "grad_norm": 1.3694576025009155, + "learning_rate": 6.543e-06, + "loss": 0.1045, + "step": 2186 + }, + { + "epoch": 1.7223316266246553, + "grad_norm": 1.2192590236663818, + "learning_rate": 6.5460000000000005e-06, + "loss": 0.0794, + "step": 2187 + }, + { + "epoch": 1.7231193383221741, + "grad_norm": 1.187859058380127, + "learning_rate": 6.549e-06, + "loss": 0.0848, + "step": 2188 + }, + { + "epoch": 1.723907050019693, + "grad_norm": 0.8584977984428406, + "learning_rate": 6.552e-06, + "loss": 0.0676, + "step": 2189 + }, + { + "epoch": 1.7246947617172115, + "grad_norm": 1.3007136583328247, + "learning_rate": 6.555e-06, + "loss": 0.0848, + "step": 2190 + }, + { + "epoch": 1.72548247341473, + "grad_norm": 1.0712189674377441, + "learning_rate": 6.558e-06, + "loss": 0.0934, + "step": 2191 + }, + { + "epoch": 1.726270185112249, + "grad_norm": 0.8357905149459839, + "learning_rate": 6.561e-06, + "loss": 0.0769, + "step": 2192 + }, + { + "epoch": 1.7270578968097676, + "grad_norm": 1.358346939086914, + "learning_rate": 6.564e-06, + "loss": 0.0798, + "step": 2193 + }, + { + "epoch": 1.7278456085072862, + "grad_norm": 1.2807371616363525, + "learning_rate": 6.567000000000001e-06, + "loss": 0.0965, + "step": 2194 + }, + { + "epoch": 1.7286333202048052, + "grad_norm": 1.3753432035446167, + "learning_rate": 6.57e-06, + "loss": 0.086, + "step": 2195 + }, + { + "epoch": 1.7294210319023238, + "grad_norm": 1.24982488155365, + "learning_rate": 6.573e-06, + "loss": 0.09, + "step": 2196 + }, + { + "epoch": 1.7302087435998423, + "grad_norm": 0.7988064885139465, + "learning_rate": 6.576e-06, + "loss": 0.0716, + "step": 2197 + }, + { + "epoch": 1.7309964552973611, + "grad_norm": 1.260285496711731, + "learning_rate": 6.579e-06, + "loss": 0.087, + "step": 2198 + }, + { + "epoch": 1.73178416699488, + "grad_norm": 1.3332695960998535, + "learning_rate": 6.582000000000001e-06, + "loss": 0.0914, + "step": 2199 + }, + { + "epoch": 1.7325718786923985, + "grad_norm": 1.4621288776397705, + "learning_rate": 6.5850000000000005e-06, + "loss": 0.1004, + "step": 2200 + }, + { + "epoch": 1.7333595903899173, + "grad_norm": 1.1074509620666504, + "learning_rate": 6.5879999999999994e-06, + "loss": 0.089, + "step": 2201 + }, + { + "epoch": 1.734147302087436, + "grad_norm": 1.6102280616760254, + "learning_rate": 6.591e-06, + "loss": 0.0833, + "step": 2202 + }, + { + "epoch": 1.7349350137849546, + "grad_norm": 3.7530407905578613, + "learning_rate": 6.594e-06, + "loss": 0.1337, + "step": 2203 + }, + { + "epoch": 1.7357227254824734, + "grad_norm": 2.3574225902557373, + "learning_rate": 6.5970000000000005e-06, + "loss": 0.0902, + "step": 2204 + }, + { + "epoch": 1.7365104371799922, + "grad_norm": 1.2768934965133667, + "learning_rate": 6.6e-06, + "loss": 0.093, + "step": 2205 + }, + { + "epoch": 1.7372981488775108, + "grad_norm": 0.9463791847229004, + "learning_rate": 6.603e-06, + "loss": 0.0833, + "step": 2206 + }, + { + "epoch": 1.7380858605750296, + "grad_norm": 1.3453819751739502, + "learning_rate": 6.606000000000001e-06, + "loss": 0.0705, + "step": 2207 + }, + { + "epoch": 1.7388735722725484, + "grad_norm": 1.4559175968170166, + "learning_rate": 6.609e-06, + "loss": 0.1105, + "step": 2208 + }, + { + "epoch": 1.739661283970067, + "grad_norm": 1.4592043161392212, + "learning_rate": 6.612e-06, + "loss": 0.1021, + "step": 2209 + }, + { + "epoch": 1.7404489956675857, + "grad_norm": 1.347806692123413, + "learning_rate": 6.615e-06, + "loss": 0.0836, + "step": 2210 + }, + { + "epoch": 1.7412367073651045, + "grad_norm": 1.229049563407898, + "learning_rate": 6.618e-06, + "loss": 0.1203, + "step": 2211 + }, + { + "epoch": 1.742024419062623, + "grad_norm": 3.0702390670776367, + "learning_rate": 6.621000000000001e-06, + "loss": 0.0747, + "step": 2212 + }, + { + "epoch": 1.7428121307601416, + "grad_norm": 1.2509377002716064, + "learning_rate": 6.6240000000000004e-06, + "loss": 0.0848, + "step": 2213 + }, + { + "epoch": 1.7435998424576606, + "grad_norm": 1.036439061164856, + "learning_rate": 6.627e-06, + "loss": 0.1092, + "step": 2214 + }, + { + "epoch": 1.7443875541551792, + "grad_norm": 1.7111568450927734, + "learning_rate": 6.63e-06, + "loss": 0.1103, + "step": 2215 + }, + { + "epoch": 1.7451752658526978, + "grad_norm": 1.8889037370681763, + "learning_rate": 6.633e-06, + "loss": 0.1074, + "step": 2216 + }, + { + "epoch": 1.7459629775502166, + "grad_norm": 1.512790560722351, + "learning_rate": 6.6360000000000005e-06, + "loss": 0.104, + "step": 2217 + }, + { + "epoch": 1.7467506892477354, + "grad_norm": 1.8460867404937744, + "learning_rate": 6.639e-06, + "loss": 0.1065, + "step": 2218 + }, + { + "epoch": 1.747538400945254, + "grad_norm": 1.9863982200622559, + "learning_rate": 6.642000000000001e-06, + "loss": 0.1289, + "step": 2219 + }, + { + "epoch": 1.7483261126427727, + "grad_norm": 1.870869755744934, + "learning_rate": 6.645e-06, + "loss": 0.1579, + "step": 2220 + }, + { + "epoch": 1.7491138243402915, + "grad_norm": 2.2422099113464355, + "learning_rate": 6.648e-06, + "loss": 0.5946, + "step": 2221 + }, + { + "epoch": 1.74990153603781, + "grad_norm": 1.2443534135818481, + "learning_rate": 6.651e-06, + "loss": 0.4007, + "step": 2222 + }, + { + "epoch": 1.7506892477353289, + "grad_norm": 3.0293171405792236, + "learning_rate": 6.654e-06, + "loss": 0.434, + "step": 2223 + }, + { + "epoch": 1.7514769594328476, + "grad_norm": 1.818339228630066, + "learning_rate": 6.657e-06, + "loss": 0.3314, + "step": 2224 + }, + { + "epoch": 1.7522646711303662, + "grad_norm": 1.468161940574646, + "learning_rate": 6.660000000000001e-06, + "loss": 0.2907, + "step": 2225 + }, + { + "epoch": 1.753052382827885, + "grad_norm": 1.4732089042663574, + "learning_rate": 6.6629999999999996e-06, + "loss": 0.1598, + "step": 2226 + }, + { + "epoch": 1.7538400945254038, + "grad_norm": 0.9923062324523926, + "learning_rate": 6.666e-06, + "loss": 0.1348, + "step": 2227 + }, + { + "epoch": 1.7546278062229224, + "grad_norm": 3.9640276432037354, + "learning_rate": 6.669e-06, + "loss": 0.1402, + "step": 2228 + }, + { + "epoch": 1.7554155179204411, + "grad_norm": 2.1042582988739014, + "learning_rate": 6.672e-06, + "loss": 0.0923, + "step": 2229 + }, + { + "epoch": 1.75620322961796, + "grad_norm": 0.7884979248046875, + "learning_rate": 6.6750000000000005e-06, + "loss": 0.0872, + "step": 2230 + }, + { + "epoch": 1.7569909413154785, + "grad_norm": 1.0030208826065063, + "learning_rate": 6.678e-06, + "loss": 0.0685, + "step": 2231 + }, + { + "epoch": 1.7577786530129973, + "grad_norm": 1.1994959115982056, + "learning_rate": 6.681e-06, + "loss": 0.0972, + "step": 2232 + }, + { + "epoch": 1.758566364710516, + "grad_norm": 1.457313895225525, + "learning_rate": 6.684e-06, + "loss": 0.065, + "step": 2233 + }, + { + "epoch": 1.7593540764080347, + "grad_norm": 1.3160024881362915, + "learning_rate": 6.687e-06, + "loss": 0.0666, + "step": 2234 + }, + { + "epoch": 1.7601417881055532, + "grad_norm": 1.282494068145752, + "learning_rate": 6.69e-06, + "loss": 0.0641, + "step": 2235 + }, + { + "epoch": 1.7609294998030722, + "grad_norm": 1.3620327711105347, + "learning_rate": 6.693e-06, + "loss": 0.0859, + "step": 2236 + }, + { + "epoch": 1.7617172115005908, + "grad_norm": 1.216788649559021, + "learning_rate": 6.696000000000001e-06, + "loss": 0.092, + "step": 2237 + }, + { + "epoch": 1.7625049231981094, + "grad_norm": 1.4080663919448853, + "learning_rate": 6.699e-06, + "loss": 0.0984, + "step": 2238 + }, + { + "epoch": 1.7632926348956282, + "grad_norm": 0.8087906837463379, + "learning_rate": 6.7019999999999995e-06, + "loss": 0.0585, + "step": 2239 + }, + { + "epoch": 1.764080346593147, + "grad_norm": 1.3690258264541626, + "learning_rate": 6.705e-06, + "loss": 0.0717, + "step": 2240 + }, + { + "epoch": 1.7648680582906655, + "grad_norm": 2.220386266708374, + "learning_rate": 6.708e-06, + "loss": 0.0935, + "step": 2241 + }, + { + "epoch": 1.7656557699881843, + "grad_norm": 1.1202882528305054, + "learning_rate": 6.711000000000001e-06, + "loss": 0.0884, + "step": 2242 + }, + { + "epoch": 1.766443481685703, + "grad_norm": 1.0813263654708862, + "learning_rate": 6.7140000000000004e-06, + "loss": 0.0937, + "step": 2243 + }, + { + "epoch": 1.7672311933832217, + "grad_norm": 2.345212936401367, + "learning_rate": 6.716999999999999e-06, + "loss": 0.0809, + "step": 2244 + }, + { + "epoch": 1.7680189050807404, + "grad_norm": 1.1679426431655884, + "learning_rate": 6.72e-06, + "loss": 0.0772, + "step": 2245 + }, + { + "epoch": 1.7688066167782592, + "grad_norm": 1.660526990890503, + "learning_rate": 6.723e-06, + "loss": 0.1015, + "step": 2246 + }, + { + "epoch": 1.7695943284757778, + "grad_norm": 1.4427480697631836, + "learning_rate": 6.7260000000000005e-06, + "loss": 0.1688, + "step": 2247 + }, + { + "epoch": 1.7703820401732966, + "grad_norm": 1.2027627229690552, + "learning_rate": 6.729e-06, + "loss": 0.0955, + "step": 2248 + }, + { + "epoch": 1.7711697518708154, + "grad_norm": 1.10768723487854, + "learning_rate": 6.732e-06, + "loss": 0.0792, + "step": 2249 + }, + { + "epoch": 1.771957463568334, + "grad_norm": 1.6732268333435059, + "learning_rate": 6.735000000000001e-06, + "loss": 0.1152, + "step": 2250 + }, + { + "epoch": 1.7727451752658527, + "grad_norm": 1.003240942955017, + "learning_rate": 6.738e-06, + "loss": 0.1071, + "step": 2251 + }, + { + "epoch": 1.7735328869633715, + "grad_norm": 0.82473224401474, + "learning_rate": 6.741e-06, + "loss": 0.0677, + "step": 2252 + }, + { + "epoch": 1.77432059866089, + "grad_norm": 1.3015738725662231, + "learning_rate": 6.744e-06, + "loss": 0.1076, + "step": 2253 + }, + { + "epoch": 1.7751083103584087, + "grad_norm": 1.189680576324463, + "learning_rate": 6.747e-06, + "loss": 0.0727, + "step": 2254 + }, + { + "epoch": 1.7758960220559277, + "grad_norm": 1.1238970756530762, + "learning_rate": 6.750000000000001e-06, + "loss": 0.0742, + "step": 2255 + }, + { + "epoch": 1.7766837337534462, + "grad_norm": 1.281211018562317, + "learning_rate": 6.753e-06, + "loss": 0.0998, + "step": 2256 + }, + { + "epoch": 1.7774714454509648, + "grad_norm": 1.5262653827667236, + "learning_rate": 6.756e-06, + "loss": 0.1025, + "step": 2257 + }, + { + "epoch": 1.7782591571484838, + "grad_norm": 2.0132460594177246, + "learning_rate": 6.759e-06, + "loss": 0.0787, + "step": 2258 + }, + { + "epoch": 1.7790468688460024, + "grad_norm": 1.4777860641479492, + "learning_rate": 6.762e-06, + "loss": 0.1262, + "step": 2259 + }, + { + "epoch": 1.779834580543521, + "grad_norm": 1.4655942916870117, + "learning_rate": 6.7650000000000005e-06, + "loss": 0.1009, + "step": 2260 + }, + { + "epoch": 1.7806222922410397, + "grad_norm": 1.106378197669983, + "learning_rate": 6.768e-06, + "loss": 0.0673, + "step": 2261 + }, + { + "epoch": 1.7814100039385585, + "grad_norm": 0.9937753677368164, + "learning_rate": 6.771000000000001e-06, + "loss": 0.0958, + "step": 2262 + }, + { + "epoch": 1.782197715636077, + "grad_norm": 4.451657295227051, + "learning_rate": 6.774e-06, + "loss": 0.0883, + "step": 2263 + }, + { + "epoch": 1.7829854273335959, + "grad_norm": 1.214360237121582, + "learning_rate": 6.777e-06, + "loss": 0.1131, + "step": 2264 + }, + { + "epoch": 1.7837731390311147, + "grad_norm": 8.177886962890625, + "learning_rate": 6.78e-06, + "loss": 0.1376, + "step": 2265 + }, + { + "epoch": 1.7845608507286332, + "grad_norm": 1.345800518989563, + "learning_rate": 6.783e-06, + "loss": 0.1042, + "step": 2266 + }, + { + "epoch": 1.785348562426152, + "grad_norm": 1.4055911302566528, + "learning_rate": 6.786000000000001e-06, + "loss": 0.1157, + "step": 2267 + }, + { + "epoch": 1.7861362741236708, + "grad_norm": 2.599332094192505, + "learning_rate": 6.7890000000000006e-06, + "loss": 0.0886, + "step": 2268 + }, + { + "epoch": 1.7869239858211894, + "grad_norm": 1.4685543775558472, + "learning_rate": 6.7919999999999995e-06, + "loss": 0.0747, + "step": 2269 + }, + { + "epoch": 1.7877116975187082, + "grad_norm": 1.4574297666549683, + "learning_rate": 6.795e-06, + "loss": 0.1145, + "step": 2270 + }, + { + "epoch": 1.788499409216227, + "grad_norm": 2.0995826721191406, + "learning_rate": 6.798e-06, + "loss": 0.6932, + "step": 2271 + }, + { + "epoch": 1.7892871209137455, + "grad_norm": 1.831945538520813, + "learning_rate": 6.801000000000001e-06, + "loss": 0.4657, + "step": 2272 + }, + { + "epoch": 1.7900748326112643, + "grad_norm": 1.6876851320266724, + "learning_rate": 6.804e-06, + "loss": 0.3489, + "step": 2273 + }, + { + "epoch": 1.790862544308783, + "grad_norm": 1.5144318342208862, + "learning_rate": 6.807e-06, + "loss": 0.3496, + "step": 2274 + }, + { + "epoch": 1.7916502560063017, + "grad_norm": 1.4117481708526611, + "learning_rate": 6.81e-06, + "loss": 0.2497, + "step": 2275 + }, + { + "epoch": 1.7924379677038202, + "grad_norm": 1.2755682468414307, + "learning_rate": 6.813e-06, + "loss": 0.1571, + "step": 2276 + }, + { + "epoch": 1.7932256794013393, + "grad_norm": 0.8107854723930359, + "learning_rate": 6.8160000000000005e-06, + "loss": 0.1058, + "step": 2277 + }, + { + "epoch": 1.7940133910988578, + "grad_norm": 0.9808543920516968, + "learning_rate": 6.819e-06, + "loss": 0.0936, + "step": 2278 + }, + { + "epoch": 1.7948011027963764, + "grad_norm": 1.235927700996399, + "learning_rate": 6.822e-06, + "loss": 0.1219, + "step": 2279 + }, + { + "epoch": 1.7955888144938952, + "grad_norm": 1.3765662908554077, + "learning_rate": 6.825000000000001e-06, + "loss": 0.1076, + "step": 2280 + }, + { + "epoch": 1.796376526191414, + "grad_norm": 2.6261065006256104, + "learning_rate": 6.828e-06, + "loss": 0.1417, + "step": 2281 + }, + { + "epoch": 1.7971642378889325, + "grad_norm": 2.856402635574341, + "learning_rate": 6.831e-06, + "loss": 0.0632, + "step": 2282 + }, + { + "epoch": 1.7979519495864513, + "grad_norm": 1.1244935989379883, + "learning_rate": 6.834e-06, + "loss": 0.0871, + "step": 2283 + }, + { + "epoch": 1.7987396612839701, + "grad_norm": 1.0835367441177368, + "learning_rate": 6.837e-06, + "loss": 0.0537, + "step": 2284 + }, + { + "epoch": 1.7995273729814887, + "grad_norm": 0.9542169570922852, + "learning_rate": 6.840000000000001e-06, + "loss": 0.084, + "step": 2285 + }, + { + "epoch": 1.8003150846790075, + "grad_norm": 1.4630285501480103, + "learning_rate": 6.843e-06, + "loss": 0.0961, + "step": 2286 + }, + { + "epoch": 1.8011027963765263, + "grad_norm": 1.1372740268707275, + "learning_rate": 6.845999999999999e-06, + "loss": 0.0811, + "step": 2287 + }, + { + "epoch": 1.8018905080740448, + "grad_norm": 2.055997133255005, + "learning_rate": 6.849e-06, + "loss": 0.0914, + "step": 2288 + }, + { + "epoch": 1.8026782197715636, + "grad_norm": 1.0774314403533936, + "learning_rate": 6.852e-06, + "loss": 0.0888, + "step": 2289 + }, + { + "epoch": 1.8034659314690824, + "grad_norm": 2.488233804702759, + "learning_rate": 6.8550000000000004e-06, + "loss": 0.0695, + "step": 2290 + }, + { + "epoch": 1.804253643166601, + "grad_norm": 1.4416922330856323, + "learning_rate": 6.858e-06, + "loss": 0.0924, + "step": 2291 + }, + { + "epoch": 1.8050413548641198, + "grad_norm": 1.4308656454086304, + "learning_rate": 6.861e-06, + "loss": 0.0743, + "step": 2292 + }, + { + "epoch": 1.8058290665616386, + "grad_norm": 1.1859761476516724, + "learning_rate": 6.864000000000001e-06, + "loss": 0.0931, + "step": 2293 + }, + { + "epoch": 1.8066167782591571, + "grad_norm": 1.6911596059799194, + "learning_rate": 6.867e-06, + "loss": 0.0856, + "step": 2294 + }, + { + "epoch": 1.807404489956676, + "grad_norm": 1.7367113828659058, + "learning_rate": 6.87e-06, + "loss": 0.0693, + "step": 2295 + }, + { + "epoch": 1.8081922016541947, + "grad_norm": 1.4401549100875854, + "learning_rate": 6.873e-06, + "loss": 0.0738, + "step": 2296 + }, + { + "epoch": 1.8089799133517133, + "grad_norm": 6.78678035736084, + "learning_rate": 6.876e-06, + "loss": 0.0729, + "step": 2297 + }, + { + "epoch": 1.8097676250492318, + "grad_norm": 1.2749356031417847, + "learning_rate": 6.8790000000000005e-06, + "loss": 0.0999, + "step": 2298 + }, + { + "epoch": 1.8105553367467508, + "grad_norm": 1.2661285400390625, + "learning_rate": 6.882e-06, + "loss": 0.0966, + "step": 2299 + }, + { + "epoch": 1.8113430484442694, + "grad_norm": 1.4725021123886108, + "learning_rate": 6.885e-06, + "loss": 0.1042, + "step": 2300 + }, + { + "epoch": 1.812130760141788, + "grad_norm": 1.1185599565505981, + "learning_rate": 6.888e-06, + "loss": 0.0604, + "step": 2301 + }, + { + "epoch": 1.8129184718393068, + "grad_norm": 1.2115657329559326, + "learning_rate": 6.891e-06, + "loss": 0.1011, + "step": 2302 + }, + { + "epoch": 1.8137061835368256, + "grad_norm": 11.36940860748291, + "learning_rate": 6.894e-06, + "loss": 0.0893, + "step": 2303 + }, + { + "epoch": 1.8144938952343441, + "grad_norm": 1.5449585914611816, + "learning_rate": 6.897e-06, + "loss": 0.1005, + "step": 2304 + }, + { + "epoch": 1.815281606931863, + "grad_norm": 1.1090425252914429, + "learning_rate": 6.900000000000001e-06, + "loss": 0.089, + "step": 2305 + }, + { + "epoch": 1.8160693186293817, + "grad_norm": 1.5493370294570923, + "learning_rate": 6.903e-06, + "loss": 0.0925, + "step": 2306 + }, + { + "epoch": 1.8168570303269003, + "grad_norm": 1.3208552598953247, + "learning_rate": 6.906e-06, + "loss": 0.0941, + "step": 2307 + }, + { + "epoch": 1.817644742024419, + "grad_norm": 1.3215315341949463, + "learning_rate": 6.909e-06, + "loss": 0.0789, + "step": 2308 + }, + { + "epoch": 1.8184324537219378, + "grad_norm": 1.8373970985412598, + "learning_rate": 6.912e-06, + "loss": 0.0817, + "step": 2309 + }, + { + "epoch": 1.8192201654194564, + "grad_norm": 1.3661658763885498, + "learning_rate": 6.915000000000001e-06, + "loss": 0.0825, + "step": 2310 + }, + { + "epoch": 1.8200078771169752, + "grad_norm": 1.3683018684387207, + "learning_rate": 6.9180000000000005e-06, + "loss": 0.0843, + "step": 2311 + }, + { + "epoch": 1.820795588814494, + "grad_norm": 1.677095651626587, + "learning_rate": 6.9209999999999995e-06, + "loss": 0.1044, + "step": 2312 + }, + { + "epoch": 1.8215833005120126, + "grad_norm": 1.1464083194732666, + "learning_rate": 6.924e-06, + "loss": 0.083, + "step": 2313 + }, + { + "epoch": 1.8223710122095313, + "grad_norm": 1.33920156955719, + "learning_rate": 6.927e-06, + "loss": 0.1106, + "step": 2314 + }, + { + "epoch": 1.8231587239070501, + "grad_norm": 1.2188677787780762, + "learning_rate": 6.9300000000000006e-06, + "loss": 0.0727, + "step": 2315 + }, + { + "epoch": 1.8239464356045687, + "grad_norm": 1.3480157852172852, + "learning_rate": 6.933e-06, + "loss": 0.0827, + "step": 2316 + }, + { + "epoch": 1.8247341473020873, + "grad_norm": 1.3965861797332764, + "learning_rate": 6.936e-06, + "loss": 0.0878, + "step": 2317 + }, + { + "epoch": 1.8255218589996063, + "grad_norm": 1.4977917671203613, + "learning_rate": 6.939e-06, + "loss": 0.0795, + "step": 2318 + }, + { + "epoch": 1.8263095706971249, + "grad_norm": 1.373023509979248, + "learning_rate": 6.942e-06, + "loss": 0.1081, + "step": 2319 + }, + { + "epoch": 1.8270972823946434, + "grad_norm": 1.7124298810958862, + "learning_rate": 6.945e-06, + "loss": 0.1352, + "step": 2320 + }, + { + "epoch": 1.8278849940921624, + "grad_norm": 1.7239171266555786, + "learning_rate": 6.948e-06, + "loss": 0.5743, + "step": 2321 + }, + { + "epoch": 1.828672705789681, + "grad_norm": 1.5972846746444702, + "learning_rate": 6.951e-06, + "loss": 0.4618, + "step": 2322 + }, + { + "epoch": 1.8294604174871996, + "grad_norm": 1.212730050086975, + "learning_rate": 6.954000000000001e-06, + "loss": 0.3431, + "step": 2323 + }, + { + "epoch": 1.8302481291847184, + "grad_norm": 2.5699923038482666, + "learning_rate": 6.957e-06, + "loss": 0.2792, + "step": 2324 + }, + { + "epoch": 1.8310358408822371, + "grad_norm": 1.1015528440475464, + "learning_rate": 6.96e-06, + "loss": 0.235, + "step": 2325 + }, + { + "epoch": 1.8318235525797557, + "grad_norm": 1.6567165851593018, + "learning_rate": 6.963e-06, + "loss": 0.2407, + "step": 2326 + }, + { + "epoch": 1.8326112642772745, + "grad_norm": 0.9391953349113464, + "learning_rate": 6.966e-06, + "loss": 0.1531, + "step": 2327 + }, + { + "epoch": 1.8333989759747933, + "grad_norm": 3.0715062618255615, + "learning_rate": 6.9690000000000005e-06, + "loss": 0.1132, + "step": 2328 + }, + { + "epoch": 1.8341866876723119, + "grad_norm": 1.326112985610962, + "learning_rate": 6.972e-06, + "loss": 0.0906, + "step": 2329 + }, + { + "epoch": 1.8349743993698306, + "grad_norm": 0.8105222582817078, + "learning_rate": 6.975e-06, + "loss": 0.063, + "step": 2330 + }, + { + "epoch": 1.8357621110673494, + "grad_norm": 1.1699057817459106, + "learning_rate": 6.978e-06, + "loss": 0.0842, + "step": 2331 + }, + { + "epoch": 1.836549822764868, + "grad_norm": 1.4981995820999146, + "learning_rate": 6.981e-06, + "loss": 0.0999, + "step": 2332 + }, + { + "epoch": 1.8373375344623868, + "grad_norm": 0.8495954275131226, + "learning_rate": 6.984e-06, + "loss": 0.0747, + "step": 2333 + }, + { + "epoch": 1.8381252461599056, + "grad_norm": 2.432175397872925, + "learning_rate": 6.987e-06, + "loss": 0.0646, + "step": 2334 + }, + { + "epoch": 1.8389129578574241, + "grad_norm": 1.1353155374526978, + "learning_rate": 6.990000000000001e-06, + "loss": 0.0591, + "step": 2335 + }, + { + "epoch": 1.839700669554943, + "grad_norm": 1.0950418710708618, + "learning_rate": 6.993000000000001e-06, + "loss": 0.0612, + "step": 2336 + }, + { + "epoch": 1.8404883812524617, + "grad_norm": 0.9413748383522034, + "learning_rate": 6.996e-06, + "loss": 0.0652, + "step": 2337 + }, + { + "epoch": 1.8412760929499803, + "grad_norm": 1.1134250164031982, + "learning_rate": 6.999e-06, + "loss": 0.0709, + "step": 2338 + }, + { + "epoch": 1.8420638046474989, + "grad_norm": 1.314326286315918, + "learning_rate": 7.002e-06, + "loss": 0.0776, + "step": 2339 + }, + { + "epoch": 1.8428515163450179, + "grad_norm": 1.9093085527420044, + "learning_rate": 7.005000000000001e-06, + "loss": 0.0873, + "step": 2340 + }, + { + "epoch": 1.8436392280425364, + "grad_norm": 1.2333621978759766, + "learning_rate": 7.0080000000000005e-06, + "loss": 0.13, + "step": 2341 + }, + { + "epoch": 1.844426939740055, + "grad_norm": 2.443028450012207, + "learning_rate": 7.011e-06, + "loss": 0.0727, + "step": 2342 + }, + { + "epoch": 1.8452146514375738, + "grad_norm": 1.0587581396102905, + "learning_rate": 7.014e-06, + "loss": 0.0757, + "step": 2343 + }, + { + "epoch": 1.8460023631350926, + "grad_norm": 2.381621837615967, + "learning_rate": 7.017e-06, + "loss": 0.106, + "step": 2344 + }, + { + "epoch": 1.8467900748326112, + "grad_norm": 1.6551380157470703, + "learning_rate": 7.0200000000000006e-06, + "loss": 0.0928, + "step": 2345 + }, + { + "epoch": 1.84757778653013, + "grad_norm": 0.987643837928772, + "learning_rate": 7.023e-06, + "loss": 0.0686, + "step": 2346 + }, + { + "epoch": 1.8483654982276487, + "grad_norm": 1.1361150741577148, + "learning_rate": 7.026e-06, + "loss": 0.0652, + "step": 2347 + }, + { + "epoch": 1.8491532099251673, + "grad_norm": 1.511370062828064, + "learning_rate": 7.029000000000001e-06, + "loss": 0.0881, + "step": 2348 + }, + { + "epoch": 1.849940921622686, + "grad_norm": 1.2687046527862549, + "learning_rate": 7.032e-06, + "loss": 0.0822, + "step": 2349 + }, + { + "epoch": 1.8507286333202049, + "grad_norm": 1.2503372430801392, + "learning_rate": 7.0349999999999996e-06, + "loss": 0.0807, + "step": 2350 + }, + { + "epoch": 1.8515163450177234, + "grad_norm": 1.013397216796875, + "learning_rate": 7.038e-06, + "loss": 0.0818, + "step": 2351 + }, + { + "epoch": 1.8523040567152422, + "grad_norm": 2.2515087127685547, + "learning_rate": 7.041e-06, + "loss": 0.1041, + "step": 2352 + }, + { + "epoch": 1.853091768412761, + "grad_norm": 2.465461492538452, + "learning_rate": 7.044000000000001e-06, + "loss": 0.0769, + "step": 2353 + }, + { + "epoch": 1.8538794801102796, + "grad_norm": 1.8335009813308716, + "learning_rate": 7.0470000000000005e-06, + "loss": 0.0947, + "step": 2354 + }, + { + "epoch": 1.8546671918077984, + "grad_norm": 1.1207234859466553, + "learning_rate": 7.049999999999999e-06, + "loss": 0.0726, + "step": 2355 + }, + { + "epoch": 1.8554549035053172, + "grad_norm": 0.8722518086433411, + "learning_rate": 7.053e-06, + "loss": 0.0599, + "step": 2356 + }, + { + "epoch": 1.8562426152028357, + "grad_norm": 0.8749350309371948, + "learning_rate": 7.056e-06, + "loss": 0.0598, + "step": 2357 + }, + { + "epoch": 1.8570303269003545, + "grad_norm": 1.321251392364502, + "learning_rate": 7.0590000000000005e-06, + "loss": 0.1064, + "step": 2358 + }, + { + "epoch": 1.8578180385978733, + "grad_norm": 1.09516179561615, + "learning_rate": 7.062e-06, + "loss": 0.0831, + "step": 2359 + }, + { + "epoch": 1.8586057502953919, + "grad_norm": 1.3215864896774292, + "learning_rate": 7.065e-06, + "loss": 0.0742, + "step": 2360 + }, + { + "epoch": 1.8593934619929104, + "grad_norm": 1.7988461256027222, + "learning_rate": 7.068e-06, + "loss": 0.1012, + "step": 2361 + }, + { + "epoch": 1.8601811736904295, + "grad_norm": 1.578973412513733, + "learning_rate": 7.071e-06, + "loss": 0.0542, + "step": 2362 + }, + { + "epoch": 1.860968885387948, + "grad_norm": 1.2310906648635864, + "learning_rate": 7.074e-06, + "loss": 0.0803, + "step": 2363 + }, + { + "epoch": 1.8617565970854666, + "grad_norm": 0.9192394614219666, + "learning_rate": 7.077e-06, + "loss": 0.0606, + "step": 2364 + }, + { + "epoch": 1.8625443087829854, + "grad_norm": 1.179871916770935, + "learning_rate": 7.08e-06, + "loss": 0.1024, + "step": 2365 + }, + { + "epoch": 1.8633320204805042, + "grad_norm": 1.3365929126739502, + "learning_rate": 7.083000000000001e-06, + "loss": 0.1006, + "step": 2366 + }, + { + "epoch": 1.8641197321780227, + "grad_norm": 1.1872913837432861, + "learning_rate": 7.086e-06, + "loss": 0.0981, + "step": 2367 + }, + { + "epoch": 1.8649074438755415, + "grad_norm": 1.512018084526062, + "learning_rate": 7.089e-06, + "loss": 0.1184, + "step": 2368 + }, + { + "epoch": 1.8656951555730603, + "grad_norm": 1.7119585275650024, + "learning_rate": 7.092e-06, + "loss": 0.1023, + "step": 2369 + }, + { + "epoch": 1.8664828672705789, + "grad_norm": 1.398518681526184, + "learning_rate": 7.095e-06, + "loss": 0.0756, + "step": 2370 + }, + { + "epoch": 1.8672705789680977, + "grad_norm": 1.656965732574463, + "learning_rate": 7.0980000000000005e-06, + "loss": 0.5198, + "step": 2371 + }, + { + "epoch": 1.8680582906656165, + "grad_norm": 3.1380882263183594, + "learning_rate": 7.101e-06, + "loss": 0.495, + "step": 2372 + }, + { + "epoch": 1.868846002363135, + "grad_norm": 2.4885120391845703, + "learning_rate": 7.104e-06, + "loss": 0.3568, + "step": 2373 + }, + { + "epoch": 1.8696337140606538, + "grad_norm": 2.063580274581909, + "learning_rate": 7.107e-06, + "loss": 0.5426, + "step": 2374 + }, + { + "epoch": 1.8704214257581726, + "grad_norm": 1.5587180852890015, + "learning_rate": 7.11e-06, + "loss": 0.3155, + "step": 2375 + }, + { + "epoch": 1.8712091374556912, + "grad_norm": 1.557894229888916, + "learning_rate": 7.113e-06, + "loss": 0.2092, + "step": 2376 + }, + { + "epoch": 1.87199684915321, + "grad_norm": 1.1897342205047607, + "learning_rate": 7.116e-06, + "loss": 0.1588, + "step": 2377 + }, + { + "epoch": 1.8727845608507288, + "grad_norm": 2.857800245285034, + "learning_rate": 7.119000000000001e-06, + "loss": 0.1071, + "step": 2378 + }, + { + "epoch": 1.8735722725482473, + "grad_norm": 1.0115677118301392, + "learning_rate": 7.122000000000001e-06, + "loss": 0.0592, + "step": 2379 + }, + { + "epoch": 1.8743599842457659, + "grad_norm": 1.031233787536621, + "learning_rate": 7.1249999999999995e-06, + "loss": 0.1153, + "step": 2380 + }, + { + "epoch": 1.875147695943285, + "grad_norm": 1.1051340103149414, + "learning_rate": 7.128e-06, + "loss": 0.0868, + "step": 2381 + }, + { + "epoch": 1.8759354076408035, + "grad_norm": 0.9452806711196899, + "learning_rate": 7.131e-06, + "loss": 0.1172, + "step": 2382 + }, + { + "epoch": 1.876723119338322, + "grad_norm": 1.1102943420410156, + "learning_rate": 7.134000000000001e-06, + "loss": 0.0712, + "step": 2383 + }, + { + "epoch": 1.877510831035841, + "grad_norm": 1.1910580396652222, + "learning_rate": 7.1370000000000004e-06, + "loss": 0.0635, + "step": 2384 + }, + { + "epoch": 1.8782985427333596, + "grad_norm": 9.866786003112793, + "learning_rate": 7.14e-06, + "loss": 0.1325, + "step": 2385 + }, + { + "epoch": 1.8790862544308782, + "grad_norm": 1.174317479133606, + "learning_rate": 7.143e-06, + "loss": 0.101, + "step": 2386 + }, + { + "epoch": 1.879873966128397, + "grad_norm": 0.9403659105300903, + "learning_rate": 7.146e-06, + "loss": 0.0572, + "step": 2387 + }, + { + "epoch": 1.8806616778259158, + "grad_norm": 1.2554125785827637, + "learning_rate": 7.1490000000000005e-06, + "loss": 0.0919, + "step": 2388 + }, + { + "epoch": 1.8814493895234343, + "grad_norm": 1.0038586854934692, + "learning_rate": 7.152e-06, + "loss": 0.0548, + "step": 2389 + }, + { + "epoch": 1.8822371012209531, + "grad_norm": 1.1641740798950195, + "learning_rate": 7.155e-06, + "loss": 0.1261, + "step": 2390 + }, + { + "epoch": 1.883024812918472, + "grad_norm": 1.044503092765808, + "learning_rate": 7.158000000000001e-06, + "loss": 0.0689, + "step": 2391 + }, + { + "epoch": 1.8838125246159905, + "grad_norm": 1.0742853879928589, + "learning_rate": 7.161e-06, + "loss": 0.0723, + "step": 2392 + }, + { + "epoch": 1.8846002363135093, + "grad_norm": 1.1698330640792847, + "learning_rate": 7.164e-06, + "loss": 0.0968, + "step": 2393 + }, + { + "epoch": 1.885387948011028, + "grad_norm": 0.9047726392745972, + "learning_rate": 7.167e-06, + "loss": 0.0615, + "step": 2394 + }, + { + "epoch": 1.8861756597085466, + "grad_norm": 1.0957460403442383, + "learning_rate": 7.17e-06, + "loss": 0.0996, + "step": 2395 + }, + { + "epoch": 1.8869633714060654, + "grad_norm": 0.8436603546142578, + "learning_rate": 7.173000000000001e-06, + "loss": 0.0859, + "step": 2396 + }, + { + "epoch": 1.8877510831035842, + "grad_norm": 0.935435950756073, + "learning_rate": 7.176e-06, + "loss": 0.0776, + "step": 2397 + }, + { + "epoch": 1.8885387948011028, + "grad_norm": 1.1229884624481201, + "learning_rate": 7.179e-06, + "loss": 0.0643, + "step": 2398 + }, + { + "epoch": 1.8893265064986215, + "grad_norm": 1.8542412519454956, + "learning_rate": 7.182e-06, + "loss": 0.0888, + "step": 2399 + }, + { + "epoch": 1.8901142181961403, + "grad_norm": 1.4055230617523193, + "learning_rate": 7.185e-06, + "loss": 0.0729, + "step": 2400 + }, + { + "epoch": 1.890901929893659, + "grad_norm": 0.9444662928581238, + "learning_rate": 7.1880000000000005e-06, + "loss": 0.078, + "step": 2401 + }, + { + "epoch": 1.8916896415911775, + "grad_norm": 1.1550884246826172, + "learning_rate": 7.191e-06, + "loss": 0.0832, + "step": 2402 + }, + { + "epoch": 1.8924773532886965, + "grad_norm": 1.0036667585372925, + "learning_rate": 7.194000000000001e-06, + "loss": 0.0754, + "step": 2403 + }, + { + "epoch": 1.893265064986215, + "grad_norm": 1.275443434715271, + "learning_rate": 7.197e-06, + "loss": 0.1685, + "step": 2404 + }, + { + "epoch": 1.8940527766837336, + "grad_norm": 1.648330569267273, + "learning_rate": 7.2e-06, + "loss": 0.0915, + "step": 2405 + }, + { + "epoch": 1.8948404883812524, + "grad_norm": 0.9148444533348083, + "learning_rate": 7.203e-06, + "loss": 0.0684, + "step": 2406 + }, + { + "epoch": 1.8956282000787712, + "grad_norm": 1.4576137065887451, + "learning_rate": 7.206e-06, + "loss": 0.089, + "step": 2407 + }, + { + "epoch": 1.8964159117762898, + "grad_norm": 1.0953822135925293, + "learning_rate": 7.209000000000001e-06, + "loss": 0.1009, + "step": 2408 + }, + { + "epoch": 1.8972036234738086, + "grad_norm": 1.4042826890945435, + "learning_rate": 7.2120000000000006e-06, + "loss": 0.0894, + "step": 2409 + }, + { + "epoch": 1.8979913351713273, + "grad_norm": 0.8985242247581482, + "learning_rate": 7.2149999999999995e-06, + "loss": 0.0828, + "step": 2410 + }, + { + "epoch": 1.898779046868846, + "grad_norm": 1.472658634185791, + "learning_rate": 7.218e-06, + "loss": 0.0962, + "step": 2411 + }, + { + "epoch": 1.8995667585663647, + "grad_norm": 1.052592158317566, + "learning_rate": 7.221e-06, + "loss": 0.0853, + "step": 2412 + }, + { + "epoch": 1.9003544702638835, + "grad_norm": 1.3963826894760132, + "learning_rate": 7.224e-06, + "loss": 0.0872, + "step": 2413 + }, + { + "epoch": 1.901142181961402, + "grad_norm": 1.5797268152236938, + "learning_rate": 7.2270000000000004e-06, + "loss": 0.1316, + "step": 2414 + }, + { + "epoch": 1.9019298936589208, + "grad_norm": 1.185547947883606, + "learning_rate": 7.23e-06, + "loss": 0.1239, + "step": 2415 + }, + { + "epoch": 1.9027176053564396, + "grad_norm": 1.4884356260299683, + "learning_rate": 7.233e-06, + "loss": 0.0734, + "step": 2416 + }, + { + "epoch": 1.9035053170539582, + "grad_norm": 1.4511324167251587, + "learning_rate": 7.236e-06, + "loss": 0.1215, + "step": 2417 + }, + { + "epoch": 1.904293028751477, + "grad_norm": 1.7161309719085693, + "learning_rate": 7.239e-06, + "loss": 0.1163, + "step": 2418 + }, + { + "epoch": 1.9050807404489958, + "grad_norm": 3.1640498638153076, + "learning_rate": 7.242e-06, + "loss": 0.1347, + "step": 2419 + }, + { + "epoch": 1.9058684521465143, + "grad_norm": 1.7377873659133911, + "learning_rate": 7.245e-06, + "loss": 0.0984, + "step": 2420 + }, + { + "epoch": 1.9066561638440331, + "grad_norm": 4.054500579833984, + "learning_rate": 7.248000000000001e-06, + "loss": 0.5895, + "step": 2421 + }, + { + "epoch": 1.907443875541552, + "grad_norm": 1.4954684972763062, + "learning_rate": 7.2510000000000005e-06, + "loss": 0.4938, + "step": 2422 + }, + { + "epoch": 1.9082315872390705, + "grad_norm": 1.7728630304336548, + "learning_rate": 7.2539999999999995e-06, + "loss": 0.3923, + "step": 2423 + }, + { + "epoch": 1.909019298936589, + "grad_norm": 1.6581871509552002, + "learning_rate": 7.257e-06, + "loss": 0.3342, + "step": 2424 + }, + { + "epoch": 1.909807010634108, + "grad_norm": 1.405200719833374, + "learning_rate": 7.26e-06, + "loss": 0.2396, + "step": 2425 + }, + { + "epoch": 1.9105947223316266, + "grad_norm": 1.3511065244674683, + "learning_rate": 7.263000000000001e-06, + "loss": 0.2071, + "step": 2426 + }, + { + "epoch": 1.9113824340291452, + "grad_norm": 1.0291736125946045, + "learning_rate": 7.266e-06, + "loss": 0.1811, + "step": 2427 + }, + { + "epoch": 1.912170145726664, + "grad_norm": 1.001240849494934, + "learning_rate": 7.269e-06, + "loss": 0.0967, + "step": 2428 + }, + { + "epoch": 1.9129578574241828, + "grad_norm": 1.7612204551696777, + "learning_rate": 7.272e-06, + "loss": 0.1082, + "step": 2429 + }, + { + "epoch": 1.9137455691217014, + "grad_norm": 0.8958449959754944, + "learning_rate": 7.275e-06, + "loss": 0.0875, + "step": 2430 + }, + { + "epoch": 1.9145332808192201, + "grad_norm": 1.0800648927688599, + "learning_rate": 7.2780000000000005e-06, + "loss": 0.0833, + "step": 2431 + }, + { + "epoch": 1.915320992516739, + "grad_norm": 1.3300710916519165, + "learning_rate": 7.281e-06, + "loss": 0.1266, + "step": 2432 + }, + { + "epoch": 1.9161087042142575, + "grad_norm": 1.030004620552063, + "learning_rate": 7.284e-06, + "loss": 0.1243, + "step": 2433 + }, + { + "epoch": 1.9168964159117763, + "grad_norm": 0.9529794454574585, + "learning_rate": 7.287000000000001e-06, + "loss": 0.0831, + "step": 2434 + }, + { + "epoch": 1.917684127609295, + "grad_norm": 1.0315678119659424, + "learning_rate": 7.29e-06, + "loss": 0.1004, + "step": 2435 + }, + { + "epoch": 1.9184718393068136, + "grad_norm": 0.8262644410133362, + "learning_rate": 7.293e-06, + "loss": 0.0694, + "step": 2436 + }, + { + "epoch": 1.9192595510043324, + "grad_norm": 0.9622556567192078, + "learning_rate": 7.296e-06, + "loss": 0.0892, + "step": 2437 + }, + { + "epoch": 1.9200472627018512, + "grad_norm": 0.713692843914032, + "learning_rate": 7.299e-06, + "loss": 0.0566, + "step": 2438 + }, + { + "epoch": 1.9208349743993698, + "grad_norm": 1.3861018419265747, + "learning_rate": 7.3020000000000006e-06, + "loss": 0.1046, + "step": 2439 + }, + { + "epoch": 1.9216226860968886, + "grad_norm": 1.109031081199646, + "learning_rate": 7.305e-06, + "loss": 0.0604, + "step": 2440 + }, + { + "epoch": 1.9224103977944074, + "grad_norm": 1.4496233463287354, + "learning_rate": 7.308e-06, + "loss": 0.1049, + "step": 2441 + }, + { + "epoch": 1.923198109491926, + "grad_norm": 0.9514570236206055, + "learning_rate": 7.311e-06, + "loss": 0.0663, + "step": 2442 + }, + { + "epoch": 1.9239858211894445, + "grad_norm": 1.3272591829299927, + "learning_rate": 7.314e-06, + "loss": 0.1008, + "step": 2443 + }, + { + "epoch": 1.9247735328869635, + "grad_norm": 1.397579550743103, + "learning_rate": 7.317e-06, + "loss": 0.0718, + "step": 2444 + }, + { + "epoch": 1.925561244584482, + "grad_norm": 1.4139755964279175, + "learning_rate": 7.32e-06, + "loss": 0.1092, + "step": 2445 + }, + { + "epoch": 1.9263489562820006, + "grad_norm": 0.7916646003723145, + "learning_rate": 7.323000000000001e-06, + "loss": 0.0749, + "step": 2446 + }, + { + "epoch": 1.9271366679795197, + "grad_norm": 1.2515170574188232, + "learning_rate": 7.326e-06, + "loss": 0.0936, + "step": 2447 + }, + { + "epoch": 1.9279243796770382, + "grad_norm": 1.2264881134033203, + "learning_rate": 7.329e-06, + "loss": 0.0823, + "step": 2448 + }, + { + "epoch": 1.9287120913745568, + "grad_norm": 1.1842010021209717, + "learning_rate": 7.332e-06, + "loss": 0.0754, + "step": 2449 + }, + { + "epoch": 1.9294998030720756, + "grad_norm": 1.557063102722168, + "learning_rate": 7.335e-06, + "loss": 0.0852, + "step": 2450 + }, + { + "epoch": 1.9302875147695944, + "grad_norm": 1.4590787887573242, + "learning_rate": 7.338000000000001e-06, + "loss": 0.081, + "step": 2451 + }, + { + "epoch": 1.931075226467113, + "grad_norm": 1.505107045173645, + "learning_rate": 7.3410000000000005e-06, + "loss": 0.0722, + "step": 2452 + }, + { + "epoch": 1.9318629381646317, + "grad_norm": 1.2234190702438354, + "learning_rate": 7.3439999999999995e-06, + "loss": 0.0962, + "step": 2453 + }, + { + "epoch": 1.9326506498621505, + "grad_norm": 1.1784995794296265, + "learning_rate": 7.347e-06, + "loss": 0.0974, + "step": 2454 + }, + { + "epoch": 1.933438361559669, + "grad_norm": 1.5480560064315796, + "learning_rate": 7.35e-06, + "loss": 0.08, + "step": 2455 + }, + { + "epoch": 1.9342260732571879, + "grad_norm": 1.7365670204162598, + "learning_rate": 7.353000000000001e-06, + "loss": 0.06, + "step": 2456 + }, + { + "epoch": 1.9350137849547067, + "grad_norm": 1.0641436576843262, + "learning_rate": 7.356e-06, + "loss": 0.0745, + "step": 2457 + }, + { + "epoch": 1.9358014966522252, + "grad_norm": 0.9826862812042236, + "learning_rate": 7.359e-06, + "loss": 0.0998, + "step": 2458 + }, + { + "epoch": 1.936589208349744, + "grad_norm": 0.9168862700462341, + "learning_rate": 7.362e-06, + "loss": 0.0556, + "step": 2459 + }, + { + "epoch": 1.9373769200472628, + "grad_norm": 1.4913614988327026, + "learning_rate": 7.365e-06, + "loss": 0.0739, + "step": 2460 + }, + { + "epoch": 1.9381646317447814, + "grad_norm": 1.2527379989624023, + "learning_rate": 7.3680000000000004e-06, + "loss": 0.0989, + "step": 2461 + }, + { + "epoch": 1.9389523434423002, + "grad_norm": 1.4893895387649536, + "learning_rate": 7.371e-06, + "loss": 0.1011, + "step": 2462 + }, + { + "epoch": 1.939740055139819, + "grad_norm": 1.0395698547363281, + "learning_rate": 7.374e-06, + "loss": 0.0928, + "step": 2463 + }, + { + "epoch": 1.9405277668373375, + "grad_norm": 1.130310297012329, + "learning_rate": 7.377000000000001e-06, + "loss": 0.0953, + "step": 2464 + }, + { + "epoch": 1.941315478534856, + "grad_norm": 1.3030033111572266, + "learning_rate": 7.3800000000000005e-06, + "loss": 0.084, + "step": 2465 + }, + { + "epoch": 1.942103190232375, + "grad_norm": 1.4040298461914062, + "learning_rate": 7.383e-06, + "loss": 0.0765, + "step": 2466 + }, + { + "epoch": 1.9428909019298937, + "grad_norm": 1.427956223487854, + "learning_rate": 7.386e-06, + "loss": 0.1088, + "step": 2467 + }, + { + "epoch": 1.9436786136274122, + "grad_norm": 1.3250823020935059, + "learning_rate": 7.389e-06, + "loss": 0.0895, + "step": 2468 + }, + { + "epoch": 1.944466325324931, + "grad_norm": 2.4890096187591553, + "learning_rate": 7.3920000000000005e-06, + "loss": 0.1577, + "step": 2469 + }, + { + "epoch": 1.9452540370224498, + "grad_norm": 1.4859809875488281, + "learning_rate": 7.395e-06, + "loss": 0.1431, + "step": 2470 + }, + { + "epoch": 1.9460417487199684, + "grad_norm": 3.198568820953369, + "learning_rate": 7.398000000000001e-06, + "loss": 0.4932, + "step": 2471 + }, + { + "epoch": 1.9468294604174872, + "grad_norm": 1.7046631574630737, + "learning_rate": 7.401e-06, + "loss": 0.3691, + "step": 2472 + }, + { + "epoch": 1.947617172115006, + "grad_norm": 1.4765695333480835, + "learning_rate": 7.404e-06, + "loss": 0.3818, + "step": 2473 + }, + { + "epoch": 1.9484048838125245, + "grad_norm": 1.5763041973114014, + "learning_rate": 7.407e-06, + "loss": 0.2447, + "step": 2474 + }, + { + "epoch": 1.9491925955100433, + "grad_norm": 2.0244405269622803, + "learning_rate": 7.41e-06, + "loss": 0.1639, + "step": 2475 + }, + { + "epoch": 1.949980307207562, + "grad_norm": 0.8694676756858826, + "learning_rate": 7.413e-06, + "loss": 0.1139, + "step": 2476 + }, + { + "epoch": 1.9507680189050807, + "grad_norm": 0.9451751708984375, + "learning_rate": 7.416000000000001e-06, + "loss": 0.094, + "step": 2477 + }, + { + "epoch": 1.9515557306025995, + "grad_norm": 1.5246028900146484, + "learning_rate": 7.419e-06, + "loss": 0.1295, + "step": 2478 + }, + { + "epoch": 1.9523434423001182, + "grad_norm": 0.9684695601463318, + "learning_rate": 7.422e-06, + "loss": 0.0621, + "step": 2479 + }, + { + "epoch": 1.9531311539976368, + "grad_norm": 1.1976838111877441, + "learning_rate": 7.425e-06, + "loss": 0.0713, + "step": 2480 + }, + { + "epoch": 1.9539188656951556, + "grad_norm": 1.2158141136169434, + "learning_rate": 7.428e-06, + "loss": 0.0776, + "step": 2481 + }, + { + "epoch": 1.9547065773926744, + "grad_norm": 1.0428968667984009, + "learning_rate": 7.4310000000000005e-06, + "loss": 0.0746, + "step": 2482 + }, + { + "epoch": 1.955494289090193, + "grad_norm": 2.1629104614257812, + "learning_rate": 7.434e-06, + "loss": 0.0655, + "step": 2483 + }, + { + "epoch": 1.9562820007877118, + "grad_norm": 0.8360955715179443, + "learning_rate": 7.437e-06, + "loss": 0.0674, + "step": 2484 + }, + { + "epoch": 1.9570697124852305, + "grad_norm": 0.8115749359130859, + "learning_rate": 7.44e-06, + "loss": 0.053, + "step": 2485 + }, + { + "epoch": 1.957857424182749, + "grad_norm": 1.0057227611541748, + "learning_rate": 7.443e-06, + "loss": 0.0834, + "step": 2486 + }, + { + "epoch": 1.9586451358802677, + "grad_norm": 1.1168603897094727, + "learning_rate": 7.446e-06, + "loss": 0.073, + "step": 2487 + }, + { + "epoch": 1.9594328475777867, + "grad_norm": 1.4318805932998657, + "learning_rate": 7.449e-06, + "loss": 0.0808, + "step": 2488 + }, + { + "epoch": 1.9602205592753053, + "grad_norm": 1.0981156826019287, + "learning_rate": 7.452000000000001e-06, + "loss": 0.0715, + "step": 2489 + }, + { + "epoch": 1.9610082709728238, + "grad_norm": 0.88680100440979, + "learning_rate": 7.455e-06, + "loss": 0.0598, + "step": 2490 + }, + { + "epoch": 1.9617959826703426, + "grad_norm": 2.1376707553863525, + "learning_rate": 7.4579999999999996e-06, + "loss": 0.0851, + "step": 2491 + }, + { + "epoch": 1.9625836943678614, + "grad_norm": 2.1176846027374268, + "learning_rate": 7.461e-06, + "loss": 0.056, + "step": 2492 + }, + { + "epoch": 1.96337140606538, + "grad_norm": 0.6373579502105713, + "learning_rate": 7.464e-06, + "loss": 0.0441, + "step": 2493 + }, + { + "epoch": 1.9641591177628988, + "grad_norm": 1.2402887344360352, + "learning_rate": 7.467000000000001e-06, + "loss": 0.0741, + "step": 2494 + }, + { + "epoch": 1.9649468294604175, + "grad_norm": 1.0458636283874512, + "learning_rate": 7.4700000000000005e-06, + "loss": 0.0831, + "step": 2495 + }, + { + "epoch": 1.965734541157936, + "grad_norm": 1.0122058391571045, + "learning_rate": 7.4729999999999994e-06, + "loss": 0.0789, + "step": 2496 + }, + { + "epoch": 1.966522252855455, + "grad_norm": 0.9607613682746887, + "learning_rate": 7.476e-06, + "loss": 0.0557, + "step": 2497 + }, + { + "epoch": 1.9673099645529737, + "grad_norm": 1.2137937545776367, + "learning_rate": 7.479e-06, + "loss": 0.0721, + "step": 2498 + }, + { + "epoch": 1.9680976762504923, + "grad_norm": 2.4481618404388428, + "learning_rate": 7.4820000000000005e-06, + "loss": 0.0871, + "step": 2499 + }, + { + "epoch": 1.968885387948011, + "grad_norm": 1.0201526880264282, + "learning_rate": 7.485e-06, + "loss": 0.0494, + "step": 2500 + }, + { + "epoch": 1.9696730996455298, + "grad_norm": 0.8557842373847961, + "learning_rate": 7.488e-06, + "loss": 0.0548, + "step": 2501 + }, + { + "epoch": 1.9704608113430484, + "grad_norm": 0.8991197943687439, + "learning_rate": 7.491e-06, + "loss": 0.0631, + "step": 2502 + }, + { + "epoch": 1.9712485230405672, + "grad_norm": 1.6090779304504395, + "learning_rate": 7.494e-06, + "loss": 0.0615, + "step": 2503 + }, + { + "epoch": 1.972036234738086, + "grad_norm": 1.182194709777832, + "learning_rate": 7.497e-06, + "loss": 0.0932, + "step": 2504 + }, + { + "epoch": 1.9728239464356045, + "grad_norm": 1.2606788873672485, + "learning_rate": 7.5e-06, + "loss": 0.0719, + "step": 2505 + }, + { + "epoch": 1.9736116581331231, + "grad_norm": 1.0251432657241821, + "learning_rate": 7.503e-06, + "loss": 0.0651, + "step": 2506 + }, + { + "epoch": 1.9743993698306421, + "grad_norm": 1.3400393724441528, + "learning_rate": 7.506e-06, + "loss": 0.0936, + "step": 2507 + }, + { + "epoch": 1.9751870815281607, + "grad_norm": 1.1995320320129395, + "learning_rate": 7.5090000000000004e-06, + "loss": 0.0719, + "step": 2508 + }, + { + "epoch": 1.9759747932256793, + "grad_norm": 1.4134206771850586, + "learning_rate": 7.512e-06, + "loss": 0.0755, + "step": 2509 + }, + { + "epoch": 1.9767625049231983, + "grad_norm": 1.4019547700881958, + "learning_rate": 7.515e-06, + "loss": 0.0868, + "step": 2510 + }, + { + "epoch": 1.9775502166207168, + "grad_norm": 1.243980884552002, + "learning_rate": 7.518e-06, + "loss": 0.0854, + "step": 2511 + }, + { + "epoch": 1.9783379283182354, + "grad_norm": 2.9713923931121826, + "learning_rate": 7.521e-06, + "loss": 0.0802, + "step": 2512 + }, + { + "epoch": 1.9791256400157542, + "grad_norm": 1.7451715469360352, + "learning_rate": 7.524000000000001e-06, + "loss": 0.0835, + "step": 2513 + }, + { + "epoch": 1.979913351713273, + "grad_norm": 1.28589928150177, + "learning_rate": 7.527000000000001e-06, + "loss": 0.0911, + "step": 2514 + }, + { + "epoch": 1.9807010634107916, + "grad_norm": 1.186657428741455, + "learning_rate": 7.53e-06, + "loss": 0.0692, + "step": 2515 + }, + { + "epoch": 1.9814887751083103, + "grad_norm": 1.518235683441162, + "learning_rate": 7.533e-06, + "loss": 0.1013, + "step": 2516 + }, + { + "epoch": 1.9822764868058291, + "grad_norm": 1.2442052364349365, + "learning_rate": 7.5359999999999995e-06, + "loss": 0.109, + "step": 2517 + }, + { + "epoch": 1.9830641985033477, + "grad_norm": 1.4171098470687866, + "learning_rate": 7.539000000000001e-06, + "loss": 0.0817, + "step": 2518 + }, + { + "epoch": 1.9838519102008665, + "grad_norm": 2.431872606277466, + "learning_rate": 7.542000000000001e-06, + "loss": 0.0766, + "step": 2519 + }, + { + "epoch": 1.9846396218983853, + "grad_norm": 1.4022802114486694, + "learning_rate": 7.545000000000001e-06, + "loss": 0.096, + "step": 2520 + }, + { + "epoch": 1.9854273335959038, + "grad_norm": 2.5004189014434814, + "learning_rate": 7.5479999999999996e-06, + "loss": 0.5307, + "step": 2521 + }, + { + "epoch": 1.9862150452934226, + "grad_norm": 1.4771417379379272, + "learning_rate": 7.550999999999999e-06, + "loss": 0.2897, + "step": 2522 + }, + { + "epoch": 1.9870027569909414, + "grad_norm": 1.1503280401229858, + "learning_rate": 7.554000000000001e-06, + "loss": 0.1641, + "step": 2523 + }, + { + "epoch": 1.98779046868846, + "grad_norm": 0.7578390836715698, + "learning_rate": 7.557000000000001e-06, + "loss": 0.0856, + "step": 2524 + }, + { + "epoch": 1.9885781803859788, + "grad_norm": 0.8048362731933594, + "learning_rate": 7.5600000000000005e-06, + "loss": 0.0757, + "step": 2525 + }, + { + "epoch": 1.9893658920834976, + "grad_norm": 0.8337692022323608, + "learning_rate": 7.563e-06, + "loss": 0.0678, + "step": 2526 + }, + { + "epoch": 1.9901536037810161, + "grad_norm": 0.77403724193573, + "learning_rate": 7.565999999999999e-06, + "loss": 0.0672, + "step": 2527 + }, + { + "epoch": 1.9909413154785347, + "grad_norm": 0.9111225008964539, + "learning_rate": 7.569000000000001e-06, + "loss": 0.0539, + "step": 2528 + }, + { + "epoch": 1.9917290271760537, + "grad_norm": 1.1829249858856201, + "learning_rate": 7.5720000000000005e-06, + "loss": 0.0483, + "step": 2529 + }, + { + "epoch": 1.9925167388735723, + "grad_norm": 0.8014804124832153, + "learning_rate": 7.575e-06, + "loss": 0.0643, + "step": 2530 + }, + { + "epoch": 1.9933044505710908, + "grad_norm": 1.466456651687622, + "learning_rate": 7.578e-06, + "loss": 0.0809, + "step": 2531 + }, + { + "epoch": 1.9940921622686096, + "grad_norm": 1.8865771293640137, + "learning_rate": 7.581e-06, + "loss": 0.0769, + "step": 2532 + }, + { + "epoch": 1.9948798739661284, + "grad_norm": 1.5091946125030518, + "learning_rate": 7.5840000000000006e-06, + "loss": 0.1056, + "step": 2533 + }, + { + "epoch": 1.995667585663647, + "grad_norm": 2.532489061355591, + "learning_rate": 7.587e-06, + "loss": 0.1109, + "step": 2534 + }, + { + "epoch": 1.9964552973611658, + "grad_norm": 1.2436119318008423, + "learning_rate": 7.59e-06, + "loss": 0.0637, + "step": 2535 + }, + { + "epoch": 1.9972430090586846, + "grad_norm": 1.2818872928619385, + "learning_rate": 7.593e-06, + "loss": 0.1033, + "step": 2536 + }, + { + "epoch": 1.9980307207562031, + "grad_norm": 1.2747893333435059, + "learning_rate": 7.596e-06, + "loss": 0.0866, + "step": 2537 + }, + { + "epoch": 1.998818432453722, + "grad_norm": 1.5915292501449585, + "learning_rate": 7.599000000000001e-06, + "loss": 0.0641, + "step": 2538 + }, + { + "epoch": 1.9996061441512407, + "grad_norm": 1.2853131294250488, + "learning_rate": 7.602e-06, + "loss": 0.0684, + "step": 2539 + }, + { + "epoch": 2.0, + "grad_norm": 4.802477836608887, + "learning_rate": 7.605e-06, + "loss": 0.1012, + "step": 2540 + }, + { + "epoch": 2.0007877116975186, + "grad_norm": 4.842461109161377, + "learning_rate": 7.608e-06, + "loss": 0.5248, + "step": 2541 + }, + { + "epoch": 2.0015754233950376, + "grad_norm": 2.401002883911133, + "learning_rate": 7.611e-06, + "loss": 0.4502, + "step": 2542 + }, + { + "epoch": 2.002363135092556, + "grad_norm": 2.222205400466919, + "learning_rate": 7.614000000000001e-06, + "loss": 0.3643, + "step": 2543 + }, + { + "epoch": 2.0031508467900747, + "grad_norm": 3.3895583152770996, + "learning_rate": 7.617000000000001e-06, + "loss": 0.3532, + "step": 2544 + }, + { + "epoch": 2.0039385584875937, + "grad_norm": 1.6866862773895264, + "learning_rate": 7.62e-06, + "loss": 0.2989, + "step": 2545 + }, + { + "epoch": 2.0047262701851123, + "grad_norm": 1.2517750263214111, + "learning_rate": 7.623e-06, + "loss": 0.1669, + "step": 2546 + }, + { + "epoch": 2.005513981882631, + "grad_norm": 0.889205813407898, + "learning_rate": 7.6259999999999995e-06, + "loss": 0.1027, + "step": 2547 + }, + { + "epoch": 2.00630169358015, + "grad_norm": 0.8697429299354553, + "learning_rate": 7.629000000000001e-06, + "loss": 0.0952, + "step": 2548 + }, + { + "epoch": 2.0070894052776684, + "grad_norm": 0.8823580145835876, + "learning_rate": 7.632e-06, + "loss": 0.0717, + "step": 2549 + }, + { + "epoch": 2.007877116975187, + "grad_norm": 0.9503212571144104, + "learning_rate": 7.635e-06, + "loss": 0.0613, + "step": 2550 + }, + { + "epoch": 2.0086648286727056, + "grad_norm": 1.1391247510910034, + "learning_rate": 7.638e-06, + "loss": 0.0669, + "step": 2551 + }, + { + "epoch": 2.0094525403702246, + "grad_norm": 1.217852234840393, + "learning_rate": 7.641e-06, + "loss": 0.0564, + "step": 2552 + }, + { + "epoch": 2.010240252067743, + "grad_norm": 6.007042407989502, + "learning_rate": 7.644000000000002e-06, + "loss": 0.0862, + "step": 2553 + }, + { + "epoch": 2.0110279637652617, + "grad_norm": 0.9431589245796204, + "learning_rate": 7.647000000000001e-06, + "loss": 0.0647, + "step": 2554 + }, + { + "epoch": 2.0118156754627807, + "grad_norm": 0.8239688277244568, + "learning_rate": 7.65e-06, + "loss": 0.063, + "step": 2555 + }, + { + "epoch": 2.0126033871602993, + "grad_norm": 0.8479486703872681, + "learning_rate": 7.653e-06, + "loss": 0.0675, + "step": 2556 + }, + { + "epoch": 2.013391098857818, + "grad_norm": 0.9845707416534424, + "learning_rate": 7.656e-06, + "loss": 0.0628, + "step": 2557 + }, + { + "epoch": 2.014178810555337, + "grad_norm": 1.0466948747634888, + "learning_rate": 7.659e-06, + "loss": 0.0707, + "step": 2558 + }, + { + "epoch": 2.0149665222528554, + "grad_norm": 1.72593092918396, + "learning_rate": 7.662e-06, + "loss": 0.0689, + "step": 2559 + }, + { + "epoch": 2.015754233950374, + "grad_norm": 0.777014434337616, + "learning_rate": 7.665e-06, + "loss": 0.0551, + "step": 2560 + }, + { + "epoch": 2.016541945647893, + "grad_norm": 8.32462215423584, + "learning_rate": 7.668e-06, + "loss": 0.0604, + "step": 2561 + }, + { + "epoch": 2.0173296573454116, + "grad_norm": 1.0936064720153809, + "learning_rate": 7.671e-06, + "loss": 0.0493, + "step": 2562 + }, + { + "epoch": 2.01811736904293, + "grad_norm": 0.8729085326194763, + "learning_rate": 7.674000000000001e-06, + "loss": 0.0626, + "step": 2563 + }, + { + "epoch": 2.018905080740449, + "grad_norm": 0.7894073128700256, + "learning_rate": 7.677000000000001e-06, + "loss": 0.0696, + "step": 2564 + }, + { + "epoch": 2.0196927924379677, + "grad_norm": 0.8519641757011414, + "learning_rate": 7.680000000000001e-06, + "loss": 0.1338, + "step": 2565 + }, + { + "epoch": 2.0204805041354863, + "grad_norm": 0.8034432530403137, + "learning_rate": 7.683e-06, + "loss": 0.0544, + "step": 2566 + }, + { + "epoch": 2.0212682158330053, + "grad_norm": 0.9006732106208801, + "learning_rate": 7.685999999999999e-06, + "loss": 0.0645, + "step": 2567 + }, + { + "epoch": 2.022055927530524, + "grad_norm": 1.1045329570770264, + "learning_rate": 7.688999999999999e-06, + "loss": 0.0725, + "step": 2568 + }, + { + "epoch": 2.0228436392280424, + "grad_norm": 1.3452540636062622, + "learning_rate": 7.692e-06, + "loss": 0.0522, + "step": 2569 + }, + { + "epoch": 2.0236313509255615, + "grad_norm": 0.8418027758598328, + "learning_rate": 7.695e-06, + "loss": 0.0599, + "step": 2570 + }, + { + "epoch": 2.02441906262308, + "grad_norm": 1.2666722536087036, + "learning_rate": 7.698e-06, + "loss": 0.0692, + "step": 2571 + }, + { + "epoch": 2.0252067743205986, + "grad_norm": 1.66085946559906, + "learning_rate": 7.701e-06, + "loss": 0.0697, + "step": 2572 + }, + { + "epoch": 2.025994486018117, + "grad_norm": 0.9680288434028625, + "learning_rate": 7.704e-06, + "loss": 0.067, + "step": 2573 + }, + { + "epoch": 2.026782197715636, + "grad_norm": 1.0958698987960815, + "learning_rate": 7.707000000000001e-06, + "loss": 0.0484, + "step": 2574 + }, + { + "epoch": 2.0275699094131547, + "grad_norm": 1.017435908317566, + "learning_rate": 7.71e-06, + "loss": 0.0625, + "step": 2575 + }, + { + "epoch": 2.0283576211106733, + "grad_norm": 1.1608210802078247, + "learning_rate": 7.713e-06, + "loss": 0.0662, + "step": 2576 + }, + { + "epoch": 2.0291453328081923, + "grad_norm": 1.1511636972427368, + "learning_rate": 7.716e-06, + "loss": 0.0731, + "step": 2577 + }, + { + "epoch": 2.029933044505711, + "grad_norm": 0.8167786598205566, + "learning_rate": 7.719e-06, + "loss": 0.0702, + "step": 2578 + }, + { + "epoch": 2.0307207562032294, + "grad_norm": 0.8636009693145752, + "learning_rate": 7.722e-06, + "loss": 0.0637, + "step": 2579 + }, + { + "epoch": 2.0315084679007485, + "grad_norm": 1.7523142099380493, + "learning_rate": 7.725e-06, + "loss": 0.0578, + "step": 2580 + }, + { + "epoch": 2.032296179598267, + "grad_norm": 0.8487754464149475, + "learning_rate": 7.728e-06, + "loss": 0.0621, + "step": 2581 + }, + { + "epoch": 2.0330838912957856, + "grad_norm": 1.5293444395065308, + "learning_rate": 7.731e-06, + "loss": 0.0878, + "step": 2582 + }, + { + "epoch": 2.0338716029933046, + "grad_norm": 1.1942033767700195, + "learning_rate": 7.733999999999999e-06, + "loss": 0.0791, + "step": 2583 + }, + { + "epoch": 2.034659314690823, + "grad_norm": 1.7914608716964722, + "learning_rate": 7.737e-06, + "loss": 0.0911, + "step": 2584 + }, + { + "epoch": 2.0354470263883417, + "grad_norm": 1.003233551979065, + "learning_rate": 7.74e-06, + "loss": 0.0733, + "step": 2585 + }, + { + "epoch": 2.0362347380858608, + "grad_norm": 1.1346207857131958, + "learning_rate": 7.743e-06, + "loss": 0.0574, + "step": 2586 + }, + { + "epoch": 2.0370224497833793, + "grad_norm": 1.3265892267227173, + "learning_rate": 7.746e-06, + "loss": 0.091, + "step": 2587 + }, + { + "epoch": 2.037810161480898, + "grad_norm": 1.760952115058899, + "learning_rate": 7.749e-06, + "loss": 0.0995, + "step": 2588 + }, + { + "epoch": 2.038597873178417, + "grad_norm": 3.3742423057556152, + "learning_rate": 7.752000000000001e-06, + "loss": 0.1011, + "step": 2589 + }, + { + "epoch": 2.0393855848759355, + "grad_norm": 1.770318627357483, + "learning_rate": 7.755000000000001e-06, + "loss": 0.1026, + "step": 2590 + }, + { + "epoch": 2.040173296573454, + "grad_norm": 2.0459213256835938, + "learning_rate": 7.758000000000001e-06, + "loss": 0.5887, + "step": 2591 + }, + { + "epoch": 2.0409610082709726, + "grad_norm": 1.4506429433822632, + "learning_rate": 7.760999999999999e-06, + "loss": 0.4537, + "step": 2592 + }, + { + "epoch": 2.0417487199684916, + "grad_norm": 1.8141508102416992, + "learning_rate": 7.763999999999999e-06, + "loss": 0.4385, + "step": 2593 + }, + { + "epoch": 2.04253643166601, + "grad_norm": 1.4442614316940308, + "learning_rate": 7.767e-06, + "loss": 0.2801, + "step": 2594 + }, + { + "epoch": 2.0433241433635287, + "grad_norm": 1.5527526140213013, + "learning_rate": 7.77e-06, + "loss": 0.224, + "step": 2595 + }, + { + "epoch": 2.0441118550610478, + "grad_norm": 1.1800810098648071, + "learning_rate": 7.773e-06, + "loss": 0.1765, + "step": 2596 + }, + { + "epoch": 2.0448995667585663, + "grad_norm": 1.3431988954544067, + "learning_rate": 7.776e-06, + "loss": 0.1085, + "step": 2597 + }, + { + "epoch": 2.045687278456085, + "grad_norm": 0.9251765012741089, + "learning_rate": 7.779e-06, + "loss": 0.1213, + "step": 2598 + }, + { + "epoch": 2.046474990153604, + "grad_norm": 0.828347384929657, + "learning_rate": 7.782000000000001e-06, + "loss": 0.0722, + "step": 2599 + }, + { + "epoch": 2.0472627018511225, + "grad_norm": 1.0454021692276, + "learning_rate": 7.785000000000001e-06, + "loss": 0.0924, + "step": 2600 + }, + { + "epoch": 2.048050413548641, + "grad_norm": 0.9096385836601257, + "learning_rate": 7.788e-06, + "loss": 0.0687, + "step": 2601 + }, + { + "epoch": 2.04883812524616, + "grad_norm": 1.2608880996704102, + "learning_rate": 7.791e-06, + "loss": 0.0683, + "step": 2602 + }, + { + "epoch": 2.0496258369436786, + "grad_norm": 0.8743247985839844, + "learning_rate": 7.794e-06, + "loss": 0.0756, + "step": 2603 + }, + { + "epoch": 2.050413548641197, + "grad_norm": 0.9776881337165833, + "learning_rate": 7.797e-06, + "loss": 0.0534, + "step": 2604 + }, + { + "epoch": 2.051201260338716, + "grad_norm": 1.1400154829025269, + "learning_rate": 7.8e-06, + "loss": 0.0701, + "step": 2605 + }, + { + "epoch": 2.0519889720362348, + "grad_norm": 0.8439685702323914, + "learning_rate": 7.803e-06, + "loss": 0.0526, + "step": 2606 + }, + { + "epoch": 2.0527766837337533, + "grad_norm": 1.1860524415969849, + "learning_rate": 7.806e-06, + "loss": 0.07, + "step": 2607 + }, + { + "epoch": 2.0535643954312723, + "grad_norm": 0.7421343922615051, + "learning_rate": 7.809e-06, + "loss": 0.0519, + "step": 2608 + }, + { + "epoch": 2.054352107128791, + "grad_norm": 1.1580297946929932, + "learning_rate": 7.812e-06, + "loss": 0.0658, + "step": 2609 + }, + { + "epoch": 2.0551398188263095, + "grad_norm": 1.5507792234420776, + "learning_rate": 7.815e-06, + "loss": 0.079, + "step": 2610 + }, + { + "epoch": 2.0559275305238285, + "grad_norm": 1.0513571500778198, + "learning_rate": 7.818e-06, + "loss": 0.1003, + "step": 2611 + }, + { + "epoch": 2.056715242221347, + "grad_norm": 1.0110679864883423, + "learning_rate": 7.821e-06, + "loss": 0.0541, + "step": 2612 + }, + { + "epoch": 2.0575029539188656, + "grad_norm": 0.7496577501296997, + "learning_rate": 7.824e-06, + "loss": 0.0506, + "step": 2613 + }, + { + "epoch": 2.058290665616384, + "grad_norm": 0.9589276909828186, + "learning_rate": 7.827000000000001e-06, + "loss": 0.0561, + "step": 2614 + }, + { + "epoch": 2.059078377313903, + "grad_norm": 1.0383049249649048, + "learning_rate": 7.830000000000001e-06, + "loss": 0.0694, + "step": 2615 + }, + { + "epoch": 2.0598660890114218, + "grad_norm": 1.4069181680679321, + "learning_rate": 7.833e-06, + "loss": 0.0926, + "step": 2616 + }, + { + "epoch": 2.0606538007089403, + "grad_norm": 0.8573128581047058, + "learning_rate": 7.836e-06, + "loss": 0.0535, + "step": 2617 + }, + { + "epoch": 2.0614415124064593, + "grad_norm": 0.9917516112327576, + "learning_rate": 7.838999999999999e-06, + "loss": 0.0698, + "step": 2618 + }, + { + "epoch": 2.062229224103978, + "grad_norm": 0.969744861125946, + "learning_rate": 7.842e-06, + "loss": 0.0496, + "step": 2619 + }, + { + "epoch": 2.0630169358014965, + "grad_norm": 0.8770263195037842, + "learning_rate": 7.845e-06, + "loss": 0.0592, + "step": 2620 + }, + { + "epoch": 2.0638046474990155, + "grad_norm": 1.5929876565933228, + "learning_rate": 7.848e-06, + "loss": 0.0784, + "step": 2621 + }, + { + "epoch": 2.064592359196534, + "grad_norm": 1.254394292831421, + "learning_rate": 7.851e-06, + "loss": 0.0694, + "step": 2622 + }, + { + "epoch": 2.0653800708940526, + "grad_norm": 1.0960133075714111, + "learning_rate": 7.854e-06, + "loss": 0.1002, + "step": 2623 + }, + { + "epoch": 2.0661677825915716, + "grad_norm": 0.7240921854972839, + "learning_rate": 7.857000000000001e-06, + "loss": 0.0549, + "step": 2624 + }, + { + "epoch": 2.06695549428909, + "grad_norm": 1.0217902660369873, + "learning_rate": 7.860000000000001e-06, + "loss": 0.0606, + "step": 2625 + }, + { + "epoch": 2.0677432059866088, + "grad_norm": 1.034948468208313, + "learning_rate": 7.863e-06, + "loss": 0.0712, + "step": 2626 + }, + { + "epoch": 2.068530917684128, + "grad_norm": 1.231775164604187, + "learning_rate": 7.866e-06, + "loss": 0.0693, + "step": 2627 + }, + { + "epoch": 2.0693186293816463, + "grad_norm": 1.7613251209259033, + "learning_rate": 7.868999999999999e-06, + "loss": 0.0883, + "step": 2628 + }, + { + "epoch": 2.070106341079165, + "grad_norm": 0.9193368554115295, + "learning_rate": 7.872e-06, + "loss": 0.074, + "step": 2629 + }, + { + "epoch": 2.070894052776684, + "grad_norm": 3.0530951023101807, + "learning_rate": 7.875e-06, + "loss": 0.0851, + "step": 2630 + }, + { + "epoch": 2.0716817644742025, + "grad_norm": 0.8381359577178955, + "learning_rate": 7.878e-06, + "loss": 0.0497, + "step": 2631 + }, + { + "epoch": 2.072469476171721, + "grad_norm": 1.075322151184082, + "learning_rate": 7.881e-06, + "loss": 0.08, + "step": 2632 + }, + { + "epoch": 2.07325718786924, + "grad_norm": 1.386710286140442, + "learning_rate": 7.884e-06, + "loss": 0.0689, + "step": 2633 + }, + { + "epoch": 2.0740448995667586, + "grad_norm": 1.1593276262283325, + "learning_rate": 7.887000000000001e-06, + "loss": 0.113, + "step": 2634 + }, + { + "epoch": 2.074832611264277, + "grad_norm": 1.605992078781128, + "learning_rate": 7.89e-06, + "loss": 0.1024, + "step": 2635 + }, + { + "epoch": 2.0756203229617958, + "grad_norm": 2.776090145111084, + "learning_rate": 7.893e-06, + "loss": 0.1111, + "step": 2636 + }, + { + "epoch": 2.076408034659315, + "grad_norm": 1.3665746450424194, + "learning_rate": 7.896e-06, + "loss": 0.0918, + "step": 2637 + }, + { + "epoch": 2.0771957463568334, + "grad_norm": 1.2126588821411133, + "learning_rate": 7.899e-06, + "loss": 0.0797, + "step": 2638 + }, + { + "epoch": 2.077983458054352, + "grad_norm": 1.5724395513534546, + "learning_rate": 7.902000000000002e-06, + "loss": 0.1052, + "step": 2639 + }, + { + "epoch": 2.078771169751871, + "grad_norm": 1.73506498336792, + "learning_rate": 7.905000000000001e-06, + "loss": 0.1, + "step": 2640 + }, + { + "epoch": 2.0795588814493895, + "grad_norm": 1.8650051355361938, + "learning_rate": 7.908e-06, + "loss": 0.5169, + "step": 2641 + }, + { + "epoch": 2.080346593146908, + "grad_norm": 8.980195999145508, + "learning_rate": 7.911e-06, + "loss": 0.4309, + "step": 2642 + }, + { + "epoch": 2.081134304844427, + "grad_norm": 1.9408119916915894, + "learning_rate": 7.913999999999999e-06, + "loss": 0.4098, + "step": 2643 + }, + { + "epoch": 2.0819220165419456, + "grad_norm": 1.8114919662475586, + "learning_rate": 7.917e-06, + "loss": 0.4028, + "step": 2644 + }, + { + "epoch": 2.082709728239464, + "grad_norm": 1.7498717308044434, + "learning_rate": 7.92e-06, + "loss": 0.2921, + "step": 2645 + }, + { + "epoch": 2.083497439936983, + "grad_norm": 0.7408292889595032, + "learning_rate": 7.923e-06, + "loss": 0.0936, + "step": 2646 + }, + { + "epoch": 2.084285151634502, + "grad_norm": 0.7337812781333923, + "learning_rate": 7.926e-06, + "loss": 0.0823, + "step": 2647 + }, + { + "epoch": 2.0850728633320204, + "grad_norm": 1.4787307977676392, + "learning_rate": 7.929e-06, + "loss": 0.1067, + "step": 2648 + }, + { + "epoch": 2.0858605750295394, + "grad_norm": 0.9419681429862976, + "learning_rate": 7.932000000000001e-06, + "loss": 0.1421, + "step": 2649 + }, + { + "epoch": 2.086648286727058, + "grad_norm": 1.1552437543869019, + "learning_rate": 7.935000000000001e-06, + "loss": 0.0602, + "step": 2650 + }, + { + "epoch": 2.0874359984245765, + "grad_norm": 0.6302905082702637, + "learning_rate": 7.938000000000001e-06, + "loss": 0.0591, + "step": 2651 + }, + { + "epoch": 2.0882237101220955, + "grad_norm": 0.7266610860824585, + "learning_rate": 7.941e-06, + "loss": 0.0745, + "step": 2652 + }, + { + "epoch": 2.089011421819614, + "grad_norm": 0.7345747947692871, + "learning_rate": 7.943999999999999e-06, + "loss": 0.0492, + "step": 2653 + }, + { + "epoch": 2.0897991335171326, + "grad_norm": 0.80438631772995, + "learning_rate": 7.947e-06, + "loss": 0.037, + "step": 2654 + }, + { + "epoch": 2.090586845214651, + "grad_norm": 0.8588997721672058, + "learning_rate": 7.95e-06, + "loss": 0.062, + "step": 2655 + }, + { + "epoch": 2.0913745569121702, + "grad_norm": 0.6899651885032654, + "learning_rate": 7.953e-06, + "loss": 0.0524, + "step": 2656 + }, + { + "epoch": 2.092162268609689, + "grad_norm": 0.6761516332626343, + "learning_rate": 7.956e-06, + "loss": 0.0774, + "step": 2657 + }, + { + "epoch": 2.0929499803072074, + "grad_norm": 1.22991943359375, + "learning_rate": 7.959e-06, + "loss": 0.0818, + "step": 2658 + }, + { + "epoch": 2.0937376920047264, + "grad_norm": 1.2739388942718506, + "learning_rate": 7.962000000000001e-06, + "loss": 0.0671, + "step": 2659 + }, + { + "epoch": 2.094525403702245, + "grad_norm": 1.1749393939971924, + "learning_rate": 7.965e-06, + "loss": 0.0524, + "step": 2660 + }, + { + "epoch": 2.0953131153997635, + "grad_norm": 0.9988558888435364, + "learning_rate": 7.968e-06, + "loss": 0.0639, + "step": 2661 + }, + { + "epoch": 2.0961008270972825, + "grad_norm": 0.8179292678833008, + "learning_rate": 7.971e-06, + "loss": 0.0601, + "step": 2662 + }, + { + "epoch": 2.096888538794801, + "grad_norm": 0.9701697826385498, + "learning_rate": 7.974e-06, + "loss": 0.0583, + "step": 2663 + }, + { + "epoch": 2.0976762504923196, + "grad_norm": 0.8374482989311218, + "learning_rate": 7.977000000000002e-06, + "loss": 0.0577, + "step": 2664 + }, + { + "epoch": 2.0984639621898387, + "grad_norm": 0.932188868522644, + "learning_rate": 7.98e-06, + "loss": 0.0615, + "step": 2665 + }, + { + "epoch": 2.0992516738873572, + "grad_norm": 0.8556964993476868, + "learning_rate": 7.983e-06, + "loss": 0.0794, + "step": 2666 + }, + { + "epoch": 2.100039385584876, + "grad_norm": 0.8919934630393982, + "learning_rate": 7.986e-06, + "loss": 0.059, + "step": 2667 + }, + { + "epoch": 2.100827097282395, + "grad_norm": 0.6644260287284851, + "learning_rate": 7.989e-06, + "loss": 0.0438, + "step": 2668 + }, + { + "epoch": 2.1016148089799134, + "grad_norm": 0.8310590982437134, + "learning_rate": 7.992e-06, + "loss": 0.0539, + "step": 2669 + }, + { + "epoch": 2.102402520677432, + "grad_norm": 0.6966724991798401, + "learning_rate": 7.995e-06, + "loss": 0.057, + "step": 2670 + }, + { + "epoch": 2.103190232374951, + "grad_norm": 0.8466404676437378, + "learning_rate": 7.998e-06, + "loss": 0.0535, + "step": 2671 + }, + { + "epoch": 2.1039779440724695, + "grad_norm": 0.6984305381774902, + "learning_rate": 8.001e-06, + "loss": 0.0392, + "step": 2672 + }, + { + "epoch": 2.104765655769988, + "grad_norm": 0.9720793962478638, + "learning_rate": 8.004e-06, + "loss": 0.0963, + "step": 2673 + }, + { + "epoch": 2.105553367467507, + "grad_norm": 0.9002019166946411, + "learning_rate": 8.007000000000001e-06, + "loss": 0.051, + "step": 2674 + }, + { + "epoch": 2.1063410791650257, + "grad_norm": 1.0461755990982056, + "learning_rate": 8.010000000000001e-06, + "loss": 0.0452, + "step": 2675 + }, + { + "epoch": 2.1071287908625442, + "grad_norm": 0.8720105886459351, + "learning_rate": 8.013000000000001e-06, + "loss": 0.0548, + "step": 2676 + }, + { + "epoch": 2.1079165025600632, + "grad_norm": 0.8540534377098083, + "learning_rate": 8.016e-06, + "loss": 0.0707, + "step": 2677 + }, + { + "epoch": 2.108704214257582, + "grad_norm": 0.8751107454299927, + "learning_rate": 8.018999999999999e-06, + "loss": 0.0572, + "step": 2678 + }, + { + "epoch": 2.1094919259551004, + "grad_norm": 1.1128594875335693, + "learning_rate": 8.022e-06, + "loss": 0.0969, + "step": 2679 + }, + { + "epoch": 2.110279637652619, + "grad_norm": 1.3640397787094116, + "learning_rate": 8.025e-06, + "loss": 0.0949, + "step": 2680 + }, + { + "epoch": 2.111067349350138, + "grad_norm": 0.9528173804283142, + "learning_rate": 8.028e-06, + "loss": 0.0608, + "step": 2681 + }, + { + "epoch": 2.1118550610476565, + "grad_norm": 1.4233872890472412, + "learning_rate": 8.031e-06, + "loss": 0.0732, + "step": 2682 + }, + { + "epoch": 2.112642772745175, + "grad_norm": 1.2400612831115723, + "learning_rate": 8.034e-06, + "loss": 0.1136, + "step": 2683 + }, + { + "epoch": 2.113430484442694, + "grad_norm": 1.349261999130249, + "learning_rate": 8.037000000000001e-06, + "loss": 0.0988, + "step": 2684 + }, + { + "epoch": 2.1142181961402127, + "grad_norm": 3.2679386138916016, + "learning_rate": 8.040000000000001e-06, + "loss": 0.0927, + "step": 2685 + }, + { + "epoch": 2.1150059078377312, + "grad_norm": 1.160031795501709, + "learning_rate": 8.043e-06, + "loss": 0.0832, + "step": 2686 + }, + { + "epoch": 2.1157936195352502, + "grad_norm": 1.586512565612793, + "learning_rate": 8.046e-06, + "loss": 0.0656, + "step": 2687 + }, + { + "epoch": 2.116581331232769, + "grad_norm": 1.9871876239776611, + "learning_rate": 8.049e-06, + "loss": 0.0883, + "step": 2688 + }, + { + "epoch": 2.1173690429302874, + "grad_norm": 2.6265275478363037, + "learning_rate": 8.052000000000002e-06, + "loss": 0.1034, + "step": 2689 + }, + { + "epoch": 2.1181567546278064, + "grad_norm": 1.1153569221496582, + "learning_rate": 8.055e-06, + "loss": 0.0858, + "step": 2690 + }, + { + "epoch": 2.118944466325325, + "grad_norm": 2.759939432144165, + "learning_rate": 8.058e-06, + "loss": 0.5048, + "step": 2691 + }, + { + "epoch": 2.1197321780228435, + "grad_norm": 1.2955858707427979, + "learning_rate": 8.061e-06, + "loss": 0.3712, + "step": 2692 + }, + { + "epoch": 2.1205198897203625, + "grad_norm": 1.652376651763916, + "learning_rate": 8.064e-06, + "loss": 0.39, + "step": 2693 + }, + { + "epoch": 2.121307601417881, + "grad_norm": 2.1535897254943848, + "learning_rate": 8.067e-06, + "loss": 0.3735, + "step": 2694 + }, + { + "epoch": 2.1220953131153997, + "grad_norm": 2.028573751449585, + "learning_rate": 8.07e-06, + "loss": 0.3022, + "step": 2695 + }, + { + "epoch": 2.1228830248129187, + "grad_norm": 1.1278220415115356, + "learning_rate": 8.073e-06, + "loss": 0.1774, + "step": 2696 + }, + { + "epoch": 2.1236707365104373, + "grad_norm": 0.9761858582496643, + "learning_rate": 8.076e-06, + "loss": 0.0916, + "step": 2697 + }, + { + "epoch": 2.124458448207956, + "grad_norm": 0.648078441619873, + "learning_rate": 8.079e-06, + "loss": 0.0665, + "step": 2698 + }, + { + "epoch": 2.1252461599054744, + "grad_norm": 1.2319501638412476, + "learning_rate": 8.082e-06, + "loss": 0.0751, + "step": 2699 + }, + { + "epoch": 2.1260338716029934, + "grad_norm": 0.7692868709564209, + "learning_rate": 8.085000000000001e-06, + "loss": 0.0824, + "step": 2700 + }, + { + "epoch": 2.126821583300512, + "grad_norm": 1.218576192855835, + "learning_rate": 8.088000000000001e-06, + "loss": 0.0701, + "step": 2701 + }, + { + "epoch": 2.1276092949980305, + "grad_norm": 0.8075659871101379, + "learning_rate": 8.091e-06, + "loss": 0.0538, + "step": 2702 + }, + { + "epoch": 2.1283970066955495, + "grad_norm": 0.8182926177978516, + "learning_rate": 8.093999999999999e-06, + "loss": 0.0725, + "step": 2703 + }, + { + "epoch": 2.129184718393068, + "grad_norm": 0.6147918701171875, + "learning_rate": 8.096999999999999e-06, + "loss": 0.0446, + "step": 2704 + }, + { + "epoch": 2.1299724300905867, + "grad_norm": 0.7520703077316284, + "learning_rate": 8.1e-06, + "loss": 0.0514, + "step": 2705 + }, + { + "epoch": 2.1307601417881057, + "grad_norm": 0.7944921255111694, + "learning_rate": 8.103e-06, + "loss": 0.0654, + "step": 2706 + }, + { + "epoch": 2.1315478534856243, + "grad_norm": 0.8337500095367432, + "learning_rate": 8.106e-06, + "loss": 0.0551, + "step": 2707 + }, + { + "epoch": 2.132335565183143, + "grad_norm": 1.353086233139038, + "learning_rate": 8.109e-06, + "loss": 0.0714, + "step": 2708 + }, + { + "epoch": 2.133123276880662, + "grad_norm": 0.9069942235946655, + "learning_rate": 8.112e-06, + "loss": 0.0665, + "step": 2709 + }, + { + "epoch": 2.1339109885781804, + "grad_norm": 1.0326476097106934, + "learning_rate": 8.115000000000001e-06, + "loss": 0.0628, + "step": 2710 + }, + { + "epoch": 2.134698700275699, + "grad_norm": 1.241828203201294, + "learning_rate": 8.118000000000001e-06, + "loss": 0.0798, + "step": 2711 + }, + { + "epoch": 2.135486411973218, + "grad_norm": 0.8040323853492737, + "learning_rate": 8.121e-06, + "loss": 0.0723, + "step": 2712 + }, + { + "epoch": 2.1362741236707365, + "grad_norm": 0.8648563027381897, + "learning_rate": 8.124e-06, + "loss": 0.0478, + "step": 2713 + }, + { + "epoch": 2.137061835368255, + "grad_norm": 1.1646199226379395, + "learning_rate": 8.126999999999999e-06, + "loss": 0.0872, + "step": 2714 + }, + { + "epoch": 2.137849547065774, + "grad_norm": 0.9676786065101624, + "learning_rate": 8.13e-06, + "loss": 0.0684, + "step": 2715 + }, + { + "epoch": 2.1386372587632927, + "grad_norm": 1.0093342065811157, + "learning_rate": 8.133e-06, + "loss": 0.0598, + "step": 2716 + }, + { + "epoch": 2.1394249704608113, + "grad_norm": 0.9303480982780457, + "learning_rate": 8.136e-06, + "loss": 0.0847, + "step": 2717 + }, + { + "epoch": 2.14021268215833, + "grad_norm": 1.4784173965454102, + "learning_rate": 8.139e-06, + "loss": 0.0853, + "step": 2718 + }, + { + "epoch": 2.141000393855849, + "grad_norm": 0.9636557102203369, + "learning_rate": 8.142e-06, + "loss": 0.0439, + "step": 2719 + }, + { + "epoch": 2.1417881055533674, + "grad_norm": 1.3623881340026855, + "learning_rate": 8.145e-06, + "loss": 0.104, + "step": 2720 + }, + { + "epoch": 2.142575817250886, + "grad_norm": 1.4317779541015625, + "learning_rate": 8.148e-06, + "loss": 0.0818, + "step": 2721 + }, + { + "epoch": 2.143363528948405, + "grad_norm": 0.9398097395896912, + "learning_rate": 8.151e-06, + "loss": 0.0539, + "step": 2722 + }, + { + "epoch": 2.1441512406459236, + "grad_norm": 1.2533085346221924, + "learning_rate": 8.154e-06, + "loss": 0.0777, + "step": 2723 + }, + { + "epoch": 2.144938952343442, + "grad_norm": 1.361243486404419, + "learning_rate": 8.157e-06, + "loss": 0.0812, + "step": 2724 + }, + { + "epoch": 2.145726664040961, + "grad_norm": 0.8773424625396729, + "learning_rate": 8.160000000000001e-06, + "loss": 0.0514, + "step": 2725 + }, + { + "epoch": 2.1465143757384797, + "grad_norm": 0.8638513088226318, + "learning_rate": 8.163000000000001e-06, + "loss": 0.0694, + "step": 2726 + }, + { + "epoch": 2.1473020874359983, + "grad_norm": 1.1062428951263428, + "learning_rate": 8.166e-06, + "loss": 0.0522, + "step": 2727 + }, + { + "epoch": 2.1480897991335173, + "grad_norm": 1.6875967979431152, + "learning_rate": 8.169e-06, + "loss": 0.0753, + "step": 2728 + }, + { + "epoch": 2.148877510831036, + "grad_norm": 1.1032086610794067, + "learning_rate": 8.171999999999999e-06, + "loss": 0.08, + "step": 2729 + }, + { + "epoch": 2.1496652225285544, + "grad_norm": 1.0499505996704102, + "learning_rate": 8.175e-06, + "loss": 0.059, + "step": 2730 + }, + { + "epoch": 2.1504529342260734, + "grad_norm": 1.1621694564819336, + "learning_rate": 8.178e-06, + "loss": 0.0774, + "step": 2731 + }, + { + "epoch": 2.151240645923592, + "grad_norm": 1.1806812286376953, + "learning_rate": 8.181e-06, + "loss": 0.0706, + "step": 2732 + }, + { + "epoch": 2.1520283576211106, + "grad_norm": 1.2233983278274536, + "learning_rate": 8.184e-06, + "loss": 0.0923, + "step": 2733 + }, + { + "epoch": 2.1528160693186296, + "grad_norm": 0.9194537401199341, + "learning_rate": 8.187e-06, + "loss": 0.0511, + "step": 2734 + }, + { + "epoch": 2.153603781016148, + "grad_norm": 1.3426377773284912, + "learning_rate": 8.190000000000001e-06, + "loss": 0.0744, + "step": 2735 + }, + { + "epoch": 2.1543914927136667, + "grad_norm": 1.0333696603775024, + "learning_rate": 8.193000000000001e-06, + "loss": 0.0605, + "step": 2736 + }, + { + "epoch": 2.1551792044111857, + "grad_norm": 1.4155981540679932, + "learning_rate": 8.196e-06, + "loss": 0.0794, + "step": 2737 + }, + { + "epoch": 2.1559669161087043, + "grad_norm": 1.7944855690002441, + "learning_rate": 8.199e-06, + "loss": 0.0919, + "step": 2738 + }, + { + "epoch": 2.156754627806223, + "grad_norm": 1.6244930028915405, + "learning_rate": 8.201999999999999e-06, + "loss": 0.0829, + "step": 2739 + }, + { + "epoch": 2.157542339503742, + "grad_norm": 1.4052249193191528, + "learning_rate": 8.205e-06, + "loss": 0.1098, + "step": 2740 + }, + { + "epoch": 2.1583300512012604, + "grad_norm": 1.6827404499053955, + "learning_rate": 8.208e-06, + "loss": 0.4249, + "step": 2741 + }, + { + "epoch": 2.159117762898779, + "grad_norm": 3.470747232437134, + "learning_rate": 8.211e-06, + "loss": 0.5259, + "step": 2742 + }, + { + "epoch": 2.1599054745962976, + "grad_norm": 1.3400150537490845, + "learning_rate": 8.214e-06, + "loss": 0.3902, + "step": 2743 + }, + { + "epoch": 2.1606931862938166, + "grad_norm": 1.6395691633224487, + "learning_rate": 8.217e-06, + "loss": 0.3168, + "step": 2744 + }, + { + "epoch": 2.161480897991335, + "grad_norm": 1.0541765689849854, + "learning_rate": 8.220000000000001e-06, + "loss": 0.2594, + "step": 2745 + }, + { + "epoch": 2.1622686096888537, + "grad_norm": 1.430782437324524, + "learning_rate": 8.223e-06, + "loss": 0.1806, + "step": 2746 + }, + { + "epoch": 2.1630563213863727, + "grad_norm": 1.5412311553955078, + "learning_rate": 8.226e-06, + "loss": 0.1647, + "step": 2747 + }, + { + "epoch": 2.1638440330838913, + "grad_norm": 0.8563548922538757, + "learning_rate": 8.229e-06, + "loss": 0.0678, + "step": 2748 + }, + { + "epoch": 2.16463174478141, + "grad_norm": 0.5419901609420776, + "learning_rate": 8.232e-06, + "loss": 0.0438, + "step": 2749 + }, + { + "epoch": 2.165419456478929, + "grad_norm": 0.7546130418777466, + "learning_rate": 8.235000000000002e-06, + "loss": 0.0646, + "step": 2750 + }, + { + "epoch": 2.1662071681764474, + "grad_norm": 0.8312780857086182, + "learning_rate": 8.238e-06, + "loss": 0.0654, + "step": 2751 + }, + { + "epoch": 2.166994879873966, + "grad_norm": 0.9542033672332764, + "learning_rate": 8.241e-06, + "loss": 0.0664, + "step": 2752 + }, + { + "epoch": 2.167782591571485, + "grad_norm": 1.0543663501739502, + "learning_rate": 8.244e-06, + "loss": 0.0618, + "step": 2753 + }, + { + "epoch": 2.1685703032690036, + "grad_norm": 2.716050386428833, + "learning_rate": 8.246999999999999e-06, + "loss": 0.0752, + "step": 2754 + }, + { + "epoch": 2.169358014966522, + "grad_norm": 1.0386989116668701, + "learning_rate": 8.25e-06, + "loss": 0.0606, + "step": 2755 + }, + { + "epoch": 2.170145726664041, + "grad_norm": 1.0338029861450195, + "learning_rate": 8.253e-06, + "loss": 0.0678, + "step": 2756 + }, + { + "epoch": 2.1709334383615597, + "grad_norm": 1.0955588817596436, + "learning_rate": 8.256e-06, + "loss": 0.0865, + "step": 2757 + }, + { + "epoch": 2.1717211500590783, + "grad_norm": 0.6279095411300659, + "learning_rate": 8.259e-06, + "loss": 0.0451, + "step": 2758 + }, + { + "epoch": 2.1725088617565973, + "grad_norm": 0.9866594672203064, + "learning_rate": 8.262e-06, + "loss": 0.0679, + "step": 2759 + }, + { + "epoch": 2.173296573454116, + "grad_norm": 0.8426399230957031, + "learning_rate": 8.265000000000001e-06, + "loss": 0.0529, + "step": 2760 + }, + { + "epoch": 2.1740842851516344, + "grad_norm": 0.85450679063797, + "learning_rate": 8.268000000000001e-06, + "loss": 0.0421, + "step": 2761 + }, + { + "epoch": 2.174871996849153, + "grad_norm": 0.6792966723442078, + "learning_rate": 8.271000000000001e-06, + "loss": 0.0414, + "step": 2762 + }, + { + "epoch": 2.175659708546672, + "grad_norm": 0.9802865386009216, + "learning_rate": 8.274e-06, + "loss": 0.0885, + "step": 2763 + }, + { + "epoch": 2.1764474202441906, + "grad_norm": 1.63649582862854, + "learning_rate": 8.276999999999999e-06, + "loss": 0.0807, + "step": 2764 + }, + { + "epoch": 2.177235131941709, + "grad_norm": 0.5575306415557861, + "learning_rate": 8.28e-06, + "loss": 0.0446, + "step": 2765 + }, + { + "epoch": 2.178022843639228, + "grad_norm": 0.8751840591430664, + "learning_rate": 8.283e-06, + "loss": 0.0452, + "step": 2766 + }, + { + "epoch": 2.1788105553367467, + "grad_norm": 0.8770869374275208, + "learning_rate": 8.286e-06, + "loss": 0.0441, + "step": 2767 + }, + { + "epoch": 2.1795982670342653, + "grad_norm": 1.769504189491272, + "learning_rate": 8.289e-06, + "loss": 0.0578, + "step": 2768 + }, + { + "epoch": 2.1803859787317843, + "grad_norm": 0.9232383966445923, + "learning_rate": 8.292e-06, + "loss": 0.0549, + "step": 2769 + }, + { + "epoch": 2.181173690429303, + "grad_norm": 0.8957457542419434, + "learning_rate": 8.295000000000001e-06, + "loss": 0.057, + "step": 2770 + }, + { + "epoch": 2.1819614021268214, + "grad_norm": 1.9102816581726074, + "learning_rate": 8.298000000000001e-06, + "loss": 0.0791, + "step": 2771 + }, + { + "epoch": 2.1827491138243404, + "grad_norm": 0.9389538764953613, + "learning_rate": 8.301e-06, + "loss": 0.0737, + "step": 2772 + }, + { + "epoch": 2.183536825521859, + "grad_norm": 1.0436465740203857, + "learning_rate": 8.304e-06, + "loss": 0.0724, + "step": 2773 + }, + { + "epoch": 2.1843245372193776, + "grad_norm": 0.914740264415741, + "learning_rate": 8.307e-06, + "loss": 0.0665, + "step": 2774 + }, + { + "epoch": 2.1851122489168966, + "grad_norm": 0.864811897277832, + "learning_rate": 8.310000000000002e-06, + "loss": 0.049, + "step": 2775 + }, + { + "epoch": 2.185899960614415, + "grad_norm": 0.837357223033905, + "learning_rate": 8.313e-06, + "loss": 0.0509, + "step": 2776 + }, + { + "epoch": 2.1866876723119337, + "grad_norm": 0.9986791014671326, + "learning_rate": 8.316e-06, + "loss": 0.0616, + "step": 2777 + }, + { + "epoch": 2.1874753840094527, + "grad_norm": 1.0373071432113647, + "learning_rate": 8.319e-06, + "loss": 0.0583, + "step": 2778 + }, + { + "epoch": 2.1882630957069713, + "grad_norm": 1.3228683471679688, + "learning_rate": 8.322e-06, + "loss": 0.0766, + "step": 2779 + }, + { + "epoch": 2.18905080740449, + "grad_norm": 1.308043122291565, + "learning_rate": 8.325e-06, + "loss": 0.0924, + "step": 2780 + }, + { + "epoch": 2.1898385191020084, + "grad_norm": 0.9195970892906189, + "learning_rate": 8.328e-06, + "loss": 0.0491, + "step": 2781 + }, + { + "epoch": 2.1906262307995275, + "grad_norm": 0.8458104729652405, + "learning_rate": 8.331e-06, + "loss": 0.0747, + "step": 2782 + }, + { + "epoch": 2.191413942497046, + "grad_norm": 0.8717568516731262, + "learning_rate": 8.334e-06, + "loss": 0.0591, + "step": 2783 + }, + { + "epoch": 2.1922016541945646, + "grad_norm": 1.5180416107177734, + "learning_rate": 8.337e-06, + "loss": 0.0888, + "step": 2784 + }, + { + "epoch": 2.1929893658920836, + "grad_norm": 1.499204158782959, + "learning_rate": 8.340000000000001e-06, + "loss": 0.1036, + "step": 2785 + }, + { + "epoch": 2.193777077589602, + "grad_norm": 1.210983157157898, + "learning_rate": 8.343000000000001e-06, + "loss": 0.0707, + "step": 2786 + }, + { + "epoch": 2.1945647892871207, + "grad_norm": 1.4202133417129517, + "learning_rate": 8.346000000000001e-06, + "loss": 0.0919, + "step": 2787 + }, + { + "epoch": 2.1953525009846397, + "grad_norm": 1.0770410299301147, + "learning_rate": 8.349e-06, + "loss": 0.0906, + "step": 2788 + }, + { + "epoch": 2.1961402126821583, + "grad_norm": 1.8296581506729126, + "learning_rate": 8.351999999999999e-06, + "loss": 0.1053, + "step": 2789 + }, + { + "epoch": 2.196927924379677, + "grad_norm": 1.2658050060272217, + "learning_rate": 8.355e-06, + "loss": 0.08, + "step": 2790 + }, + { + "epoch": 2.197715636077196, + "grad_norm": 2.874025344848633, + "learning_rate": 8.358e-06, + "loss": 0.5186, + "step": 2791 + }, + { + "epoch": 2.1985033477747145, + "grad_norm": 2.6562225818634033, + "learning_rate": 8.361e-06, + "loss": 0.4003, + "step": 2792 + }, + { + "epoch": 2.199291059472233, + "grad_norm": 1.1748536825180054, + "learning_rate": 8.364e-06, + "loss": 0.31, + "step": 2793 + }, + { + "epoch": 2.200078771169752, + "grad_norm": 1.5098488330841064, + "learning_rate": 8.367e-06, + "loss": 0.301, + "step": 2794 + }, + { + "epoch": 2.2008664828672706, + "grad_norm": 1.9651256799697876, + "learning_rate": 8.370000000000001e-06, + "loss": 0.1825, + "step": 2795 + }, + { + "epoch": 2.201654194564789, + "grad_norm": 1.4201771020889282, + "learning_rate": 8.373000000000001e-06, + "loss": 0.159, + "step": 2796 + }, + { + "epoch": 2.202441906262308, + "grad_norm": 0.9556218981742859, + "learning_rate": 8.376e-06, + "loss": 0.125, + "step": 2797 + }, + { + "epoch": 2.2032296179598267, + "grad_norm": 0.7150432467460632, + "learning_rate": 8.379e-06, + "loss": 0.0748, + "step": 2798 + }, + { + "epoch": 2.2040173296573453, + "grad_norm": 0.6548647880554199, + "learning_rate": 8.382e-06, + "loss": 0.0629, + "step": 2799 + }, + { + "epoch": 2.2048050413548643, + "grad_norm": 0.5689389109611511, + "learning_rate": 8.385e-06, + "loss": 0.0554, + "step": 2800 + }, + { + "epoch": 2.205592753052383, + "grad_norm": 0.7499937415122986, + "learning_rate": 8.388e-06, + "loss": 0.0648, + "step": 2801 + }, + { + "epoch": 2.2063804647499015, + "grad_norm": 0.7150487303733826, + "learning_rate": 8.391e-06, + "loss": 0.0392, + "step": 2802 + }, + { + "epoch": 2.2071681764474205, + "grad_norm": 1.066130518913269, + "learning_rate": 8.394e-06, + "loss": 0.0803, + "step": 2803 + }, + { + "epoch": 2.207955888144939, + "grad_norm": 0.6216344237327576, + "learning_rate": 8.397e-06, + "loss": 0.0437, + "step": 2804 + }, + { + "epoch": 2.2087435998424576, + "grad_norm": 0.7242721915245056, + "learning_rate": 8.400000000000001e-06, + "loss": 0.053, + "step": 2805 + }, + { + "epoch": 2.209531311539976, + "grad_norm": 0.5743241310119629, + "learning_rate": 8.403e-06, + "loss": 0.0396, + "step": 2806 + }, + { + "epoch": 2.210319023237495, + "grad_norm": 0.7907276749610901, + "learning_rate": 8.406e-06, + "loss": 0.0689, + "step": 2807 + }, + { + "epoch": 2.2111067349350138, + "grad_norm": 0.7166036367416382, + "learning_rate": 8.409e-06, + "loss": 0.0656, + "step": 2808 + }, + { + "epoch": 2.2118944466325323, + "grad_norm": 0.9081157445907593, + "learning_rate": 8.412e-06, + "loss": 0.0736, + "step": 2809 + }, + { + "epoch": 2.2126821583300513, + "grad_norm": 0.9860913753509521, + "learning_rate": 8.415000000000002e-06, + "loss": 0.0679, + "step": 2810 + }, + { + "epoch": 2.21346987002757, + "grad_norm": 0.9268185496330261, + "learning_rate": 8.418000000000001e-06, + "loss": 0.0935, + "step": 2811 + }, + { + "epoch": 2.2142575817250885, + "grad_norm": 0.9260664582252502, + "learning_rate": 8.421000000000001e-06, + "loss": 0.0534, + "step": 2812 + }, + { + "epoch": 2.2150452934226075, + "grad_norm": 0.9269130825996399, + "learning_rate": 8.424e-06, + "loss": 0.0595, + "step": 2813 + }, + { + "epoch": 2.215833005120126, + "grad_norm": 1.0543512105941772, + "learning_rate": 8.426999999999999e-06, + "loss": 0.0634, + "step": 2814 + }, + { + "epoch": 2.2166207168176446, + "grad_norm": 0.9138286113739014, + "learning_rate": 8.43e-06, + "loss": 0.0645, + "step": 2815 + }, + { + "epoch": 2.2174084285151636, + "grad_norm": 0.7580428123474121, + "learning_rate": 8.433e-06, + "loss": 0.0579, + "step": 2816 + }, + { + "epoch": 2.218196140212682, + "grad_norm": 1.043462872505188, + "learning_rate": 8.436e-06, + "loss": 0.0668, + "step": 2817 + }, + { + "epoch": 2.2189838519102008, + "grad_norm": 1.0172855854034424, + "learning_rate": 8.439e-06, + "loss": 0.0567, + "step": 2818 + }, + { + "epoch": 2.2197715636077198, + "grad_norm": 0.778802216053009, + "learning_rate": 8.442e-06, + "loss": 0.0488, + "step": 2819 + }, + { + "epoch": 2.2205592753052383, + "grad_norm": 0.9989569783210754, + "learning_rate": 8.445e-06, + "loss": 0.0788, + "step": 2820 + }, + { + "epoch": 2.221346987002757, + "grad_norm": 2.5344040393829346, + "learning_rate": 8.448000000000001e-06, + "loss": 0.0587, + "step": 2821 + }, + { + "epoch": 2.222134698700276, + "grad_norm": 0.8177671432495117, + "learning_rate": 8.451000000000001e-06, + "loss": 0.0614, + "step": 2822 + }, + { + "epoch": 2.2229224103977945, + "grad_norm": 1.0277976989746094, + "learning_rate": 8.454e-06, + "loss": 0.0676, + "step": 2823 + }, + { + "epoch": 2.223710122095313, + "grad_norm": 1.2268544435501099, + "learning_rate": 8.457e-06, + "loss": 0.0706, + "step": 2824 + }, + { + "epoch": 2.2244978337928316, + "grad_norm": 1.0597141981124878, + "learning_rate": 8.459999999999999e-06, + "loss": 0.0925, + "step": 2825 + }, + { + "epoch": 2.2252855454903506, + "grad_norm": 1.652482271194458, + "learning_rate": 8.463e-06, + "loss": 0.066, + "step": 2826 + }, + { + "epoch": 2.226073257187869, + "grad_norm": 1.0702295303344727, + "learning_rate": 8.466e-06, + "loss": 0.0516, + "step": 2827 + }, + { + "epoch": 2.2268609688853878, + "grad_norm": 1.729331135749817, + "learning_rate": 8.469e-06, + "loss": 0.0517, + "step": 2828 + }, + { + "epoch": 2.2276486805829068, + "grad_norm": 0.9343040585517883, + "learning_rate": 8.472e-06, + "loss": 0.0888, + "step": 2829 + }, + { + "epoch": 2.2284363922804253, + "grad_norm": 0.9381545186042786, + "learning_rate": 8.475e-06, + "loss": 0.0647, + "step": 2830 + }, + { + "epoch": 2.229224103977944, + "grad_norm": 0.9838034510612488, + "learning_rate": 8.478e-06, + "loss": 0.0619, + "step": 2831 + }, + { + "epoch": 2.230011815675463, + "grad_norm": 1.3501578569412231, + "learning_rate": 8.481e-06, + "loss": 0.0813, + "step": 2832 + }, + { + "epoch": 2.2307995273729815, + "grad_norm": 1.0925205945968628, + "learning_rate": 8.484e-06, + "loss": 0.0817, + "step": 2833 + }, + { + "epoch": 2.2315872390705, + "grad_norm": 1.630101203918457, + "learning_rate": 8.487e-06, + "loss": 0.0824, + "step": 2834 + }, + { + "epoch": 2.232374950768019, + "grad_norm": 0.9261672496795654, + "learning_rate": 8.49e-06, + "loss": 0.0888, + "step": 2835 + }, + { + "epoch": 2.2331626624655376, + "grad_norm": 0.9688372015953064, + "learning_rate": 8.493000000000002e-06, + "loss": 0.0571, + "step": 2836 + }, + { + "epoch": 2.233950374163056, + "grad_norm": 1.0870685577392578, + "learning_rate": 8.496e-06, + "loss": 0.0831, + "step": 2837 + }, + { + "epoch": 2.234738085860575, + "grad_norm": 1.1456652879714966, + "learning_rate": 8.499e-06, + "loss": 0.0753, + "step": 2838 + }, + { + "epoch": 2.2355257975580938, + "grad_norm": 1.1523545980453491, + "learning_rate": 8.502e-06, + "loss": 0.0698, + "step": 2839 + }, + { + "epoch": 2.2363135092556123, + "grad_norm": 1.5618045330047607, + "learning_rate": 8.504999999999999e-06, + "loss": 0.1492, + "step": 2840 + }, + { + "epoch": 2.2371012209531314, + "grad_norm": 2.080993890762329, + "learning_rate": 8.508e-06, + "loss": 0.5219, + "step": 2841 + }, + { + "epoch": 2.23788893265065, + "grad_norm": 1.3249588012695312, + "learning_rate": 8.511e-06, + "loss": 0.3303, + "step": 2842 + }, + { + "epoch": 2.2386766443481685, + "grad_norm": 1.0929884910583496, + "learning_rate": 8.514e-06, + "loss": 0.3004, + "step": 2843 + }, + { + "epoch": 2.239464356045687, + "grad_norm": 1.5082778930664062, + "learning_rate": 8.517e-06, + "loss": 0.3039, + "step": 2844 + }, + { + "epoch": 2.240252067743206, + "grad_norm": 1.0981252193450928, + "learning_rate": 8.52e-06, + "loss": 0.1651, + "step": 2845 + }, + { + "epoch": 2.2410397794407246, + "grad_norm": 0.986420214176178, + "learning_rate": 8.523000000000001e-06, + "loss": 0.0914, + "step": 2846 + }, + { + "epoch": 2.241827491138243, + "grad_norm": 0.6618751287460327, + "learning_rate": 8.526000000000001e-06, + "loss": 0.0817, + "step": 2847 + }, + { + "epoch": 2.242615202835762, + "grad_norm": 0.6665956377983093, + "learning_rate": 8.529e-06, + "loss": 0.0653, + "step": 2848 + }, + { + "epoch": 2.243402914533281, + "grad_norm": 0.8760743141174316, + "learning_rate": 8.532e-06, + "loss": 0.1331, + "step": 2849 + }, + { + "epoch": 2.2441906262307993, + "grad_norm": 0.7259612083435059, + "learning_rate": 8.534999999999999e-06, + "loss": 0.0829, + "step": 2850 + }, + { + "epoch": 2.2449783379283184, + "grad_norm": 0.9193028211593628, + "learning_rate": 8.538e-06, + "loss": 0.0597, + "step": 2851 + }, + { + "epoch": 2.245766049625837, + "grad_norm": 0.8897410035133362, + "learning_rate": 8.541e-06, + "loss": 0.0861, + "step": 2852 + }, + { + "epoch": 2.2465537613233555, + "grad_norm": 0.7837010622024536, + "learning_rate": 8.544e-06, + "loss": 0.0632, + "step": 2853 + }, + { + "epoch": 2.2473414730208745, + "grad_norm": 0.7787255644798279, + "learning_rate": 8.547e-06, + "loss": 0.0456, + "step": 2854 + }, + { + "epoch": 2.248129184718393, + "grad_norm": 0.7575989365577698, + "learning_rate": 8.55e-06, + "loss": 0.0515, + "step": 2855 + }, + { + "epoch": 2.2489168964159116, + "grad_norm": 1.7042847871780396, + "learning_rate": 8.553000000000001e-06, + "loss": 0.0629, + "step": 2856 + }, + { + "epoch": 2.2497046081134306, + "grad_norm": NaN, + "learning_rate": 8.553000000000001e-06, + "loss": 0.115, + "step": 2857 + }, + { + "epoch": 2.250492319810949, + "grad_norm": 0.7914428114891052, + "learning_rate": 8.556e-06, + "loss": 0.0588, + "step": 2858 + }, + { + "epoch": 2.251280031508468, + "grad_norm": 1.2330331802368164, + "learning_rate": 8.559e-06, + "loss": 0.0618, + "step": 2859 + }, + { + "epoch": 2.252067743205987, + "grad_norm": 0.7161920666694641, + "learning_rate": 8.562e-06, + "loss": 0.0596, + "step": 2860 + }, + { + "epoch": 2.2528554549035054, + "grad_norm": 0.9440436363220215, + "learning_rate": 8.565e-06, + "loss": 0.0648, + "step": 2861 + }, + { + "epoch": 2.253643166601024, + "grad_norm": 1.539494514465332, + "learning_rate": 8.568000000000002e-06, + "loss": 0.0462, + "step": 2862 + }, + { + "epoch": 2.2544308782985425, + "grad_norm": 0.5187697410583496, + "learning_rate": 8.571e-06, + "loss": 0.0568, + "step": 2863 + }, + { + "epoch": 2.2552185899960615, + "grad_norm": 0.7797425389289856, + "learning_rate": 8.574e-06, + "loss": 0.0424, + "step": 2864 + }, + { + "epoch": 2.25600630169358, + "grad_norm": 1.0658470392227173, + "learning_rate": 8.577e-06, + "loss": 0.0677, + "step": 2865 + }, + { + "epoch": 2.256794013391099, + "grad_norm": 1.1813499927520752, + "learning_rate": 8.58e-06, + "loss": 0.0953, + "step": 2866 + }, + { + "epoch": 2.2575817250886177, + "grad_norm": 0.8486014008522034, + "learning_rate": 8.583e-06, + "loss": 0.054, + "step": 2867 + }, + { + "epoch": 2.258369436786136, + "grad_norm": 1.943264365196228, + "learning_rate": 8.586e-06, + "loss": 0.0757, + "step": 2868 + }, + { + "epoch": 2.259157148483655, + "grad_norm": 1.501364827156067, + "learning_rate": 8.589e-06, + "loss": 0.0984, + "step": 2869 + }, + { + "epoch": 2.259944860181174, + "grad_norm": 0.9510977268218994, + "learning_rate": 8.592e-06, + "loss": 0.0645, + "step": 2870 + }, + { + "epoch": 2.2607325718786924, + "grad_norm": 1.1557252407073975, + "learning_rate": 8.595e-06, + "loss": 0.0521, + "step": 2871 + }, + { + "epoch": 2.261520283576211, + "grad_norm": 0.9498692750930786, + "learning_rate": 8.598000000000001e-06, + "loss": 0.0433, + "step": 2872 + }, + { + "epoch": 2.26230799527373, + "grad_norm": 1.2669347524642944, + "learning_rate": 8.601000000000001e-06, + "loss": 0.0778, + "step": 2873 + }, + { + "epoch": 2.2630957069712485, + "grad_norm": 0.7270307540893555, + "learning_rate": 8.604000000000001e-06, + "loss": 0.0397, + "step": 2874 + }, + { + "epoch": 2.263883418668767, + "grad_norm": 1.0097954273223877, + "learning_rate": 8.606999999999999e-06, + "loss": 0.0522, + "step": 2875 + }, + { + "epoch": 2.264671130366286, + "grad_norm": 1.7841973304748535, + "learning_rate": 8.609999999999999e-06, + "loss": 0.0728, + "step": 2876 + }, + { + "epoch": 2.2654588420638047, + "grad_norm": 0.823127269744873, + "learning_rate": 8.613e-06, + "loss": 0.051, + "step": 2877 + }, + { + "epoch": 2.2662465537613232, + "grad_norm": 0.946051299571991, + "learning_rate": 8.616e-06, + "loss": 0.0606, + "step": 2878 + }, + { + "epoch": 2.2670342654588422, + "grad_norm": 0.9447850584983826, + "learning_rate": 8.619e-06, + "loss": 0.0689, + "step": 2879 + }, + { + "epoch": 2.267821977156361, + "grad_norm": 1.2099223136901855, + "learning_rate": 8.622e-06, + "loss": 0.0682, + "step": 2880 + }, + { + "epoch": 2.2686096888538794, + "grad_norm": 0.8108013272285461, + "learning_rate": 8.625e-06, + "loss": 0.0533, + "step": 2881 + }, + { + "epoch": 2.2693974005513984, + "grad_norm": 1.144171118736267, + "learning_rate": 8.628000000000001e-06, + "loss": 0.1137, + "step": 2882 + }, + { + "epoch": 2.270185112248917, + "grad_norm": 1.228449821472168, + "learning_rate": 8.631000000000001e-06, + "loss": 0.0746, + "step": 2883 + }, + { + "epoch": 2.2709728239464355, + "grad_norm": 1.059554934501648, + "learning_rate": 8.634e-06, + "loss": 0.0933, + "step": 2884 + }, + { + "epoch": 2.2717605356439545, + "grad_norm": 1.2645103931427002, + "learning_rate": 8.637e-06, + "loss": 0.0601, + "step": 2885 + }, + { + "epoch": 2.272548247341473, + "grad_norm": 1.156613826751709, + "learning_rate": 8.64e-06, + "loss": 0.0624, + "step": 2886 + }, + { + "epoch": 2.2733359590389917, + "grad_norm": 1.025482177734375, + "learning_rate": 8.643e-06, + "loss": 0.0801, + "step": 2887 + }, + { + "epoch": 2.2741236707365102, + "grad_norm": 1.130159854888916, + "learning_rate": 8.646e-06, + "loss": 0.1118, + "step": 2888 + }, + { + "epoch": 2.2749113824340292, + "grad_norm": 1.3859217166900635, + "learning_rate": 8.649e-06, + "loss": 0.0958, + "step": 2889 + }, + { + "epoch": 2.275699094131548, + "grad_norm": 1.5403800010681152, + "learning_rate": 8.652e-06, + "loss": 0.0879, + "step": 2890 + }, + { + "epoch": 2.2764868058290664, + "grad_norm": 2.8886847496032715, + "learning_rate": 8.655e-06, + "loss": 0.6172, + "step": 2891 + }, + { + "epoch": 2.2772745175265854, + "grad_norm": 1.3381179571151733, + "learning_rate": 8.658e-06, + "loss": 0.3426, + "step": 2892 + }, + { + "epoch": 2.278062229224104, + "grad_norm": 1.889336109161377, + "learning_rate": 8.661e-06, + "loss": 0.31, + "step": 2893 + }, + { + "epoch": 2.2788499409216225, + "grad_norm": 1.3580032587051392, + "learning_rate": 8.664e-06, + "loss": 0.2476, + "step": 2894 + }, + { + "epoch": 2.2796376526191415, + "grad_norm": 1.7771437168121338, + "learning_rate": 8.667e-06, + "loss": 0.2218, + "step": 2895 + }, + { + "epoch": 2.28042536431666, + "grad_norm": 1.0605305433273315, + "learning_rate": 8.67e-06, + "loss": 0.1483, + "step": 2896 + }, + { + "epoch": 2.2812130760141787, + "grad_norm": 1.0464668273925781, + "learning_rate": 8.673000000000001e-06, + "loss": 0.1029, + "step": 2897 + }, + { + "epoch": 2.2820007877116977, + "grad_norm": 0.6598726511001587, + "learning_rate": 8.676000000000001e-06, + "loss": 0.0754, + "step": 2898 + }, + { + "epoch": 2.2827884994092162, + "grad_norm": 0.7358702421188354, + "learning_rate": 8.679000000000001e-06, + "loss": 0.0834, + "step": 2899 + }, + { + "epoch": 2.283576211106735, + "grad_norm": 1.2057932615280151, + "learning_rate": 8.682e-06, + "loss": 0.0933, + "step": 2900 + }, + { + "epoch": 2.284363922804254, + "grad_norm": 0.6695127487182617, + "learning_rate": 8.684999999999999e-06, + "loss": 0.1022, + "step": 2901 + }, + { + "epoch": 2.2851516345017724, + "grad_norm": 0.8671555519104004, + "learning_rate": 8.688e-06, + "loss": 0.0721, + "step": 2902 + }, + { + "epoch": 2.285939346199291, + "grad_norm": 0.8981437683105469, + "learning_rate": 8.691e-06, + "loss": 0.1272, + "step": 2903 + }, + { + "epoch": 2.28672705789681, + "grad_norm": 2.6012141704559326, + "learning_rate": 8.694e-06, + "loss": 0.0603, + "step": 2904 + }, + { + "epoch": 2.2875147695943285, + "grad_norm": 0.62962806224823, + "learning_rate": 8.697e-06, + "loss": 0.0459, + "step": 2905 + }, + { + "epoch": 2.288302481291847, + "grad_norm": 0.9434187412261963, + "learning_rate": 8.7e-06, + "loss": 0.1012, + "step": 2906 + }, + { + "epoch": 2.2890901929893657, + "grad_norm": 0.8898925185203552, + "learning_rate": 8.703000000000001e-06, + "loss": 0.084, + "step": 2907 + }, + { + "epoch": 2.2898779046868847, + "grad_norm": 0.8489217758178711, + "learning_rate": 8.706000000000001e-06, + "loss": 0.0525, + "step": 2908 + }, + { + "epoch": 2.2906656163844032, + "grad_norm": 0.8785389065742493, + "learning_rate": 8.709e-06, + "loss": 0.0487, + "step": 2909 + }, + { + "epoch": 2.2914533280819223, + "grad_norm": 1.148979663848877, + "learning_rate": 8.712e-06, + "loss": 0.0549, + "step": 2910 + }, + { + "epoch": 2.292241039779441, + "grad_norm": 0.9654780030250549, + "learning_rate": 8.715e-06, + "loss": 0.0598, + "step": 2911 + }, + { + "epoch": 2.2930287514769594, + "grad_norm": 1.1434483528137207, + "learning_rate": 8.718e-06, + "loss": 0.063, + "step": 2912 + }, + { + "epoch": 2.293816463174478, + "grad_norm": 0.7556212544441223, + "learning_rate": 8.721e-06, + "loss": 0.0679, + "step": 2913 + }, + { + "epoch": 2.294604174871997, + "grad_norm": 1.1790220737457275, + "learning_rate": 8.724e-06, + "loss": 0.0853, + "step": 2914 + }, + { + "epoch": 2.2953918865695155, + "grad_norm": 1.1285104751586914, + "learning_rate": 8.727e-06, + "loss": 0.0634, + "step": 2915 + }, + { + "epoch": 2.296179598267034, + "grad_norm": 0.9144028425216675, + "learning_rate": 8.73e-06, + "loss": 0.0578, + "step": 2916 + }, + { + "epoch": 2.296967309964553, + "grad_norm": 1.0750116109848022, + "learning_rate": 8.733000000000001e-06, + "loss": 0.0583, + "step": 2917 + }, + { + "epoch": 2.2977550216620717, + "grad_norm": 0.8734289407730103, + "learning_rate": 8.736e-06, + "loss": 0.068, + "step": 2918 + }, + { + "epoch": 2.2985427333595903, + "grad_norm": 0.9044342637062073, + "learning_rate": 8.739e-06, + "loss": 0.0785, + "step": 2919 + }, + { + "epoch": 2.2993304450571093, + "grad_norm": 1.7640599012374878, + "learning_rate": 8.742e-06, + "loss": 0.07, + "step": 2920 + }, + { + "epoch": 2.300118156754628, + "grad_norm": 0.818756639957428, + "learning_rate": 8.745e-06, + "loss": 0.038, + "step": 2921 + }, + { + "epoch": 2.3009058684521464, + "grad_norm": 1.1686100959777832, + "learning_rate": 8.748000000000002e-06, + "loss": 0.0598, + "step": 2922 + }, + { + "epoch": 2.3016935801496654, + "grad_norm": 0.9032220244407654, + "learning_rate": 8.751000000000001e-06, + "loss": 0.0429, + "step": 2923 + }, + { + "epoch": 2.302481291847184, + "grad_norm": 0.919197142124176, + "learning_rate": 8.754e-06, + "loss": 0.0574, + "step": 2924 + }, + { + "epoch": 2.3032690035447025, + "grad_norm": 1.0902929306030273, + "learning_rate": 8.757e-06, + "loss": 0.0736, + "step": 2925 + }, + { + "epoch": 2.304056715242221, + "grad_norm": 1.0069503784179688, + "learning_rate": 8.759999999999999e-06, + "loss": 0.0653, + "step": 2926 + }, + { + "epoch": 2.30484442693974, + "grad_norm": 0.996650218963623, + "learning_rate": 8.763e-06, + "loss": 0.0579, + "step": 2927 + }, + { + "epoch": 2.3056321386372587, + "grad_norm": 1.0636260509490967, + "learning_rate": 8.766e-06, + "loss": 0.0533, + "step": 2928 + }, + { + "epoch": 2.3064198503347777, + "grad_norm": 1.0276801586151123, + "learning_rate": 8.769e-06, + "loss": 0.0767, + "step": 2929 + }, + { + "epoch": 2.3072075620322963, + "grad_norm": 0.821722149848938, + "learning_rate": 8.772e-06, + "loss": 0.0651, + "step": 2930 + }, + { + "epoch": 2.307995273729815, + "grad_norm": 0.907588005065918, + "learning_rate": 8.775e-06, + "loss": 0.0521, + "step": 2931 + }, + { + "epoch": 2.3087829854273334, + "grad_norm": 1.4148343801498413, + "learning_rate": 8.778000000000001e-06, + "loss": 0.0774, + "step": 2932 + }, + { + "epoch": 2.3095706971248524, + "grad_norm": 1.3105273246765137, + "learning_rate": 8.781000000000001e-06, + "loss": 0.0711, + "step": 2933 + }, + { + "epoch": 2.310358408822371, + "grad_norm": 1.019483208656311, + "learning_rate": 8.784000000000001e-06, + "loss": 0.0742, + "step": 2934 + }, + { + "epoch": 2.3111461205198895, + "grad_norm": 1.201540470123291, + "learning_rate": 8.787e-06, + "loss": 0.0793, + "step": 2935 + }, + { + "epoch": 2.3119338322174086, + "grad_norm": 0.9786596298217773, + "learning_rate": 8.79e-06, + "loss": 0.0566, + "step": 2936 + }, + { + "epoch": 2.312721543914927, + "grad_norm": 0.9759734869003296, + "learning_rate": 8.793e-06, + "loss": 0.0824, + "step": 2937 + }, + { + "epoch": 2.3135092556124457, + "grad_norm": 1.351144552230835, + "learning_rate": 8.796e-06, + "loss": 0.1136, + "step": 2938 + }, + { + "epoch": 2.3142969673099647, + "grad_norm": 1.5099897384643555, + "learning_rate": 8.799e-06, + "loss": 0.0903, + "step": 2939 + }, + { + "epoch": 2.3150846790074833, + "grad_norm": 2.333015203475952, + "learning_rate": 8.802e-06, + "loss": 0.1337, + "step": 2940 + }, + { + "epoch": 2.315872390705002, + "grad_norm": 2.134493589401245, + "learning_rate": 8.805e-06, + "loss": 0.4522, + "step": 2941 + }, + { + "epoch": 2.316660102402521, + "grad_norm": 1.2971796989440918, + "learning_rate": 8.808000000000001e-06, + "loss": 0.4454, + "step": 2942 + }, + { + "epoch": 2.3174478141000394, + "grad_norm": 0.9624576568603516, + "learning_rate": 8.811000000000001e-06, + "loss": 0.3177, + "step": 2943 + }, + { + "epoch": 2.318235525797558, + "grad_norm": 1.462650179862976, + "learning_rate": 8.814e-06, + "loss": 0.3416, + "step": 2944 + }, + { + "epoch": 2.319023237495077, + "grad_norm": 1.3318222761154175, + "learning_rate": 8.817e-06, + "loss": 0.3251, + "step": 2945 + }, + { + "epoch": 2.3198109491925956, + "grad_norm": 0.8322545289993286, + "learning_rate": 8.82e-06, + "loss": 0.1702, + "step": 2946 + }, + { + "epoch": 2.320598660890114, + "grad_norm": 0.7841878533363342, + "learning_rate": 8.823e-06, + "loss": 0.1512, + "step": 2947 + }, + { + "epoch": 2.321386372587633, + "grad_norm": 1.2730971574783325, + "learning_rate": 8.826000000000002e-06, + "loss": 0.066, + "step": 2948 + }, + { + "epoch": 2.3221740842851517, + "grad_norm": 0.8433002233505249, + "learning_rate": 8.829e-06, + "loss": 0.1112, + "step": 2949 + }, + { + "epoch": 2.3229617959826703, + "grad_norm": 0.9561911225318909, + "learning_rate": 8.832e-06, + "loss": 0.0645, + "step": 2950 + }, + { + "epoch": 2.323749507680189, + "grad_norm": 0.8379849791526794, + "learning_rate": 8.835e-06, + "loss": 0.081, + "step": 2951 + }, + { + "epoch": 2.324537219377708, + "grad_norm": 1.073012351989746, + "learning_rate": 8.837999999999999e-06, + "loss": 0.0761, + "step": 2952 + }, + { + "epoch": 2.3253249310752264, + "grad_norm": 0.6579760313034058, + "learning_rate": 8.841e-06, + "loss": 0.0688, + "step": 2953 + }, + { + "epoch": 2.326112642772745, + "grad_norm": 0.8093737959861755, + "learning_rate": 8.844e-06, + "loss": 0.0952, + "step": 2954 + }, + { + "epoch": 2.326900354470264, + "grad_norm": 0.8657782673835754, + "learning_rate": 8.847e-06, + "loss": 0.062, + "step": 2955 + }, + { + "epoch": 2.3276880661677826, + "grad_norm": 0.940170168876648, + "learning_rate": 8.85e-06, + "loss": 0.0763, + "step": 2956 + }, + { + "epoch": 2.328475777865301, + "grad_norm": 1.002376675605774, + "learning_rate": 8.853e-06, + "loss": 0.0451, + "step": 2957 + }, + { + "epoch": 2.32926348956282, + "grad_norm": 0.9195945858955383, + "learning_rate": 8.856000000000001e-06, + "loss": 0.0539, + "step": 2958 + }, + { + "epoch": 2.3300512012603387, + "grad_norm": 0.7610181570053101, + "learning_rate": 8.859000000000001e-06, + "loss": 0.0679, + "step": 2959 + }, + { + "epoch": 2.3308389129578573, + "grad_norm": 0.9617300033569336, + "learning_rate": 8.862000000000001e-06, + "loss": 0.0516, + "step": 2960 + }, + { + "epoch": 2.3316266246553763, + "grad_norm": 0.844501256942749, + "learning_rate": 8.864999999999999e-06, + "loss": 0.0671, + "step": 2961 + }, + { + "epoch": 2.332414336352895, + "grad_norm": 1.1521010398864746, + "learning_rate": 8.867999999999999e-06, + "loss": 0.0475, + "step": 2962 + }, + { + "epoch": 2.3332020480504134, + "grad_norm": 0.6943371891975403, + "learning_rate": 8.871e-06, + "loss": 0.0499, + "step": 2963 + }, + { + "epoch": 2.3339897597479324, + "grad_norm": 1.0618159770965576, + "learning_rate": 8.874e-06, + "loss": 0.0557, + "step": 2964 + }, + { + "epoch": 2.334777471445451, + "grad_norm": 0.698199987411499, + "learning_rate": 8.877e-06, + "loss": 0.0528, + "step": 2965 + }, + { + "epoch": 2.3355651831429696, + "grad_norm": 0.8588815927505493, + "learning_rate": 8.88e-06, + "loss": 0.0454, + "step": 2966 + }, + { + "epoch": 2.3363528948404886, + "grad_norm": 0.747677206993103, + "learning_rate": 8.883e-06, + "loss": 0.042, + "step": 2967 + }, + { + "epoch": 2.337140606538007, + "grad_norm": 1.153573989868164, + "learning_rate": 8.886000000000001e-06, + "loss": 0.0659, + "step": 2968 + }, + { + "epoch": 2.3379283182355257, + "grad_norm": 0.7350477576255798, + "learning_rate": 8.889e-06, + "loss": 0.0492, + "step": 2969 + }, + { + "epoch": 2.3387160299330443, + "grad_norm": 0.914330244064331, + "learning_rate": 8.892e-06, + "loss": 0.0647, + "step": 2970 + }, + { + "epoch": 2.3395037416305633, + "grad_norm": 0.7514603137969971, + "learning_rate": 8.895e-06, + "loss": 0.0609, + "step": 2971 + }, + { + "epoch": 2.340291453328082, + "grad_norm": 0.9559147953987122, + "learning_rate": 8.898e-06, + "loss": 0.0591, + "step": 2972 + }, + { + "epoch": 2.341079165025601, + "grad_norm": 0.7661526203155518, + "learning_rate": 8.901e-06, + "loss": 0.0622, + "step": 2973 + }, + { + "epoch": 2.3418668767231194, + "grad_norm": 0.7852794528007507, + "learning_rate": 8.904e-06, + "loss": 0.0558, + "step": 2974 + }, + { + "epoch": 2.342654588420638, + "grad_norm": 0.9247592687606812, + "learning_rate": 8.907e-06, + "loss": 0.0651, + "step": 2975 + }, + { + "epoch": 2.3434423001181566, + "grad_norm": 1.0748704671859741, + "learning_rate": 8.91e-06, + "loss": 0.0731, + "step": 2976 + }, + { + "epoch": 2.3442300118156756, + "grad_norm": 1.6138511896133423, + "learning_rate": 8.913e-06, + "loss": 0.0721, + "step": 2977 + }, + { + "epoch": 2.345017723513194, + "grad_norm": 0.7620524764060974, + "learning_rate": 8.916e-06, + "loss": 0.0606, + "step": 2978 + }, + { + "epoch": 2.3458054352107127, + "grad_norm": 1.0408949851989746, + "learning_rate": 8.919e-06, + "loss": 0.0668, + "step": 2979 + }, + { + "epoch": 2.3465931469082317, + "grad_norm": 0.8602389693260193, + "learning_rate": 8.922e-06, + "loss": 0.0536, + "step": 2980 + }, + { + "epoch": 2.3473808586057503, + "grad_norm": 0.8467727899551392, + "learning_rate": 8.925e-06, + "loss": 0.0536, + "step": 2981 + }, + { + "epoch": 2.348168570303269, + "grad_norm": 0.9800115823745728, + "learning_rate": 8.928e-06, + "loss": 0.0708, + "step": 2982 + }, + { + "epoch": 2.348956282000788, + "grad_norm": 0.9519461989402771, + "learning_rate": 8.931000000000001e-06, + "loss": 0.0503, + "step": 2983 + }, + { + "epoch": 2.3497439936983064, + "grad_norm": 1.045619249343872, + "learning_rate": 8.934000000000001e-06, + "loss": 0.0861, + "step": 2984 + }, + { + "epoch": 2.350531705395825, + "grad_norm": 0.9982858300209045, + "learning_rate": 8.937000000000001e-06, + "loss": 0.0684, + "step": 2985 + }, + { + "epoch": 2.351319417093344, + "grad_norm": 1.2407587766647339, + "learning_rate": 8.939999999999999e-06, + "loss": 0.0676, + "step": 2986 + }, + { + "epoch": 2.3521071287908626, + "grad_norm": 0.8760013580322266, + "learning_rate": 8.942999999999999e-06, + "loss": 0.0734, + "step": 2987 + }, + { + "epoch": 2.352894840488381, + "grad_norm": 1.210986614227295, + "learning_rate": 8.946e-06, + "loss": 0.0769, + "step": 2988 + }, + { + "epoch": 2.3536825521858997, + "grad_norm": 0.903477132320404, + "learning_rate": 8.949e-06, + "loss": 0.0591, + "step": 2989 + }, + { + "epoch": 2.3544702638834187, + "grad_norm": 1.141359806060791, + "learning_rate": 8.952e-06, + "loss": 0.0895, + "step": 2990 + }, + { + "epoch": 2.3552579755809373, + "grad_norm": 1.6942750215530396, + "learning_rate": 8.955e-06, + "loss": 0.4018, + "step": 2991 + }, + { + "epoch": 2.3560456872784563, + "grad_norm": 2.1516377925872803, + "learning_rate": 8.958e-06, + "loss": 0.5387, + "step": 2992 + }, + { + "epoch": 2.356833398975975, + "grad_norm": 2.678067445755005, + "learning_rate": 8.961000000000001e-06, + "loss": 0.3577, + "step": 2993 + }, + { + "epoch": 2.3576211106734934, + "grad_norm": 1.4790288209915161, + "learning_rate": 8.964000000000001e-06, + "loss": 0.2226, + "step": 2994 + }, + { + "epoch": 2.358408822371012, + "grad_norm": 1.2741718292236328, + "learning_rate": 8.967e-06, + "loss": 0.2269, + "step": 2995 + }, + { + "epoch": 2.359196534068531, + "grad_norm": 0.7175904512405396, + "learning_rate": 8.97e-06, + "loss": 0.0844, + "step": 2996 + }, + { + "epoch": 2.3599842457660496, + "grad_norm": 0.8424521684646606, + "learning_rate": 8.973e-06, + "loss": 0.1262, + "step": 2997 + }, + { + "epoch": 2.360771957463568, + "grad_norm": 0.6793601512908936, + "learning_rate": 8.976e-06, + "loss": 0.0778, + "step": 2998 + }, + { + "epoch": 2.361559669161087, + "grad_norm": 0.7677938342094421, + "learning_rate": 8.979e-06, + "loss": 0.0625, + "step": 2999 + }, + { + "epoch": 2.3623473808586057, + "grad_norm": 0.5280177593231201, + "learning_rate": 8.982e-06, + "loss": 0.0535, + "step": 3000 + }, + { + "epoch": 2.3623473808586057, + "eval_cer": 0.1526816496876377, + "eval_loss": 0.5161155462265015, + "eval_runtime": 16.19, + "eval_samples_per_second": 18.777, + "eval_steps_per_second": 0.618, + "eval_wer": 0.5306983883346125, + "step": 3000 + }, + { + "epoch": 2.3631350925561243, + "grad_norm": 1.1541178226470947, + "learning_rate": 8.985e-06, + "loss": 0.0634, + "step": 3001 + }, + { + "epoch": 2.3639228042536433, + "grad_norm": 0.8760367035865784, + "learning_rate": 8.988e-06, + "loss": 0.0585, + "step": 3002 + }, + { + "epoch": 2.364710515951162, + "grad_norm": 1.2273774147033691, + "learning_rate": 8.991e-06, + "loss": 0.0839, + "step": 3003 + }, + { + "epoch": 2.3654982276486805, + "grad_norm": 0.6769484877586365, + "learning_rate": 8.994e-06, + "loss": 0.0425, + "step": 3004 + }, + { + "epoch": 2.3662859393461995, + "grad_norm": 0.5609738230705261, + "learning_rate": 8.997e-06, + "loss": 0.0323, + "step": 3005 + }, + { + "epoch": 2.367073651043718, + "grad_norm": 0.9436789155006409, + "learning_rate": 9e-06, + "loss": 0.0745, + "step": 3006 + }, + { + "epoch": 2.3678613627412366, + "grad_norm": 0.6809965968132019, + "learning_rate": 9.003e-06, + "loss": 0.0391, + "step": 3007 + }, + { + "epoch": 2.3686490744387556, + "grad_norm": 0.8017938733100891, + "learning_rate": 9.006000000000002e-06, + "loss": 0.0668, + "step": 3008 + }, + { + "epoch": 2.369436786136274, + "grad_norm": 1.5344326496124268, + "learning_rate": 9.009000000000001e-06, + "loss": 0.0455, + "step": 3009 + }, + { + "epoch": 2.3702244978337927, + "grad_norm": 0.8540695905685425, + "learning_rate": 9.012e-06, + "loss": 0.0394, + "step": 3010 + }, + { + "epoch": 2.3710122095313118, + "grad_norm": 0.6834715008735657, + "learning_rate": 9.015e-06, + "loss": 0.0553, + "step": 3011 + }, + { + "epoch": 2.3717999212288303, + "grad_norm": 0.6704806685447693, + "learning_rate": 9.017999999999999e-06, + "loss": 0.0423, + "step": 3012 + }, + { + "epoch": 2.372587632926349, + "grad_norm": 1.300304651260376, + "learning_rate": 9.021e-06, + "loss": 0.0609, + "step": 3013 + }, + { + "epoch": 2.3733753446238675, + "grad_norm": 0.7948070168495178, + "learning_rate": 9.024e-06, + "loss": 0.0539, + "step": 3014 + }, + { + "epoch": 2.3741630563213865, + "grad_norm": 0.6205891966819763, + "learning_rate": 9.027e-06, + "loss": 0.0391, + "step": 3015 + }, + { + "epoch": 2.374950768018905, + "grad_norm": 1.0571225881576538, + "learning_rate": 9.03e-06, + "loss": 0.0761, + "step": 3016 + }, + { + "epoch": 2.3757384797164236, + "grad_norm": 0.9091776609420776, + "learning_rate": 9.033e-06, + "loss": 0.0606, + "step": 3017 + }, + { + "epoch": 2.3765261914139426, + "grad_norm": 1.3437137603759766, + "learning_rate": 9.036000000000001e-06, + "loss": 0.0658, + "step": 3018 + }, + { + "epoch": 2.377313903111461, + "grad_norm": 0.8095684051513672, + "learning_rate": 9.039000000000001e-06, + "loss": 0.0512, + "step": 3019 + }, + { + "epoch": 2.3781016148089797, + "grad_norm": 0.8075119256973267, + "learning_rate": 9.042e-06, + "loss": 0.0758, + "step": 3020 + }, + { + "epoch": 2.3788893265064988, + "grad_norm": 1.008732795715332, + "learning_rate": 9.045e-06, + "loss": 0.0616, + "step": 3021 + }, + { + "epoch": 2.3796770382040173, + "grad_norm": 1.1364188194274902, + "learning_rate": 9.048e-06, + "loss": 0.0916, + "step": 3022 + }, + { + "epoch": 2.380464749901536, + "grad_norm": 0.8755109310150146, + "learning_rate": 9.051e-06, + "loss": 0.0838, + "step": 3023 + }, + { + "epoch": 2.381252461599055, + "grad_norm": 0.6044226884841919, + "learning_rate": 9.054e-06, + "loss": 0.0449, + "step": 3024 + }, + { + "epoch": 2.3820401732965735, + "grad_norm": 0.4918907582759857, + "learning_rate": 9.057e-06, + "loss": 0.0309, + "step": 3025 + }, + { + "epoch": 2.382827884994092, + "grad_norm": 0.8613329529762268, + "learning_rate": 9.06e-06, + "loss": 0.0509, + "step": 3026 + }, + { + "epoch": 2.383615596691611, + "grad_norm": 0.5140036344528198, + "learning_rate": 9.063e-06, + "loss": 0.0456, + "step": 3027 + }, + { + "epoch": 2.3844033083891296, + "grad_norm": 1.1358615159988403, + "learning_rate": 9.066000000000001e-06, + "loss": 0.074, + "step": 3028 + }, + { + "epoch": 2.385191020086648, + "grad_norm": 1.0677088499069214, + "learning_rate": 9.069e-06, + "loss": 0.0945, + "step": 3029 + }, + { + "epoch": 2.385978731784167, + "grad_norm": 0.7824407815933228, + "learning_rate": 9.072e-06, + "loss": 0.07, + "step": 3030 + }, + { + "epoch": 2.3867664434816858, + "grad_norm": 1.0309025049209595, + "learning_rate": 9.075e-06, + "loss": 0.0792, + "step": 3031 + }, + { + "epoch": 2.3875541551792043, + "grad_norm": 1.5304142236709595, + "learning_rate": 9.078e-06, + "loss": 0.0706, + "step": 3032 + }, + { + "epoch": 2.388341866876723, + "grad_norm": 0.7446516156196594, + "learning_rate": 9.081000000000002e-06, + "loss": 0.0424, + "step": 3033 + }, + { + "epoch": 2.389129578574242, + "grad_norm": 0.9528713226318359, + "learning_rate": 9.084000000000001e-06, + "loss": 0.0725, + "step": 3034 + }, + { + "epoch": 2.3899172902717605, + "grad_norm": 0.8526288866996765, + "learning_rate": 9.087e-06, + "loss": 0.0848, + "step": 3035 + }, + { + "epoch": 2.3907050019692795, + "grad_norm": 1.1096042394638062, + "learning_rate": 9.09e-06, + "loss": 0.0579, + "step": 3036 + }, + { + "epoch": 2.391492713666798, + "grad_norm": 0.7890580296516418, + "learning_rate": 9.093e-06, + "loss": 0.0538, + "step": 3037 + }, + { + "epoch": 2.3922804253643166, + "grad_norm": 1.132791519165039, + "learning_rate": 9.096e-06, + "loss": 0.0491, + "step": 3038 + }, + { + "epoch": 2.393068137061835, + "grad_norm": 1.4711873531341553, + "learning_rate": 9.099e-06, + "loss": 0.0555, + "step": 3039 + }, + { + "epoch": 2.393855848759354, + "grad_norm": 1.1643989086151123, + "learning_rate": 9.102e-06, + "loss": 0.0882, + "step": 3040 + }, + { + "epoch": 2.3946435604568728, + "grad_norm": 3.098935604095459, + "learning_rate": 9.105e-06, + "loss": 0.5275, + "step": 3041 + }, + { + "epoch": 2.3954312721543913, + "grad_norm": 1.472331166267395, + "learning_rate": 9.108e-06, + "loss": 0.4025, + "step": 3042 + }, + { + "epoch": 2.3962189838519103, + "grad_norm": 1.398056149482727, + "learning_rate": 9.111000000000001e-06, + "loss": 0.305, + "step": 3043 + }, + { + "epoch": 2.397006695549429, + "grad_norm": 1.6374099254608154, + "learning_rate": 9.114000000000001e-06, + "loss": 0.2805, + "step": 3044 + }, + { + "epoch": 2.3977944072469475, + "grad_norm": 1.068313479423523, + "learning_rate": 9.117000000000001e-06, + "loss": 0.2032, + "step": 3045 + }, + { + "epoch": 2.3985821189444665, + "grad_norm": 0.8685638904571533, + "learning_rate": 9.12e-06, + "loss": 0.1794, + "step": 3046 + }, + { + "epoch": 2.399369830641985, + "grad_norm": 0.8378549814224243, + "learning_rate": 9.122999999999999e-06, + "loss": 0.0855, + "step": 3047 + }, + { + "epoch": 2.4001575423395036, + "grad_norm": 3.4619383811950684, + "learning_rate": 9.126e-06, + "loss": 0.1045, + "step": 3048 + }, + { + "epoch": 2.4009452540370226, + "grad_norm": 1.0611604452133179, + "learning_rate": 9.129e-06, + "loss": 0.0693, + "step": 3049 + }, + { + "epoch": 2.401732965734541, + "grad_norm": 0.5671749114990234, + "learning_rate": 9.132e-06, + "loss": 0.0516, + "step": 3050 + }, + { + "epoch": 2.4025206774320598, + "grad_norm": 0.938235342502594, + "learning_rate": 9.135e-06, + "loss": 0.0728, + "step": 3051 + }, + { + "epoch": 2.4033083891295783, + "grad_norm": 0.6478285789489746, + "learning_rate": 9.138e-06, + "loss": 0.0338, + "step": 3052 + }, + { + "epoch": 2.4040961008270973, + "grad_norm": 1.1059685945510864, + "learning_rate": 9.141000000000001e-06, + "loss": 0.0621, + "step": 3053 + }, + { + "epoch": 2.404883812524616, + "grad_norm": 1.164021611213684, + "learning_rate": 9.144000000000001e-06, + "loss": 0.1101, + "step": 3054 + }, + { + "epoch": 2.405671524222135, + "grad_norm": 2.0019948482513428, + "learning_rate": 9.147e-06, + "loss": 0.0261, + "step": 3055 + }, + { + "epoch": 2.4064592359196535, + "grad_norm": 0.7680150866508484, + "learning_rate": 9.15e-06, + "loss": 0.0801, + "step": 3056 + }, + { + "epoch": 2.407246947617172, + "grad_norm": 0.6200305819511414, + "learning_rate": 9.153e-06, + "loss": 0.043, + "step": 3057 + }, + { + "epoch": 2.4080346593146906, + "grad_norm": 0.6194590330123901, + "learning_rate": 9.156000000000002e-06, + "loss": 0.0437, + "step": 3058 + }, + { + "epoch": 2.4088223710122096, + "grad_norm": 0.6079630255699158, + "learning_rate": 9.159e-06, + "loss": 0.05, + "step": 3059 + }, + { + "epoch": 2.409610082709728, + "grad_norm": 0.7600833177566528, + "learning_rate": 9.162e-06, + "loss": 0.0596, + "step": 3060 + }, + { + "epoch": 2.4103977944072468, + "grad_norm": 1.0771533250808716, + "learning_rate": 9.165e-06, + "loss": 0.0663, + "step": 3061 + }, + { + "epoch": 2.411185506104766, + "grad_norm": 0.9908783435821533, + "learning_rate": 9.168e-06, + "loss": 0.0738, + "step": 3062 + }, + { + "epoch": 2.4119732178022844, + "grad_norm": 0.6541309952735901, + "learning_rate": 9.171e-06, + "loss": 0.0504, + "step": 3063 + }, + { + "epoch": 2.412760929499803, + "grad_norm": 0.7665324807167053, + "learning_rate": 9.174e-06, + "loss": 0.0373, + "step": 3064 + }, + { + "epoch": 2.413548641197322, + "grad_norm": 0.7121755480766296, + "learning_rate": 9.177e-06, + "loss": 0.0393, + "step": 3065 + }, + { + "epoch": 2.4143363528948405, + "grad_norm": 0.7700909972190857, + "learning_rate": 9.18e-06, + "loss": 0.0793, + "step": 3066 + }, + { + "epoch": 2.415124064592359, + "grad_norm": 0.7378225922584534, + "learning_rate": 9.183e-06, + "loss": 0.0468, + "step": 3067 + }, + { + "epoch": 2.415911776289878, + "grad_norm": 0.8586748242378235, + "learning_rate": 9.186000000000001e-06, + "loss": 0.0637, + "step": 3068 + }, + { + "epoch": 2.4166994879873966, + "grad_norm": 0.7564337849617004, + "learning_rate": 9.189000000000001e-06, + "loss": 0.0393, + "step": 3069 + }, + { + "epoch": 2.417487199684915, + "grad_norm": 0.890916109085083, + "learning_rate": 9.192000000000001e-06, + "loss": 0.0744, + "step": 3070 + }, + { + "epoch": 2.4182749113824342, + "grad_norm": 0.7669765949249268, + "learning_rate": 9.195000000000001e-06, + "loss": 0.0595, + "step": 3071 + }, + { + "epoch": 2.419062623079953, + "grad_norm": 3.1680996417999268, + "learning_rate": 9.197999999999999e-06, + "loss": 0.0626, + "step": 3072 + }, + { + "epoch": 2.4198503347774714, + "grad_norm": 0.8436568975448608, + "learning_rate": 9.200999999999999e-06, + "loss": 0.0454, + "step": 3073 + }, + { + "epoch": 2.4206380464749904, + "grad_norm": 0.8686376810073853, + "learning_rate": 9.204e-06, + "loss": 0.0574, + "step": 3074 + }, + { + "epoch": 2.421425758172509, + "grad_norm": 0.7635902166366577, + "learning_rate": 9.207e-06, + "loss": 0.0569, + "step": 3075 + }, + { + "epoch": 2.4222134698700275, + "grad_norm": 0.6473199129104614, + "learning_rate": 9.21e-06, + "loss": 0.0338, + "step": 3076 + }, + { + "epoch": 2.423001181567546, + "grad_norm": 1.0477513074874878, + "learning_rate": 9.213e-06, + "loss": 0.0488, + "step": 3077 + }, + { + "epoch": 2.423788893265065, + "grad_norm": 0.8694229125976562, + "learning_rate": 9.216e-06, + "loss": 0.0602, + "step": 3078 + }, + { + "epoch": 2.4245766049625836, + "grad_norm": 0.9349133372306824, + "learning_rate": 9.219000000000001e-06, + "loss": 0.0624, + "step": 3079 + }, + { + "epoch": 2.425364316660102, + "grad_norm": 1.0650988817214966, + "learning_rate": 9.222e-06, + "loss": 0.075, + "step": 3080 + }, + { + "epoch": 2.4261520283576212, + "grad_norm": 1.0734843015670776, + "learning_rate": 9.225e-06, + "loss": 0.0662, + "step": 3081 + }, + { + "epoch": 2.42693974005514, + "grad_norm": 1.031404972076416, + "learning_rate": 9.228e-06, + "loss": 0.068, + "step": 3082 + }, + { + "epoch": 2.4277274517526584, + "grad_norm": 0.858614444732666, + "learning_rate": 9.231e-06, + "loss": 0.0471, + "step": 3083 + }, + { + "epoch": 2.4285151634501774, + "grad_norm": 1.3938530683517456, + "learning_rate": 9.234e-06, + "loss": 0.0832, + "step": 3084 + }, + { + "epoch": 2.429302875147696, + "grad_norm": 0.9488372802734375, + "learning_rate": 9.237e-06, + "loss": 0.0742, + "step": 3085 + }, + { + "epoch": 2.4300905868452145, + "grad_norm": 1.066763997077942, + "learning_rate": 9.24e-06, + "loss": 0.0769, + "step": 3086 + }, + { + "epoch": 2.4308782985427335, + "grad_norm": 1.1571918725967407, + "learning_rate": 9.243e-06, + "loss": 0.067, + "step": 3087 + }, + { + "epoch": 2.431666010240252, + "grad_norm": 0.9818512201309204, + "learning_rate": 9.246e-06, + "loss": 0.0814, + "step": 3088 + }, + { + "epoch": 2.4324537219377707, + "grad_norm": 1.3713635206222534, + "learning_rate": 9.249e-06, + "loss": 0.0795, + "step": 3089 + }, + { + "epoch": 2.4332414336352897, + "grad_norm": 1.163221001625061, + "learning_rate": 9.252e-06, + "loss": 0.078, + "step": 3090 + }, + { + "epoch": 2.4340291453328082, + "grad_norm": 2.0462045669555664, + "learning_rate": 9.255e-06, + "loss": 0.5197, + "step": 3091 + }, + { + "epoch": 2.434816857030327, + "grad_norm": 1.660933494567871, + "learning_rate": 9.258e-06, + "loss": 0.3507, + "step": 3092 + }, + { + "epoch": 2.435604568727846, + "grad_norm": 1.2933388948440552, + "learning_rate": 9.261e-06, + "loss": 0.3292, + "step": 3093 + }, + { + "epoch": 2.4363922804253644, + "grad_norm": 1.2006975412368774, + "learning_rate": 9.264000000000001e-06, + "loss": 0.2631, + "step": 3094 + }, + { + "epoch": 2.437179992122883, + "grad_norm": 1.1192691326141357, + "learning_rate": 9.267000000000001e-06, + "loss": 0.136, + "step": 3095 + }, + { + "epoch": 2.4379677038204015, + "grad_norm": 1.3931841850280762, + "learning_rate": 9.27e-06, + "loss": 0.1371, + "step": 3096 + }, + { + "epoch": 2.4387554155179205, + "grad_norm": 1.1042442321777344, + "learning_rate": 9.272999999999999e-06, + "loss": 0.1526, + "step": 3097 + }, + { + "epoch": 2.439543127215439, + "grad_norm": 0.8724532127380371, + "learning_rate": 9.275999999999999e-06, + "loss": 0.0805, + "step": 3098 + }, + { + "epoch": 2.440330838912958, + "grad_norm": 1.3614318370819092, + "learning_rate": 9.279e-06, + "loss": 0.0922, + "step": 3099 + }, + { + "epoch": 2.4411185506104767, + "grad_norm": 0.6742561459541321, + "learning_rate": 9.282e-06, + "loss": 0.0639, + "step": 3100 + }, + { + "epoch": 2.4419062623079952, + "grad_norm": 0.6578989028930664, + "learning_rate": 9.285e-06, + "loss": 0.0521, + "step": 3101 + }, + { + "epoch": 2.442693974005514, + "grad_norm": 0.6058350801467896, + "learning_rate": 9.288e-06, + "loss": 0.047, + "step": 3102 + }, + { + "epoch": 2.443481685703033, + "grad_norm": 0.6419582366943359, + "learning_rate": 9.291e-06, + "loss": 0.0391, + "step": 3103 + }, + { + "epoch": 2.4442693974005514, + "grad_norm": 1.1609959602355957, + "learning_rate": 9.294000000000001e-06, + "loss": 0.0599, + "step": 3104 + }, + { + "epoch": 2.44505710909807, + "grad_norm": 1.289551854133606, + "learning_rate": 9.297000000000001e-06, + "loss": 0.0827, + "step": 3105 + }, + { + "epoch": 2.445844820795589, + "grad_norm": 0.6183859705924988, + "learning_rate": 9.3e-06, + "loss": 0.0601, + "step": 3106 + }, + { + "epoch": 2.4466325324931075, + "grad_norm": 0.8438467979431152, + "learning_rate": 9.303e-06, + "loss": 0.0653, + "step": 3107 + }, + { + "epoch": 2.447420244190626, + "grad_norm": 0.654680073261261, + "learning_rate": 9.306e-06, + "loss": 0.0505, + "step": 3108 + }, + { + "epoch": 2.448207955888145, + "grad_norm": 0.8292509913444519, + "learning_rate": 9.309e-06, + "loss": 0.0771, + "step": 3109 + }, + { + "epoch": 2.4489956675856637, + "grad_norm": 0.6243188977241516, + "learning_rate": 9.312e-06, + "loss": 0.0439, + "step": 3110 + }, + { + "epoch": 2.4497833792831822, + "grad_norm": 0.7241324782371521, + "learning_rate": 9.315e-06, + "loss": 0.0353, + "step": 3111 + }, + { + "epoch": 2.4505710909807012, + "grad_norm": 0.8823056817054749, + "learning_rate": 9.318e-06, + "loss": 0.0563, + "step": 3112 + }, + { + "epoch": 2.45135880267822, + "grad_norm": 0.9396480917930603, + "learning_rate": 9.321e-06, + "loss": 0.0553, + "step": 3113 + }, + { + "epoch": 2.4521465143757384, + "grad_norm": 0.567619264125824, + "learning_rate": 9.324000000000001e-06, + "loss": 0.0566, + "step": 3114 + }, + { + "epoch": 2.452934226073257, + "grad_norm": 0.6539912223815918, + "learning_rate": 9.327e-06, + "loss": 0.0463, + "step": 3115 + }, + { + "epoch": 2.453721937770776, + "grad_norm": 0.8332438468933105, + "learning_rate": 9.33e-06, + "loss": 0.0621, + "step": 3116 + }, + { + "epoch": 2.4545096494682945, + "grad_norm": 3.329563617706299, + "learning_rate": 9.333e-06, + "loss": 0.0469, + "step": 3117 + }, + { + "epoch": 2.4552973611658135, + "grad_norm": 0.6509931683540344, + "learning_rate": 9.336e-06, + "loss": 0.0507, + "step": 3118 + }, + { + "epoch": 2.456085072863332, + "grad_norm": 1.6599453687667847, + "learning_rate": 9.339000000000002e-06, + "loss": 0.0818, + "step": 3119 + }, + { + "epoch": 2.4568727845608507, + "grad_norm": 1.473266839981079, + "learning_rate": 9.342000000000001e-06, + "loss": 0.0476, + "step": 3120 + }, + { + "epoch": 2.4576604962583692, + "grad_norm": 0.9408569931983948, + "learning_rate": 9.345e-06, + "loss": 0.0643, + "step": 3121 + }, + { + "epoch": 2.4584482079558883, + "grad_norm": 0.9867995381355286, + "learning_rate": 9.348e-06, + "loss": 0.064, + "step": 3122 + }, + { + "epoch": 2.459235919653407, + "grad_norm": 1.0626220703125, + "learning_rate": 9.350999999999999e-06, + "loss": 0.0764, + "step": 3123 + }, + { + "epoch": 2.4600236313509254, + "grad_norm": 2.2123894691467285, + "learning_rate": 9.354e-06, + "loss": 0.0603, + "step": 3124 + }, + { + "epoch": 2.4608113430484444, + "grad_norm": 1.0069835186004639, + "learning_rate": 9.357e-06, + "loss": 0.0384, + "step": 3125 + }, + { + "epoch": 2.461599054745963, + "grad_norm": 0.8609867095947266, + "learning_rate": 9.36e-06, + "loss": 0.0713, + "step": 3126 + }, + { + "epoch": 2.4623867664434815, + "grad_norm": 1.2873907089233398, + "learning_rate": 9.363e-06, + "loss": 0.0559, + "step": 3127 + }, + { + "epoch": 2.4631744781410005, + "grad_norm": 0.635764479637146, + "learning_rate": 9.366e-06, + "loss": 0.0439, + "step": 3128 + }, + { + "epoch": 2.463962189838519, + "grad_norm": 0.7027595639228821, + "learning_rate": 9.369000000000001e-06, + "loss": 0.0459, + "step": 3129 + }, + { + "epoch": 2.4647499015360377, + "grad_norm": 1.4145197868347168, + "learning_rate": 9.372000000000001e-06, + "loss": 0.0471, + "step": 3130 + }, + { + "epoch": 2.4655376132335567, + "grad_norm": 0.9010178446769714, + "learning_rate": 9.375000000000001e-06, + "loss": 0.0716, + "step": 3131 + }, + { + "epoch": 2.4663253249310753, + "grad_norm": 1.1553481817245483, + "learning_rate": 9.378e-06, + "loss": 0.0736, + "step": 3132 + }, + { + "epoch": 2.467113036628594, + "grad_norm": 1.4294474124908447, + "learning_rate": 9.380999999999999e-06, + "loss": 0.0676, + "step": 3133 + }, + { + "epoch": 2.467900748326113, + "grad_norm": 0.9781564474105835, + "learning_rate": 9.384e-06, + "loss": 0.0624, + "step": 3134 + }, + { + "epoch": 2.4686884600236314, + "grad_norm": 1.1056909561157227, + "learning_rate": 9.387e-06, + "loss": 0.0755, + "step": 3135 + }, + { + "epoch": 2.46947617172115, + "grad_norm": 1.192288875579834, + "learning_rate": 9.39e-06, + "loss": 0.061, + "step": 3136 + }, + { + "epoch": 2.470263883418669, + "grad_norm": 0.9913935661315918, + "learning_rate": 9.393e-06, + "loss": 0.0683, + "step": 3137 + }, + { + "epoch": 2.4710515951161875, + "grad_norm": 1.4410735368728638, + "learning_rate": 9.396e-06, + "loss": 0.1042, + "step": 3138 + }, + { + "epoch": 2.471839306813706, + "grad_norm": 0.9679030179977417, + "learning_rate": 9.399000000000001e-06, + "loss": 0.0668, + "step": 3139 + }, + { + "epoch": 2.4726270185112247, + "grad_norm": 0.9493259191513062, + "learning_rate": 9.402e-06, + "loss": 0.0605, + "step": 3140 + }, + { + "epoch": 2.4734147302087437, + "grad_norm": 3.8414793014526367, + "learning_rate": 9.405e-06, + "loss": 0.5482, + "step": 3141 + }, + { + "epoch": 2.4742024419062623, + "grad_norm": 1.9431872367858887, + "learning_rate": 9.408e-06, + "loss": 0.3712, + "step": 3142 + }, + { + "epoch": 2.474990153603781, + "grad_norm": 2.6786513328552246, + "learning_rate": 9.411e-06, + "loss": 0.3271, + "step": 3143 + }, + { + "epoch": 2.4757778653013, + "grad_norm": 1.0781153440475464, + "learning_rate": 9.414000000000002e-06, + "loss": 0.2297, + "step": 3144 + }, + { + "epoch": 2.4765655769988184, + "grad_norm": 1.165866494178772, + "learning_rate": 9.417e-06, + "loss": 0.1857, + "step": 3145 + }, + { + "epoch": 2.477353288696337, + "grad_norm": 0.9702721238136292, + "learning_rate": 9.42e-06, + "loss": 0.1428, + "step": 3146 + }, + { + "epoch": 2.478141000393856, + "grad_norm": 1.089490532875061, + "learning_rate": 9.423e-06, + "loss": 0.1073, + "step": 3147 + }, + { + "epoch": 2.4789287120913746, + "grad_norm": 1.1116654872894287, + "learning_rate": 9.426e-06, + "loss": 0.0547, + "step": 3148 + }, + { + "epoch": 2.479716423788893, + "grad_norm": 1.2119755744934082, + "learning_rate": 9.429e-06, + "loss": 0.051, + "step": 3149 + }, + { + "epoch": 2.480504135486412, + "grad_norm": 0.623970627784729, + "learning_rate": 9.432e-06, + "loss": 0.047, + "step": 3150 + }, + { + "epoch": 2.4812918471839307, + "grad_norm": 0.8831971287727356, + "learning_rate": 9.435e-06, + "loss": 0.0701, + "step": 3151 + }, + { + "epoch": 2.4820795588814493, + "grad_norm": 1.3085379600524902, + "learning_rate": 9.438e-06, + "loss": 0.0657, + "step": 3152 + }, + { + "epoch": 2.4828672705789683, + "grad_norm": 0.7186987400054932, + "learning_rate": 9.441e-06, + "loss": 0.0501, + "step": 3153 + }, + { + "epoch": 2.483654982276487, + "grad_norm": 0.5653578639030457, + "learning_rate": 9.444000000000001e-06, + "loss": 0.043, + "step": 3154 + }, + { + "epoch": 2.4844426939740054, + "grad_norm": 0.6910883188247681, + "learning_rate": 9.447000000000001e-06, + "loss": 0.0502, + "step": 3155 + }, + { + "epoch": 2.4852304056715244, + "grad_norm": 0.7400258183479309, + "learning_rate": 9.450000000000001e-06, + "loss": 0.0635, + "step": 3156 + }, + { + "epoch": 2.486018117369043, + "grad_norm": 0.6995779871940613, + "learning_rate": 9.453e-06, + "loss": 0.0511, + "step": 3157 + }, + { + "epoch": 2.4868058290665616, + "grad_norm": 0.6509313583374023, + "learning_rate": 9.455999999999999e-06, + "loss": 0.0514, + "step": 3158 + }, + { + "epoch": 2.48759354076408, + "grad_norm": 0.6968459486961365, + "learning_rate": 9.459e-06, + "loss": 0.0652, + "step": 3159 + }, + { + "epoch": 2.488381252461599, + "grad_norm": 0.9522707462310791, + "learning_rate": 9.462e-06, + "loss": 0.0429, + "step": 3160 + }, + { + "epoch": 2.4891689641591177, + "grad_norm": 0.9145280718803406, + "learning_rate": 9.465e-06, + "loss": 0.0825, + "step": 3161 + }, + { + "epoch": 2.4899566758566367, + "grad_norm": 0.721043586730957, + "learning_rate": 9.468e-06, + "loss": 0.0611, + "step": 3162 + }, + { + "epoch": 2.4907443875541553, + "grad_norm": 0.9298194050788879, + "learning_rate": 9.471e-06, + "loss": 0.0479, + "step": 3163 + }, + { + "epoch": 2.491532099251674, + "grad_norm": 0.9136679172515869, + "learning_rate": 9.474000000000001e-06, + "loss": 0.0659, + "step": 3164 + }, + { + "epoch": 2.4923198109491924, + "grad_norm": 0.9741732478141785, + "learning_rate": 9.477000000000001e-06, + "loss": 0.0791, + "step": 3165 + }, + { + "epoch": 2.4931075226467114, + "grad_norm": 0.5379950404167175, + "learning_rate": 9.48e-06, + "loss": 0.0387, + "step": 3166 + }, + { + "epoch": 2.49389523434423, + "grad_norm": 0.7733938694000244, + "learning_rate": 9.483e-06, + "loss": 0.0636, + "step": 3167 + }, + { + "epoch": 2.4946829460417486, + "grad_norm": 0.8574127554893494, + "learning_rate": 9.486e-06, + "loss": 0.0566, + "step": 3168 + }, + { + "epoch": 2.4954706577392676, + "grad_norm": 0.6524454355239868, + "learning_rate": 9.489000000000002e-06, + "loss": 0.0388, + "step": 3169 + }, + { + "epoch": 2.496258369436786, + "grad_norm": 1.0465351343154907, + "learning_rate": 9.492e-06, + "loss": 0.0861, + "step": 3170 + }, + { + "epoch": 2.4970460811343047, + "grad_norm": 0.8765840530395508, + "learning_rate": 9.495e-06, + "loss": 0.0493, + "step": 3171 + }, + { + "epoch": 2.4978337928318237, + "grad_norm": 0.794861912727356, + "learning_rate": 9.498e-06, + "loss": 0.0371, + "step": 3172 + }, + { + "epoch": 2.4986215045293423, + "grad_norm": 0.7541015148162842, + "learning_rate": 9.501e-06, + "loss": 0.0712, + "step": 3173 + }, + { + "epoch": 2.499409216226861, + "grad_norm": 0.9722827672958374, + "learning_rate": 9.504e-06, + "loss": 0.076, + "step": 3174 + }, + { + "epoch": 2.50019692792438, + "grad_norm": 1.699580192565918, + "learning_rate": 9.507e-06, + "loss": 0.0692, + "step": 3175 + }, + { + "epoch": 2.5009846396218984, + "grad_norm": 0.9367459416389465, + "learning_rate": 9.51e-06, + "loss": 0.0752, + "step": 3176 + }, + { + "epoch": 2.501772351319417, + "grad_norm": 0.7310293912887573, + "learning_rate": 9.513e-06, + "loss": 0.0614, + "step": 3177 + }, + { + "epoch": 2.5025600630169356, + "grad_norm": 0.9517685770988464, + "learning_rate": 9.516e-06, + "loss": 0.0649, + "step": 3178 + }, + { + "epoch": 2.5033477747144546, + "grad_norm": 0.9124424457550049, + "learning_rate": 9.519000000000002e-06, + "loss": 0.0683, + "step": 3179 + }, + { + "epoch": 2.504135486411973, + "grad_norm": 0.8367537260055542, + "learning_rate": 9.522000000000001e-06, + "loss": 0.0376, + "step": 3180 + }, + { + "epoch": 2.504923198109492, + "grad_norm": 0.7289870381355286, + "learning_rate": 9.525000000000001e-06, + "loss": 0.041, + "step": 3181 + }, + { + "epoch": 2.5057109098070107, + "grad_norm": 0.6980565786361694, + "learning_rate": 9.528e-06, + "loss": 0.0535, + "step": 3182 + }, + { + "epoch": 2.5064986215045293, + "grad_norm": 0.920069694519043, + "learning_rate": 9.530999999999999e-06, + "loss": 0.059, + "step": 3183 + }, + { + "epoch": 2.507286333202048, + "grad_norm": 1.3053630590438843, + "learning_rate": 9.534e-06, + "loss": 0.0822, + "step": 3184 + }, + { + "epoch": 2.508074044899567, + "grad_norm": 0.8703436255455017, + "learning_rate": 9.537e-06, + "loss": 0.0432, + "step": 3185 + }, + { + "epoch": 2.5088617565970854, + "grad_norm": 0.9391535520553589, + "learning_rate": 9.54e-06, + "loss": 0.0759, + "step": 3186 + }, + { + "epoch": 2.5096494682946044, + "grad_norm": 2.074306011199951, + "learning_rate": 9.543e-06, + "loss": 0.0582, + "step": 3187 + }, + { + "epoch": 2.510437179992123, + "grad_norm": 1.3816817998886108, + "learning_rate": 9.546e-06, + "loss": 0.0654, + "step": 3188 + }, + { + "epoch": 2.5112248916896416, + "grad_norm": 1.4371263980865479, + "learning_rate": 9.549000000000001e-06, + "loss": 0.1107, + "step": 3189 + }, + { + "epoch": 2.51201260338716, + "grad_norm": 1.1481564044952393, + "learning_rate": 9.552000000000001e-06, + "loss": 0.1049, + "step": 3190 + }, + { + "epoch": 2.512800315084679, + "grad_norm": 2.6874566078186035, + "learning_rate": 9.555e-06, + "loss": 0.4545, + "step": 3191 + }, + { + "epoch": 2.5135880267821977, + "grad_norm": 1.3766263723373413, + "learning_rate": 9.558e-06, + "loss": 0.3516, + "step": 3192 + }, + { + "epoch": 2.5143757384797163, + "grad_norm": 1.8333991765975952, + "learning_rate": 9.561e-06, + "loss": 0.3419, + "step": 3193 + }, + { + "epoch": 2.5151634501772353, + "grad_norm": 1.444656491279602, + "learning_rate": 9.564e-06, + "loss": 0.2465, + "step": 3194 + }, + { + "epoch": 2.515951161874754, + "grad_norm": 1.328439712524414, + "learning_rate": 9.567e-06, + "loss": 0.2209, + "step": 3195 + }, + { + "epoch": 2.5167388735722724, + "grad_norm": 1.0386226177215576, + "learning_rate": 9.57e-06, + "loss": 0.1579, + "step": 3196 + }, + { + "epoch": 2.517526585269791, + "grad_norm": 0.6444233655929565, + "learning_rate": 9.573e-06, + "loss": 0.0682, + "step": 3197 + }, + { + "epoch": 2.51831429696731, + "grad_norm": 0.8245398998260498, + "learning_rate": 9.576e-06, + "loss": 0.066, + "step": 3198 + }, + { + "epoch": 2.5191020086648286, + "grad_norm": 0.49941006302833557, + "learning_rate": 9.579e-06, + "loss": 0.0573, + "step": 3199 + }, + { + "epoch": 2.5198897203623476, + "grad_norm": 0.7401144504547119, + "learning_rate": 9.582e-06, + "loss": 0.0631, + "step": 3200 + }, + { + "epoch": 2.520677432059866, + "grad_norm": 0.9699612855911255, + "learning_rate": 9.585e-06, + "loss": 0.0592, + "step": 3201 + }, + { + "epoch": 2.5214651437573847, + "grad_norm": 0.6723110675811768, + "learning_rate": 9.588e-06, + "loss": 0.0456, + "step": 3202 + }, + { + "epoch": 2.5222528554549033, + "grad_norm": 0.6172309517860413, + "learning_rate": 9.591e-06, + "loss": 0.0404, + "step": 3203 + }, + { + "epoch": 2.5230405671524223, + "grad_norm": 1.0618759393692017, + "learning_rate": 9.594e-06, + "loss": 0.101, + "step": 3204 + }, + { + "epoch": 2.523828278849941, + "grad_norm": 0.6760218739509583, + "learning_rate": 9.597000000000001e-06, + "loss": 0.0377, + "step": 3205 + }, + { + "epoch": 2.52461599054746, + "grad_norm": 0.6051573157310486, + "learning_rate": 9.600000000000001e-06, + "loss": 0.047, + "step": 3206 + }, + { + "epoch": 2.5254037022449785, + "grad_norm": 0.8181388974189758, + "learning_rate": 9.603e-06, + "loss": 0.048, + "step": 3207 + }, + { + "epoch": 2.526191413942497, + "grad_norm": 0.6084415912628174, + "learning_rate": 9.606e-06, + "loss": 0.0378, + "step": 3208 + }, + { + "epoch": 2.5269791256400156, + "grad_norm": 0.9091440439224243, + "learning_rate": 9.608999999999999e-06, + "loss": 0.067, + "step": 3209 + }, + { + "epoch": 2.5277668373375346, + "grad_norm": 0.9218552112579346, + "learning_rate": 9.612e-06, + "loss": 0.0701, + "step": 3210 + }, + { + "epoch": 2.528554549035053, + "grad_norm": 0.7692875862121582, + "learning_rate": 9.615e-06, + "loss": 0.0567, + "step": 3211 + }, + { + "epoch": 2.5293422607325717, + "grad_norm": 1.2032420635223389, + "learning_rate": 9.618e-06, + "loss": 0.0741, + "step": 3212 + }, + { + "epoch": 2.5301299724300907, + "grad_norm": 0.7686079144477844, + "learning_rate": 9.621e-06, + "loss": 0.0479, + "step": 3213 + }, + { + "epoch": 2.5309176841276093, + "grad_norm": 1.3506935834884644, + "learning_rate": 9.624e-06, + "loss": 0.0625, + "step": 3214 + }, + { + "epoch": 2.531705395825128, + "grad_norm": 0.8657990097999573, + "learning_rate": 9.627000000000001e-06, + "loss": 0.0486, + "step": 3215 + }, + { + "epoch": 2.5324931075226464, + "grad_norm": 1.4434412717819214, + "learning_rate": 9.630000000000001e-06, + "loss": 0.1173, + "step": 3216 + }, + { + "epoch": 2.5332808192201655, + "grad_norm": 0.7987033724784851, + "learning_rate": 9.633e-06, + "loss": 0.0524, + "step": 3217 + }, + { + "epoch": 2.534068530917684, + "grad_norm": 0.8009648323059082, + "learning_rate": 9.636e-06, + "loss": 0.064, + "step": 3218 + }, + { + "epoch": 2.534856242615203, + "grad_norm": 1.1496731042861938, + "learning_rate": 9.638999999999999e-06, + "loss": 0.048, + "step": 3219 + }, + { + "epoch": 2.5356439543127216, + "grad_norm": 0.8440859913825989, + "learning_rate": 9.642e-06, + "loss": 0.0494, + "step": 3220 + }, + { + "epoch": 2.53643166601024, + "grad_norm": 0.6411259770393372, + "learning_rate": 9.645e-06, + "loss": 0.0418, + "step": 3221 + }, + { + "epoch": 2.5372193777077587, + "grad_norm": 0.8051779270172119, + "learning_rate": 9.648e-06, + "loss": 0.0887, + "step": 3222 + }, + { + "epoch": 2.5380070894052777, + "grad_norm": 0.9283203482627869, + "learning_rate": 9.651e-06, + "loss": 0.0566, + "step": 3223 + }, + { + "epoch": 2.5387948011027963, + "grad_norm": 1.2768712043762207, + "learning_rate": 9.654e-06, + "loss": 0.0697, + "step": 3224 + }, + { + "epoch": 2.5395825128003153, + "grad_norm": 0.940818727016449, + "learning_rate": 9.657000000000001e-06, + "loss": 0.0651, + "step": 3225 + }, + { + "epoch": 2.540370224497834, + "grad_norm": 0.8459259271621704, + "learning_rate": 9.66e-06, + "loss": 0.0516, + "step": 3226 + }, + { + "epoch": 2.5411579361953525, + "grad_norm": 0.8761997222900391, + "learning_rate": 9.663e-06, + "loss": 0.061, + "step": 3227 + }, + { + "epoch": 2.541945647892871, + "grad_norm": 1.026444911956787, + "learning_rate": 9.666e-06, + "loss": 0.0775, + "step": 3228 + }, + { + "epoch": 2.54273335959039, + "grad_norm": 0.8146826028823853, + "learning_rate": 9.669e-06, + "loss": 0.0623, + "step": 3229 + }, + { + "epoch": 2.5435210712879086, + "grad_norm": 1.1236741542816162, + "learning_rate": 9.672000000000002e-06, + "loss": 0.098, + "step": 3230 + }, + { + "epoch": 2.544308782985427, + "grad_norm": 0.8138452172279358, + "learning_rate": 9.675e-06, + "loss": 0.04, + "step": 3231 + }, + { + "epoch": 2.545096494682946, + "grad_norm": 1.3208869695663452, + "learning_rate": 9.678e-06, + "loss": 0.0659, + "step": 3232 + }, + { + "epoch": 2.5458842063804648, + "grad_norm": 1.1409987211227417, + "learning_rate": 9.681e-06, + "loss": 0.0618, + "step": 3233 + }, + { + "epoch": 2.5466719180779833, + "grad_norm": 0.9283557534217834, + "learning_rate": 9.683999999999999e-06, + "loss": 0.0897, + "step": 3234 + }, + { + "epoch": 2.5474596297755023, + "grad_norm": 1.0588496923446655, + "learning_rate": 9.687e-06, + "loss": 0.0658, + "step": 3235 + }, + { + "epoch": 2.548247341473021, + "grad_norm": 1.5560694932937622, + "learning_rate": 9.69e-06, + "loss": 0.0457, + "step": 3236 + }, + { + "epoch": 2.5490350531705395, + "grad_norm": 0.7299632430076599, + "learning_rate": 9.693e-06, + "loss": 0.0548, + "step": 3237 + }, + { + "epoch": 2.5498227648680585, + "grad_norm": 1.4535977840423584, + "learning_rate": 9.696e-06, + "loss": 0.077, + "step": 3238 + }, + { + "epoch": 2.550610476565577, + "grad_norm": 0.963916540145874, + "learning_rate": 9.699e-06, + "loss": 0.0665, + "step": 3239 + }, + { + "epoch": 2.5513981882630956, + "grad_norm": 1.3450709581375122, + "learning_rate": 9.702000000000001e-06, + "loss": 0.0781, + "step": 3240 + }, + { + "epoch": 2.552185899960614, + "grad_norm": 1.9104821681976318, + "learning_rate": 9.705000000000001e-06, + "loss": 0.4929, + "step": 3241 + }, + { + "epoch": 2.552973611658133, + "grad_norm": 1.3318085670471191, + "learning_rate": 9.708000000000001e-06, + "loss": 0.3251, + "step": 3242 + }, + { + "epoch": 2.5537613233556518, + "grad_norm": 1.5396497249603271, + "learning_rate": 9.711e-06, + "loss": 0.2977, + "step": 3243 + }, + { + "epoch": 2.5545490350531708, + "grad_norm": 1.2719594240188599, + "learning_rate": 9.713999999999999e-06, + "loss": 0.2512, + "step": 3244 + }, + { + "epoch": 2.5553367467506893, + "grad_norm": 1.339210867881775, + "learning_rate": 9.717e-06, + "loss": 0.2246, + "step": 3245 + }, + { + "epoch": 2.556124458448208, + "grad_norm": 1.4785572290420532, + "learning_rate": 9.72e-06, + "loss": 0.2705, + "step": 3246 + }, + { + "epoch": 2.5569121701457265, + "grad_norm": 0.8419133424758911, + "learning_rate": 9.723e-06, + "loss": 0.1005, + "step": 3247 + }, + { + "epoch": 2.5576998818432455, + "grad_norm": 0.6350347399711609, + "learning_rate": 9.726e-06, + "loss": 0.0826, + "step": 3248 + }, + { + "epoch": 2.558487593540764, + "grad_norm": 1.8362984657287598, + "learning_rate": 9.729e-06, + "loss": 0.0723, + "step": 3249 + }, + { + "epoch": 2.559275305238283, + "grad_norm": 0.8910056352615356, + "learning_rate": 9.732000000000001e-06, + "loss": 0.0828, + "step": 3250 + }, + { + "epoch": 2.5600630169358016, + "grad_norm": 0.8350694179534912, + "learning_rate": 9.735e-06, + "loss": 0.1071, + "step": 3251 + }, + { + "epoch": 2.56085072863332, + "grad_norm": 0.6870247721672058, + "learning_rate": 9.738e-06, + "loss": 0.0484, + "step": 3252 + }, + { + "epoch": 2.5616384403308388, + "grad_norm": 0.6591034531593323, + "learning_rate": 9.741e-06, + "loss": 0.0487, + "step": 3253 + }, + { + "epoch": 2.5624261520283578, + "grad_norm": 1.0851905345916748, + "learning_rate": 9.744e-06, + "loss": 0.0772, + "step": 3254 + }, + { + "epoch": 2.5632138637258763, + "grad_norm": 0.808826208114624, + "learning_rate": 9.747000000000002e-06, + "loss": 0.0436, + "step": 3255 + }, + { + "epoch": 2.564001575423395, + "grad_norm": 0.6380029916763306, + "learning_rate": 9.75e-06, + "loss": 0.0602, + "step": 3256 + }, + { + "epoch": 2.564789287120914, + "grad_norm": 0.9779039025306702, + "learning_rate": 9.753e-06, + "loss": 0.0723, + "step": 3257 + }, + { + "epoch": 2.5655769988184325, + "grad_norm": 1.3131029605865479, + "learning_rate": 9.756e-06, + "loss": 0.046, + "step": 3258 + }, + { + "epoch": 2.566364710515951, + "grad_norm": 1.07974374294281, + "learning_rate": 9.759e-06, + "loss": 0.0492, + "step": 3259 + }, + { + "epoch": 2.5671524222134696, + "grad_norm": 0.9026055932044983, + "learning_rate": 9.762e-06, + "loss": 0.07, + "step": 3260 + }, + { + "epoch": 2.5679401339109886, + "grad_norm": 0.7699682712554932, + "learning_rate": 9.765e-06, + "loss": 0.0573, + "step": 3261 + }, + { + "epoch": 2.568727845608507, + "grad_norm": 1.099617600440979, + "learning_rate": 9.768e-06, + "loss": 0.0625, + "step": 3262 + }, + { + "epoch": 2.569515557306026, + "grad_norm": 0.7061864137649536, + "learning_rate": 9.771e-06, + "loss": 0.0666, + "step": 3263 + }, + { + "epoch": 2.5703032690035448, + "grad_norm": 0.7595364451408386, + "learning_rate": 9.774e-06, + "loss": 0.0645, + "step": 3264 + }, + { + "epoch": 2.5710909807010633, + "grad_norm": 0.887251079082489, + "learning_rate": 9.777000000000001e-06, + "loss": 0.0548, + "step": 3265 + }, + { + "epoch": 2.571878692398582, + "grad_norm": 0.6293666362762451, + "learning_rate": 9.780000000000001e-06, + "loss": 0.0499, + "step": 3266 + }, + { + "epoch": 2.572666404096101, + "grad_norm": 0.9288830757141113, + "learning_rate": 9.783000000000001e-06, + "loss": 0.0466, + "step": 3267 + }, + { + "epoch": 2.5734541157936195, + "grad_norm": 0.9691036343574524, + "learning_rate": 9.785999999999999e-06, + "loss": 0.051, + "step": 3268 + }, + { + "epoch": 2.5742418274911385, + "grad_norm": 0.8454388380050659, + "learning_rate": 9.788999999999999e-06, + "loss": 0.0547, + "step": 3269 + }, + { + "epoch": 2.575029539188657, + "grad_norm": 0.8828182816505432, + "learning_rate": 9.792e-06, + "loss": 0.0548, + "step": 3270 + }, + { + "epoch": 2.5758172508861756, + "grad_norm": 0.7429821491241455, + "learning_rate": 9.795e-06, + "loss": 0.0642, + "step": 3271 + }, + { + "epoch": 2.576604962583694, + "grad_norm": 1.210669994354248, + "learning_rate": 9.798e-06, + "loss": 0.056, + "step": 3272 + }, + { + "epoch": 2.577392674281213, + "grad_norm": 0.8095968961715698, + "learning_rate": 9.801e-06, + "loss": 0.0728, + "step": 3273 + }, + { + "epoch": 2.578180385978732, + "grad_norm": 0.9571141600608826, + "learning_rate": 9.804e-06, + "loss": 0.064, + "step": 3274 + }, + { + "epoch": 2.5789680976762503, + "grad_norm": 0.9299687147140503, + "learning_rate": 9.807000000000001e-06, + "loss": 0.0439, + "step": 3275 + }, + { + "epoch": 2.5797558093737694, + "grad_norm": 1.6793906688690186, + "learning_rate": 9.810000000000001e-06, + "loss": 0.0741, + "step": 3276 + }, + { + "epoch": 2.580543521071288, + "grad_norm": 0.8778285980224609, + "learning_rate": 9.813e-06, + "loss": 0.0585, + "step": 3277 + }, + { + "epoch": 2.5813312327688065, + "grad_norm": 0.8445032835006714, + "learning_rate": 9.816e-06, + "loss": 0.0631, + "step": 3278 + }, + { + "epoch": 2.582118944466325, + "grad_norm": 0.9527825117111206, + "learning_rate": 9.819e-06, + "loss": 0.0483, + "step": 3279 + }, + { + "epoch": 2.582906656163844, + "grad_norm": 0.895846962928772, + "learning_rate": 9.822e-06, + "loss": 0.0488, + "step": 3280 + }, + { + "epoch": 2.5836943678613626, + "grad_norm": 0.5189657807350159, + "learning_rate": 9.825e-06, + "loss": 0.0423, + "step": 3281 + }, + { + "epoch": 2.5844820795588817, + "grad_norm": 1.0097817182540894, + "learning_rate": 9.828e-06, + "loss": 0.0687, + "step": 3282 + }, + { + "epoch": 2.5852697912564, + "grad_norm": 1.0326424837112427, + "learning_rate": 9.831e-06, + "loss": 0.0635, + "step": 3283 + }, + { + "epoch": 2.586057502953919, + "grad_norm": 2.20532488822937, + "learning_rate": 9.834e-06, + "loss": 0.0918, + "step": 3284 + }, + { + "epoch": 2.5868452146514374, + "grad_norm": 1.0740625858306885, + "learning_rate": 9.837000000000001e-06, + "loss": 0.0675, + "step": 3285 + }, + { + "epoch": 2.5876329263489564, + "grad_norm": 1.0807982683181763, + "learning_rate": 9.84e-06, + "loss": 0.0785, + "step": 3286 + }, + { + "epoch": 2.588420638046475, + "grad_norm": 0.8373476266860962, + "learning_rate": 9.843e-06, + "loss": 0.0756, + "step": 3287 + }, + { + "epoch": 2.589208349743994, + "grad_norm": 1.1760064363479614, + "learning_rate": 9.846e-06, + "loss": 0.0616, + "step": 3288 + }, + { + "epoch": 2.5899960614415125, + "grad_norm": 0.8817402720451355, + "learning_rate": 9.849e-06, + "loss": 0.0578, + "step": 3289 + }, + { + "epoch": 2.590783773139031, + "grad_norm": 0.8741480708122253, + "learning_rate": 9.852000000000002e-06, + "loss": 0.0806, + "step": 3290 + }, + { + "epoch": 2.5915714848365496, + "grad_norm": 1.453019618988037, + "learning_rate": 9.855000000000001e-06, + "loss": 0.4546, + "step": 3291 + }, + { + "epoch": 2.5923591965340687, + "grad_norm": 1.1099966764450073, + "learning_rate": 9.858000000000001e-06, + "loss": 0.3475, + "step": 3292 + }, + { + "epoch": 2.5931469082315872, + "grad_norm": 1.091080904006958, + "learning_rate": 9.861e-06, + "loss": 0.2723, + "step": 3293 + }, + { + "epoch": 2.593934619929106, + "grad_norm": 1.0943639278411865, + "learning_rate": 9.863999999999999e-06, + "loss": 0.2565, + "step": 3294 + }, + { + "epoch": 2.594722331626625, + "grad_norm": 1.3921599388122559, + "learning_rate": 9.867e-06, + "loss": 0.2306, + "step": 3295 + }, + { + "epoch": 2.5955100433241434, + "grad_norm": 1.3042426109313965, + "learning_rate": 9.87e-06, + "loss": 0.2004, + "step": 3296 + }, + { + "epoch": 2.596297755021662, + "grad_norm": 0.894403338432312, + "learning_rate": 9.873e-06, + "loss": 0.1147, + "step": 3297 + }, + { + "epoch": 2.597085466719181, + "grad_norm": 0.5888485908508301, + "learning_rate": 9.876e-06, + "loss": 0.0816, + "step": 3298 + }, + { + "epoch": 2.5978731784166995, + "grad_norm": 0.911336362361908, + "learning_rate": 9.879e-06, + "loss": 0.0562, + "step": 3299 + }, + { + "epoch": 2.598660890114218, + "grad_norm": 0.6452270746231079, + "learning_rate": 9.882000000000001e-06, + "loss": 0.0604, + "step": 3300 + }, + { + "epoch": 2.599448601811737, + "grad_norm": 1.0603901147842407, + "learning_rate": 9.885000000000001e-06, + "loss": 0.0493, + "step": 3301 + }, + { + "epoch": 2.6002363135092557, + "grad_norm": 1.0922757387161255, + "learning_rate": 9.888000000000001e-06, + "loss": 0.0601, + "step": 3302 + }, + { + "epoch": 2.6010240252067742, + "grad_norm": 1.3445228338241577, + "learning_rate": 9.891e-06, + "loss": 0.1257, + "step": 3303 + }, + { + "epoch": 2.601811736904293, + "grad_norm": 0.5468413829803467, + "learning_rate": 9.894e-06, + "loss": 0.0472, + "step": 3304 + }, + { + "epoch": 2.602599448601812, + "grad_norm": 0.865929126739502, + "learning_rate": 9.897e-06, + "loss": 0.0814, + "step": 3305 + }, + { + "epoch": 2.6033871602993304, + "grad_norm": 1.2254453897476196, + "learning_rate": 9.9e-06, + "loss": 0.0454, + "step": 3306 + }, + { + "epoch": 2.6041748719968494, + "grad_norm": 0.8563942909240723, + "learning_rate": 9.903e-06, + "loss": 0.0649, + "step": 3307 + }, + { + "epoch": 2.604962583694368, + "grad_norm": 0.834405243396759, + "learning_rate": 9.906e-06, + "loss": 0.0371, + "step": 3308 + }, + { + "epoch": 2.6057502953918865, + "grad_norm": 0.7713406682014465, + "learning_rate": 9.909e-06, + "loss": 0.0524, + "step": 3309 + }, + { + "epoch": 2.606538007089405, + "grad_norm": 0.9873881936073303, + "learning_rate": 9.912000000000001e-06, + "loss": 0.0538, + "step": 3310 + }, + { + "epoch": 2.607325718786924, + "grad_norm": 0.5971899628639221, + "learning_rate": 9.915e-06, + "loss": 0.0522, + "step": 3311 + }, + { + "epoch": 2.6081134304844427, + "grad_norm": 1.0720466375350952, + "learning_rate": 9.918e-06, + "loss": 0.1222, + "step": 3312 + }, + { + "epoch": 2.6089011421819617, + "grad_norm": 0.7697622179985046, + "learning_rate": 9.921e-06, + "loss": 0.0563, + "step": 3313 + }, + { + "epoch": 2.6096888538794802, + "grad_norm": 1.4750694036483765, + "learning_rate": 9.924e-06, + "loss": 0.0485, + "step": 3314 + }, + { + "epoch": 2.610476565576999, + "grad_norm": 0.6177639961242676, + "learning_rate": 9.927000000000002e-06, + "loss": 0.0425, + "step": 3315 + }, + { + "epoch": 2.6112642772745174, + "grad_norm": 0.8804740905761719, + "learning_rate": 9.930000000000001e-06, + "loss": 0.0634, + "step": 3316 + }, + { + "epoch": 2.6120519889720364, + "grad_norm": 0.8840813636779785, + "learning_rate": 9.933e-06, + "loss": 0.0605, + "step": 3317 + }, + { + "epoch": 2.612839700669555, + "grad_norm": 0.707872748374939, + "learning_rate": 9.936e-06, + "loss": 0.062, + "step": 3318 + }, + { + "epoch": 2.6136274123670735, + "grad_norm": 1.035508155822754, + "learning_rate": 9.939e-06, + "loss": 0.0282, + "step": 3319 + }, + { + "epoch": 2.6144151240645925, + "grad_norm": 0.8650664687156677, + "learning_rate": 9.941999999999999e-06, + "loss": 0.0447, + "step": 3320 + }, + { + "epoch": 2.615202835762111, + "grad_norm": 1.2268298864364624, + "learning_rate": 9.945e-06, + "loss": 0.0476, + "step": 3321 + }, + { + "epoch": 2.6159905474596297, + "grad_norm": 0.6931474804878235, + "learning_rate": 9.948e-06, + "loss": 0.0485, + "step": 3322 + }, + { + "epoch": 2.6167782591571482, + "grad_norm": 0.7042391300201416, + "learning_rate": 9.951e-06, + "loss": 0.044, + "step": 3323 + }, + { + "epoch": 2.6175659708546672, + "grad_norm": 1.7097562551498413, + "learning_rate": 9.954e-06, + "loss": 0.0686, + "step": 3324 + }, + { + "epoch": 2.618353682552186, + "grad_norm": 0.9838547110557556, + "learning_rate": 9.957e-06, + "loss": 0.077, + "step": 3325 + }, + { + "epoch": 2.619141394249705, + "grad_norm": 1.225113034248352, + "learning_rate": 9.960000000000001e-06, + "loss": 0.0573, + "step": 3326 + }, + { + "epoch": 2.6199291059472234, + "grad_norm": 2.274324655532837, + "learning_rate": 9.963000000000001e-06, + "loss": 0.0492, + "step": 3327 + }, + { + "epoch": 2.620716817644742, + "grad_norm": 1.1510201692581177, + "learning_rate": 9.966e-06, + "loss": 0.0402, + "step": 3328 + }, + { + "epoch": 2.6215045293422605, + "grad_norm": 0.9810526371002197, + "learning_rate": 9.969e-06, + "loss": 0.0674, + "step": 3329 + }, + { + "epoch": 2.6222922410397795, + "grad_norm": 0.911199152469635, + "learning_rate": 9.971999999999999e-06, + "loss": 0.0438, + "step": 3330 + }, + { + "epoch": 2.623079952737298, + "grad_norm": 0.7814428210258484, + "learning_rate": 9.975e-06, + "loss": 0.0577, + "step": 3331 + }, + { + "epoch": 2.623867664434817, + "grad_norm": 0.8862691521644592, + "learning_rate": 9.978e-06, + "loss": 0.0647, + "step": 3332 + }, + { + "epoch": 2.6246553761323357, + "grad_norm": 1.023545742034912, + "learning_rate": 9.981e-06, + "loss": 0.0565, + "step": 3333 + }, + { + "epoch": 2.6254430878298542, + "grad_norm": 1.3880853652954102, + "learning_rate": 9.984e-06, + "loss": 0.0705, + "step": 3334 + }, + { + "epoch": 2.626230799527373, + "grad_norm": 0.9327532649040222, + "learning_rate": 9.987e-06, + "loss": 0.0554, + "step": 3335 + }, + { + "epoch": 2.627018511224892, + "grad_norm": 0.8379401564598083, + "learning_rate": 9.990000000000001e-06, + "loss": 0.049, + "step": 3336 + }, + { + "epoch": 2.6278062229224104, + "grad_norm": 0.889546275138855, + "learning_rate": 9.993e-06, + "loss": 0.0821, + "step": 3337 + }, + { + "epoch": 2.628593934619929, + "grad_norm": 1.2955516576766968, + "learning_rate": 9.996e-06, + "loss": 0.0714, + "step": 3338 + }, + { + "epoch": 2.629381646317448, + "grad_norm": 1.0717504024505615, + "learning_rate": 9.999e-06, + "loss": 0.0891, + "step": 3339 + }, + { + "epoch": 2.6301693580149665, + "grad_norm": 0.8130077123641968, + "learning_rate": 1.0002e-05, + "loss": 0.0761, + "step": 3340 + }, + { + "epoch": 2.630957069712485, + "grad_norm": 6.007847785949707, + "learning_rate": 1.0005000000000002e-05, + "loss": 0.4972, + "step": 3341 + }, + { + "epoch": 2.6317447814100037, + "grad_norm": 3.004225969314575, + "learning_rate": 1.0008e-05, + "loss": 0.3317, + "step": 3342 + }, + { + "epoch": 2.6325324931075227, + "grad_norm": 2.0162041187286377, + "learning_rate": 1.0011e-05, + "loss": 0.3318, + "step": 3343 + }, + { + "epoch": 2.6333202048050413, + "grad_norm": 1.2296967506408691, + "learning_rate": 1.0014e-05, + "loss": 0.2012, + "step": 3344 + }, + { + "epoch": 2.6341079165025603, + "grad_norm": 1.5154330730438232, + "learning_rate": 1.0016999999999999e-05, + "loss": 0.1946, + "step": 3345 + }, + { + "epoch": 2.634895628200079, + "grad_norm": 1.2914382219314575, + "learning_rate": 1.002e-05, + "loss": 0.1579, + "step": 3346 + }, + { + "epoch": 2.6356833398975974, + "grad_norm": 6.19645357131958, + "learning_rate": 1.0023e-05, + "loss": 0.0784, + "step": 3347 + }, + { + "epoch": 2.636471051595116, + "grad_norm": 0.7964847683906555, + "learning_rate": 1.0026e-05, + "loss": 0.0749, + "step": 3348 + }, + { + "epoch": 2.637258763292635, + "grad_norm": 0.8645462989807129, + "learning_rate": 1.0029e-05, + "loss": 0.0541, + "step": 3349 + }, + { + "epoch": 2.6380464749901535, + "grad_norm": 0.8342092037200928, + "learning_rate": 1.0032e-05, + "loss": 0.0466, + "step": 3350 + }, + { + "epoch": 2.6388341866876726, + "grad_norm": 0.6656183004379272, + "learning_rate": 1.0035000000000001e-05, + "loss": 0.0445, + "step": 3351 + }, + { + "epoch": 2.639621898385191, + "grad_norm": 0.7836894989013672, + "learning_rate": 1.0038000000000001e-05, + "loss": 0.0388, + "step": 3352 + }, + { + "epoch": 2.6404096100827097, + "grad_norm": 0.7577874064445496, + "learning_rate": 1.0041000000000001e-05, + "loss": 0.0593, + "step": 3353 + }, + { + "epoch": 2.6411973217802283, + "grad_norm": 0.6995496153831482, + "learning_rate": 1.0043999999999999e-05, + "loss": 0.0391, + "step": 3354 + }, + { + "epoch": 2.6419850334777473, + "grad_norm": 1.107413411140442, + "learning_rate": 1.0046999999999999e-05, + "loss": 0.056, + "step": 3355 + }, + { + "epoch": 2.642772745175266, + "grad_norm": 0.5534799098968506, + "learning_rate": 1.005e-05, + "loss": 0.046, + "step": 3356 + }, + { + "epoch": 2.6435604568727844, + "grad_norm": 0.9960932731628418, + "learning_rate": 1.0053e-05, + "loss": 0.0482, + "step": 3357 + }, + { + "epoch": 2.6443481685703034, + "grad_norm": 0.7379774451255798, + "learning_rate": 1.0056e-05, + "loss": 0.0493, + "step": 3358 + }, + { + "epoch": 2.645135880267822, + "grad_norm": 0.9676980376243591, + "learning_rate": 1.0059e-05, + "loss": 0.0548, + "step": 3359 + }, + { + "epoch": 2.6459235919653405, + "grad_norm": 0.7551872134208679, + "learning_rate": 1.0062e-05, + "loss": 0.0642, + "step": 3360 + }, + { + "epoch": 2.646711303662859, + "grad_norm": 0.6342753767967224, + "learning_rate": 1.0065000000000001e-05, + "loss": 0.0446, + "step": 3361 + }, + { + "epoch": 2.647499015360378, + "grad_norm": 0.6036708354949951, + "learning_rate": 1.0068e-05, + "loss": 0.0565, + "step": 3362 + }, + { + "epoch": 2.6482867270578967, + "grad_norm": 0.7688912153244019, + "learning_rate": 1.0071e-05, + "loss": 0.0542, + "step": 3363 + }, + { + "epoch": 2.6490744387554157, + "grad_norm": 0.766703188419342, + "learning_rate": 1.0074e-05, + "loss": 0.0485, + "step": 3364 + }, + { + "epoch": 2.6498621504529343, + "grad_norm": 0.7543346285820007, + "learning_rate": 1.0077e-05, + "loss": 0.0792, + "step": 3365 + }, + { + "epoch": 2.650649862150453, + "grad_norm": 0.8449914455413818, + "learning_rate": 1.008e-05, + "loss": 0.0481, + "step": 3366 + }, + { + "epoch": 2.6514375738479714, + "grad_norm": 1.6684544086456299, + "learning_rate": 1.0083e-05, + "loss": 0.0498, + "step": 3367 + }, + { + "epoch": 2.6522252855454904, + "grad_norm": 1.0284647941589355, + "learning_rate": 1.0086e-05, + "loss": 0.057, + "step": 3368 + }, + { + "epoch": 2.653012997243009, + "grad_norm": 0.7286827564239502, + "learning_rate": 1.0089e-05, + "loss": 0.0465, + "step": 3369 + }, + { + "epoch": 2.653800708940528, + "grad_norm": 0.9510892629623413, + "learning_rate": 1.0092e-05, + "loss": 0.0496, + "step": 3370 + }, + { + "epoch": 2.6545884206380466, + "grad_norm": 1.1069221496582031, + "learning_rate": 1.0095e-05, + "loss": 0.0614, + "step": 3371 + }, + { + "epoch": 2.655376132335565, + "grad_norm": 0.8072028160095215, + "learning_rate": 1.0098e-05, + "loss": 0.0451, + "step": 3372 + }, + { + "epoch": 2.6561638440330837, + "grad_norm": 0.8584066033363342, + "learning_rate": 1.0101e-05, + "loss": 0.0674, + "step": 3373 + }, + { + "epoch": 2.6569515557306027, + "grad_norm": 0.9408462643623352, + "learning_rate": 1.0104e-05, + "loss": 0.0369, + "step": 3374 + }, + { + "epoch": 2.6577392674281213, + "grad_norm": 0.7698652148246765, + "learning_rate": 1.0107e-05, + "loss": 0.0453, + "step": 3375 + }, + { + "epoch": 2.6585269791256403, + "grad_norm": 0.8686133623123169, + "learning_rate": 1.0110000000000001e-05, + "loss": 0.0387, + "step": 3376 + }, + { + "epoch": 2.659314690823159, + "grad_norm": 0.663946807384491, + "learning_rate": 1.0113000000000001e-05, + "loss": 0.0501, + "step": 3377 + }, + { + "epoch": 2.6601024025206774, + "grad_norm": 24.854026794433594, + "learning_rate": 1.0116000000000001e-05, + "loss": 0.0594, + "step": 3378 + }, + { + "epoch": 2.660890114218196, + "grad_norm": 0.8124898672103882, + "learning_rate": 1.0119e-05, + "loss": 0.0448, + "step": 3379 + }, + { + "epoch": 2.661677825915715, + "grad_norm": 1.1324762105941772, + "learning_rate": 1.0121999999999999e-05, + "loss": 0.0691, + "step": 3380 + }, + { + "epoch": 2.6624655376132336, + "grad_norm": 0.9663019180297852, + "learning_rate": 1.0125e-05, + "loss": 0.0555, + "step": 3381 + }, + { + "epoch": 2.663253249310752, + "grad_norm": 0.8708319664001465, + "learning_rate": 1.0128e-05, + "loss": 0.0843, + "step": 3382 + }, + { + "epoch": 2.664040961008271, + "grad_norm": 0.7190468311309814, + "learning_rate": 1.0131e-05, + "loss": 0.0566, + "step": 3383 + }, + { + "epoch": 2.6648286727057897, + "grad_norm": 0.8022722601890564, + "learning_rate": 1.0134e-05, + "loss": 0.0457, + "step": 3384 + }, + { + "epoch": 2.6656163844033083, + "grad_norm": 1.0078336000442505, + "learning_rate": 1.0137e-05, + "loss": 0.0857, + "step": 3385 + }, + { + "epoch": 2.666404096100827, + "grad_norm": 1.2727857828140259, + "learning_rate": 1.0140000000000001e-05, + "loss": 0.0898, + "step": 3386 + }, + { + "epoch": 2.667191807798346, + "grad_norm": 1.6282949447631836, + "learning_rate": 1.0143000000000001e-05, + "loss": 0.0562, + "step": 3387 + }, + { + "epoch": 2.6679795194958644, + "grad_norm": 0.8061880469322205, + "learning_rate": 1.0146e-05, + "loss": 0.0477, + "step": 3388 + }, + { + "epoch": 2.6687672311933834, + "grad_norm": 1.1655447483062744, + "learning_rate": 1.0149e-05, + "loss": 0.077, + "step": 3389 + }, + { + "epoch": 2.669554942890902, + "grad_norm": 1.3137863874435425, + "learning_rate": 1.0152e-05, + "loss": 0.089, + "step": 3390 + }, + { + "epoch": 2.6703426545884206, + "grad_norm": 1.5968042612075806, + "learning_rate": 1.0155e-05, + "loss": 0.5234, + "step": 3391 + }, + { + "epoch": 2.671130366285939, + "grad_norm": 1.1141399145126343, + "learning_rate": 1.0158e-05, + "loss": 0.3429, + "step": 3392 + }, + { + "epoch": 2.671918077983458, + "grad_norm": 0.9841530919075012, + "learning_rate": 1.0161e-05, + "loss": 0.229, + "step": 3393 + }, + { + "epoch": 2.6727057896809767, + "grad_norm": 1.6923469305038452, + "learning_rate": 1.0164e-05, + "loss": 0.2405, + "step": 3394 + }, + { + "epoch": 2.6734935013784957, + "grad_norm": 1.9644912481307983, + "learning_rate": 1.0167e-05, + "loss": 0.172, + "step": 3395 + }, + { + "epoch": 2.6742812130760143, + "grad_norm": 2.0005455017089844, + "learning_rate": 1.0170000000000001e-05, + "loss": 0.1469, + "step": 3396 + }, + { + "epoch": 2.675068924773533, + "grad_norm": 1.521203875541687, + "learning_rate": 1.0173e-05, + "loss": 0.1755, + "step": 3397 + }, + { + "epoch": 2.6758566364710514, + "grad_norm": 0.876253604888916, + "learning_rate": 1.0176e-05, + "loss": 0.0869, + "step": 3398 + }, + { + "epoch": 2.6766443481685704, + "grad_norm": 0.6467610001564026, + "learning_rate": 1.0179e-05, + "loss": 0.0599, + "step": 3399 + }, + { + "epoch": 2.677432059866089, + "grad_norm": 0.6148637533187866, + "learning_rate": 1.0182e-05, + "loss": 0.0397, + "step": 3400 + }, + { + "epoch": 2.6782197715636076, + "grad_norm": 0.9379889965057373, + "learning_rate": 1.0185000000000002e-05, + "loss": 0.0721, + "step": 3401 + }, + { + "epoch": 2.6790074832611266, + "grad_norm": 0.6542046666145325, + "learning_rate": 1.0188000000000001e-05, + "loss": 0.0581, + "step": 3402 + }, + { + "epoch": 2.679795194958645, + "grad_norm": 0.8632768988609314, + "learning_rate": 1.0191e-05, + "loss": 0.0377, + "step": 3403 + }, + { + "epoch": 2.6805829066561637, + "grad_norm": 0.7022622227668762, + "learning_rate": 1.0194e-05, + "loss": 0.1127, + "step": 3404 + }, + { + "epoch": 2.6813706183536823, + "grad_norm": 0.9020134210586548, + "learning_rate": 1.0196999999999999e-05, + "loss": 0.0447, + "step": 3405 + }, + { + "epoch": 2.6821583300512013, + "grad_norm": 0.5379353761672974, + "learning_rate": 1.02e-05, + "loss": 0.0316, + "step": 3406 + }, + { + "epoch": 2.68294604174872, + "grad_norm": 0.6946240067481995, + "learning_rate": 1.0203e-05, + "loss": 0.0345, + "step": 3407 + }, + { + "epoch": 2.683733753446239, + "grad_norm": 0.8468925356864929, + "learning_rate": 1.0206e-05, + "loss": 0.0327, + "step": 3408 + }, + { + "epoch": 2.6845214651437574, + "grad_norm": 1.0652180910110474, + "learning_rate": 1.0209e-05, + "loss": 0.0551, + "step": 3409 + }, + { + "epoch": 2.685309176841276, + "grad_norm": 0.6694315671920776, + "learning_rate": 1.0212e-05, + "loss": 0.0663, + "step": 3410 + }, + { + "epoch": 2.6860968885387946, + "grad_norm": 0.8048431277275085, + "learning_rate": 1.0215000000000001e-05, + "loss": 0.0504, + "step": 3411 + }, + { + "epoch": 2.6868846002363136, + "grad_norm": 0.8454904556274414, + "learning_rate": 1.0218000000000001e-05, + "loss": 0.0348, + "step": 3412 + }, + { + "epoch": 2.687672311933832, + "grad_norm": 0.7000889778137207, + "learning_rate": 1.0221000000000001e-05, + "loss": 0.0467, + "step": 3413 + }, + { + "epoch": 2.688460023631351, + "grad_norm": 0.6050423979759216, + "learning_rate": 1.0224e-05, + "loss": 0.0531, + "step": 3414 + }, + { + "epoch": 2.6892477353288697, + "grad_norm": 0.4866451025009155, + "learning_rate": 1.0227e-05, + "loss": 0.0376, + "step": 3415 + }, + { + "epoch": 2.6900354470263883, + "grad_norm": 0.7995560169219971, + "learning_rate": 1.023e-05, + "loss": 0.0728, + "step": 3416 + }, + { + "epoch": 2.690823158723907, + "grad_norm": 0.6483689546585083, + "learning_rate": 1.0233e-05, + "loss": 0.0397, + "step": 3417 + }, + { + "epoch": 2.691610870421426, + "grad_norm": 0.8275576829910278, + "learning_rate": 1.0236e-05, + "loss": 0.0468, + "step": 3418 + }, + { + "epoch": 2.6923985821189445, + "grad_norm": 1.0921387672424316, + "learning_rate": 1.0239e-05, + "loss": 0.0505, + "step": 3419 + }, + { + "epoch": 2.693186293816463, + "grad_norm": 0.664218544960022, + "learning_rate": 1.0242e-05, + "loss": 0.0448, + "step": 3420 + }, + { + "epoch": 2.693974005513982, + "grad_norm": 0.7297108173370361, + "learning_rate": 1.0245000000000001e-05, + "loss": 0.0351, + "step": 3421 + }, + { + "epoch": 2.6947617172115006, + "grad_norm": 0.5755402445793152, + "learning_rate": 1.0248e-05, + "loss": 0.0313, + "step": 3422 + }, + { + "epoch": 2.695549428909019, + "grad_norm": 0.8394693732261658, + "learning_rate": 1.0251e-05, + "loss": 0.0494, + "step": 3423 + }, + { + "epoch": 2.6963371406065377, + "grad_norm": 0.8139951229095459, + "learning_rate": 1.0254e-05, + "loss": 0.047, + "step": 3424 + }, + { + "epoch": 2.6971248523040567, + "grad_norm": 0.8393800258636475, + "learning_rate": 1.0257e-05, + "loss": 0.0521, + "step": 3425 + }, + { + "epoch": 2.6979125640015753, + "grad_norm": 0.9040412902832031, + "learning_rate": 1.0260000000000002e-05, + "loss": 0.0511, + "step": 3426 + }, + { + "epoch": 2.6987002756990943, + "grad_norm": 0.8374390602111816, + "learning_rate": 1.0263000000000002e-05, + "loss": 0.0444, + "step": 3427 + }, + { + "epoch": 2.699487987396613, + "grad_norm": 0.9384106397628784, + "learning_rate": 1.0266e-05, + "loss": 0.0579, + "step": 3428 + }, + { + "epoch": 2.7002756990941315, + "grad_norm": 0.6834551095962524, + "learning_rate": 1.0269e-05, + "loss": 0.0378, + "step": 3429 + }, + { + "epoch": 2.70106341079165, + "grad_norm": 0.6966255903244019, + "learning_rate": 1.0272e-05, + "loss": 0.0514, + "step": 3430 + }, + { + "epoch": 2.701851122489169, + "grad_norm": 0.6415438652038574, + "learning_rate": 1.0275e-05, + "loss": 0.0337, + "step": 3431 + }, + { + "epoch": 2.7026388341866876, + "grad_norm": 0.9720473289489746, + "learning_rate": 1.0278e-05, + "loss": 0.054, + "step": 3432 + }, + { + "epoch": 2.7034265458842066, + "grad_norm": 1.4357080459594727, + "learning_rate": 1.0281e-05, + "loss": 0.0717, + "step": 3433 + }, + { + "epoch": 2.704214257581725, + "grad_norm": 0.9656221866607666, + "learning_rate": 1.0284e-05, + "loss": 0.0363, + "step": 3434 + }, + { + "epoch": 2.7050019692792437, + "grad_norm": 0.9809225797653198, + "learning_rate": 1.0287e-05, + "loss": 0.0647, + "step": 3435 + }, + { + "epoch": 2.7057896809767623, + "grad_norm": 0.8821759223937988, + "learning_rate": 1.0290000000000001e-05, + "loss": 0.066, + "step": 3436 + }, + { + "epoch": 2.7065773926742813, + "grad_norm": 0.7881132960319519, + "learning_rate": 1.0293000000000001e-05, + "loss": 0.0436, + "step": 3437 + }, + { + "epoch": 2.7073651043718, + "grad_norm": 0.5701141357421875, + "learning_rate": 1.0296000000000001e-05, + "loss": 0.0518, + "step": 3438 + }, + { + "epoch": 2.708152816069319, + "grad_norm": 0.9060848951339722, + "learning_rate": 1.0299e-05, + "loss": 0.0695, + "step": 3439 + }, + { + "epoch": 2.7089405277668375, + "grad_norm": 1.65829598903656, + "learning_rate": 1.0301999999999999e-05, + "loss": 0.0907, + "step": 3440 + }, + { + "epoch": 2.709728239464356, + "grad_norm": 1.9302226305007935, + "learning_rate": 1.0305e-05, + "loss": 0.4445, + "step": 3441 + }, + { + "epoch": 2.7105159511618746, + "grad_norm": 1.1166019439697266, + "learning_rate": 1.0308e-05, + "loss": 0.3839, + "step": 3442 + }, + { + "epoch": 2.7113036628593936, + "grad_norm": 1.0213555097579956, + "learning_rate": 1.0311e-05, + "loss": 0.2591, + "step": 3443 + }, + { + "epoch": 2.712091374556912, + "grad_norm": 1.0834143161773682, + "learning_rate": 1.0314e-05, + "loss": 0.2806, + "step": 3444 + }, + { + "epoch": 2.7128790862544307, + "grad_norm": 1.4031925201416016, + "learning_rate": 1.0317e-05, + "loss": 0.1509, + "step": 3445 + }, + { + "epoch": 2.7136667979519498, + "grad_norm": 0.6303246021270752, + "learning_rate": 1.032e-05, + "loss": 0.1019, + "step": 3446 + }, + { + "epoch": 2.7144545096494683, + "grad_norm": 0.8426051139831543, + "learning_rate": 1.0323000000000001e-05, + "loss": 0.1119, + "step": 3447 + }, + { + "epoch": 2.715242221346987, + "grad_norm": 0.7119936943054199, + "learning_rate": 1.0326e-05, + "loss": 0.12, + "step": 3448 + }, + { + "epoch": 2.7160299330445055, + "grad_norm": 1.0293490886688232, + "learning_rate": 1.0329e-05, + "loss": 0.0668, + "step": 3449 + }, + { + "epoch": 2.7168176447420245, + "grad_norm": 2.012956380844116, + "learning_rate": 1.0332e-05, + "loss": 0.0465, + "step": 3450 + }, + { + "epoch": 2.717605356439543, + "grad_norm": 0.9643163084983826, + "learning_rate": 1.0335e-05, + "loss": 0.07, + "step": 3451 + }, + { + "epoch": 2.718393068137062, + "grad_norm": 0.6503215432167053, + "learning_rate": 1.0338e-05, + "loss": 0.057, + "step": 3452 + }, + { + "epoch": 2.7191807798345806, + "grad_norm": 0.9603964686393738, + "learning_rate": 1.0341e-05, + "loss": 0.1076, + "step": 3453 + }, + { + "epoch": 2.719968491532099, + "grad_norm": 0.6902780532836914, + "learning_rate": 1.0344e-05, + "loss": 0.0474, + "step": 3454 + }, + { + "epoch": 2.7207562032296178, + "grad_norm": 0.6046273708343506, + "learning_rate": 1.0347e-05, + "loss": 0.0461, + "step": 3455 + }, + { + "epoch": 2.7215439149271368, + "grad_norm": 1.267113447189331, + "learning_rate": 1.035e-05, + "loss": 0.0452, + "step": 3456 + }, + { + "epoch": 2.7223316266246553, + "grad_norm": 0.638418436050415, + "learning_rate": 1.0353e-05, + "loss": 0.0593, + "step": 3457 + }, + { + "epoch": 2.7231193383221743, + "grad_norm": 0.6008505821228027, + "learning_rate": 1.0356e-05, + "loss": 0.0506, + "step": 3458 + }, + { + "epoch": 2.723907050019693, + "grad_norm": 0.49217355251312256, + "learning_rate": 1.0359e-05, + "loss": 0.0395, + "step": 3459 + }, + { + "epoch": 2.7246947617172115, + "grad_norm": 0.6710898876190186, + "learning_rate": 1.0362e-05, + "loss": 0.0472, + "step": 3460 + }, + { + "epoch": 2.72548247341473, + "grad_norm": 0.836137056350708, + "learning_rate": 1.0365e-05, + "loss": 0.0503, + "step": 3461 + }, + { + "epoch": 2.726270185112249, + "grad_norm": 0.7355411052703857, + "learning_rate": 1.0368000000000001e-05, + "loss": 0.0538, + "step": 3462 + }, + { + "epoch": 2.7270578968097676, + "grad_norm": 1.0206594467163086, + "learning_rate": 1.0371000000000001e-05, + "loss": 0.0506, + "step": 3463 + }, + { + "epoch": 2.727845608507286, + "grad_norm": 0.9039499759674072, + "learning_rate": 1.0374000000000001e-05, + "loss": 0.0556, + "step": 3464 + }, + { + "epoch": 2.728633320204805, + "grad_norm": 0.6972000598907471, + "learning_rate": 1.0376999999999999e-05, + "loss": 0.0595, + "step": 3465 + }, + { + "epoch": 2.7294210319023238, + "grad_norm": 0.6322767734527588, + "learning_rate": 1.0379999999999999e-05, + "loss": 0.0469, + "step": 3466 + }, + { + "epoch": 2.7302087435998423, + "grad_norm": 0.8511699438095093, + "learning_rate": 1.0383e-05, + "loss": 0.0367, + "step": 3467 + }, + { + "epoch": 2.730996455297361, + "grad_norm": 0.7741568088531494, + "learning_rate": 1.0386e-05, + "loss": 0.0522, + "step": 3468 + }, + { + "epoch": 2.73178416699488, + "grad_norm": 0.8266224265098572, + "learning_rate": 1.0389e-05, + "loss": 0.0511, + "step": 3469 + }, + { + "epoch": 2.7325718786923985, + "grad_norm": 0.7434104681015015, + "learning_rate": 1.0392e-05, + "loss": 0.0644, + "step": 3470 + }, + { + "epoch": 2.7333595903899175, + "grad_norm": 1.1103456020355225, + "learning_rate": 1.0395e-05, + "loss": 0.0731, + "step": 3471 + }, + { + "epoch": 2.734147302087436, + "grad_norm": 1.2941734790802002, + "learning_rate": 1.0398000000000001e-05, + "loss": 0.0635, + "step": 3472 + }, + { + "epoch": 2.7349350137849546, + "grad_norm": 0.7773082852363586, + "learning_rate": 1.0401000000000001e-05, + "loss": 0.0561, + "step": 3473 + }, + { + "epoch": 2.735722725482473, + "grad_norm": 0.7059159874916077, + "learning_rate": 1.0404e-05, + "loss": 0.0431, + "step": 3474 + }, + { + "epoch": 2.736510437179992, + "grad_norm": 3.839061737060547, + "learning_rate": 1.0407e-05, + "loss": 0.0669, + "step": 3475 + }, + { + "epoch": 2.7372981488775108, + "grad_norm": 0.7671451568603516, + "learning_rate": 1.041e-05, + "loss": 0.046, + "step": 3476 + }, + { + "epoch": 2.73808586057503, + "grad_norm": 0.9379280805587769, + "learning_rate": 1.0413e-05, + "loss": 0.0538, + "step": 3477 + }, + { + "epoch": 2.7388735722725484, + "grad_norm": 0.8830292820930481, + "learning_rate": 1.0416e-05, + "loss": 0.0538, + "step": 3478 + }, + { + "epoch": 2.739661283970067, + "grad_norm": 1.0949389934539795, + "learning_rate": 1.0419e-05, + "loss": 0.0644, + "step": 3479 + }, + { + "epoch": 2.7404489956675855, + "grad_norm": 1.2394987344741821, + "learning_rate": 1.0422e-05, + "loss": 0.0393, + "step": 3480 + }, + { + "epoch": 2.7412367073651045, + "grad_norm": 0.6929782032966614, + "learning_rate": 1.0425e-05, + "loss": 0.0446, + "step": 3481 + }, + { + "epoch": 2.742024419062623, + "grad_norm": 1.1009178161621094, + "learning_rate": 1.0428e-05, + "loss": 0.0795, + "step": 3482 + }, + { + "epoch": 2.7428121307601416, + "grad_norm": 0.8217458724975586, + "learning_rate": 1.0431e-05, + "loss": 0.0477, + "step": 3483 + }, + { + "epoch": 2.7435998424576606, + "grad_norm": 0.9929971694946289, + "learning_rate": 1.0434e-05, + "loss": 0.0634, + "step": 3484 + }, + { + "epoch": 2.744387554155179, + "grad_norm": 1.2469369173049927, + "learning_rate": 1.0437e-05, + "loss": 0.0445, + "step": 3485 + }, + { + "epoch": 2.7451752658526978, + "grad_norm": 0.9315780997276306, + "learning_rate": 1.044e-05, + "loss": 0.0657, + "step": 3486 + }, + { + "epoch": 2.7459629775502163, + "grad_norm": 1.3086861371994019, + "learning_rate": 1.0443000000000001e-05, + "loss": 0.0712, + "step": 3487 + }, + { + "epoch": 2.7467506892477354, + "grad_norm": 1.4419200420379639, + "learning_rate": 1.0446000000000001e-05, + "loss": 0.0809, + "step": 3488 + }, + { + "epoch": 2.747538400945254, + "grad_norm": 0.863852858543396, + "learning_rate": 1.0449e-05, + "loss": 0.0735, + "step": 3489 + }, + { + "epoch": 2.748326112642773, + "grad_norm": 1.1002132892608643, + "learning_rate": 1.0452e-05, + "loss": 0.0706, + "step": 3490 + }, + { + "epoch": 2.7491138243402915, + "grad_norm": 1.6314409971237183, + "learning_rate": 1.0454999999999999e-05, + "loss": 0.4034, + "step": 3491 + }, + { + "epoch": 2.74990153603781, + "grad_norm": 0.9085620641708374, + "learning_rate": 1.0458e-05, + "loss": 0.2903, + "step": 3492 + }, + { + "epoch": 2.7506892477353286, + "grad_norm": 1.1604522466659546, + "learning_rate": 1.0461e-05, + "loss": 0.2679, + "step": 3493 + }, + { + "epoch": 2.7514769594328476, + "grad_norm": 1.807093858718872, + "learning_rate": 1.0464e-05, + "loss": 0.2972, + "step": 3494 + }, + { + "epoch": 2.752264671130366, + "grad_norm": 0.9378382563591003, + "learning_rate": 1.0467e-05, + "loss": 0.1682, + "step": 3495 + }, + { + "epoch": 2.7530523828278852, + "grad_norm": 0.759148359298706, + "learning_rate": 1.047e-05, + "loss": 0.1119, + "step": 3496 + }, + { + "epoch": 2.753840094525404, + "grad_norm": 0.8327105045318604, + "learning_rate": 1.0473000000000001e-05, + "loss": 0.0988, + "step": 3497 + }, + { + "epoch": 2.7546278062229224, + "grad_norm": 0.5978240370750427, + "learning_rate": 1.0476000000000001e-05, + "loss": 0.0781, + "step": 3498 + }, + { + "epoch": 2.755415517920441, + "grad_norm": 1.9759199619293213, + "learning_rate": 1.0479e-05, + "loss": 0.1348, + "step": 3499 + }, + { + "epoch": 2.75620322961796, + "grad_norm": 0.5174894332885742, + "learning_rate": 1.0482e-05, + "loss": 0.0527, + "step": 3500 + }, + { + "epoch": 2.7569909413154785, + "grad_norm": 0.5812801718711853, + "learning_rate": 1.0485e-05, + "loss": 0.0449, + "step": 3501 + }, + { + "epoch": 2.7577786530129975, + "grad_norm": 0.7663119435310364, + "learning_rate": 1.0488e-05, + "loss": 0.057, + "step": 3502 + }, + { + "epoch": 2.758566364710516, + "grad_norm": 0.5529369711875916, + "learning_rate": 1.0491e-05, + "loss": 0.0535, + "step": 3503 + }, + { + "epoch": 2.7593540764080347, + "grad_norm": 0.5430825352668762, + "learning_rate": 1.0494e-05, + "loss": 0.0445, + "step": 3504 + }, + { + "epoch": 2.760141788105553, + "grad_norm": 0.6203799843788147, + "learning_rate": 1.0497e-05, + "loss": 0.0364, + "step": 3505 + }, + { + "epoch": 2.7609294998030722, + "grad_norm": 0.8333918452262878, + "learning_rate": 1.05e-05, + "loss": 0.0409, + "step": 3506 + }, + { + "epoch": 2.761717211500591, + "grad_norm": 0.6302753686904907, + "learning_rate": 1.0503000000000001e-05, + "loss": 0.0354, + "step": 3507 + }, + { + "epoch": 2.7625049231981094, + "grad_norm": 0.7088421583175659, + "learning_rate": 1.0506e-05, + "loss": 0.0385, + "step": 3508 + }, + { + "epoch": 2.7632926348956284, + "grad_norm": 0.5905536413192749, + "learning_rate": 1.0509e-05, + "loss": 0.0555, + "step": 3509 + }, + { + "epoch": 2.764080346593147, + "grad_norm": 0.5228442549705505, + "learning_rate": 1.0512e-05, + "loss": 0.0391, + "step": 3510 + }, + { + "epoch": 2.7648680582906655, + "grad_norm": 0.5111157298088074, + "learning_rate": 1.0515e-05, + "loss": 0.0315, + "step": 3511 + }, + { + "epoch": 2.765655769988184, + "grad_norm": 0.8303258419036865, + "learning_rate": 1.0518000000000002e-05, + "loss": 0.0512, + "step": 3512 + }, + { + "epoch": 2.766443481685703, + "grad_norm": 0.7664098739624023, + "learning_rate": 1.0521000000000001e-05, + "loss": 0.0425, + "step": 3513 + }, + { + "epoch": 2.7672311933832217, + "grad_norm": 0.9943049550056458, + "learning_rate": 1.0524e-05, + "loss": 0.0602, + "step": 3514 + }, + { + "epoch": 2.7680189050807407, + "grad_norm": 0.4806714653968811, + "learning_rate": 1.0527e-05, + "loss": 0.0294, + "step": 3515 + }, + { + "epoch": 2.7688066167782592, + "grad_norm": 0.674990177154541, + "learning_rate": 1.0529999999999999e-05, + "loss": 0.0543, + "step": 3516 + }, + { + "epoch": 2.769594328475778, + "grad_norm": 0.7520889639854431, + "learning_rate": 1.0533e-05, + "loss": 0.0586, + "step": 3517 + }, + { + "epoch": 2.7703820401732964, + "grad_norm": 0.8609735369682312, + "learning_rate": 1.0536e-05, + "loss": 0.0351, + "step": 3518 + }, + { + "epoch": 2.7711697518708154, + "grad_norm": 0.7567917108535767, + "learning_rate": 1.0539e-05, + "loss": 0.0496, + "step": 3519 + }, + { + "epoch": 2.771957463568334, + "grad_norm": 0.8650139570236206, + "learning_rate": 1.0542e-05, + "loss": 0.0432, + "step": 3520 + }, + { + "epoch": 2.772745175265853, + "grad_norm": 0.7039094567298889, + "learning_rate": 1.0545e-05, + "loss": 0.0405, + "step": 3521 + }, + { + "epoch": 2.7735328869633715, + "grad_norm": 1.0714330673217773, + "learning_rate": 1.0548000000000001e-05, + "loss": 0.0611, + "step": 3522 + }, + { + "epoch": 2.77432059866089, + "grad_norm": 0.6939718723297119, + "learning_rate": 1.0551000000000001e-05, + "loss": 0.0468, + "step": 3523 + }, + { + "epoch": 2.7751083103584087, + "grad_norm": 0.6754922270774841, + "learning_rate": 1.0554000000000001e-05, + "loss": 0.0534, + "step": 3524 + }, + { + "epoch": 2.7758960220559277, + "grad_norm": 0.8597479462623596, + "learning_rate": 1.0557e-05, + "loss": 0.0676, + "step": 3525 + }, + { + "epoch": 2.7766837337534462, + "grad_norm": 0.6142721176147461, + "learning_rate": 1.0559999999999999e-05, + "loss": 0.0435, + "step": 3526 + }, + { + "epoch": 2.777471445450965, + "grad_norm": 0.9322033524513245, + "learning_rate": 1.0563e-05, + "loss": 0.0444, + "step": 3527 + }, + { + "epoch": 2.778259157148484, + "grad_norm": 0.7122319936752319, + "learning_rate": 1.0566e-05, + "loss": 0.0526, + "step": 3528 + }, + { + "epoch": 2.7790468688460024, + "grad_norm": 1.5143001079559326, + "learning_rate": 1.0569e-05, + "loss": 0.0795, + "step": 3529 + }, + { + "epoch": 2.779834580543521, + "grad_norm": 1.077987551689148, + "learning_rate": 1.0572e-05, + "loss": 0.0682, + "step": 3530 + }, + { + "epoch": 2.7806222922410395, + "grad_norm": 0.7651817202568054, + "learning_rate": 1.0575e-05, + "loss": 0.0578, + "step": 3531 + }, + { + "epoch": 2.7814100039385585, + "grad_norm": 0.8242251873016357, + "learning_rate": 1.0578000000000001e-05, + "loss": 0.078, + "step": 3532 + }, + { + "epoch": 2.782197715636077, + "grad_norm": 0.8540482521057129, + "learning_rate": 1.0581e-05, + "loss": 0.0474, + "step": 3533 + }, + { + "epoch": 2.782985427333596, + "grad_norm": 0.6916796565055847, + "learning_rate": 1.0584e-05, + "loss": 0.0503, + "step": 3534 + }, + { + "epoch": 2.7837731390311147, + "grad_norm": 1.2476695775985718, + "learning_rate": 1.0587e-05, + "loss": 0.0612, + "step": 3535 + }, + { + "epoch": 2.7845608507286332, + "grad_norm": 1.1510001420974731, + "learning_rate": 1.059e-05, + "loss": 0.1156, + "step": 3536 + }, + { + "epoch": 2.785348562426152, + "grad_norm": 0.8708344101905823, + "learning_rate": 1.0593000000000002e-05, + "loss": 0.0511, + "step": 3537 + }, + { + "epoch": 2.786136274123671, + "grad_norm": 1.0303641557693481, + "learning_rate": 1.0596e-05, + "loss": 0.0593, + "step": 3538 + }, + { + "epoch": 2.7869239858211894, + "grad_norm": 0.7878034114837646, + "learning_rate": 1.0599e-05, + "loss": 0.0641, + "step": 3539 + }, + { + "epoch": 2.7877116975187084, + "grad_norm": 0.9050739407539368, + "learning_rate": 1.0602e-05, + "loss": 0.0865, + "step": 3540 + }, + { + "epoch": 2.788499409216227, + "grad_norm": 1.5895326137542725, + "learning_rate": 1.0605e-05, + "loss": 0.3669, + "step": 3541 + }, + { + "epoch": 2.7892871209137455, + "grad_norm": 1.719907283782959, + "learning_rate": 1.0608e-05, + "loss": 0.5019, + "step": 3542 + }, + { + "epoch": 2.790074832611264, + "grad_norm": 1.0588042736053467, + "learning_rate": 1.0611e-05, + "loss": 0.286, + "step": 3543 + }, + { + "epoch": 2.790862544308783, + "grad_norm": 1.337920904159546, + "learning_rate": 1.0614e-05, + "loss": 0.2794, + "step": 3544 + }, + { + "epoch": 2.7916502560063017, + "grad_norm": 1.2683900594711304, + "learning_rate": 1.0617e-05, + "loss": 0.1987, + "step": 3545 + }, + { + "epoch": 2.7924379677038202, + "grad_norm": 0.8680461645126343, + "learning_rate": 1.062e-05, + "loss": 0.1231, + "step": 3546 + }, + { + "epoch": 2.7932256794013393, + "grad_norm": 0.6550981402397156, + "learning_rate": 1.0623000000000001e-05, + "loss": 0.0638, + "step": 3547 + }, + { + "epoch": 2.794013391098858, + "grad_norm": 0.8711990714073181, + "learning_rate": 1.0626000000000001e-05, + "loss": 0.0782, + "step": 3548 + }, + { + "epoch": 2.7948011027963764, + "grad_norm": 0.9834581613540649, + "learning_rate": 1.0629000000000001e-05, + "loss": 0.0851, + "step": 3549 + }, + { + "epoch": 2.795588814493895, + "grad_norm": 0.6198612451553345, + "learning_rate": 1.0632000000000001e-05, + "loss": 0.0636, + "step": 3550 + }, + { + "epoch": 2.796376526191414, + "grad_norm": 0.7066558599472046, + "learning_rate": 1.0634999999999999e-05, + "loss": 0.0467, + "step": 3551 + }, + { + "epoch": 2.7971642378889325, + "grad_norm": 1.588624358177185, + "learning_rate": 1.0638e-05, + "loss": 0.0497, + "step": 3552 + }, + { + "epoch": 2.7979519495864515, + "grad_norm": 0.758544385433197, + "learning_rate": 1.0641e-05, + "loss": 0.0415, + "step": 3553 + }, + { + "epoch": 2.79873966128397, + "grad_norm": 0.7105105519294739, + "learning_rate": 1.0644e-05, + "loss": 0.042, + "step": 3554 + }, + { + "epoch": 2.7995273729814887, + "grad_norm": 0.5774279236793518, + "learning_rate": 1.0647e-05, + "loss": 0.0285, + "step": 3555 + }, + { + "epoch": 2.8003150846790072, + "grad_norm": 0.7145833969116211, + "learning_rate": 1.065e-05, + "loss": 0.0345, + "step": 3556 + }, + { + "epoch": 2.8011027963765263, + "grad_norm": 0.9378234148025513, + "learning_rate": 1.0653000000000001e-05, + "loss": 0.0713, + "step": 3557 + }, + { + "epoch": 2.801890508074045, + "grad_norm": 0.8468196392059326, + "learning_rate": 1.0656000000000001e-05, + "loss": 0.0376, + "step": 3558 + }, + { + "epoch": 2.802678219771564, + "grad_norm": 0.9001160860061646, + "learning_rate": 1.0659e-05, + "loss": 0.057, + "step": 3559 + }, + { + "epoch": 2.8034659314690824, + "grad_norm": 0.6725720763206482, + "learning_rate": 1.0662e-05, + "loss": 0.0459, + "step": 3560 + }, + { + "epoch": 2.804253643166601, + "grad_norm": 0.9972389936447144, + "learning_rate": 1.0665e-05, + "loss": 0.0472, + "step": 3561 + }, + { + "epoch": 2.8050413548641195, + "grad_norm": 0.7293881773948669, + "learning_rate": 1.0668000000000002e-05, + "loss": 0.0666, + "step": 3562 + }, + { + "epoch": 2.8058290665616386, + "grad_norm": 0.8790149688720703, + "learning_rate": 1.0671e-05, + "loss": 0.0643, + "step": 3563 + }, + { + "epoch": 2.806616778259157, + "grad_norm": 3.5484509468078613, + "learning_rate": 1.0674e-05, + "loss": 0.1158, + "step": 3564 + }, + { + "epoch": 2.807404489956676, + "grad_norm": 0.7707997560501099, + "learning_rate": 1.0677e-05, + "loss": 0.0388, + "step": 3565 + }, + { + "epoch": 2.8081922016541947, + "grad_norm": 0.9691988229751587, + "learning_rate": 1.068e-05, + "loss": 0.0963, + "step": 3566 + }, + { + "epoch": 2.8089799133517133, + "grad_norm": 0.7075831890106201, + "learning_rate": 1.0683000000000001e-05, + "loss": 0.0421, + "step": 3567 + }, + { + "epoch": 2.809767625049232, + "grad_norm": 0.5678495168685913, + "learning_rate": 1.0686e-05, + "loss": 0.0368, + "step": 3568 + }, + { + "epoch": 2.810555336746751, + "grad_norm": 0.7075474858283997, + "learning_rate": 1.0689e-05, + "loss": 0.0573, + "step": 3569 + }, + { + "epoch": 2.8113430484442694, + "grad_norm": 0.9349903464317322, + "learning_rate": 1.0692e-05, + "loss": 0.076, + "step": 3570 + }, + { + "epoch": 2.812130760141788, + "grad_norm": 1.4977939128875732, + "learning_rate": 1.0695e-05, + "loss": 0.0631, + "step": 3571 + }, + { + "epoch": 2.812918471839307, + "grad_norm": 0.9488173723220825, + "learning_rate": 1.0698e-05, + "loss": 0.0397, + "step": 3572 + }, + { + "epoch": 2.8137061835368256, + "grad_norm": 0.7911178469657898, + "learning_rate": 1.0701000000000001e-05, + "loss": 0.0424, + "step": 3573 + }, + { + "epoch": 2.814493895234344, + "grad_norm": 0.8640322089195251, + "learning_rate": 1.0704000000000001e-05, + "loss": 0.0409, + "step": 3574 + }, + { + "epoch": 2.8152816069318627, + "grad_norm": 0.7399913668632507, + "learning_rate": 1.0707e-05, + "loss": 0.0522, + "step": 3575 + }, + { + "epoch": 2.8160693186293817, + "grad_norm": 0.7528223991394043, + "learning_rate": 1.0709999999999999e-05, + "loss": 0.0568, + "step": 3576 + }, + { + "epoch": 2.8168570303269003, + "grad_norm": 0.8680115938186646, + "learning_rate": 1.0712999999999999e-05, + "loss": 0.053, + "step": 3577 + }, + { + "epoch": 2.8176447420244193, + "grad_norm": 0.6392773389816284, + "learning_rate": 1.0716e-05, + "loss": 0.0421, + "step": 3578 + }, + { + "epoch": 2.818432453721938, + "grad_norm": 0.9025692343711853, + "learning_rate": 1.0719e-05, + "loss": 0.0673, + "step": 3579 + }, + { + "epoch": 2.8192201654194564, + "grad_norm": 1.4268642663955688, + "learning_rate": 1.0722e-05, + "loss": 0.0688, + "step": 3580 + }, + { + "epoch": 2.820007877116975, + "grad_norm": 1.1900739669799805, + "learning_rate": 1.0725e-05, + "loss": 0.063, + "step": 3581 + }, + { + "epoch": 2.820795588814494, + "grad_norm": 0.7884998321533203, + "learning_rate": 1.0728e-05, + "loss": 0.0554, + "step": 3582 + }, + { + "epoch": 2.8215833005120126, + "grad_norm": 1.75670325756073, + "learning_rate": 1.0731000000000001e-05, + "loss": 0.0773, + "step": 3583 + }, + { + "epoch": 2.8223710122095316, + "grad_norm": 1.2618552446365356, + "learning_rate": 1.0734000000000001e-05, + "loss": 0.0666, + "step": 3584 + }, + { + "epoch": 2.82315872390705, + "grad_norm": 1.04254949092865, + "learning_rate": 1.0737e-05, + "loss": 0.0729, + "step": 3585 + }, + { + "epoch": 2.8239464356045687, + "grad_norm": 0.7886636853218079, + "learning_rate": 1.074e-05, + "loss": 0.0546, + "step": 3586 + }, + { + "epoch": 2.8247341473020873, + "grad_norm": 0.6988055109977722, + "learning_rate": 1.0743e-05, + "loss": 0.0441, + "step": 3587 + }, + { + "epoch": 2.8255218589996063, + "grad_norm": 1.2442854642868042, + "learning_rate": 1.0746e-05, + "loss": 0.0487, + "step": 3588 + }, + { + "epoch": 2.826309570697125, + "grad_norm": 1.2364819049835205, + "learning_rate": 1.0749e-05, + "loss": 0.094, + "step": 3589 + }, + { + "epoch": 2.8270972823946434, + "grad_norm": 1.1242170333862305, + "learning_rate": 1.0752e-05, + "loss": 0.112, + "step": 3590 + }, + { + "epoch": 2.8278849940921624, + "grad_norm": 2.937713623046875, + "learning_rate": 1.0755e-05, + "loss": 0.4378, + "step": 3591 + }, + { + "epoch": 2.828672705789681, + "grad_norm": 1.7272197008132935, + "learning_rate": 1.0758e-05, + "loss": 0.2949, + "step": 3592 + }, + { + "epoch": 2.8294604174871996, + "grad_norm": 1.108683466911316, + "learning_rate": 1.0761e-05, + "loss": 0.2739, + "step": 3593 + }, + { + "epoch": 2.830248129184718, + "grad_norm": 1.1056160926818848, + "learning_rate": 1.0764e-05, + "loss": 0.2635, + "step": 3594 + }, + { + "epoch": 2.831035840882237, + "grad_norm": 1.6632988452911377, + "learning_rate": 1.0767e-05, + "loss": 0.2028, + "step": 3595 + }, + { + "epoch": 2.8318235525797557, + "grad_norm": 0.8882690072059631, + "learning_rate": 1.077e-05, + "loss": 0.132, + "step": 3596 + }, + { + "epoch": 2.8326112642772747, + "grad_norm": 0.9419432282447815, + "learning_rate": 1.0773e-05, + "loss": 0.1049, + "step": 3597 + }, + { + "epoch": 2.8333989759747933, + "grad_norm": 0.8126984238624573, + "learning_rate": 1.0776000000000002e-05, + "loss": 0.1034, + "step": 3598 + }, + { + "epoch": 2.834186687672312, + "grad_norm": 1.85343337059021, + "learning_rate": 1.0779000000000001e-05, + "loss": 0.0687, + "step": 3599 + }, + { + "epoch": 2.8349743993698304, + "grad_norm": 0.6269285678863525, + "learning_rate": 1.0782e-05, + "loss": 0.0588, + "step": 3600 + }, + { + "epoch": 2.8357621110673494, + "grad_norm": 0.7239330410957336, + "learning_rate": 1.0785e-05, + "loss": 0.0562, + "step": 3601 + }, + { + "epoch": 2.836549822764868, + "grad_norm": 0.44406837224960327, + "learning_rate": 1.0787999999999999e-05, + "loss": 0.0455, + "step": 3602 + }, + { + "epoch": 2.837337534462387, + "grad_norm": 1.2945556640625, + "learning_rate": 1.0791e-05, + "loss": 0.063, + "step": 3603 + }, + { + "epoch": 2.8381252461599056, + "grad_norm": 0.8343498110771179, + "learning_rate": 1.0794e-05, + "loss": 0.0421, + "step": 3604 + }, + { + "epoch": 2.838912957857424, + "grad_norm": 0.49050667881965637, + "learning_rate": 1.0797e-05, + "loss": 0.0475, + "step": 3605 + }, + { + "epoch": 2.8397006695549427, + "grad_norm": 0.6495568752288818, + "learning_rate": 1.08e-05, + "loss": 0.0387, + "step": 3606 + }, + { + "epoch": 2.8404883812524617, + "grad_norm": 0.528252124786377, + "learning_rate": 1.0803e-05, + "loss": 0.0364, + "step": 3607 + }, + { + "epoch": 2.8412760929499803, + "grad_norm": 0.6648236513137817, + "learning_rate": 1.0806000000000001e-05, + "loss": 0.0357, + "step": 3608 + }, + { + "epoch": 2.842063804647499, + "grad_norm": 0.7837843894958496, + "learning_rate": 1.0809000000000001e-05, + "loss": 0.056, + "step": 3609 + }, + { + "epoch": 2.842851516345018, + "grad_norm": 1.3186322450637817, + "learning_rate": 1.0812e-05, + "loss": 0.0321, + "step": 3610 + }, + { + "epoch": 2.8436392280425364, + "grad_norm": 0.9147319793701172, + "learning_rate": 1.0815e-05, + "loss": 0.0524, + "step": 3611 + }, + { + "epoch": 2.844426939740055, + "grad_norm": 0.6185140013694763, + "learning_rate": 1.0817999999999999e-05, + "loss": 0.0313, + "step": 3612 + }, + { + "epoch": 2.8452146514375736, + "grad_norm": 0.6407333016395569, + "learning_rate": 1.0821e-05, + "loss": 0.0662, + "step": 3613 + }, + { + "epoch": 2.8460023631350926, + "grad_norm": 0.8251403570175171, + "learning_rate": 1.0824e-05, + "loss": 0.029, + "step": 3614 + }, + { + "epoch": 2.846790074832611, + "grad_norm": 1.029017448425293, + "learning_rate": 1.0827e-05, + "loss": 0.0537, + "step": 3615 + }, + { + "epoch": 2.84757778653013, + "grad_norm": 1.3660205602645874, + "learning_rate": 1.083e-05, + "loss": 0.0552, + "step": 3616 + }, + { + "epoch": 2.8483654982276487, + "grad_norm": 0.6393278241157532, + "learning_rate": 1.0833e-05, + "loss": 0.0465, + "step": 3617 + }, + { + "epoch": 2.8491532099251673, + "grad_norm": 0.6455153226852417, + "learning_rate": 1.0836000000000001e-05, + "loss": 0.0399, + "step": 3618 + }, + { + "epoch": 2.849940921622686, + "grad_norm": 0.6756976246833801, + "learning_rate": 1.0839e-05, + "loss": 0.0496, + "step": 3619 + }, + { + "epoch": 2.850728633320205, + "grad_norm": 0.7412790060043335, + "learning_rate": 1.0842e-05, + "loss": 0.0614, + "step": 3620 + }, + { + "epoch": 2.8515163450177234, + "grad_norm": 0.7900129556655884, + "learning_rate": 1.0845e-05, + "loss": 0.045, + "step": 3621 + }, + { + "epoch": 2.8523040567152425, + "grad_norm": 0.9854816198348999, + "learning_rate": 1.0848e-05, + "loss": 0.0494, + "step": 3622 + }, + { + "epoch": 2.853091768412761, + "grad_norm": 0.7579402327537537, + "learning_rate": 1.0851000000000002e-05, + "loss": 0.0479, + "step": 3623 + }, + { + "epoch": 2.8538794801102796, + "grad_norm": 0.5546042323112488, + "learning_rate": 1.0854e-05, + "loss": 0.0442, + "step": 3624 + }, + { + "epoch": 2.854667191807798, + "grad_norm": 0.7342181205749512, + "learning_rate": 1.0857e-05, + "loss": 0.0499, + "step": 3625 + }, + { + "epoch": 2.855454903505317, + "grad_norm": 0.6170284748077393, + "learning_rate": 1.086e-05, + "loss": 0.0409, + "step": 3626 + }, + { + "epoch": 2.8562426152028357, + "grad_norm": 0.7306555509567261, + "learning_rate": 1.0863e-05, + "loss": 0.049, + "step": 3627 + }, + { + "epoch": 2.8570303269003547, + "grad_norm": 0.6740602850914001, + "learning_rate": 1.0866e-05, + "loss": 0.0458, + "step": 3628 + }, + { + "epoch": 2.8578180385978733, + "grad_norm": 0.7339708805084229, + "learning_rate": 1.0869e-05, + "loss": 0.0631, + "step": 3629 + }, + { + "epoch": 2.858605750295392, + "grad_norm": 1.4403775930404663, + "learning_rate": 1.0872e-05, + "loss": 0.0515, + "step": 3630 + }, + { + "epoch": 2.8593934619929104, + "grad_norm": 0.827873170375824, + "learning_rate": 1.0875e-05, + "loss": 0.065, + "step": 3631 + }, + { + "epoch": 2.8601811736904295, + "grad_norm": 0.5904473662376404, + "learning_rate": 1.0878e-05, + "loss": 0.0435, + "step": 3632 + }, + { + "epoch": 2.860968885387948, + "grad_norm": 0.7882388234138489, + "learning_rate": 1.0881000000000001e-05, + "loss": 0.0564, + "step": 3633 + }, + { + "epoch": 2.8617565970854666, + "grad_norm": 0.7782049179077148, + "learning_rate": 1.0884000000000001e-05, + "loss": 0.0568, + "step": 3634 + }, + { + "epoch": 2.8625443087829856, + "grad_norm": 0.6268551349639893, + "learning_rate": 1.0887000000000001e-05, + "loss": 0.0521, + "step": 3635 + }, + { + "epoch": 2.863332020480504, + "grad_norm": 0.6755161881446838, + "learning_rate": 1.089e-05, + "loss": 0.0288, + "step": 3636 + }, + { + "epoch": 2.8641197321780227, + "grad_norm": 1.2719224691390991, + "learning_rate": 1.0892999999999999e-05, + "loss": 0.0587, + "step": 3637 + }, + { + "epoch": 2.8649074438755413, + "grad_norm": 0.8065553307533264, + "learning_rate": 1.0896e-05, + "loss": 0.0632, + "step": 3638 + }, + { + "epoch": 2.8656951555730603, + "grad_norm": 1.563761830329895, + "learning_rate": 1.0899e-05, + "loss": 0.0947, + "step": 3639 + }, + { + "epoch": 2.866482867270579, + "grad_norm": 1.151876449584961, + "learning_rate": 1.0902e-05, + "loss": 0.0803, + "step": 3640 + }, + { + "epoch": 2.867270578968098, + "grad_norm": 1.7392410039901733, + "learning_rate": 1.0905e-05, + "loss": 0.428, + "step": 3641 + }, + { + "epoch": 2.8680582906656165, + "grad_norm": 1.5692968368530273, + "learning_rate": 1.0908e-05, + "loss": 0.3138, + "step": 3642 + }, + { + "epoch": 2.868846002363135, + "grad_norm": 1.3026809692382812, + "learning_rate": 1.0911000000000001e-05, + "loss": 0.3384, + "step": 3643 + }, + { + "epoch": 2.8696337140606536, + "grad_norm": 1.3222867250442505, + "learning_rate": 1.0914000000000001e-05, + "loss": 0.2803, + "step": 3644 + }, + { + "epoch": 2.8704214257581726, + "grad_norm": 1.384346842765808, + "learning_rate": 1.0917e-05, + "loss": 0.2383, + "step": 3645 + }, + { + "epoch": 2.871209137455691, + "grad_norm": 0.738409698009491, + "learning_rate": 1.092e-05, + "loss": 0.0772, + "step": 3646 + }, + { + "epoch": 2.87199684915321, + "grad_norm": 0.6449632048606873, + "learning_rate": 1.0923e-05, + "loss": 0.0975, + "step": 3647 + }, + { + "epoch": 2.8727845608507288, + "grad_norm": 0.643817126750946, + "learning_rate": 1.0926000000000002e-05, + "loss": 0.0472, + "step": 3648 + }, + { + "epoch": 2.8735722725482473, + "grad_norm": 3.262319564819336, + "learning_rate": 1.0929e-05, + "loss": 0.0648, + "step": 3649 + }, + { + "epoch": 2.874359984245766, + "grad_norm": 0.6909857988357544, + "learning_rate": 1.0932e-05, + "loss": 0.0484, + "step": 3650 + }, + { + "epoch": 2.875147695943285, + "grad_norm": 0.755171537399292, + "learning_rate": 1.0935e-05, + "loss": 0.0556, + "step": 3651 + }, + { + "epoch": 2.8759354076408035, + "grad_norm": 0.5417126417160034, + "learning_rate": 1.0938e-05, + "loss": 0.0391, + "step": 3652 + }, + { + "epoch": 2.876723119338322, + "grad_norm": 0.592180609703064, + "learning_rate": 1.0941e-05, + "loss": 0.0473, + "step": 3653 + }, + { + "epoch": 2.877510831035841, + "grad_norm": 1.0549081563949585, + "learning_rate": 1.0944e-05, + "loss": 0.0545, + "step": 3654 + }, + { + "epoch": 2.8782985427333596, + "grad_norm": 1.030050277709961, + "learning_rate": 1.0947e-05, + "loss": 0.0572, + "step": 3655 + }, + { + "epoch": 2.879086254430878, + "grad_norm": 0.8966221213340759, + "learning_rate": 1.095e-05, + "loss": 0.0476, + "step": 3656 + }, + { + "epoch": 2.8798739661283967, + "grad_norm": 0.5439420342445374, + "learning_rate": 1.0953e-05, + "loss": 0.0365, + "step": 3657 + }, + { + "epoch": 2.8806616778259158, + "grad_norm": 0.5381730794906616, + "learning_rate": 1.0956000000000001e-05, + "loss": 0.037, + "step": 3658 + }, + { + "epoch": 2.8814493895234343, + "grad_norm": 0.5756615400314331, + "learning_rate": 1.0959000000000001e-05, + "loss": 0.0358, + "step": 3659 + }, + { + "epoch": 2.8822371012209533, + "grad_norm": 0.7410528063774109, + "learning_rate": 1.0962000000000001e-05, + "loss": 0.038, + "step": 3660 + }, + { + "epoch": 2.883024812918472, + "grad_norm": 0.7628379464149475, + "learning_rate": 1.0965e-05, + "loss": 0.0594, + "step": 3661 + }, + { + "epoch": 2.8838125246159905, + "grad_norm": 1.7778240442276, + "learning_rate": 1.0967999999999999e-05, + "loss": 0.0417, + "step": 3662 + }, + { + "epoch": 2.884600236313509, + "grad_norm": 0.6810964345932007, + "learning_rate": 1.0971e-05, + "loss": 0.0638, + "step": 3663 + }, + { + "epoch": 2.885387948011028, + "grad_norm": 0.8246665000915527, + "learning_rate": 1.0974e-05, + "loss": 0.048, + "step": 3664 + }, + { + "epoch": 2.8861756597085466, + "grad_norm": 0.9427855014801025, + "learning_rate": 1.0977e-05, + "loss": 0.0404, + "step": 3665 + }, + { + "epoch": 2.8869633714060656, + "grad_norm": 0.7295368909835815, + "learning_rate": 1.098e-05, + "loss": 0.0562, + "step": 3666 + }, + { + "epoch": 2.887751083103584, + "grad_norm": 0.6737035512924194, + "learning_rate": 1.0983e-05, + "loss": 0.0416, + "step": 3667 + }, + { + "epoch": 2.8885387948011028, + "grad_norm": 0.6745599508285522, + "learning_rate": 1.0986000000000001e-05, + "loss": 0.0405, + "step": 3668 + }, + { + "epoch": 2.8893265064986213, + "grad_norm": 0.7005133628845215, + "learning_rate": 1.0989000000000001e-05, + "loss": 0.0536, + "step": 3669 + }, + { + "epoch": 2.8901142181961403, + "grad_norm": 0.9476985335350037, + "learning_rate": 1.0992e-05, + "loss": 0.0507, + "step": 3670 + }, + { + "epoch": 2.890901929893659, + "grad_norm": 0.8930561542510986, + "learning_rate": 1.0995e-05, + "loss": 0.0452, + "step": 3671 + }, + { + "epoch": 2.8916896415911775, + "grad_norm": 0.8356891870498657, + "learning_rate": 1.0998e-05, + "loss": 0.0393, + "step": 3672 + }, + { + "epoch": 2.8924773532886965, + "grad_norm": 0.8407155871391296, + "learning_rate": 1.1001e-05, + "loss": 0.0465, + "step": 3673 + }, + { + "epoch": 2.893265064986215, + "grad_norm": 1.227873682975769, + "learning_rate": 1.1004e-05, + "loss": 0.0561, + "step": 3674 + }, + { + "epoch": 2.8940527766837336, + "grad_norm": 0.6145022511482239, + "learning_rate": 1.1007e-05, + "loss": 0.0402, + "step": 3675 + }, + { + "epoch": 2.894840488381252, + "grad_norm": 0.6552572250366211, + "learning_rate": 1.101e-05, + "loss": 0.0255, + "step": 3676 + }, + { + "epoch": 2.895628200078771, + "grad_norm": 1.412091612815857, + "learning_rate": 1.1013e-05, + "loss": 0.0609, + "step": 3677 + }, + { + "epoch": 2.8964159117762898, + "grad_norm": 1.191643238067627, + "learning_rate": 1.1016000000000001e-05, + "loss": 0.0706, + "step": 3678 + }, + { + "epoch": 2.8972036234738088, + "grad_norm": 1.0758239030838013, + "learning_rate": 1.1019e-05, + "loss": 0.0568, + "step": 3679 + }, + { + "epoch": 2.8979913351713273, + "grad_norm": 0.7536303400993347, + "learning_rate": 1.1022e-05, + "loss": 0.0635, + "step": 3680 + }, + { + "epoch": 2.898779046868846, + "grad_norm": 1.4903273582458496, + "learning_rate": 1.1025e-05, + "loss": 0.0554, + "step": 3681 + }, + { + "epoch": 2.8995667585663645, + "grad_norm": 0.80381178855896, + "learning_rate": 1.1028e-05, + "loss": 0.0512, + "step": 3682 + }, + { + "epoch": 2.9003544702638835, + "grad_norm": 0.6405297517776489, + "learning_rate": 1.1031000000000002e-05, + "loss": 0.041, + "step": 3683 + }, + { + "epoch": 2.901142181961402, + "grad_norm": 1.2124146223068237, + "learning_rate": 1.1034000000000001e-05, + "loss": 0.0464, + "step": 3684 + }, + { + "epoch": 2.901929893658921, + "grad_norm": 1.1569575071334839, + "learning_rate": 1.1037000000000001e-05, + "loss": 0.0371, + "step": 3685 + }, + { + "epoch": 2.9027176053564396, + "grad_norm": 0.8721880912780762, + "learning_rate": 1.104e-05, + "loss": 0.0376, + "step": 3686 + }, + { + "epoch": 2.903505317053958, + "grad_norm": 0.9669612646102905, + "learning_rate": 1.1042999999999999e-05, + "loss": 0.0692, + "step": 3687 + }, + { + "epoch": 2.9042930287514768, + "grad_norm": 1.0832438468933105, + "learning_rate": 1.1046e-05, + "loss": 0.0642, + "step": 3688 + }, + { + "epoch": 2.905080740448996, + "grad_norm": 2.113205671310425, + "learning_rate": 1.1049e-05, + "loss": 0.0751, + "step": 3689 + }, + { + "epoch": 2.9058684521465143, + "grad_norm": 1.2483606338500977, + "learning_rate": 1.1052e-05, + "loss": 0.1071, + "step": 3690 + }, + { + "epoch": 2.9066561638440334, + "grad_norm": 1.714341402053833, + "learning_rate": 1.1055e-05, + "loss": 0.354, + "step": 3691 + }, + { + "epoch": 2.907443875541552, + "grad_norm": 1.1096880435943604, + "learning_rate": 1.1058e-05, + "loss": 0.3156, + "step": 3692 + }, + { + "epoch": 2.9082315872390705, + "grad_norm": 0.9585646986961365, + "learning_rate": 1.1061000000000001e-05, + "loss": 0.2269, + "step": 3693 + }, + { + "epoch": 2.909019298936589, + "grad_norm": 0.8690913319587708, + "learning_rate": 1.1064000000000001e-05, + "loss": 0.2204, + "step": 3694 + }, + { + "epoch": 2.909807010634108, + "grad_norm": 1.5624891519546509, + "learning_rate": 1.1067000000000001e-05, + "loss": 0.2039, + "step": 3695 + }, + { + "epoch": 2.9105947223316266, + "grad_norm": 0.7936228513717651, + "learning_rate": 1.107e-05, + "loss": 0.1216, + "step": 3696 + }, + { + "epoch": 2.911382434029145, + "grad_norm": 0.8008772134780884, + "learning_rate": 1.1073e-05, + "loss": 0.0991, + "step": 3697 + }, + { + "epoch": 2.912170145726664, + "grad_norm": 0.6311633586883545, + "learning_rate": 1.1075999999999999e-05, + "loss": 0.0893, + "step": 3698 + }, + { + "epoch": 2.912957857424183, + "grad_norm": 0.45913273096084595, + "learning_rate": 1.1079e-05, + "loss": 0.0475, + "step": 3699 + }, + { + "epoch": 2.9137455691217014, + "grad_norm": 0.8750367760658264, + "learning_rate": 1.1082e-05, + "loss": 0.091, + "step": 3700 + }, + { + "epoch": 2.91453328081922, + "grad_norm": 0.7952347993850708, + "learning_rate": 1.1085e-05, + "loss": 0.0844, + "step": 3701 + }, + { + "epoch": 2.915320992516739, + "grad_norm": 0.832523763179779, + "learning_rate": 1.1088e-05, + "loss": 0.0583, + "step": 3702 + }, + { + "epoch": 2.9161087042142575, + "grad_norm": 0.6843067407608032, + "learning_rate": 1.1091e-05, + "loss": 0.0472, + "step": 3703 + }, + { + "epoch": 2.9168964159117765, + "grad_norm": 0.6147898435592651, + "learning_rate": 1.1094e-05, + "loss": 0.0299, + "step": 3704 + }, + { + "epoch": 2.917684127609295, + "grad_norm": 0.49170997738838196, + "learning_rate": 1.1097e-05, + "loss": 0.0405, + "step": 3705 + }, + { + "epoch": 2.9184718393068136, + "grad_norm": 0.5154930949211121, + "learning_rate": 1.11e-05, + "loss": 0.0377, + "step": 3706 + }, + { + "epoch": 2.919259551004332, + "grad_norm": 0.6790882349014282, + "learning_rate": 1.1103e-05, + "loss": 0.0354, + "step": 3707 + }, + { + "epoch": 2.920047262701851, + "grad_norm": 0.609961748123169, + "learning_rate": 1.1106e-05, + "loss": 0.0494, + "step": 3708 + }, + { + "epoch": 2.92083497439937, + "grad_norm": 0.7282396554946899, + "learning_rate": 1.1109000000000002e-05, + "loss": 0.0363, + "step": 3709 + }, + { + "epoch": 2.921622686096889, + "grad_norm": 0.7470404505729675, + "learning_rate": 1.1112e-05, + "loss": 0.0568, + "step": 3710 + }, + { + "epoch": 2.9224103977944074, + "grad_norm": 0.6181232929229736, + "learning_rate": 1.1115e-05, + "loss": 0.0431, + "step": 3711 + }, + { + "epoch": 2.923198109491926, + "grad_norm": 0.5685896277427673, + "learning_rate": 1.1118e-05, + "loss": 0.0386, + "step": 3712 + }, + { + "epoch": 2.9239858211894445, + "grad_norm": 0.7808270454406738, + "learning_rate": 1.1120999999999999e-05, + "loss": 0.0409, + "step": 3713 + }, + { + "epoch": 2.9247735328869635, + "grad_norm": 0.8034542798995972, + "learning_rate": 1.1124e-05, + "loss": 0.0396, + "step": 3714 + }, + { + "epoch": 2.925561244584482, + "grad_norm": 0.9482354521751404, + "learning_rate": 1.1127e-05, + "loss": 0.0595, + "step": 3715 + }, + { + "epoch": 2.9263489562820006, + "grad_norm": 0.7243106365203857, + "learning_rate": 1.113e-05, + "loss": 0.0412, + "step": 3716 + }, + { + "epoch": 2.9271366679795197, + "grad_norm": 0.9536169171333313, + "learning_rate": 1.1133e-05, + "loss": 0.0349, + "step": 3717 + }, + { + "epoch": 2.9279243796770382, + "grad_norm": 0.6926130652427673, + "learning_rate": 1.1136e-05, + "loss": 0.0316, + "step": 3718 + }, + { + "epoch": 2.928712091374557, + "grad_norm": 1.1933585405349731, + "learning_rate": 1.1139000000000001e-05, + "loss": 0.0693, + "step": 3719 + }, + { + "epoch": 2.9294998030720754, + "grad_norm": 1.0394293069839478, + "learning_rate": 1.1142000000000001e-05, + "loss": 0.0448, + "step": 3720 + }, + { + "epoch": 2.9302875147695944, + "grad_norm": 1.2001590728759766, + "learning_rate": 1.1145000000000001e-05, + "loss": 0.0482, + "step": 3721 + }, + { + "epoch": 2.931075226467113, + "grad_norm": 0.677554726600647, + "learning_rate": 1.1148e-05, + "loss": 0.0367, + "step": 3722 + }, + { + "epoch": 2.931862938164632, + "grad_norm": 0.5543152689933777, + "learning_rate": 1.1150999999999999e-05, + "loss": 0.0516, + "step": 3723 + }, + { + "epoch": 2.9326506498621505, + "grad_norm": 1.6593797206878662, + "learning_rate": 1.1154e-05, + "loss": 0.1533, + "step": 3724 + }, + { + "epoch": 2.933438361559669, + "grad_norm": 0.6501101851463318, + "learning_rate": 1.1157e-05, + "loss": 0.0573, + "step": 3725 + }, + { + "epoch": 2.9342260732571877, + "grad_norm": 0.5197108387947083, + "learning_rate": 1.116e-05, + "loss": 0.0317, + "step": 3726 + }, + { + "epoch": 2.9350137849547067, + "grad_norm": 1.0746392011642456, + "learning_rate": 1.1163e-05, + "loss": 0.0445, + "step": 3727 + }, + { + "epoch": 2.9358014966522252, + "grad_norm": 0.9661144018173218, + "learning_rate": 1.1166e-05, + "loss": 0.0804, + "step": 3728 + }, + { + "epoch": 2.9365892083497442, + "grad_norm": 1.0487761497497559, + "learning_rate": 1.1169000000000001e-05, + "loss": 0.0484, + "step": 3729 + }, + { + "epoch": 2.937376920047263, + "grad_norm": 0.8640915155410767, + "learning_rate": 1.1172e-05, + "loss": 0.0619, + "step": 3730 + }, + { + "epoch": 2.9381646317447814, + "grad_norm": 0.586040735244751, + "learning_rate": 1.1175e-05, + "loss": 0.0568, + "step": 3731 + }, + { + "epoch": 2.9389523434423, + "grad_norm": 0.7873704433441162, + "learning_rate": 1.1178e-05, + "loss": 0.0601, + "step": 3732 + }, + { + "epoch": 2.939740055139819, + "grad_norm": 0.8781585693359375, + "learning_rate": 1.1181e-05, + "loss": 0.0649, + "step": 3733 + }, + { + "epoch": 2.9405277668373375, + "grad_norm": 0.6399778723716736, + "learning_rate": 1.1184000000000002e-05, + "loss": 0.0427, + "step": 3734 + }, + { + "epoch": 2.941315478534856, + "grad_norm": 0.9738752245903015, + "learning_rate": 1.1187e-05, + "loss": 0.0657, + "step": 3735 + }, + { + "epoch": 2.942103190232375, + "grad_norm": 1.7016379833221436, + "learning_rate": 1.119e-05, + "loss": 0.0473, + "step": 3736 + }, + { + "epoch": 2.9428909019298937, + "grad_norm": 0.7622055411338806, + "learning_rate": 1.1193e-05, + "loss": 0.0891, + "step": 3737 + }, + { + "epoch": 2.9436786136274122, + "grad_norm": 0.7481237053871155, + "learning_rate": 1.1196e-05, + "loss": 0.0524, + "step": 3738 + }, + { + "epoch": 2.944466325324931, + "grad_norm": 0.8981318473815918, + "learning_rate": 1.1199e-05, + "loss": 0.0545, + "step": 3739 + }, + { + "epoch": 2.94525403702245, + "grad_norm": 1.0718331336975098, + "learning_rate": 1.1202e-05, + "loss": 0.0839, + "step": 3740 + }, + { + "epoch": 2.9460417487199684, + "grad_norm": 4.065292835235596, + "learning_rate": 1.1205e-05, + "loss": 0.552, + "step": 3741 + }, + { + "epoch": 2.9468294604174874, + "grad_norm": 1.3027299642562866, + "learning_rate": 1.1208e-05, + "loss": 0.3394, + "step": 3742 + }, + { + "epoch": 2.947617172115006, + "grad_norm": 2.6634774208068848, + "learning_rate": 1.1211e-05, + "loss": 0.2267, + "step": 3743 + }, + { + "epoch": 2.9484048838125245, + "grad_norm": 0.9774680137634277, + "learning_rate": 1.1214000000000001e-05, + "loss": 0.1959, + "step": 3744 + }, + { + "epoch": 2.949192595510043, + "grad_norm": 0.968720018863678, + "learning_rate": 1.1217000000000001e-05, + "loss": 0.1832, + "step": 3745 + }, + { + "epoch": 2.949980307207562, + "grad_norm": 0.7433813214302063, + "learning_rate": 1.1220000000000001e-05, + "loss": 0.0918, + "step": 3746 + }, + { + "epoch": 2.9507680189050807, + "grad_norm": 0.7946711182594299, + "learning_rate": 1.1222999999999999e-05, + "loss": 0.1049, + "step": 3747 + }, + { + "epoch": 2.9515557306025997, + "grad_norm": 0.8997898101806641, + "learning_rate": 1.1225999999999999e-05, + "loss": 0.0665, + "step": 3748 + }, + { + "epoch": 2.9523434423001182, + "grad_norm": 0.6508486270904541, + "learning_rate": 1.1229e-05, + "loss": 0.0569, + "step": 3749 + }, + { + "epoch": 2.953131153997637, + "grad_norm": 0.5169578194618225, + "learning_rate": 1.1232e-05, + "loss": 0.0399, + "step": 3750 + }, + { + "epoch": 2.9539188656951554, + "grad_norm": 0.8425366878509521, + "learning_rate": 1.1235e-05, + "loss": 0.0444, + "step": 3751 + }, + { + "epoch": 2.9547065773926744, + "grad_norm": 0.4717499911785126, + "learning_rate": 1.1238e-05, + "loss": 0.0469, + "step": 3752 + }, + { + "epoch": 2.955494289090193, + "grad_norm": 0.5731096267700195, + "learning_rate": 1.1241e-05, + "loss": 0.0495, + "step": 3753 + }, + { + "epoch": 2.956282000787712, + "grad_norm": 0.5635340809822083, + "learning_rate": 1.1244000000000001e-05, + "loss": 0.0519, + "step": 3754 + }, + { + "epoch": 2.9570697124852305, + "grad_norm": 0.5327789187431335, + "learning_rate": 1.1247000000000001e-05, + "loss": 0.0422, + "step": 3755 + }, + { + "epoch": 2.957857424182749, + "grad_norm": 0.6697182655334473, + "learning_rate": 1.125e-05, + "loss": 0.0652, + "step": 3756 + }, + { + "epoch": 2.9586451358802677, + "grad_norm": 0.4640757739543915, + "learning_rate": 1.1253e-05, + "loss": 0.0331, + "step": 3757 + }, + { + "epoch": 2.9594328475777867, + "grad_norm": 0.453124463558197, + "learning_rate": 1.1256e-05, + "loss": 0.0433, + "step": 3758 + }, + { + "epoch": 2.9602205592753053, + "grad_norm": 1.4537990093231201, + "learning_rate": 1.1259e-05, + "loss": 0.0519, + "step": 3759 + }, + { + "epoch": 2.961008270972824, + "grad_norm": 0.9286530613899231, + "learning_rate": 1.1262e-05, + "loss": 0.039, + "step": 3760 + }, + { + "epoch": 2.961795982670343, + "grad_norm": 0.9717118740081787, + "learning_rate": 1.1265e-05, + "loss": 0.0648, + "step": 3761 + }, + { + "epoch": 2.9625836943678614, + "grad_norm": 0.921646237373352, + "learning_rate": 1.1268e-05, + "loss": 0.0574, + "step": 3762 + }, + { + "epoch": 2.96337140606538, + "grad_norm": 0.6992313265800476, + "learning_rate": 1.1271e-05, + "loss": 0.0275, + "step": 3763 + }, + { + "epoch": 2.9641591177628985, + "grad_norm": 1.200265645980835, + "learning_rate": 1.1274e-05, + "loss": 0.0837, + "step": 3764 + }, + { + "epoch": 2.9649468294604175, + "grad_norm": 0.5932478308677673, + "learning_rate": 1.1277e-05, + "loss": 0.0513, + "step": 3765 + }, + { + "epoch": 2.965734541157936, + "grad_norm": 0.6030976176261902, + "learning_rate": 1.128e-05, + "loss": 0.0505, + "step": 3766 + }, + { + "epoch": 2.966522252855455, + "grad_norm": 0.6539469957351685, + "learning_rate": 1.1283e-05, + "loss": 0.0399, + "step": 3767 + }, + { + "epoch": 2.9673099645529737, + "grad_norm": 0.8630111813545227, + "learning_rate": 1.1286e-05, + "loss": 0.0516, + "step": 3768 + }, + { + "epoch": 2.9680976762504923, + "grad_norm": 0.9183276295661926, + "learning_rate": 1.1289000000000002e-05, + "loss": 0.034, + "step": 3769 + }, + { + "epoch": 2.968885387948011, + "grad_norm": 0.8105270862579346, + "learning_rate": 1.1292000000000001e-05, + "loss": 0.0484, + "step": 3770 + }, + { + "epoch": 2.96967309964553, + "grad_norm": 1.0096428394317627, + "learning_rate": 1.1295000000000001e-05, + "loss": 0.0795, + "step": 3771 + }, + { + "epoch": 2.9704608113430484, + "grad_norm": 0.7536454200744629, + "learning_rate": 1.1298e-05, + "loss": 0.0409, + "step": 3772 + }, + { + "epoch": 2.9712485230405674, + "grad_norm": 0.6108353137969971, + "learning_rate": 1.1300999999999999e-05, + "loss": 0.0438, + "step": 3773 + }, + { + "epoch": 2.972036234738086, + "grad_norm": 0.8008527755737305, + "learning_rate": 1.1304e-05, + "loss": 0.0574, + "step": 3774 + }, + { + "epoch": 2.9728239464356045, + "grad_norm": 0.8820787668228149, + "learning_rate": 1.1307e-05, + "loss": 0.0575, + "step": 3775 + }, + { + "epoch": 2.973611658133123, + "grad_norm": 0.6229186058044434, + "learning_rate": 1.131e-05, + "loss": 0.0364, + "step": 3776 + }, + { + "epoch": 2.974399369830642, + "grad_norm": 0.7672805190086365, + "learning_rate": 1.1313e-05, + "loss": 0.0441, + "step": 3777 + }, + { + "epoch": 2.9751870815281607, + "grad_norm": 1.0336889028549194, + "learning_rate": 1.1316e-05, + "loss": 0.0693, + "step": 3778 + }, + { + "epoch": 2.9759747932256793, + "grad_norm": 0.8257912397384644, + "learning_rate": 1.1319000000000001e-05, + "loss": 0.0533, + "step": 3779 + }, + { + "epoch": 2.9767625049231983, + "grad_norm": 0.8720191717147827, + "learning_rate": 1.1322000000000001e-05, + "loss": 0.0543, + "step": 3780 + }, + { + "epoch": 2.977550216620717, + "grad_norm": 0.5471386313438416, + "learning_rate": 1.1325e-05, + "loss": 0.0375, + "step": 3781 + }, + { + "epoch": 2.9783379283182354, + "grad_norm": 1.6313724517822266, + "learning_rate": 1.1328e-05, + "loss": 0.0717, + "step": 3782 + }, + { + "epoch": 2.979125640015754, + "grad_norm": 0.6566583514213562, + "learning_rate": 1.1331e-05, + "loss": 0.0512, + "step": 3783 + }, + { + "epoch": 2.979913351713273, + "grad_norm": 0.6330272555351257, + "learning_rate": 1.1334e-05, + "loss": 0.0463, + "step": 3784 + }, + { + "epoch": 2.9807010634107916, + "grad_norm": 0.8951604962348938, + "learning_rate": 1.1337e-05, + "loss": 0.0551, + "step": 3785 + }, + { + "epoch": 2.9814887751083106, + "grad_norm": 0.8987985253334045, + "learning_rate": 1.134e-05, + "loss": 0.0612, + "step": 3786 + }, + { + "epoch": 2.982276486805829, + "grad_norm": 0.9178686738014221, + "learning_rate": 1.1343e-05, + "loss": 0.0586, + "step": 3787 + }, + { + "epoch": 2.9830641985033477, + "grad_norm": 1.0889356136322021, + "learning_rate": 1.1346e-05, + "loss": 0.0656, + "step": 3788 + }, + { + "epoch": 2.9838519102008663, + "grad_norm": 1.480254888534546, + "learning_rate": 1.1349000000000001e-05, + "loss": 0.0657, + "step": 3789 + }, + { + "epoch": 2.9846396218983853, + "grad_norm": 0.9037389755249023, + "learning_rate": 1.1352e-05, + "loss": 0.054, + "step": 3790 + }, + { + "epoch": 2.985427333595904, + "grad_norm": 1.1620819568634033, + "learning_rate": 1.1355e-05, + "loss": 0.2678, + "step": 3791 + }, + { + "epoch": 2.986215045293423, + "grad_norm": 1.3313831090927124, + "learning_rate": 1.1358e-05, + "loss": 0.2382, + "step": 3792 + }, + { + "epoch": 2.9870027569909414, + "grad_norm": 1.3080836534500122, + "learning_rate": 1.1361e-05, + "loss": 0.1703, + "step": 3793 + }, + { + "epoch": 2.98779046868846, + "grad_norm": 0.810477077960968, + "learning_rate": 1.1364000000000002e-05, + "loss": 0.0846, + "step": 3794 + }, + { + "epoch": 2.9885781803859786, + "grad_norm": 0.5188721418380737, + "learning_rate": 1.1367000000000001e-05, + "loss": 0.0434, + "step": 3795 + }, + { + "epoch": 2.9893658920834976, + "grad_norm": 0.7948912978172302, + "learning_rate": 1.137e-05, + "loss": 0.0594, + "step": 3796 + }, + { + "epoch": 2.990153603781016, + "grad_norm": 0.5674085021018982, + "learning_rate": 1.1373e-05, + "loss": 0.0332, + "step": 3797 + }, + { + "epoch": 2.9909413154785347, + "grad_norm": 0.6176050305366516, + "learning_rate": 1.1376e-05, + "loss": 0.0488, + "step": 3798 + }, + { + "epoch": 2.9917290271760537, + "grad_norm": 0.44723403453826904, + "learning_rate": 1.1379e-05, + "loss": 0.0398, + "step": 3799 + }, + { + "epoch": 2.9925167388735723, + "grad_norm": 0.7474955320358276, + "learning_rate": 1.1382e-05, + "loss": 0.0583, + "step": 3800 + }, + { + "epoch": 2.993304450571091, + "grad_norm": 0.5594581365585327, + "learning_rate": 1.1385e-05, + "loss": 0.039, + "step": 3801 + }, + { + "epoch": 2.9940921622686094, + "grad_norm": 0.8535451889038086, + "learning_rate": 1.1388e-05, + "loss": 0.0397, + "step": 3802 + }, + { + "epoch": 2.9948798739661284, + "grad_norm": 0.859883725643158, + "learning_rate": 1.1391e-05, + "loss": 0.0628, + "step": 3803 + }, + { + "epoch": 2.995667585663647, + "grad_norm": 0.5917317867279053, + "learning_rate": 1.1394000000000001e-05, + "loss": 0.0427, + "step": 3804 + }, + { + "epoch": 2.996455297361166, + "grad_norm": 0.4937160015106201, + "learning_rate": 1.1397000000000001e-05, + "loss": 0.0425, + "step": 3805 + }, + { + "epoch": 2.9972430090586846, + "grad_norm": 1.1916555166244507, + "learning_rate": 1.1400000000000001e-05, + "loss": 0.0582, + "step": 3806 + }, + { + "epoch": 2.998030720756203, + "grad_norm": 0.9881109595298767, + "learning_rate": 1.1403e-05, + "loss": 0.0557, + "step": 3807 + }, + { + "epoch": 2.9988184324537217, + "grad_norm": 0.8701132535934448, + "learning_rate": 1.1406e-05, + "loss": 0.0531, + "step": 3808 + }, + { + "epoch": 2.9996061441512407, + "grad_norm": 1.0137581825256348, + "learning_rate": 1.1409e-05, + "loss": 0.0658, + "step": 3809 + }, + { + "epoch": 3.0, + "grad_norm": 1.1403098106384277, + "learning_rate": 1.1412e-05, + "loss": 0.0547, + "step": 3810 + }, + { + "epoch": 3.0007877116975186, + "grad_norm": 1.1267873048782349, + "learning_rate": 1.1415e-05, + "loss": 0.3967, + "step": 3811 + }, + { + "epoch": 3.0015754233950376, + "grad_norm": 0.8789886832237244, + "learning_rate": 1.1418e-05, + "loss": 0.2705, + "step": 3812 + }, + { + "epoch": 3.002363135092556, + "grad_norm": 1.2015959024429321, + "learning_rate": 1.1421e-05, + "loss": 0.2586, + "step": 3813 + }, + { + "epoch": 3.0031508467900747, + "grad_norm": 1.2411750555038452, + "learning_rate": 1.1424000000000001e-05, + "loss": 0.1936, + "step": 3814 + }, + { + "epoch": 3.0039385584875937, + "grad_norm": 1.4347119331359863, + "learning_rate": 1.1427000000000001e-05, + "loss": 0.14, + "step": 3815 + }, + { + "epoch": 3.0047262701851123, + "grad_norm": 0.7147073149681091, + "learning_rate": 1.143e-05, + "loss": 0.0838, + "step": 3816 + }, + { + "epoch": 3.005513981882631, + "grad_norm": 1.1736812591552734, + "learning_rate": 1.1433e-05, + "loss": 0.1439, + "step": 3817 + }, + { + "epoch": 3.00630169358015, + "grad_norm": 0.5557313561439514, + "learning_rate": 1.1436e-05, + "loss": 0.0509, + "step": 3818 + }, + { + "epoch": 3.0070894052776684, + "grad_norm": 0.7735299468040466, + "learning_rate": 1.1439e-05, + "loss": 0.0792, + "step": 3819 + }, + { + "epoch": 3.007877116975187, + "grad_norm": 0.5157554745674133, + "learning_rate": 1.1442000000000002e-05, + "loss": 0.0317, + "step": 3820 + }, + { + "epoch": 3.0086648286727056, + "grad_norm": 0.9039798378944397, + "learning_rate": 1.1445e-05, + "loss": 0.0433, + "step": 3821 + }, + { + "epoch": 3.0094525403702246, + "grad_norm": 0.5256069302558899, + "learning_rate": 1.1448e-05, + "loss": 0.0444, + "step": 3822 + }, + { + "epoch": 3.010240252067743, + "grad_norm": 0.6702919006347656, + "learning_rate": 1.1451e-05, + "loss": 0.047, + "step": 3823 + }, + { + "epoch": 3.0110279637652617, + "grad_norm": 0.4620499014854431, + "learning_rate": 1.1453999999999999e-05, + "loss": 0.0308, + "step": 3824 + }, + { + "epoch": 3.0118156754627807, + "grad_norm": 0.4446265399456024, + "learning_rate": 1.1457e-05, + "loss": 0.0218, + "step": 3825 + }, + { + "epoch": 3.0126033871602993, + "grad_norm": 0.7944394946098328, + "learning_rate": 1.146e-05, + "loss": 0.0494, + "step": 3826 + }, + { + "epoch": 3.013391098857818, + "grad_norm": 0.745747983455658, + "learning_rate": 1.1463e-05, + "loss": 0.0333, + "step": 3827 + }, + { + "epoch": 3.014178810555337, + "grad_norm": 0.8209229111671448, + "learning_rate": 1.1466e-05, + "loss": 0.0543, + "step": 3828 + }, + { + "epoch": 3.0149665222528554, + "grad_norm": 0.37028592824935913, + "learning_rate": 1.1469e-05, + "loss": 0.0289, + "step": 3829 + }, + { + "epoch": 3.015754233950374, + "grad_norm": 0.5358646512031555, + "learning_rate": 1.1472000000000001e-05, + "loss": 0.0344, + "step": 3830 + }, + { + "epoch": 3.016541945647893, + "grad_norm": 0.591387927532196, + "learning_rate": 1.1475000000000001e-05, + "loss": 0.0311, + "step": 3831 + }, + { + "epoch": 3.0173296573454116, + "grad_norm": 0.8837469220161438, + "learning_rate": 1.1478000000000001e-05, + "loss": 0.0537, + "step": 3832 + }, + { + "epoch": 3.01811736904293, + "grad_norm": 0.8782399296760559, + "learning_rate": 1.1480999999999999e-05, + "loss": 0.0558, + "step": 3833 + }, + { + "epoch": 3.018905080740449, + "grad_norm": 0.7049795985221863, + "learning_rate": 1.1483999999999999e-05, + "loss": 0.0406, + "step": 3834 + }, + { + "epoch": 3.0196927924379677, + "grad_norm": 0.8613286018371582, + "learning_rate": 1.1487e-05, + "loss": 0.0651, + "step": 3835 + }, + { + "epoch": 3.0204805041354863, + "grad_norm": 0.8511703610420227, + "learning_rate": 1.149e-05, + "loss": 0.0397, + "step": 3836 + }, + { + "epoch": 3.0212682158330053, + "grad_norm": 0.8498990535736084, + "learning_rate": 1.1493e-05, + "loss": 0.0522, + "step": 3837 + }, + { + "epoch": 3.022055927530524, + "grad_norm": 0.5649842023849487, + "learning_rate": 1.1496e-05, + "loss": 0.0397, + "step": 3838 + }, + { + "epoch": 3.0228436392280424, + "grad_norm": 0.6585394740104675, + "learning_rate": 1.1499e-05, + "loss": 0.0338, + "step": 3839 + }, + { + "epoch": 3.0236313509255615, + "grad_norm": 0.7145583629608154, + "learning_rate": 1.1502000000000001e-05, + "loss": 0.0422, + "step": 3840 + }, + { + "epoch": 3.02441906262308, + "grad_norm": 0.7870445847511292, + "learning_rate": 1.1505e-05, + "loss": 0.0523, + "step": 3841 + }, + { + "epoch": 3.0252067743205986, + "grad_norm": 0.591414213180542, + "learning_rate": 1.1508e-05, + "loss": 0.0481, + "step": 3842 + }, + { + "epoch": 3.025994486018117, + "grad_norm": 0.4865707755088806, + "learning_rate": 1.1511e-05, + "loss": 0.0342, + "step": 3843 + }, + { + "epoch": 3.026782197715636, + "grad_norm": 0.7309476733207703, + "learning_rate": 1.1514e-05, + "loss": 0.0488, + "step": 3844 + }, + { + "epoch": 3.0275699094131547, + "grad_norm": 0.7633036971092224, + "learning_rate": 1.1517e-05, + "loss": 0.0338, + "step": 3845 + }, + { + "epoch": 3.0283576211106733, + "grad_norm": 1.0334068536758423, + "learning_rate": 1.152e-05, + "loss": 0.0548, + "step": 3846 + }, + { + "epoch": 3.0291453328081923, + "grad_norm": 0.8770681619644165, + "learning_rate": 1.1523e-05, + "loss": 0.0557, + "step": 3847 + }, + { + "epoch": 3.029933044505711, + "grad_norm": 0.5125308632850647, + "learning_rate": 1.1526e-05, + "loss": 0.0275, + "step": 3848 + }, + { + "epoch": 3.0307207562032294, + "grad_norm": 1.1051772832870483, + "learning_rate": 1.1529e-05, + "loss": 0.0485, + "step": 3849 + }, + { + "epoch": 3.0315084679007485, + "grad_norm": 0.6886886954307556, + "learning_rate": 1.1532e-05, + "loss": 0.0445, + "step": 3850 + }, + { + "epoch": 3.032296179598267, + "grad_norm": 1.073586344718933, + "learning_rate": 1.1535e-05, + "loss": 0.0755, + "step": 3851 + }, + { + "epoch": 3.0330838912957856, + "grad_norm": 0.8920929431915283, + "learning_rate": 1.1538e-05, + "loss": 0.0409, + "step": 3852 + }, + { + "epoch": 3.0338716029933046, + "grad_norm": 0.7502365708351135, + "learning_rate": 1.1541e-05, + "loss": 0.0394, + "step": 3853 + }, + { + "epoch": 3.034659314690823, + "grad_norm": 0.598853588104248, + "learning_rate": 1.1544e-05, + "loss": 0.0491, + "step": 3854 + }, + { + "epoch": 3.0354470263883417, + "grad_norm": 0.9283138513565063, + "learning_rate": 1.1547000000000001e-05, + "loss": 0.0312, + "step": 3855 + }, + { + "epoch": 3.0362347380858608, + "grad_norm": 0.6639292240142822, + "learning_rate": 1.1550000000000001e-05, + "loss": 0.0433, + "step": 3856 + }, + { + "epoch": 3.0370224497833793, + "grad_norm": 1.331373691558838, + "learning_rate": 1.1553000000000001e-05, + "loss": 0.057, + "step": 3857 + }, + { + "epoch": 3.037810161480898, + "grad_norm": 0.8521243929862976, + "learning_rate": 1.1555999999999999e-05, + "loss": 0.0493, + "step": 3858 + }, + { + "epoch": 3.038597873178417, + "grad_norm": 1.3227293491363525, + "learning_rate": 1.1558999999999999e-05, + "loss": 0.0726, + "step": 3859 + }, + { + "epoch": 3.0393855848759355, + "grad_norm": 0.9349537491798401, + "learning_rate": 1.1562e-05, + "loss": 0.0688, + "step": 3860 + }, + { + "epoch": 3.040173296573454, + "grad_norm": 1.4691708087921143, + "learning_rate": 1.1565e-05, + "loss": 0.413, + "step": 3861 + }, + { + "epoch": 3.0409610082709726, + "grad_norm": 1.177639126777649, + "learning_rate": 1.1568e-05, + "loss": 0.2761, + "step": 3862 + }, + { + "epoch": 3.0417487199684916, + "grad_norm": 1.9794902801513672, + "learning_rate": 1.1571e-05, + "loss": 0.3705, + "step": 3863 + }, + { + "epoch": 3.04253643166601, + "grad_norm": 1.251172661781311, + "learning_rate": 1.1574e-05, + "loss": 0.3007, + "step": 3864 + }, + { + "epoch": 3.0433241433635287, + "grad_norm": 1.020132064819336, + "learning_rate": 1.1577000000000001e-05, + "loss": 0.1969, + "step": 3865 + }, + { + "epoch": 3.0441118550610478, + "grad_norm": 0.8879207968711853, + "learning_rate": 1.1580000000000001e-05, + "loss": 0.0992, + "step": 3866 + }, + { + "epoch": 3.0448995667585663, + "grad_norm": 0.7223156094551086, + "learning_rate": 1.1583e-05, + "loss": 0.0981, + "step": 3867 + }, + { + "epoch": 3.045687278456085, + "grad_norm": 0.604021430015564, + "learning_rate": 1.1586e-05, + "loss": 0.0514, + "step": 3868 + }, + { + "epoch": 3.046474990153604, + "grad_norm": 0.553321897983551, + "learning_rate": 1.1589e-05, + "loss": 0.0459, + "step": 3869 + }, + { + "epoch": 3.0472627018511225, + "grad_norm": 0.48561620712280273, + "learning_rate": 1.1592e-05, + "loss": 0.0535, + "step": 3870 + }, + { + "epoch": 3.048050413548641, + "grad_norm": 1.4280242919921875, + "learning_rate": 1.1595e-05, + "loss": 0.0454, + "step": 3871 + }, + { + "epoch": 3.04883812524616, + "grad_norm": 0.7370725274085999, + "learning_rate": 1.1598e-05, + "loss": 0.0548, + "step": 3872 + }, + { + "epoch": 3.0496258369436786, + "grad_norm": 0.8943104147911072, + "learning_rate": 1.1601e-05, + "loss": 0.0896, + "step": 3873 + }, + { + "epoch": 3.050413548641197, + "grad_norm": 0.5664016008377075, + "learning_rate": 1.1604e-05, + "loss": 0.0301, + "step": 3874 + }, + { + "epoch": 3.051201260338716, + "grad_norm": 0.5787067413330078, + "learning_rate": 1.1607000000000001e-05, + "loss": 0.0282, + "step": 3875 + }, + { + "epoch": 3.0519889720362348, + "grad_norm": 0.5211849808692932, + "learning_rate": 1.161e-05, + "loss": 0.0265, + "step": 3876 + }, + { + "epoch": 3.0527766837337533, + "grad_norm": 0.8637042045593262, + "learning_rate": 1.1613e-05, + "loss": 0.0508, + "step": 3877 + }, + { + "epoch": 3.0535643954312723, + "grad_norm": 0.5283514261245728, + "learning_rate": 1.1616e-05, + "loss": 0.0291, + "step": 3878 + }, + { + "epoch": 3.054352107128791, + "grad_norm": 1.2429536581039429, + "learning_rate": 1.1619e-05, + "loss": 0.036, + "step": 3879 + }, + { + "epoch": 3.0551398188263095, + "grad_norm": 0.44337430596351624, + "learning_rate": 1.1622000000000002e-05, + "loss": 0.04, + "step": 3880 + }, + { + "epoch": 3.0559275305238285, + "grad_norm": 0.4989849030971527, + "learning_rate": 1.1625000000000001e-05, + "loss": 0.0318, + "step": 3881 + }, + { + "epoch": 3.056715242221347, + "grad_norm": 0.8232018351554871, + "learning_rate": 1.1628e-05, + "loss": 0.0579, + "step": 3882 + }, + { + "epoch": 3.0575029539188656, + "grad_norm": 0.3394777178764343, + "learning_rate": 1.1631e-05, + "loss": 0.019, + "step": 3883 + }, + { + "epoch": 3.058290665616384, + "grad_norm": 0.9948216676712036, + "learning_rate": 1.1633999999999999e-05, + "loss": 0.0306, + "step": 3884 + }, + { + "epoch": 3.059078377313903, + "grad_norm": 0.8851935267448425, + "learning_rate": 1.1637e-05, + "loss": 0.0439, + "step": 3885 + }, + { + "epoch": 3.0598660890114218, + "grad_norm": 0.4352758526802063, + "learning_rate": 1.164e-05, + "loss": 0.0251, + "step": 3886 + }, + { + "epoch": 3.0606538007089403, + "grad_norm": 1.1821738481521606, + "learning_rate": 1.1643e-05, + "loss": 0.048, + "step": 3887 + }, + { + "epoch": 3.0614415124064593, + "grad_norm": 0.9184879064559937, + "learning_rate": 1.1646e-05, + "loss": 0.0451, + "step": 3888 + }, + { + "epoch": 3.062229224103978, + "grad_norm": 2.0754449367523193, + "learning_rate": 1.1649e-05, + "loss": 0.0645, + "step": 3889 + }, + { + "epoch": 3.0630169358014965, + "grad_norm": 0.6805292963981628, + "learning_rate": 1.1652000000000001e-05, + "loss": 0.0273, + "step": 3890 + }, + { + "epoch": 3.0638046474990155, + "grad_norm": 0.4925066828727722, + "learning_rate": 1.1655000000000001e-05, + "loss": 0.0339, + "step": 3891 + }, + { + "epoch": 3.064592359196534, + "grad_norm": 1.1091945171356201, + "learning_rate": 1.1658000000000001e-05, + "loss": 0.0502, + "step": 3892 + }, + { + "epoch": 3.0653800708940526, + "grad_norm": 0.5691070556640625, + "learning_rate": 1.1661e-05, + "loss": 0.0478, + "step": 3893 + }, + { + "epoch": 3.0661677825915716, + "grad_norm": 0.5427806973457336, + "learning_rate": 1.1664e-05, + "loss": 0.0516, + "step": 3894 + }, + { + "epoch": 3.06695549428909, + "grad_norm": 0.6105404496192932, + "learning_rate": 1.1667e-05, + "loss": 0.0345, + "step": 3895 + }, + { + "epoch": 3.0677432059866088, + "grad_norm": 0.7432419061660767, + "learning_rate": 1.167e-05, + "loss": 0.0455, + "step": 3896 + }, + { + "epoch": 3.068530917684128, + "grad_norm": 0.5576518774032593, + "learning_rate": 1.1673e-05, + "loss": 0.0356, + "step": 3897 + }, + { + "epoch": 3.0693186293816463, + "grad_norm": 0.6129207015037537, + "learning_rate": 1.1676e-05, + "loss": 0.0436, + "step": 3898 + }, + { + "epoch": 3.070106341079165, + "grad_norm": 0.6477587223052979, + "learning_rate": 1.1679e-05, + "loss": 0.0331, + "step": 3899 + }, + { + "epoch": 3.070894052776684, + "grad_norm": 0.6854813694953918, + "learning_rate": 1.1682000000000001e-05, + "loss": 0.0392, + "step": 3900 + }, + { + "epoch": 3.0716817644742025, + "grad_norm": 0.7655742168426514, + "learning_rate": 1.1685e-05, + "loss": 0.0505, + "step": 3901 + }, + { + "epoch": 3.072469476171721, + "grad_norm": 0.8856099247932434, + "learning_rate": 1.1688e-05, + "loss": 0.0554, + "step": 3902 + }, + { + "epoch": 3.07325718786924, + "grad_norm": 0.8415340185165405, + "learning_rate": 1.1691e-05, + "loss": 0.0744, + "step": 3903 + }, + { + "epoch": 3.0740448995667586, + "grad_norm": 0.6309134364128113, + "learning_rate": 1.1694e-05, + "loss": 0.0415, + "step": 3904 + }, + { + "epoch": 3.074832611264277, + "grad_norm": 0.9045529365539551, + "learning_rate": 1.1697000000000002e-05, + "loss": 0.05, + "step": 3905 + }, + { + "epoch": 3.0756203229617958, + "grad_norm": 0.7955474853515625, + "learning_rate": 1.1700000000000001e-05, + "loss": 0.0446, + "step": 3906 + }, + { + "epoch": 3.076408034659315, + "grad_norm": 0.7847703695297241, + "learning_rate": 1.1703e-05, + "loss": 0.046, + "step": 3907 + }, + { + "epoch": 3.0771957463568334, + "grad_norm": 0.6648415923118591, + "learning_rate": 1.1706e-05, + "loss": 0.0364, + "step": 3908 + }, + { + "epoch": 3.077983458054352, + "grad_norm": 1.0741996765136719, + "learning_rate": 1.1709e-05, + "loss": 0.0576, + "step": 3909 + }, + { + "epoch": 3.078771169751871, + "grad_norm": 0.9018031358718872, + "learning_rate": 1.1712e-05, + "loss": 0.0703, + "step": 3910 + }, + { + "epoch": 3.0795588814493895, + "grad_norm": 2.7618210315704346, + "learning_rate": 1.1715e-05, + "loss": 0.4326, + "step": 3911 + }, + { + "epoch": 3.080346593146908, + "grad_norm": 1.2924737930297852, + "learning_rate": 1.1718e-05, + "loss": 0.3, + "step": 3912 + }, + { + "epoch": 3.081134304844427, + "grad_norm": 1.3179211616516113, + "learning_rate": 1.1721e-05, + "loss": 0.2963, + "step": 3913 + }, + { + "epoch": 3.0819220165419456, + "grad_norm": 0.9998614192008972, + "learning_rate": 1.1724e-05, + "loss": 0.216, + "step": 3914 + }, + { + "epoch": 3.082709728239464, + "grad_norm": 1.1108624935150146, + "learning_rate": 1.1727000000000001e-05, + "loss": 0.1873, + "step": 3915 + }, + { + "epoch": 3.083497439936983, + "grad_norm": 1.610350489616394, + "learning_rate": 1.1730000000000001e-05, + "loss": 0.1422, + "step": 3916 + }, + { + "epoch": 3.084285151634502, + "grad_norm": 0.5921132564544678, + "learning_rate": 1.1733000000000001e-05, + "loss": 0.0571, + "step": 3917 + }, + { + "epoch": 3.0850728633320204, + "grad_norm": 0.5166023373603821, + "learning_rate": 1.1736e-05, + "loss": 0.072, + "step": 3918 + }, + { + "epoch": 3.0858605750295394, + "grad_norm": 0.7882480621337891, + "learning_rate": 1.1738999999999999e-05, + "loss": 0.1044, + "step": 3919 + }, + { + "epoch": 3.086648286727058, + "grad_norm": 0.7795553803443909, + "learning_rate": 1.1742e-05, + "loss": 0.0704, + "step": 3920 + }, + { + "epoch": 3.0874359984245765, + "grad_norm": 1.24544358253479, + "learning_rate": 1.1745e-05, + "loss": 0.0499, + "step": 3921 + }, + { + "epoch": 3.0882237101220955, + "grad_norm": 0.4929734468460083, + "learning_rate": 1.1748e-05, + "loss": 0.0409, + "step": 3922 + }, + { + "epoch": 3.089011421819614, + "grad_norm": 0.8495798110961914, + "learning_rate": 1.1751e-05, + "loss": 0.0533, + "step": 3923 + }, + { + "epoch": 3.0897991335171326, + "grad_norm": 0.532278299331665, + "learning_rate": 1.1754e-05, + "loss": 0.0325, + "step": 3924 + }, + { + "epoch": 3.090586845214651, + "grad_norm": 0.5099316239356995, + "learning_rate": 1.1757000000000001e-05, + "loss": 0.0301, + "step": 3925 + }, + { + "epoch": 3.0913745569121702, + "grad_norm": 0.5152562260627747, + "learning_rate": 1.1760000000000001e-05, + "loss": 0.0267, + "step": 3926 + }, + { + "epoch": 3.092162268609689, + "grad_norm": 0.6593733429908752, + "learning_rate": 1.1763e-05, + "loss": 0.0407, + "step": 3927 + }, + { + "epoch": 3.0929499803072074, + "grad_norm": 0.44765013456344604, + "learning_rate": 1.1766e-05, + "loss": 0.0254, + "step": 3928 + }, + { + "epoch": 3.0937376920047264, + "grad_norm": 0.6198313236236572, + "learning_rate": 1.1769e-05, + "loss": 0.037, + "step": 3929 + }, + { + "epoch": 3.094525403702245, + "grad_norm": 1.0801931619644165, + "learning_rate": 1.1772000000000002e-05, + "loss": 0.0404, + "step": 3930 + }, + { + "epoch": 3.0953131153997635, + "grad_norm": 0.7183708548545837, + "learning_rate": 1.1775000000000002e-05, + "loss": 0.0649, + "step": 3931 + }, + { + "epoch": 3.0961008270972825, + "grad_norm": 0.5667235851287842, + "learning_rate": 1.1778e-05, + "loss": 0.0381, + "step": 3932 + }, + { + "epoch": 3.096888538794801, + "grad_norm": 0.6474065184593201, + "learning_rate": 1.1781e-05, + "loss": 0.0445, + "step": 3933 + }, + { + "epoch": 3.0976762504923196, + "grad_norm": 0.5204691886901855, + "learning_rate": 1.1784e-05, + "loss": 0.036, + "step": 3934 + }, + { + "epoch": 3.0984639621898387, + "grad_norm": 0.5751348733901978, + "learning_rate": 1.1787e-05, + "loss": 0.0362, + "step": 3935 + }, + { + "epoch": 3.0992516738873572, + "grad_norm": 0.6830968260765076, + "learning_rate": 1.179e-05, + "loss": 0.0442, + "step": 3936 + }, + { + "epoch": 3.100039385584876, + "grad_norm": 0.7225962281227112, + "learning_rate": 1.1793e-05, + "loss": 0.0306, + "step": 3937 + }, + { + "epoch": 3.100827097282395, + "grad_norm": 0.8234646320343018, + "learning_rate": 1.1796e-05, + "loss": 0.0491, + "step": 3938 + }, + { + "epoch": 3.1016148089799134, + "grad_norm": 0.6113240718841553, + "learning_rate": 1.1799e-05, + "loss": 0.0433, + "step": 3939 + }, + { + "epoch": 3.102402520677432, + "grad_norm": 0.7144742012023926, + "learning_rate": 1.1802000000000002e-05, + "loss": 0.0515, + "step": 3940 + }, + { + "epoch": 3.103190232374951, + "grad_norm": 1.160273790359497, + "learning_rate": 1.1805000000000001e-05, + "loss": 0.0462, + "step": 3941 + }, + { + "epoch": 3.1039779440724695, + "grad_norm": 13.142682075500488, + "learning_rate": 1.1808000000000001e-05, + "loss": 0.1239, + "step": 3942 + }, + { + "epoch": 3.104765655769988, + "grad_norm": 0.8279339075088501, + "learning_rate": 1.1811000000000001e-05, + "loss": 0.0735, + "step": 3943 + }, + { + "epoch": 3.105553367467507, + "grad_norm": 0.5725799202919006, + "learning_rate": 1.1813999999999999e-05, + "loss": 0.027, + "step": 3944 + }, + { + "epoch": 3.1063410791650257, + "grad_norm": 0.9332057237625122, + "learning_rate": 1.1816999999999999e-05, + "loss": 0.0354, + "step": 3945 + }, + { + "epoch": 3.1071287908625442, + "grad_norm": 0.6836509704589844, + "learning_rate": 1.182e-05, + "loss": 0.038, + "step": 3946 + }, + { + "epoch": 3.1079165025600632, + "grad_norm": 0.4888162612915039, + "learning_rate": 1.1823e-05, + "loss": 0.0306, + "step": 3947 + }, + { + "epoch": 3.108704214257582, + "grad_norm": 0.9794334769248962, + "learning_rate": 1.1826e-05, + "loss": 0.0467, + "step": 3948 + }, + { + "epoch": 3.1094919259551004, + "grad_norm": 0.9837256669998169, + "learning_rate": 1.1829e-05, + "loss": 0.0565, + "step": 3949 + }, + { + "epoch": 3.110279637652619, + "grad_norm": 0.47869521379470825, + "learning_rate": 1.1832e-05, + "loss": 0.0326, + "step": 3950 + }, + { + "epoch": 3.111067349350138, + "grad_norm": 0.6737895607948303, + "learning_rate": 1.1835000000000001e-05, + "loss": 0.038, + "step": 3951 + }, + { + "epoch": 3.1118550610476565, + "grad_norm": 0.7969237565994263, + "learning_rate": 1.1838e-05, + "loss": 0.0527, + "step": 3952 + }, + { + "epoch": 3.112642772745175, + "grad_norm": 0.5994583964347839, + "learning_rate": 1.1841e-05, + "loss": 0.0374, + "step": 3953 + }, + { + "epoch": 3.113430484442694, + "grad_norm": 0.6237952709197998, + "learning_rate": 1.1844e-05, + "loss": 0.0544, + "step": 3954 + }, + { + "epoch": 3.1142181961402127, + "grad_norm": 0.9490984678268433, + "learning_rate": 1.1847e-05, + "loss": 0.0603, + "step": 3955 + }, + { + "epoch": 3.1150059078377312, + "grad_norm": 0.6428765058517456, + "learning_rate": 1.185e-05, + "loss": 0.0355, + "step": 3956 + }, + { + "epoch": 3.1157936195352502, + "grad_norm": 0.8135315179824829, + "learning_rate": 1.1853e-05, + "loss": 0.0665, + "step": 3957 + }, + { + "epoch": 3.116581331232769, + "grad_norm": 0.9486034512519836, + "learning_rate": 1.1856e-05, + "loss": 0.0502, + "step": 3958 + }, + { + "epoch": 3.1173690429302874, + "grad_norm": 0.7727532982826233, + "learning_rate": 1.1859e-05, + "loss": 0.0477, + "step": 3959 + }, + { + "epoch": 3.1181567546278064, + "grad_norm": 1.4762104749679565, + "learning_rate": 1.1862e-05, + "loss": 0.1312, + "step": 3960 + }, + { + "epoch": 3.118944466325325, + "grad_norm": 1.4103902578353882, + "learning_rate": 1.1865e-05, + "loss": 0.3343, + "step": 3961 + }, + { + "epoch": 3.1197321780228435, + "grad_norm": 2.5963134765625, + "learning_rate": 1.1868e-05, + "loss": 0.4456, + "step": 3962 + }, + { + "epoch": 3.1205198897203625, + "grad_norm": 1.1286615133285522, + "learning_rate": 1.1871e-05, + "loss": 0.2784, + "step": 3963 + }, + { + "epoch": 3.121307601417881, + "grad_norm": 0.9285112619400024, + "learning_rate": 1.1874e-05, + "loss": 0.1863, + "step": 3964 + }, + { + "epoch": 3.1220953131153997, + "grad_norm": 0.8822343945503235, + "learning_rate": 1.1877e-05, + "loss": 0.1468, + "step": 3965 + }, + { + "epoch": 3.1228830248129187, + "grad_norm": 0.7597814202308655, + "learning_rate": 1.1880000000000001e-05, + "loss": 0.0969, + "step": 3966 + }, + { + "epoch": 3.1236707365104373, + "grad_norm": 0.7520023584365845, + "learning_rate": 1.1883000000000001e-05, + "loss": 0.0621, + "step": 3967 + }, + { + "epoch": 3.124458448207956, + "grad_norm": 0.5703044533729553, + "learning_rate": 1.1886e-05, + "loss": 0.0481, + "step": 3968 + }, + { + "epoch": 3.1252461599054744, + "grad_norm": 0.7709327936172485, + "learning_rate": 1.1889e-05, + "loss": 0.0497, + "step": 3969 + }, + { + "epoch": 3.1260338716029934, + "grad_norm": 0.5086915493011475, + "learning_rate": 1.1891999999999999e-05, + "loss": 0.0495, + "step": 3970 + }, + { + "epoch": 3.126821583300512, + "grad_norm": 0.9852067232131958, + "learning_rate": 1.1895e-05, + "loss": 0.0542, + "step": 3971 + }, + { + "epoch": 3.1276092949980305, + "grad_norm": 0.47526705265045166, + "learning_rate": 1.1898e-05, + "loss": 0.0477, + "step": 3972 + }, + { + "epoch": 3.1283970066955495, + "grad_norm": 0.48456448316574097, + "learning_rate": 1.1901e-05, + "loss": 0.0244, + "step": 3973 + }, + { + "epoch": 3.129184718393068, + "grad_norm": 0.5649906396865845, + "learning_rate": 1.1904e-05, + "loss": 0.0288, + "step": 3974 + }, + { + "epoch": 3.1299724300905867, + "grad_norm": 0.4428614675998688, + "learning_rate": 1.1907e-05, + "loss": 0.0291, + "step": 3975 + }, + { + "epoch": 3.1307601417881057, + "grad_norm": 0.5377501249313354, + "learning_rate": 1.1910000000000001e-05, + "loss": 0.04, + "step": 3976 + }, + { + "epoch": 3.1315478534856243, + "grad_norm": 1.6106568574905396, + "learning_rate": 1.1913000000000001e-05, + "loss": 0.0326, + "step": 3977 + }, + { + "epoch": 3.132335565183143, + "grad_norm": 0.6719521880149841, + "learning_rate": 1.1916e-05, + "loss": 0.0511, + "step": 3978 + }, + { + "epoch": 3.133123276880662, + "grad_norm": 0.6794100403785706, + "learning_rate": 1.1919e-05, + "loss": 0.0387, + "step": 3979 + }, + { + "epoch": 3.1339109885781804, + "grad_norm": 0.6592605113983154, + "learning_rate": 1.1922e-05, + "loss": 0.0362, + "step": 3980 + }, + { + "epoch": 3.134698700275699, + "grad_norm": 1.0759309530258179, + "learning_rate": 1.1925e-05, + "loss": 0.0609, + "step": 3981 + }, + { + "epoch": 3.135486411973218, + "grad_norm": 4.904702663421631, + "learning_rate": 1.1928e-05, + "loss": 0.0256, + "step": 3982 + }, + { + "epoch": 3.1362741236707365, + "grad_norm": 0.6323040127754211, + "learning_rate": 1.1931e-05, + "loss": 0.04, + "step": 3983 + }, + { + "epoch": 3.137061835368255, + "grad_norm": 1.3002936840057373, + "learning_rate": 1.1934e-05, + "loss": 0.0496, + "step": 3984 + }, + { + "epoch": 3.137849547065774, + "grad_norm": 0.737598717212677, + "learning_rate": 1.1937e-05, + "loss": 0.0555, + "step": 3985 + }, + { + "epoch": 3.1386372587632927, + "grad_norm": 0.6912407279014587, + "learning_rate": 1.1940000000000001e-05, + "loss": 0.0397, + "step": 3986 + }, + { + "epoch": 3.1394249704608113, + "grad_norm": 0.49505847692489624, + "learning_rate": 1.1943e-05, + "loss": 0.0318, + "step": 3987 + }, + { + "epoch": 3.14021268215833, + "grad_norm": 0.6807467937469482, + "learning_rate": 1.1946e-05, + "loss": 0.0457, + "step": 3988 + }, + { + "epoch": 3.141000393855849, + "grad_norm": 0.588776707649231, + "learning_rate": 1.1949e-05, + "loss": 0.0336, + "step": 3989 + }, + { + "epoch": 3.1417881055533674, + "grad_norm": 0.6181431412696838, + "learning_rate": 1.1952e-05, + "loss": 0.0379, + "step": 3990 + }, + { + "epoch": 3.142575817250886, + "grad_norm": 1.327311396598816, + "learning_rate": 1.1955000000000002e-05, + "loss": 0.0454, + "step": 3991 + }, + { + "epoch": 3.143363528948405, + "grad_norm": 0.903589129447937, + "learning_rate": 1.1958000000000001e-05, + "loss": 0.0626, + "step": 3992 + }, + { + "epoch": 3.1441512406459236, + "grad_norm": 0.5579115748405457, + "learning_rate": 1.1961e-05, + "loss": 0.0417, + "step": 3993 + }, + { + "epoch": 3.144938952343442, + "grad_norm": 0.9365033507347107, + "learning_rate": 1.1964e-05, + "loss": 0.1141, + "step": 3994 + }, + { + "epoch": 3.145726664040961, + "grad_norm": 0.5499711632728577, + "learning_rate": 1.1966999999999999e-05, + "loss": 0.0373, + "step": 3995 + }, + { + "epoch": 3.1465143757384797, + "grad_norm": 0.606633722782135, + "learning_rate": 1.197e-05, + "loss": 0.0476, + "step": 3996 + }, + { + "epoch": 3.1473020874359983, + "grad_norm": 0.7210782170295715, + "learning_rate": 1.1973e-05, + "loss": 0.041, + "step": 3997 + }, + { + "epoch": 3.1480897991335173, + "grad_norm": 0.6461611390113831, + "learning_rate": 1.1976e-05, + "loss": 0.0383, + "step": 3998 + }, + { + "epoch": 3.148877510831036, + "grad_norm": 1.095171332359314, + "learning_rate": 1.1979e-05, + "loss": 0.0489, + "step": 3999 + }, + { + "epoch": 3.1496652225285544, + "grad_norm": 0.9431239366531372, + "learning_rate": 1.1982e-05, + "loss": 0.0471, + "step": 4000 + }, + { + "epoch": 3.1496652225285544, + "eval_cer": 0.14337558648935894, + "eval_loss": 0.45013427734375, + "eval_runtime": 16.4116, + "eval_samples_per_second": 18.524, + "eval_steps_per_second": 0.609, + "eval_wer": 0.4873369148119724, + "step": 4000 + }, + { + "epoch": 3.1504529342260734, + "grad_norm": 0.9166675806045532, + "learning_rate": 1.1985000000000001e-05, + "loss": 0.037, + "step": 4001 + }, + { + "epoch": 3.151240645923592, + "grad_norm": 0.7710642218589783, + "learning_rate": 1.1988000000000001e-05, + "loss": 0.0646, + "step": 4002 + }, + { + "epoch": 3.1520283576211106, + "grad_norm": 0.9048500657081604, + "learning_rate": 1.1991000000000001e-05, + "loss": 0.04, + "step": 4003 + }, + { + "epoch": 3.1528160693186296, + "grad_norm": 0.8844276070594788, + "learning_rate": 1.1994e-05, + "loss": 0.0573, + "step": 4004 + }, + { + "epoch": 3.153603781016148, + "grad_norm": 1.0975717306137085, + "learning_rate": 1.1996999999999999e-05, + "loss": 0.05, + "step": 4005 + }, + { + "epoch": 3.1543914927136667, + "grad_norm": 1.0119140148162842, + "learning_rate": 1.2e-05, + "loss": 0.0483, + "step": 4006 + }, + { + "epoch": 3.1551792044111857, + "grad_norm": 1.1195330619812012, + "learning_rate": 1.2003e-05, + "loss": 0.0529, + "step": 4007 + }, + { + "epoch": 3.1559669161087043, + "grad_norm": 0.814975380897522, + "learning_rate": 1.2006e-05, + "loss": 0.0385, + "step": 4008 + }, + { + "epoch": 3.156754627806223, + "grad_norm": 0.7776085734367371, + "learning_rate": 1.2009e-05, + "loss": 0.0414, + "step": 4009 + }, + { + "epoch": 3.157542339503742, + "grad_norm": 1.4161275625228882, + "learning_rate": 1.2012e-05, + "loss": 0.0652, + "step": 4010 + }, + { + "epoch": 3.1583300512012604, + "grad_norm": 3.7383382320404053, + "learning_rate": 1.2015000000000001e-05, + "loss": 0.4944, + "step": 4011 + }, + { + "epoch": 3.159117762898779, + "grad_norm": 1.467667818069458, + "learning_rate": 1.2018e-05, + "loss": 0.2778, + "step": 4012 + }, + { + "epoch": 3.1599054745962976, + "grad_norm": 1.410555124282837, + "learning_rate": 1.2021e-05, + "loss": 0.2484, + "step": 4013 + }, + { + "epoch": 3.1606931862938166, + "grad_norm": 1.7129709720611572, + "learning_rate": 1.2024e-05, + "loss": 0.2027, + "step": 4014 + }, + { + "epoch": 3.161480897991335, + "grad_norm": 1.7313114404678345, + "learning_rate": 1.2027e-05, + "loss": 0.2227, + "step": 4015 + }, + { + "epoch": 3.1622686096888537, + "grad_norm": 0.9008122682571411, + "learning_rate": 1.2030000000000002e-05, + "loss": 0.1031, + "step": 4016 + }, + { + "epoch": 3.1630563213863727, + "grad_norm": 0.8992700576782227, + "learning_rate": 1.2033000000000002e-05, + "loss": 0.0809, + "step": 4017 + }, + { + "epoch": 3.1638440330838913, + "grad_norm": 1.3145664930343628, + "learning_rate": 1.2036e-05, + "loss": 0.0857, + "step": 4018 + }, + { + "epoch": 3.16463174478141, + "grad_norm": 0.8716297745704651, + "learning_rate": 1.2039e-05, + "loss": 0.1307, + "step": 4019 + }, + { + "epoch": 3.165419456478929, + "grad_norm": 0.6370618343353271, + "learning_rate": 1.2042e-05, + "loss": 0.0446, + "step": 4020 + }, + { + "epoch": 3.1662071681764474, + "grad_norm": 0.7256255149841309, + "learning_rate": 1.2045e-05, + "loss": 0.0404, + "step": 4021 + }, + { + "epoch": 3.166994879873966, + "grad_norm": 0.7520545721054077, + "learning_rate": 1.2048e-05, + "loss": 0.073, + "step": 4022 + }, + { + "epoch": 3.167782591571485, + "grad_norm": 0.608822226524353, + "learning_rate": 1.2051e-05, + "loss": 0.0372, + "step": 4023 + }, + { + "epoch": 3.1685703032690036, + "grad_norm": 0.8541470170021057, + "learning_rate": 1.2054e-05, + "loss": 0.0317, + "step": 4024 + }, + { + "epoch": 3.169358014966522, + "grad_norm": 1.1175055503845215, + "learning_rate": 1.2057e-05, + "loss": 0.0871, + "step": 4025 + }, + { + "epoch": 3.170145726664041, + "grad_norm": 1.0257474184036255, + "learning_rate": 1.2060000000000001e-05, + "loss": 0.0316, + "step": 4026 + }, + { + "epoch": 3.1709334383615597, + "grad_norm": 0.5932887196540833, + "learning_rate": 1.2063000000000001e-05, + "loss": 0.0336, + "step": 4027 + }, + { + "epoch": 3.1717211500590783, + "grad_norm": 0.7310899496078491, + "learning_rate": 1.2066000000000001e-05, + "loss": 0.0441, + "step": 4028 + }, + { + "epoch": 3.1725088617565973, + "grad_norm": 0.8221806287765503, + "learning_rate": 1.2069e-05, + "loss": 0.0518, + "step": 4029 + }, + { + "epoch": 3.173296573454116, + "grad_norm": 0.47474780678749084, + "learning_rate": 1.2071999999999999e-05, + "loss": 0.0292, + "step": 4030 + }, + { + "epoch": 3.1740842851516344, + "grad_norm": 0.8857983350753784, + "learning_rate": 1.2075e-05, + "loss": 0.0472, + "step": 4031 + }, + { + "epoch": 3.174871996849153, + "grad_norm": 0.6336103081703186, + "learning_rate": 1.2078e-05, + "loss": 0.0614, + "step": 4032 + }, + { + "epoch": 3.175659708546672, + "grad_norm": 0.592275083065033, + "learning_rate": 1.2081e-05, + "loss": 0.0496, + "step": 4033 + }, + { + "epoch": 3.1764474202441906, + "grad_norm": 0.6228069067001343, + "learning_rate": 1.2084e-05, + "loss": 0.0422, + "step": 4034 + }, + { + "epoch": 3.177235131941709, + "grad_norm": 0.6050815582275391, + "learning_rate": 1.2087e-05, + "loss": 0.0385, + "step": 4035 + }, + { + "epoch": 3.178022843639228, + "grad_norm": 0.4364112913608551, + "learning_rate": 1.2090000000000001e-05, + "loss": 0.0355, + "step": 4036 + }, + { + "epoch": 3.1788105553367467, + "grad_norm": 1.1816593408584595, + "learning_rate": 1.2093000000000001e-05, + "loss": 0.0413, + "step": 4037 + }, + { + "epoch": 3.1795982670342653, + "grad_norm": 1.9020034074783325, + "learning_rate": 1.2096e-05, + "loss": 0.0548, + "step": 4038 + }, + { + "epoch": 3.1803859787317843, + "grad_norm": 0.5851032733917236, + "learning_rate": 1.2099e-05, + "loss": 0.0402, + "step": 4039 + }, + { + "epoch": 3.181173690429303, + "grad_norm": 0.9253275990486145, + "learning_rate": 1.2102e-05, + "loss": 0.0492, + "step": 4040 + }, + { + "epoch": 3.1819614021268214, + "grad_norm": 0.6447612047195435, + "learning_rate": 1.2105000000000002e-05, + "loss": 0.0514, + "step": 4041 + }, + { + "epoch": 3.1827491138243404, + "grad_norm": 0.618815004825592, + "learning_rate": 1.2108e-05, + "loss": 0.0344, + "step": 4042 + }, + { + "epoch": 3.183536825521859, + "grad_norm": 0.7200620770454407, + "learning_rate": 1.2111e-05, + "loss": 0.0335, + "step": 4043 + }, + { + "epoch": 3.1843245372193776, + "grad_norm": 1.000442385673523, + "learning_rate": 1.2114e-05, + "loss": 0.0618, + "step": 4044 + }, + { + "epoch": 3.1851122489168966, + "grad_norm": 0.7763901948928833, + "learning_rate": 1.2117e-05, + "loss": 0.0502, + "step": 4045 + }, + { + "epoch": 3.185899960614415, + "grad_norm": 0.6096683144569397, + "learning_rate": 1.2120000000000001e-05, + "loss": 0.0307, + "step": 4046 + }, + { + "epoch": 3.1866876723119337, + "grad_norm": 0.5447818636894226, + "learning_rate": 1.2123e-05, + "loss": 0.0282, + "step": 4047 + }, + { + "epoch": 3.1874753840094527, + "grad_norm": 0.8078464865684509, + "learning_rate": 1.2126e-05, + "loss": 0.0557, + "step": 4048 + }, + { + "epoch": 3.1882630957069713, + "grad_norm": 0.7779615521430969, + "learning_rate": 1.2129e-05, + "loss": 0.0447, + "step": 4049 + }, + { + "epoch": 3.18905080740449, + "grad_norm": 0.9021788239479065, + "learning_rate": 1.2132e-05, + "loss": 0.0559, + "step": 4050 + }, + { + "epoch": 3.1898385191020084, + "grad_norm": 1.0724892616271973, + "learning_rate": 1.2135000000000002e-05, + "loss": 0.0348, + "step": 4051 + }, + { + "epoch": 3.1906262307995275, + "grad_norm": 0.824126124382019, + "learning_rate": 1.2138000000000001e-05, + "loss": 0.0518, + "step": 4052 + }, + { + "epoch": 3.191413942497046, + "grad_norm": 0.9372316002845764, + "learning_rate": 1.2141000000000001e-05, + "loss": 0.045, + "step": 4053 + }, + { + "epoch": 3.1922016541945646, + "grad_norm": 0.9260129928588867, + "learning_rate": 1.2144e-05, + "loss": 0.0489, + "step": 4054 + }, + { + "epoch": 3.1929893658920836, + "grad_norm": 0.8953214287757874, + "learning_rate": 1.2146999999999999e-05, + "loss": 0.0631, + "step": 4055 + }, + { + "epoch": 3.193777077589602, + "grad_norm": 0.9410547614097595, + "learning_rate": 1.215e-05, + "loss": 0.0527, + "step": 4056 + }, + { + "epoch": 3.1945647892871207, + "grad_norm": 0.684792697429657, + "learning_rate": 1.2153e-05, + "loss": 0.0298, + "step": 4057 + }, + { + "epoch": 3.1953525009846397, + "grad_norm": 0.5579074025154114, + "learning_rate": 1.2156e-05, + "loss": 0.0497, + "step": 4058 + }, + { + "epoch": 3.1961402126821583, + "grad_norm": 1.2375023365020752, + "learning_rate": 1.2159e-05, + "loss": 0.0636, + "step": 4059 + }, + { + "epoch": 3.196927924379677, + "grad_norm": 1.0494543313980103, + "learning_rate": 1.2162e-05, + "loss": 0.0531, + "step": 4060 + }, + { + "epoch": 3.197715636077196, + "grad_norm": 1.8698290586471558, + "learning_rate": 1.2165000000000001e-05, + "loss": 0.3727, + "step": 4061 + }, + { + "epoch": 3.1985033477747145, + "grad_norm": 0.9649758338928223, + "learning_rate": 1.2168000000000001e-05, + "loss": 0.2519, + "step": 4062 + }, + { + "epoch": 3.199291059472233, + "grad_norm": 1.014600396156311, + "learning_rate": 1.2171000000000001e-05, + "loss": 0.2364, + "step": 4063 + }, + { + "epoch": 3.200078771169752, + "grad_norm": 1.4843723773956299, + "learning_rate": 1.2174e-05, + "loss": 0.2619, + "step": 4064 + }, + { + "epoch": 3.2008664828672706, + "grad_norm": 1.2166025638580322, + "learning_rate": 1.2177e-05, + "loss": 0.2413, + "step": 4065 + }, + { + "epoch": 3.201654194564789, + "grad_norm": 1.1227006912231445, + "learning_rate": 1.2180000000000002e-05, + "loss": 0.1205, + "step": 4066 + }, + { + "epoch": 3.202441906262308, + "grad_norm": 0.4855048954486847, + "learning_rate": 1.2183e-05, + "loss": 0.0557, + "step": 4067 + }, + { + "epoch": 3.2032296179598267, + "grad_norm": 0.9717891812324524, + "learning_rate": 1.2186e-05, + "loss": 0.1159, + "step": 4068 + }, + { + "epoch": 3.2040173296573453, + "grad_norm": 0.7549824714660645, + "learning_rate": 1.2189e-05, + "loss": 0.095, + "step": 4069 + }, + { + "epoch": 3.2048050413548643, + "grad_norm": 0.843210756778717, + "learning_rate": 1.2192e-05, + "loss": 0.0479, + "step": 4070 + }, + { + "epoch": 3.205592753052383, + "grad_norm": 1.623909831047058, + "learning_rate": 1.2195e-05, + "loss": 0.0942, + "step": 4071 + }, + { + "epoch": 3.2063804647499015, + "grad_norm": 0.547566294670105, + "learning_rate": 1.2198e-05, + "loss": 0.0592, + "step": 4072 + }, + { + "epoch": 3.2071681764474205, + "grad_norm": 0.5263308882713318, + "learning_rate": 1.2201e-05, + "loss": 0.042, + "step": 4073 + }, + { + "epoch": 3.207955888144939, + "grad_norm": 0.44006118178367615, + "learning_rate": 1.2204e-05, + "loss": 0.0431, + "step": 4074 + }, + { + "epoch": 3.2087435998424576, + "grad_norm": 0.6853240728378296, + "learning_rate": 1.2207e-05, + "loss": 0.0406, + "step": 4075 + }, + { + "epoch": 3.209531311539976, + "grad_norm": 0.5651113390922546, + "learning_rate": 1.221e-05, + "loss": 0.0417, + "step": 4076 + }, + { + "epoch": 3.210319023237495, + "grad_norm": 0.5679463744163513, + "learning_rate": 1.2213000000000001e-05, + "loss": 0.0355, + "step": 4077 + }, + { + "epoch": 3.2111067349350138, + "grad_norm": 0.6257662177085876, + "learning_rate": 1.2216000000000001e-05, + "loss": 0.0337, + "step": 4078 + }, + { + "epoch": 3.2118944466325323, + "grad_norm": 0.5538316965103149, + "learning_rate": 1.2219e-05, + "loss": 0.03, + "step": 4079 + }, + { + "epoch": 3.2126821583300513, + "grad_norm": 0.4370489716529846, + "learning_rate": 1.2222e-05, + "loss": 0.0453, + "step": 4080 + }, + { + "epoch": 3.21346987002757, + "grad_norm": 1.2698116302490234, + "learning_rate": 1.2224999999999999e-05, + "loss": 0.0433, + "step": 4081 + }, + { + "epoch": 3.2142575817250885, + "grad_norm": 0.6094589829444885, + "learning_rate": 1.2228e-05, + "loss": 0.0385, + "step": 4082 + }, + { + "epoch": 3.2150452934226075, + "grad_norm": 0.6859515905380249, + "learning_rate": 1.2231e-05, + "loss": 0.0304, + "step": 4083 + }, + { + "epoch": 3.215833005120126, + "grad_norm": 0.6686913371086121, + "learning_rate": 1.2234e-05, + "loss": 0.0363, + "step": 4084 + }, + { + "epoch": 3.2166207168176446, + "grad_norm": 0.9757808446884155, + "learning_rate": 1.2237e-05, + "loss": 0.0533, + "step": 4085 + }, + { + "epoch": 3.2174084285151636, + "grad_norm": 0.6310076713562012, + "learning_rate": 1.224e-05, + "loss": 0.0283, + "step": 4086 + }, + { + "epoch": 3.218196140212682, + "grad_norm": 0.7322906851768494, + "learning_rate": 1.2243000000000001e-05, + "loss": 0.0484, + "step": 4087 + }, + { + "epoch": 3.2189838519102008, + "grad_norm": 0.5736980438232422, + "learning_rate": 1.2246000000000001e-05, + "loss": 0.0366, + "step": 4088 + }, + { + "epoch": 3.2197715636077198, + "grad_norm": 0.5416637063026428, + "learning_rate": 1.2249e-05, + "loss": 0.0331, + "step": 4089 + }, + { + "epoch": 3.2205592753052383, + "grad_norm": 0.5929494500160217, + "learning_rate": 1.2252e-05, + "loss": 0.034, + "step": 4090 + }, + { + "epoch": 3.221346987002757, + "grad_norm": 1.4879941940307617, + "learning_rate": 1.2254999999999999e-05, + "loss": 0.054, + "step": 4091 + }, + { + "epoch": 3.222134698700276, + "grad_norm": 0.49586430191993713, + "learning_rate": 1.2258e-05, + "loss": 0.0305, + "step": 4092 + }, + { + "epoch": 3.2229224103977945, + "grad_norm": 0.6423540711402893, + "learning_rate": 1.2261e-05, + "loss": 0.0477, + "step": 4093 + }, + { + "epoch": 3.223710122095313, + "grad_norm": 0.6464357972145081, + "learning_rate": 1.2264e-05, + "loss": 0.0504, + "step": 4094 + }, + { + "epoch": 3.2244978337928316, + "grad_norm": 0.7897395491600037, + "learning_rate": 1.2267e-05, + "loss": 0.0466, + "step": 4095 + }, + { + "epoch": 3.2252855454903506, + "grad_norm": 0.6037055850028992, + "learning_rate": 1.227e-05, + "loss": 0.0302, + "step": 4096 + }, + { + "epoch": 3.226073257187869, + "grad_norm": 0.6180537939071655, + "learning_rate": 1.2273000000000001e-05, + "loss": 0.0506, + "step": 4097 + }, + { + "epoch": 3.2268609688853878, + "grad_norm": 0.6900700926780701, + "learning_rate": 1.2276e-05, + "loss": 0.0452, + "step": 4098 + }, + { + "epoch": 3.2276486805829068, + "grad_norm": 0.7490521669387817, + "learning_rate": 1.2279e-05, + "loss": 0.0344, + "step": 4099 + }, + { + "epoch": 3.2284363922804253, + "grad_norm": 0.7296422719955444, + "learning_rate": 1.2282e-05, + "loss": 0.0535, + "step": 4100 + }, + { + "epoch": 3.229224103977944, + "grad_norm": 0.521769642829895, + "learning_rate": 1.2285e-05, + "loss": 0.0335, + "step": 4101 + }, + { + "epoch": 3.230011815675463, + "grad_norm": 1.134818434715271, + "learning_rate": 1.2288000000000002e-05, + "loss": 0.0493, + "step": 4102 + }, + { + "epoch": 3.2307995273729815, + "grad_norm": 0.7208835482597351, + "learning_rate": 1.2291000000000001e-05, + "loss": 0.0405, + "step": 4103 + }, + { + "epoch": 3.2315872390705, + "grad_norm": 0.8036493062973022, + "learning_rate": 1.2294e-05, + "loss": 0.0513, + "step": 4104 + }, + { + "epoch": 3.232374950768019, + "grad_norm": 0.8924216628074646, + "learning_rate": 1.2297e-05, + "loss": 0.0478, + "step": 4105 + }, + { + "epoch": 3.2331626624655376, + "grad_norm": 0.6925222277641296, + "learning_rate": 1.2299999999999999e-05, + "loss": 0.0535, + "step": 4106 + }, + { + "epoch": 3.233950374163056, + "grad_norm": 0.9203606843948364, + "learning_rate": 1.2303e-05, + "loss": 0.0541, + "step": 4107 + }, + { + "epoch": 3.234738085860575, + "grad_norm": 0.6428502202033997, + "learning_rate": 1.2306e-05, + "loss": 0.0341, + "step": 4108 + }, + { + "epoch": 3.2355257975580938, + "grad_norm": 0.7880935668945312, + "learning_rate": 1.2309e-05, + "loss": 0.0353, + "step": 4109 + }, + { + "epoch": 3.2363135092556123, + "grad_norm": 0.7984086871147156, + "learning_rate": 1.2312e-05, + "loss": 0.0624, + "step": 4110 + }, + { + "epoch": 3.2371012209531314, + "grad_norm": 1.6785931587219238, + "learning_rate": 1.2315e-05, + "loss": 0.3648, + "step": 4111 + }, + { + "epoch": 3.23788893265065, + "grad_norm": 1.1752362251281738, + "learning_rate": 1.2318000000000001e-05, + "loss": 0.3346, + "step": 4112 + }, + { + "epoch": 3.2386766443481685, + "grad_norm": 1.0681610107421875, + "learning_rate": 1.2321000000000001e-05, + "loss": 0.1904, + "step": 4113 + }, + { + "epoch": 3.239464356045687, + "grad_norm": 1.2582305669784546, + "learning_rate": 1.2324000000000001e-05, + "loss": 0.2518, + "step": 4114 + }, + { + "epoch": 3.240252067743206, + "grad_norm": 1.2018146514892578, + "learning_rate": 1.2327e-05, + "loss": 0.2321, + "step": 4115 + }, + { + "epoch": 3.2410397794407246, + "grad_norm": 0.6249655485153198, + "learning_rate": 1.2329999999999999e-05, + "loss": 0.0889, + "step": 4116 + }, + { + "epoch": 3.241827491138243, + "grad_norm": 0.7844008803367615, + "learning_rate": 1.2333e-05, + "loss": 0.0671, + "step": 4117 + }, + { + "epoch": 3.242615202835762, + "grad_norm": 0.5484212040901184, + "learning_rate": 1.2336e-05, + "loss": 0.0788, + "step": 4118 + }, + { + "epoch": 3.243402914533281, + "grad_norm": 0.7872232794761658, + "learning_rate": 1.2339e-05, + "loss": 0.0515, + "step": 4119 + }, + { + "epoch": 3.2441906262307993, + "grad_norm": 0.5034776329994202, + "learning_rate": 1.2342e-05, + "loss": 0.0267, + "step": 4120 + }, + { + "epoch": 3.2449783379283184, + "grad_norm": 0.6655566692352295, + "learning_rate": 1.2345e-05, + "loss": 0.0802, + "step": 4121 + }, + { + "epoch": 3.245766049625837, + "grad_norm": 1.06536066532135, + "learning_rate": 1.2348000000000001e-05, + "loss": 0.0574, + "step": 4122 + }, + { + "epoch": 3.2465537613233555, + "grad_norm": 0.6820407509803772, + "learning_rate": 1.2351e-05, + "loss": 0.0625, + "step": 4123 + }, + { + "epoch": 3.2473414730208745, + "grad_norm": 0.6477457284927368, + "learning_rate": 1.2354e-05, + "loss": 0.0487, + "step": 4124 + }, + { + "epoch": 3.248129184718393, + "grad_norm": 0.5915496349334717, + "learning_rate": 1.2357e-05, + "loss": 0.0378, + "step": 4125 + }, + { + "epoch": 3.2489168964159116, + "grad_norm": 0.7723541855812073, + "learning_rate": 1.236e-05, + "loss": 0.0471, + "step": 4126 + }, + { + "epoch": 3.2497046081134306, + "grad_norm": 0.55119389295578, + "learning_rate": 1.2363000000000002e-05, + "loss": 0.026, + "step": 4127 + }, + { + "epoch": 3.250492319810949, + "grad_norm": 0.7538642883300781, + "learning_rate": 1.2366e-05, + "loss": 0.0418, + "step": 4128 + }, + { + "epoch": 3.251280031508468, + "grad_norm": 0.713594377040863, + "learning_rate": 1.2369e-05, + "loss": 0.043, + "step": 4129 + }, + { + "epoch": 3.252067743205987, + "grad_norm": 0.6611047983169556, + "learning_rate": 1.2372e-05, + "loss": 0.0401, + "step": 4130 + }, + { + "epoch": 3.2528554549035054, + "grad_norm": 0.668782114982605, + "learning_rate": 1.2375e-05, + "loss": 0.0408, + "step": 4131 + }, + { + "epoch": 3.253643166601024, + "grad_norm": 0.5110369324684143, + "learning_rate": 1.2378e-05, + "loss": 0.0314, + "step": 4132 + }, + { + "epoch": 3.2544308782985425, + "grad_norm": 0.7685160040855408, + "learning_rate": 1.2381e-05, + "loss": 0.0536, + "step": 4133 + }, + { + "epoch": 3.2552185899960615, + "grad_norm": 0.5752913355827332, + "learning_rate": 1.2384e-05, + "loss": 0.0303, + "step": 4134 + }, + { + "epoch": 3.25600630169358, + "grad_norm": 0.5898373126983643, + "learning_rate": 1.2387e-05, + "loss": 0.0454, + "step": 4135 + }, + { + "epoch": 3.256794013391099, + "grad_norm": 0.5944299101829529, + "learning_rate": 1.239e-05, + "loss": 0.0592, + "step": 4136 + }, + { + "epoch": 3.2575817250886177, + "grad_norm": 0.6147855520248413, + "learning_rate": 1.2393000000000001e-05, + "loss": 0.04, + "step": 4137 + }, + { + "epoch": 3.258369436786136, + "grad_norm": 0.6842803359031677, + "learning_rate": 1.2396000000000001e-05, + "loss": 0.0501, + "step": 4138 + }, + { + "epoch": 3.259157148483655, + "grad_norm": 0.5849815011024475, + "learning_rate": 1.2399000000000001e-05, + "loss": 0.054, + "step": 4139 + }, + { + "epoch": 3.259944860181174, + "grad_norm": 0.6486673951148987, + "learning_rate": 1.2402e-05, + "loss": 0.0382, + "step": 4140 + }, + { + "epoch": 3.2607325718786924, + "grad_norm": 1.0785175561904907, + "learning_rate": 1.2404999999999999e-05, + "loss": 0.0331, + "step": 4141 + }, + { + "epoch": 3.261520283576211, + "grad_norm": 0.6827090978622437, + "learning_rate": 1.2408e-05, + "loss": 0.045, + "step": 4142 + }, + { + "epoch": 3.26230799527373, + "grad_norm": 1.0268739461898804, + "learning_rate": 1.2411e-05, + "loss": 0.0506, + "step": 4143 + }, + { + "epoch": 3.2630957069712485, + "grad_norm": 0.9748472571372986, + "learning_rate": 1.2414e-05, + "loss": 0.0621, + "step": 4144 + }, + { + "epoch": 3.263883418668767, + "grad_norm": 0.823531985282898, + "learning_rate": 1.2417e-05, + "loss": 0.0561, + "step": 4145 + }, + { + "epoch": 3.264671130366286, + "grad_norm": 0.6802054047584534, + "learning_rate": 1.242e-05, + "loss": 0.0326, + "step": 4146 + }, + { + "epoch": 3.2654588420638047, + "grad_norm": 0.6834481358528137, + "learning_rate": 1.2423000000000001e-05, + "loss": 0.029, + "step": 4147 + }, + { + "epoch": 3.2662465537613232, + "grad_norm": 0.42066165804862976, + "learning_rate": 1.2426000000000001e-05, + "loss": 0.0266, + "step": 4148 + }, + { + "epoch": 3.2670342654588422, + "grad_norm": 0.6421510577201843, + "learning_rate": 1.2429e-05, + "loss": 0.0359, + "step": 4149 + }, + { + "epoch": 3.267821977156361, + "grad_norm": 0.9034837484359741, + "learning_rate": 1.2432e-05, + "loss": 0.0501, + "step": 4150 + }, + { + "epoch": 3.2686096888538794, + "grad_norm": 0.7416817545890808, + "learning_rate": 1.2435e-05, + "loss": 0.0489, + "step": 4151 + }, + { + "epoch": 3.2693974005513984, + "grad_norm": 0.597087025642395, + "learning_rate": 1.2438000000000002e-05, + "loss": 0.0437, + "step": 4152 + }, + { + "epoch": 3.270185112248917, + "grad_norm": 0.7805439829826355, + "learning_rate": 1.2441e-05, + "loss": 0.0347, + "step": 4153 + }, + { + "epoch": 3.2709728239464355, + "grad_norm": 0.712030827999115, + "learning_rate": 1.2444e-05, + "loss": 0.0528, + "step": 4154 + }, + { + "epoch": 3.2717605356439545, + "grad_norm": 0.9615911245346069, + "learning_rate": 1.2447e-05, + "loss": 0.0303, + "step": 4155 + }, + { + "epoch": 3.272548247341473, + "grad_norm": 0.7386966347694397, + "learning_rate": 1.245e-05, + "loss": 0.0378, + "step": 4156 + }, + { + "epoch": 3.2733359590389917, + "grad_norm": 1.4781163930892944, + "learning_rate": 1.2453000000000001e-05, + "loss": 0.0493, + "step": 4157 + }, + { + "epoch": 3.2741236707365102, + "grad_norm": 0.9883362054824829, + "learning_rate": 1.2456e-05, + "loss": 0.0382, + "step": 4158 + }, + { + "epoch": 3.2749113824340292, + "grad_norm": 0.8000332713127136, + "learning_rate": 1.2459e-05, + "loss": 0.051, + "step": 4159 + }, + { + "epoch": 3.275699094131548, + "grad_norm": 1.152640700340271, + "learning_rate": 1.2462e-05, + "loss": 0.0542, + "step": 4160 + }, + { + "epoch": 3.2764868058290664, + "grad_norm": 1.3987599611282349, + "learning_rate": 1.2465e-05, + "loss": 0.3116, + "step": 4161 + }, + { + "epoch": 3.2772745175265854, + "grad_norm": 1.2345993518829346, + "learning_rate": 1.2468000000000002e-05, + "loss": 0.2855, + "step": 4162 + }, + { + "epoch": 3.278062229224104, + "grad_norm": 1.3382893800735474, + "learning_rate": 1.2471000000000001e-05, + "loss": 0.251, + "step": 4163 + }, + { + "epoch": 3.2788499409216225, + "grad_norm": 1.2375632524490356, + "learning_rate": 1.2474000000000001e-05, + "loss": 0.1707, + "step": 4164 + }, + { + "epoch": 3.2796376526191415, + "grad_norm": 0.7890026569366455, + "learning_rate": 1.2477e-05, + "loss": 0.1296, + "step": 4165 + }, + { + "epoch": 3.28042536431666, + "grad_norm": 0.6674181818962097, + "learning_rate": 1.2479999999999999e-05, + "loss": 0.0893, + "step": 4166 + }, + { + "epoch": 3.2812130760141787, + "grad_norm": 0.524045467376709, + "learning_rate": 1.2483e-05, + "loss": 0.0575, + "step": 4167 + }, + { + "epoch": 3.2820007877116977, + "grad_norm": 0.7632150053977966, + "learning_rate": 1.2486e-05, + "loss": 0.0541, + "step": 4168 + }, + { + "epoch": 3.2827884994092162, + "grad_norm": 0.4581703543663025, + "learning_rate": 1.2489e-05, + "loss": 0.0376, + "step": 4169 + }, + { + "epoch": 3.283576211106735, + "grad_norm": 0.9615974426269531, + "learning_rate": 1.2492e-05, + "loss": 0.0593, + "step": 4170 + }, + { + "epoch": 3.284363922804254, + "grad_norm": 0.6332741975784302, + "learning_rate": 1.2495e-05, + "loss": 0.04, + "step": 4171 + }, + { + "epoch": 3.2851516345017724, + "grad_norm": 0.4856887757778168, + "learning_rate": 1.2498000000000001e-05, + "loss": 0.0247, + "step": 4172 + }, + { + "epoch": 3.285939346199291, + "grad_norm": 0.6632201671600342, + "learning_rate": 1.2501000000000001e-05, + "loss": 0.0458, + "step": 4173 + }, + { + "epoch": 3.28672705789681, + "grad_norm": 0.5502901673316956, + "learning_rate": 1.2504000000000001e-05, + "loss": 0.0318, + "step": 4174 + }, + { + "epoch": 3.2875147695943285, + "grad_norm": 0.7584769129753113, + "learning_rate": 1.2507e-05, + "loss": 0.0557, + "step": 4175 + }, + { + "epoch": 3.288302481291847, + "grad_norm": 0.8393949270248413, + "learning_rate": 1.251e-05, + "loss": 0.0369, + "step": 4176 + }, + { + "epoch": 3.2890901929893657, + "grad_norm": 0.4575367569923401, + "learning_rate": 1.2513e-05, + "loss": 0.0315, + "step": 4177 + }, + { + "epoch": 3.2898779046868847, + "grad_norm": 0.4824506342411041, + "learning_rate": 1.2516e-05, + "loss": 0.0333, + "step": 4178 + }, + { + "epoch": 3.2906656163844032, + "grad_norm": 0.6603240966796875, + "learning_rate": 1.2519e-05, + "loss": 0.0374, + "step": 4179 + }, + { + "epoch": 3.2914533280819223, + "grad_norm": 0.6026250123977661, + "learning_rate": 1.2522e-05, + "loss": 0.0396, + "step": 4180 + }, + { + "epoch": 3.292241039779441, + "grad_norm": 0.7422687411308289, + "learning_rate": 1.2525e-05, + "loss": 0.0538, + "step": 4181 + }, + { + "epoch": 3.2930287514769594, + "grad_norm": 0.45306310057640076, + "learning_rate": 1.2528000000000001e-05, + "loss": 0.0317, + "step": 4182 + }, + { + "epoch": 3.293816463174478, + "grad_norm": 0.6239453554153442, + "learning_rate": 1.2531e-05, + "loss": 0.0509, + "step": 4183 + }, + { + "epoch": 3.294604174871997, + "grad_norm": 0.7086141705513, + "learning_rate": 1.2534e-05, + "loss": 0.0425, + "step": 4184 + }, + { + "epoch": 3.2953918865695155, + "grad_norm": 0.5242235064506531, + "learning_rate": 1.2537e-05, + "loss": 0.0418, + "step": 4185 + }, + { + "epoch": 3.296179598267034, + "grad_norm": 0.599339485168457, + "learning_rate": 1.254e-05, + "loss": 0.0413, + "step": 4186 + }, + { + "epoch": 3.296967309964553, + "grad_norm": 0.4459702968597412, + "learning_rate": 1.2543000000000002e-05, + "loss": 0.0352, + "step": 4187 + }, + { + "epoch": 3.2977550216620717, + "grad_norm": 1.1343791484832764, + "learning_rate": 1.2546000000000002e-05, + "loss": 0.0625, + "step": 4188 + }, + { + "epoch": 3.2985427333595903, + "grad_norm": 0.39101701974868774, + "learning_rate": 1.2549000000000001e-05, + "loss": 0.0275, + "step": 4189 + }, + { + "epoch": 3.2993304450571093, + "grad_norm": 0.7671141028404236, + "learning_rate": 1.2552e-05, + "loss": 0.0475, + "step": 4190 + }, + { + "epoch": 3.300118156754628, + "grad_norm": 0.49752599000930786, + "learning_rate": 1.2555e-05, + "loss": 0.0303, + "step": 4191 + }, + { + "epoch": 3.3009058684521464, + "grad_norm": 0.6647692918777466, + "learning_rate": 1.2558e-05, + "loss": 0.0536, + "step": 4192 + }, + { + "epoch": 3.3016935801496654, + "grad_norm": 0.7490576505661011, + "learning_rate": 1.2561e-05, + "loss": 0.0337, + "step": 4193 + }, + { + "epoch": 3.302481291847184, + "grad_norm": 0.6668580770492554, + "learning_rate": 1.2564e-05, + "loss": 0.0447, + "step": 4194 + }, + { + "epoch": 3.3032690035447025, + "grad_norm": 0.5027128458023071, + "learning_rate": 1.2567e-05, + "loss": 0.033, + "step": 4195 + }, + { + "epoch": 3.304056715242221, + "grad_norm": 0.5857109427452087, + "learning_rate": 1.257e-05, + "loss": 0.0327, + "step": 4196 + }, + { + "epoch": 3.30484442693974, + "grad_norm": 0.46187731623649597, + "learning_rate": 1.2573e-05, + "loss": 0.0362, + "step": 4197 + }, + { + "epoch": 3.3056321386372587, + "grad_norm": 0.5232667326927185, + "learning_rate": 1.2576000000000001e-05, + "loss": 0.0541, + "step": 4198 + }, + { + "epoch": 3.3064198503347777, + "grad_norm": 0.6528952717781067, + "learning_rate": 1.2579000000000001e-05, + "loss": 0.0575, + "step": 4199 + }, + { + "epoch": 3.3072075620322963, + "grad_norm": 0.8417725563049316, + "learning_rate": 1.2582e-05, + "loss": 0.0495, + "step": 4200 + }, + { + "epoch": 3.307995273729815, + "grad_norm": 0.8714038729667664, + "learning_rate": 1.2585e-05, + "loss": 0.0582, + "step": 4201 + }, + { + "epoch": 3.3087829854273334, + "grad_norm": 0.698313295841217, + "learning_rate": 1.2587999999999999e-05, + "loss": 0.0478, + "step": 4202 + }, + { + "epoch": 3.3095706971248524, + "grad_norm": 0.7445794939994812, + "learning_rate": 1.2591e-05, + "loss": 0.0452, + "step": 4203 + }, + { + "epoch": 3.310358408822371, + "grad_norm": 1.63778817653656, + "learning_rate": 1.2594e-05, + "loss": 0.0661, + "step": 4204 + }, + { + "epoch": 3.3111461205198895, + "grad_norm": 0.7721985578536987, + "learning_rate": 1.2597e-05, + "loss": 0.0556, + "step": 4205 + }, + { + "epoch": 3.3119338322174086, + "grad_norm": 0.7189369201660156, + "learning_rate": 1.26e-05, + "loss": 0.0529, + "step": 4206 + }, + { + "epoch": 3.312721543914927, + "grad_norm": 0.9025474786758423, + "learning_rate": 1.2603e-05, + "loss": 0.0678, + "step": 4207 + }, + { + "epoch": 3.3135092556124457, + "grad_norm": 0.608988881111145, + "learning_rate": 1.2606000000000001e-05, + "loss": 0.0331, + "step": 4208 + }, + { + "epoch": 3.3142969673099647, + "grad_norm": 0.889510452747345, + "learning_rate": 1.2609e-05, + "loss": 0.0584, + "step": 4209 + }, + { + "epoch": 3.3150846790074833, + "grad_norm": 0.9222556352615356, + "learning_rate": 1.2612e-05, + "loss": 0.0483, + "step": 4210 + }, + { + "epoch": 3.315872390705002, + "grad_norm": 1.1284918785095215, + "learning_rate": 1.2615e-05, + "loss": 0.4174, + "step": 4211 + }, + { + "epoch": 3.316660102402521, + "grad_norm": 1.5712335109710693, + "learning_rate": 1.2618e-05, + "loss": 0.3862, + "step": 4212 + }, + { + "epoch": 3.3174478141000394, + "grad_norm": 0.9952114224433899, + "learning_rate": 1.2621000000000002e-05, + "loss": 0.2311, + "step": 4213 + }, + { + "epoch": 3.318235525797558, + "grad_norm": 0.9052881598472595, + "learning_rate": 1.2624e-05, + "loss": 0.2543, + "step": 4214 + }, + { + "epoch": 3.319023237495077, + "grad_norm": 0.932630181312561, + "learning_rate": 1.2627e-05, + "loss": 0.1798, + "step": 4215 + }, + { + "epoch": 3.3198109491925956, + "grad_norm": 0.6634971499443054, + "learning_rate": 1.263e-05, + "loss": 0.0771, + "step": 4216 + }, + { + "epoch": 3.320598660890114, + "grad_norm": 0.8772913813591003, + "learning_rate": 1.2633e-05, + "loss": 0.0893, + "step": 4217 + }, + { + "epoch": 3.321386372587633, + "grad_norm": 3.480023145675659, + "learning_rate": 1.2636e-05, + "loss": 0.0881, + "step": 4218 + }, + { + "epoch": 3.3221740842851517, + "grad_norm": 0.620434045791626, + "learning_rate": 1.2639e-05, + "loss": 0.0913, + "step": 4219 + }, + { + "epoch": 3.3229617959826703, + "grad_norm": 0.4712977707386017, + "learning_rate": 1.2642e-05, + "loss": 0.0427, + "step": 4220 + }, + { + "epoch": 3.323749507680189, + "grad_norm": 0.8011451363563538, + "learning_rate": 1.2645e-05, + "loss": 0.0623, + "step": 4221 + }, + { + "epoch": 3.324537219377708, + "grad_norm": 0.5595332980155945, + "learning_rate": 1.2648e-05, + "loss": 0.0458, + "step": 4222 + }, + { + "epoch": 3.3253249310752264, + "grad_norm": 0.45360326766967773, + "learning_rate": 1.2651000000000001e-05, + "loss": 0.0342, + "step": 4223 + }, + { + "epoch": 3.326112642772745, + "grad_norm": 0.9444568753242493, + "learning_rate": 1.2654000000000001e-05, + "loss": 0.0559, + "step": 4224 + }, + { + "epoch": 3.326900354470264, + "grad_norm": 0.41675865650177, + "learning_rate": 1.2657000000000001e-05, + "loss": 0.0177, + "step": 4225 + }, + { + "epoch": 3.3276880661677826, + "grad_norm": 0.8433734774589539, + "learning_rate": 1.2659999999999999e-05, + "loss": 0.0626, + "step": 4226 + }, + { + "epoch": 3.328475777865301, + "grad_norm": 0.5688547492027283, + "learning_rate": 1.2662999999999999e-05, + "loss": 0.0275, + "step": 4227 + }, + { + "epoch": 3.32926348956282, + "grad_norm": 0.6491416692733765, + "learning_rate": 1.2666e-05, + "loss": 0.0419, + "step": 4228 + }, + { + "epoch": 3.3300512012603387, + "grad_norm": 0.5729240775108337, + "learning_rate": 1.2669e-05, + "loss": 0.0398, + "step": 4229 + }, + { + "epoch": 3.3308389129578573, + "grad_norm": 0.5734041929244995, + "learning_rate": 1.2672e-05, + "loss": 0.0328, + "step": 4230 + }, + { + "epoch": 3.3316266246553763, + "grad_norm": 0.5817105770111084, + "learning_rate": 1.2675e-05, + "loss": 0.0386, + "step": 4231 + }, + { + "epoch": 3.332414336352895, + "grad_norm": 1.0530307292938232, + "learning_rate": 1.2678e-05, + "loss": 0.0347, + "step": 4232 + }, + { + "epoch": 3.3332020480504134, + "grad_norm": 0.34940680861473083, + "learning_rate": 1.2681000000000001e-05, + "loss": 0.0306, + "step": 4233 + }, + { + "epoch": 3.3339897597479324, + "grad_norm": 0.37531837821006775, + "learning_rate": 1.2684000000000001e-05, + "loss": 0.0339, + "step": 4234 + }, + { + "epoch": 3.334777471445451, + "grad_norm": 0.5451421141624451, + "learning_rate": 1.2687e-05, + "loss": 0.0562, + "step": 4235 + }, + { + "epoch": 3.3355651831429696, + "grad_norm": 1.000906229019165, + "learning_rate": 1.269e-05, + "loss": 0.0432, + "step": 4236 + }, + { + "epoch": 3.3363528948404886, + "grad_norm": 1.0071935653686523, + "learning_rate": 1.2693e-05, + "loss": 0.1019, + "step": 4237 + }, + { + "epoch": 3.337140606538007, + "grad_norm": 0.4755503237247467, + "learning_rate": 1.2696000000000002e-05, + "loss": 0.0261, + "step": 4238 + }, + { + "epoch": 3.3379283182355257, + "grad_norm": 0.6146532297134399, + "learning_rate": 1.2699e-05, + "loss": 0.0571, + "step": 4239 + }, + { + "epoch": 3.3387160299330443, + "grad_norm": 0.840123176574707, + "learning_rate": 1.2702e-05, + "loss": 0.0499, + "step": 4240 + }, + { + "epoch": 3.3395037416305633, + "grad_norm": 0.7891411781311035, + "learning_rate": 1.2705e-05, + "loss": 0.06, + "step": 4241 + }, + { + "epoch": 3.340291453328082, + "grad_norm": 0.5145392417907715, + "learning_rate": 1.2708e-05, + "loss": 0.0377, + "step": 4242 + }, + { + "epoch": 3.341079165025601, + "grad_norm": 0.6061033606529236, + "learning_rate": 1.2711e-05, + "loss": 0.0509, + "step": 4243 + }, + { + "epoch": 3.3418668767231194, + "grad_norm": 0.6866207718849182, + "learning_rate": 1.2714e-05, + "loss": 0.0466, + "step": 4244 + }, + { + "epoch": 3.342654588420638, + "grad_norm": 0.6267085075378418, + "learning_rate": 1.2717e-05, + "loss": 0.0375, + "step": 4245 + }, + { + "epoch": 3.3434423001181566, + "grad_norm": 0.6952674388885498, + "learning_rate": 1.272e-05, + "loss": 0.0492, + "step": 4246 + }, + { + "epoch": 3.3442300118156756, + "grad_norm": 0.941451907157898, + "learning_rate": 1.2723e-05, + "loss": 0.0393, + "step": 4247 + }, + { + "epoch": 3.345017723513194, + "grad_norm": 0.5516367554664612, + "learning_rate": 1.2726000000000001e-05, + "loss": 0.0367, + "step": 4248 + }, + { + "epoch": 3.3458054352107127, + "grad_norm": 0.6251102685928345, + "learning_rate": 1.2729000000000001e-05, + "loss": 0.0296, + "step": 4249 + }, + { + "epoch": 3.3465931469082317, + "grad_norm": 0.8186331391334534, + "learning_rate": 1.2732000000000001e-05, + "loss": 0.0397, + "step": 4250 + }, + { + "epoch": 3.3473808586057503, + "grad_norm": 0.6157922148704529, + "learning_rate": 1.2735e-05, + "loss": 0.0457, + "step": 4251 + }, + { + "epoch": 3.348168570303269, + "grad_norm": 0.793036162853241, + "learning_rate": 1.2737999999999999e-05, + "loss": 0.0406, + "step": 4252 + }, + { + "epoch": 3.348956282000788, + "grad_norm": 0.7275445461273193, + "learning_rate": 1.2741e-05, + "loss": 0.0487, + "step": 4253 + }, + { + "epoch": 3.3497439936983064, + "grad_norm": 1.1245062351226807, + "learning_rate": 1.2744e-05, + "loss": 0.0636, + "step": 4254 + }, + { + "epoch": 3.350531705395825, + "grad_norm": 0.7537308931350708, + "learning_rate": 1.2747e-05, + "loss": 0.0513, + "step": 4255 + }, + { + "epoch": 3.351319417093344, + "grad_norm": 0.7430124878883362, + "learning_rate": 1.275e-05, + "loss": 0.0285, + "step": 4256 + }, + { + "epoch": 3.3521071287908626, + "grad_norm": 0.547439694404602, + "learning_rate": 1.2753e-05, + "loss": 0.033, + "step": 4257 + }, + { + "epoch": 3.352894840488381, + "grad_norm": 0.9258630275726318, + "learning_rate": 1.2756000000000001e-05, + "loss": 0.0707, + "step": 4258 + }, + { + "epoch": 3.3536825521858997, + "grad_norm": 0.7675349712371826, + "learning_rate": 1.2759000000000001e-05, + "loss": 0.0566, + "step": 4259 + }, + { + "epoch": 3.3544702638834187, + "grad_norm": 0.7682092785835266, + "learning_rate": 1.2762e-05, + "loss": 0.0506, + "step": 4260 + }, + { + "epoch": 3.3552579755809373, + "grad_norm": 1.9224830865859985, + "learning_rate": 1.2765e-05, + "loss": 0.4279, + "step": 4261 + }, + { + "epoch": 3.3560456872784563, + "grad_norm": 1.092612385749817, + "learning_rate": 1.2768e-05, + "loss": 0.3527, + "step": 4262 + }, + { + "epoch": 3.356833398975975, + "grad_norm": 1.0751913785934448, + "learning_rate": 1.2771e-05, + "loss": 0.2493, + "step": 4263 + }, + { + "epoch": 3.3576211106734934, + "grad_norm": 0.8989518880844116, + "learning_rate": 1.2774e-05, + "loss": 0.1903, + "step": 4264 + }, + { + "epoch": 3.358408822371012, + "grad_norm": 1.416426420211792, + "learning_rate": 1.2777e-05, + "loss": 0.1457, + "step": 4265 + }, + { + "epoch": 3.359196534068531, + "grad_norm": 1.0193160772323608, + "learning_rate": 1.278e-05, + "loss": 0.1769, + "step": 4266 + }, + { + "epoch": 3.3599842457660496, + "grad_norm": 0.9858120679855347, + "learning_rate": 1.2783e-05, + "loss": 0.101, + "step": 4267 + }, + { + "epoch": 3.360771957463568, + "grad_norm": 0.5596039891242981, + "learning_rate": 1.2786000000000001e-05, + "loss": 0.0661, + "step": 4268 + }, + { + "epoch": 3.361559669161087, + "grad_norm": 0.6483952403068542, + "learning_rate": 1.2789e-05, + "loss": 0.0497, + "step": 4269 + }, + { + "epoch": 3.3623473808586057, + "grad_norm": 0.932285726070404, + "learning_rate": 1.2792e-05, + "loss": 0.0561, + "step": 4270 + }, + { + "epoch": 3.3631350925561243, + "grad_norm": 0.7785527110099792, + "learning_rate": 1.2795e-05, + "loss": 0.0333, + "step": 4271 + }, + { + "epoch": 3.3639228042536433, + "grad_norm": 0.480754017829895, + "learning_rate": 1.2798e-05, + "loss": 0.0228, + "step": 4272 + }, + { + "epoch": 3.364710515951162, + "grad_norm": 0.49007657170295715, + "learning_rate": 1.2801000000000002e-05, + "loss": 0.0307, + "step": 4273 + }, + { + "epoch": 3.3654982276486805, + "grad_norm": 0.8994677662849426, + "learning_rate": 1.2804000000000001e-05, + "loss": 0.0395, + "step": 4274 + }, + { + "epoch": 3.3662859393461995, + "grad_norm": 0.5983802080154419, + "learning_rate": 1.2807000000000001e-05, + "loss": 0.0647, + "step": 4275 + }, + { + "epoch": 3.367073651043718, + "grad_norm": 0.910150408744812, + "learning_rate": 1.281e-05, + "loss": 0.0394, + "step": 4276 + }, + { + "epoch": 3.3678613627412366, + "grad_norm": 0.5515006184577942, + "learning_rate": 1.2812999999999999e-05, + "loss": 0.0435, + "step": 4277 + }, + { + "epoch": 3.3686490744387556, + "grad_norm": 0.5854009389877319, + "learning_rate": 1.2816e-05, + "loss": 0.036, + "step": 4278 + }, + { + "epoch": 3.369436786136274, + "grad_norm": 0.6635561585426331, + "learning_rate": 1.2819e-05, + "loss": 0.0414, + "step": 4279 + }, + { + "epoch": 3.3702244978337927, + "grad_norm": 0.8753423094749451, + "learning_rate": 1.2822e-05, + "loss": 0.0363, + "step": 4280 + }, + { + "epoch": 3.3710122095313118, + "grad_norm": 0.6970058679580688, + "learning_rate": 1.2825e-05, + "loss": 0.0359, + "step": 4281 + }, + { + "epoch": 3.3717999212288303, + "grad_norm": 0.4681422710418701, + "learning_rate": 1.2828e-05, + "loss": 0.034, + "step": 4282 + }, + { + "epoch": 3.372587632926349, + "grad_norm": 0.8153987526893616, + "learning_rate": 1.2831000000000001e-05, + "loss": 0.0427, + "step": 4283 + }, + { + "epoch": 3.3733753446238675, + "grad_norm": 0.4703180491924286, + "learning_rate": 1.2834000000000001e-05, + "loss": 0.0345, + "step": 4284 + }, + { + "epoch": 3.3741630563213865, + "grad_norm": 0.46849098801612854, + "learning_rate": 1.2837000000000001e-05, + "loss": 0.0232, + "step": 4285 + }, + { + "epoch": 3.374950768018905, + "grad_norm": 0.500266432762146, + "learning_rate": 1.284e-05, + "loss": 0.0336, + "step": 4286 + }, + { + "epoch": 3.3757384797164236, + "grad_norm": 0.563986599445343, + "learning_rate": 1.2843e-05, + "loss": 0.0264, + "step": 4287 + }, + { + "epoch": 3.3765261914139426, + "grad_norm": 0.9331568479537964, + "learning_rate": 1.2846e-05, + "loss": 0.0343, + "step": 4288 + }, + { + "epoch": 3.377313903111461, + "grad_norm": 0.9601365327835083, + "learning_rate": 1.2849e-05, + "loss": 0.0481, + "step": 4289 + }, + { + "epoch": 3.3781016148089797, + "grad_norm": 0.7159706950187683, + "learning_rate": 1.2852e-05, + "loss": 0.0439, + "step": 4290 + }, + { + "epoch": 3.3788893265064988, + "grad_norm": 0.5304768681526184, + "learning_rate": 1.2855e-05, + "loss": 0.0294, + "step": 4291 + }, + { + "epoch": 3.3796770382040173, + "grad_norm": 0.9607317447662354, + "learning_rate": 1.2858e-05, + "loss": 0.0486, + "step": 4292 + }, + { + "epoch": 3.380464749901536, + "grad_norm": 0.6542136073112488, + "learning_rate": 1.2861000000000001e-05, + "loss": 0.0435, + "step": 4293 + }, + { + "epoch": 3.381252461599055, + "grad_norm": 0.7334176301956177, + "learning_rate": 1.2864e-05, + "loss": 0.0499, + "step": 4294 + }, + { + "epoch": 3.3820401732965735, + "grad_norm": 0.5551656484603882, + "learning_rate": 1.2867e-05, + "loss": 0.0447, + "step": 4295 + }, + { + "epoch": 3.382827884994092, + "grad_norm": 0.8023624420166016, + "learning_rate": 1.287e-05, + "loss": 0.0604, + "step": 4296 + }, + { + "epoch": 3.383615596691611, + "grad_norm": 0.6493339538574219, + "learning_rate": 1.2873e-05, + "loss": 0.0372, + "step": 4297 + }, + { + "epoch": 3.3844033083891296, + "grad_norm": 0.6654216051101685, + "learning_rate": 1.2876000000000002e-05, + "loss": 0.0571, + "step": 4298 + }, + { + "epoch": 3.385191020086648, + "grad_norm": 0.8177908658981323, + "learning_rate": 1.2879000000000002e-05, + "loss": 0.0529, + "step": 4299 + }, + { + "epoch": 3.385978731784167, + "grad_norm": 0.7300742268562317, + "learning_rate": 1.2882e-05, + "loss": 0.0263, + "step": 4300 + }, + { + "epoch": 3.3867664434816858, + "grad_norm": 0.5752741694450378, + "learning_rate": 1.2885e-05, + "loss": 0.0463, + "step": 4301 + }, + { + "epoch": 3.3875541551792043, + "grad_norm": 0.7149348258972168, + "learning_rate": 1.2888e-05, + "loss": 0.0397, + "step": 4302 + }, + { + "epoch": 3.388341866876723, + "grad_norm": 0.57847660779953, + "learning_rate": 1.2891e-05, + "loss": 0.0435, + "step": 4303 + }, + { + "epoch": 3.389129578574242, + "grad_norm": 0.7735539078712463, + "learning_rate": 1.2894e-05, + "loss": 0.0517, + "step": 4304 + }, + { + "epoch": 3.3899172902717605, + "grad_norm": 0.8624207973480225, + "learning_rate": 1.2897e-05, + "loss": 0.0416, + "step": 4305 + }, + { + "epoch": 3.3907050019692795, + "grad_norm": 1.0902915000915527, + "learning_rate": 1.29e-05, + "loss": 0.0547, + "step": 4306 + }, + { + "epoch": 3.391492713666798, + "grad_norm": 0.6699715852737427, + "learning_rate": 1.2903e-05, + "loss": 0.0402, + "step": 4307 + }, + { + "epoch": 3.3922804253643166, + "grad_norm": 0.7031717896461487, + "learning_rate": 1.2906000000000001e-05, + "loss": 0.0431, + "step": 4308 + }, + { + "epoch": 3.393068137061835, + "grad_norm": 0.8910537362098694, + "learning_rate": 1.2909000000000001e-05, + "loss": 0.0489, + "step": 4309 + }, + { + "epoch": 3.393855848759354, + "grad_norm": 0.7881303429603577, + "learning_rate": 1.2912000000000001e-05, + "loss": 0.055, + "step": 4310 + }, + { + "epoch": 3.3946435604568728, + "grad_norm": 1.8764052391052246, + "learning_rate": 1.2915000000000001e-05, + "loss": 0.4681, + "step": 4311 + }, + { + "epoch": 3.3954312721543913, + "grad_norm": 1.2543952465057373, + "learning_rate": 1.2917999999999999e-05, + "loss": 0.3752, + "step": 4312 + }, + { + "epoch": 3.3962189838519103, + "grad_norm": 1.5631824731826782, + "learning_rate": 1.2921e-05, + "loss": 0.3367, + "step": 4313 + }, + { + "epoch": 3.397006695549429, + "grad_norm": 1.702230453491211, + "learning_rate": 1.2924e-05, + "loss": 0.2448, + "step": 4314 + }, + { + "epoch": 3.3977944072469475, + "grad_norm": 1.0955604314804077, + "learning_rate": 1.2927e-05, + "loss": 0.1582, + "step": 4315 + }, + { + "epoch": 3.3985821189444665, + "grad_norm": 1.0371026992797852, + "learning_rate": 1.293e-05, + "loss": 0.1253, + "step": 4316 + }, + { + "epoch": 3.399369830641985, + "grad_norm": 1.1306242942810059, + "learning_rate": 1.2933e-05, + "loss": 0.1734, + "step": 4317 + }, + { + "epoch": 3.4001575423395036, + "grad_norm": 0.7592105865478516, + "learning_rate": 1.2936000000000001e-05, + "loss": 0.0546, + "step": 4318 + }, + { + "epoch": 3.4009452540370226, + "grad_norm": 1.2015093564987183, + "learning_rate": 1.2939000000000001e-05, + "loss": 0.0485, + "step": 4319 + }, + { + "epoch": 3.401732965734541, + "grad_norm": 1.1145999431610107, + "learning_rate": 1.2942e-05, + "loss": 0.0942, + "step": 4320 + }, + { + "epoch": 3.4025206774320598, + "grad_norm": 0.6898665428161621, + "learning_rate": 1.2945e-05, + "loss": 0.0518, + "step": 4321 + }, + { + "epoch": 3.4033083891295783, + "grad_norm": 0.9680213928222656, + "learning_rate": 1.2948e-05, + "loss": 0.0618, + "step": 4322 + }, + { + "epoch": 3.4040961008270973, + "grad_norm": 0.5861354470252991, + "learning_rate": 1.2951e-05, + "loss": 0.0437, + "step": 4323 + }, + { + "epoch": 3.404883812524616, + "grad_norm": 0.5138019323348999, + "learning_rate": 1.2954000000000002e-05, + "loss": 0.0308, + "step": 4324 + }, + { + "epoch": 3.405671524222135, + "grad_norm": 0.5377653241157532, + "learning_rate": 1.2957e-05, + "loss": 0.0346, + "step": 4325 + }, + { + "epoch": 3.4064592359196535, + "grad_norm": 0.7539350986480713, + "learning_rate": 1.296e-05, + "loss": 0.055, + "step": 4326 + }, + { + "epoch": 3.407246947617172, + "grad_norm": 0.43814483284950256, + "learning_rate": 1.2963e-05, + "loss": 0.0209, + "step": 4327 + }, + { + "epoch": 3.4080346593146906, + "grad_norm": 0.3892878293991089, + "learning_rate": 1.2966e-05, + "loss": 0.0273, + "step": 4328 + }, + { + "epoch": 3.4088223710122096, + "grad_norm": 1.0930229425430298, + "learning_rate": 1.2969e-05, + "loss": 0.0421, + "step": 4329 + }, + { + "epoch": 3.409610082709728, + "grad_norm": 0.6238739490509033, + "learning_rate": 1.2972e-05, + "loss": 0.0378, + "step": 4330 + }, + { + "epoch": 3.4103977944072468, + "grad_norm": 0.6948608756065369, + "learning_rate": 1.2975e-05, + "loss": 0.0611, + "step": 4331 + }, + { + "epoch": 3.411185506104766, + "grad_norm": 0.5414325594902039, + "learning_rate": 1.2978e-05, + "loss": 0.0372, + "step": 4332 + }, + { + "epoch": 3.4119732178022844, + "grad_norm": 0.6074135303497314, + "learning_rate": 1.2981e-05, + "loss": 0.0406, + "step": 4333 + }, + { + "epoch": 3.412760929499803, + "grad_norm": 0.5423781275749207, + "learning_rate": 1.2984000000000001e-05, + "loss": 0.0292, + "step": 4334 + }, + { + "epoch": 3.413548641197322, + "grad_norm": 0.47212788462638855, + "learning_rate": 1.2987000000000001e-05, + "loss": 0.037, + "step": 4335 + }, + { + "epoch": 3.4143363528948405, + "grad_norm": 0.8344385623931885, + "learning_rate": 1.2990000000000001e-05, + "loss": 0.0516, + "step": 4336 + }, + { + "epoch": 3.415124064592359, + "grad_norm": 0.4756212830543518, + "learning_rate": 1.2992999999999999e-05, + "loss": 0.0337, + "step": 4337 + }, + { + "epoch": 3.415911776289878, + "grad_norm": 0.6220102906227112, + "learning_rate": 1.2995999999999999e-05, + "loss": 0.0287, + "step": 4338 + }, + { + "epoch": 3.4166994879873966, + "grad_norm": 0.5282211303710938, + "learning_rate": 1.2999e-05, + "loss": 0.0294, + "step": 4339 + }, + { + "epoch": 3.417487199684915, + "grad_norm": 0.5071007609367371, + "learning_rate": 1.3002e-05, + "loss": 0.0306, + "step": 4340 + }, + { + "epoch": 3.4182749113824342, + "grad_norm": 1.3853610754013062, + "learning_rate": 1.3005e-05, + "loss": 0.0551, + "step": 4341 + }, + { + "epoch": 3.419062623079953, + "grad_norm": 0.5367956757545471, + "learning_rate": 1.3008e-05, + "loss": 0.0445, + "step": 4342 + }, + { + "epoch": 3.4198503347774714, + "grad_norm": 0.493026465177536, + "learning_rate": 1.3011e-05, + "loss": 0.0277, + "step": 4343 + }, + { + "epoch": 3.4206380464749904, + "grad_norm": 1.3219879865646362, + "learning_rate": 1.3014000000000001e-05, + "loss": 0.0502, + "step": 4344 + }, + { + "epoch": 3.421425758172509, + "grad_norm": 0.8509877920150757, + "learning_rate": 1.3017000000000001e-05, + "loss": 0.0398, + "step": 4345 + }, + { + "epoch": 3.4222134698700275, + "grad_norm": 0.44382238388061523, + "learning_rate": 1.302e-05, + "loss": 0.0184, + "step": 4346 + }, + { + "epoch": 3.423001181567546, + "grad_norm": 0.6554411053657532, + "learning_rate": 1.3023e-05, + "loss": 0.037, + "step": 4347 + }, + { + "epoch": 3.423788893265065, + "grad_norm": 0.5189496278762817, + "learning_rate": 1.3026e-05, + "loss": 0.0418, + "step": 4348 + }, + { + "epoch": 3.4245766049625836, + "grad_norm": 0.3905388116836548, + "learning_rate": 1.3029e-05, + "loss": 0.0268, + "step": 4349 + }, + { + "epoch": 3.425364316660102, + "grad_norm": 0.6919323801994324, + "learning_rate": 1.3032e-05, + "loss": 0.0319, + "step": 4350 + }, + { + "epoch": 3.4261520283576212, + "grad_norm": 0.5787458419799805, + "learning_rate": 1.3035e-05, + "loss": 0.0267, + "step": 4351 + }, + { + "epoch": 3.42693974005514, + "grad_norm": 0.5164385437965393, + "learning_rate": 1.3038e-05, + "loss": 0.0318, + "step": 4352 + }, + { + "epoch": 3.4277274517526584, + "grad_norm": 0.8131687045097351, + "learning_rate": 1.3041e-05, + "loss": 0.0491, + "step": 4353 + }, + { + "epoch": 3.4285151634501774, + "grad_norm": 0.6988740563392639, + "learning_rate": 1.3044e-05, + "loss": 0.0453, + "step": 4354 + }, + { + "epoch": 3.429302875147696, + "grad_norm": 0.731473445892334, + "learning_rate": 1.3047e-05, + "loss": 0.0569, + "step": 4355 + }, + { + "epoch": 3.4300905868452145, + "grad_norm": 0.703393280506134, + "learning_rate": 1.305e-05, + "loss": 0.0379, + "step": 4356 + }, + { + "epoch": 3.4308782985427335, + "grad_norm": 0.8869132995605469, + "learning_rate": 1.3053e-05, + "loss": 0.0404, + "step": 4357 + }, + { + "epoch": 3.431666010240252, + "grad_norm": 0.709222674369812, + "learning_rate": 1.3056e-05, + "loss": 0.0482, + "step": 4358 + }, + { + "epoch": 3.4324537219377707, + "grad_norm": 0.7927533984184265, + "learning_rate": 1.3059000000000002e-05, + "loss": 0.0284, + "step": 4359 + }, + { + "epoch": 3.4332414336352897, + "grad_norm": 0.8191909193992615, + "learning_rate": 1.3062000000000001e-05, + "loss": 0.0426, + "step": 4360 + }, + { + "epoch": 3.4340291453328082, + "grad_norm": 2.4612395763397217, + "learning_rate": 1.3065000000000001e-05, + "loss": 0.4357, + "step": 4361 + }, + { + "epoch": 3.434816857030327, + "grad_norm": 1.2643779516220093, + "learning_rate": 1.3068e-05, + "loss": 0.3204, + "step": 4362 + }, + { + "epoch": 3.435604568727846, + "grad_norm": 0.8449984192848206, + "learning_rate": 1.3070999999999999e-05, + "loss": 0.2378, + "step": 4363 + }, + { + "epoch": 3.4363922804253644, + "grad_norm": 1.1791229248046875, + "learning_rate": 1.3074e-05, + "loss": 0.2167, + "step": 4364 + }, + { + "epoch": 3.437179992122883, + "grad_norm": 0.7561640739440918, + "learning_rate": 1.3077e-05, + "loss": 0.1016, + "step": 4365 + }, + { + "epoch": 3.4379677038204015, + "grad_norm": 0.692566454410553, + "learning_rate": 1.308e-05, + "loss": 0.0653, + "step": 4366 + }, + { + "epoch": 3.4387554155179205, + "grad_norm": 0.5355672240257263, + "learning_rate": 1.3083e-05, + "loss": 0.07, + "step": 4367 + }, + { + "epoch": 3.439543127215439, + "grad_norm": 0.6395969390869141, + "learning_rate": 1.3086e-05, + "loss": 0.0717, + "step": 4368 + }, + { + "epoch": 3.440330838912958, + "grad_norm": 0.599572479724884, + "learning_rate": 1.3089000000000001e-05, + "loss": 0.0395, + "step": 4369 + }, + { + "epoch": 3.4411185506104767, + "grad_norm": 0.5067611932754517, + "learning_rate": 1.3092000000000001e-05, + "loss": 0.0412, + "step": 4370 + }, + { + "epoch": 3.4419062623079952, + "grad_norm": 0.8068516254425049, + "learning_rate": 1.3095e-05, + "loss": 0.0446, + "step": 4371 + }, + { + "epoch": 3.442693974005514, + "grad_norm": 0.751352846622467, + "learning_rate": 1.3098e-05, + "loss": 0.0466, + "step": 4372 + }, + { + "epoch": 3.443481685703033, + "grad_norm": 1.0376030206680298, + "learning_rate": 1.3101e-05, + "loss": 0.0561, + "step": 4373 + }, + { + "epoch": 3.4442693974005514, + "grad_norm": 0.4469934105873108, + "learning_rate": 1.3104e-05, + "loss": 0.0277, + "step": 4374 + }, + { + "epoch": 3.44505710909807, + "grad_norm": 0.846080482006073, + "learning_rate": 1.3107e-05, + "loss": 0.0513, + "step": 4375 + }, + { + "epoch": 3.445844820795589, + "grad_norm": 0.39386221766471863, + "learning_rate": 1.311e-05, + "loss": 0.0289, + "step": 4376 + }, + { + "epoch": 3.4466325324931075, + "grad_norm": 0.6623865365982056, + "learning_rate": 1.3113e-05, + "loss": 0.0457, + "step": 4377 + }, + { + "epoch": 3.447420244190626, + "grad_norm": 0.634091854095459, + "learning_rate": 1.3116e-05, + "loss": 0.0345, + "step": 4378 + }, + { + "epoch": 3.448207955888145, + "grad_norm": 0.6744831204414368, + "learning_rate": 1.3119000000000001e-05, + "loss": 0.0372, + "step": 4379 + }, + { + "epoch": 3.4489956675856637, + "grad_norm": 0.6980535984039307, + "learning_rate": 1.3122e-05, + "loss": 0.0392, + "step": 4380 + }, + { + "epoch": 3.4497833792831822, + "grad_norm": 0.7403380274772644, + "learning_rate": 1.3125e-05, + "loss": 0.0322, + "step": 4381 + }, + { + "epoch": 3.4505710909807012, + "grad_norm": 0.618099570274353, + "learning_rate": 1.3128e-05, + "loss": 0.0378, + "step": 4382 + }, + { + "epoch": 3.45135880267822, + "grad_norm": 0.5577452778816223, + "learning_rate": 1.3131e-05, + "loss": 0.041, + "step": 4383 + }, + { + "epoch": 3.4521465143757384, + "grad_norm": 0.5546232461929321, + "learning_rate": 1.3134000000000002e-05, + "loss": 0.0499, + "step": 4384 + }, + { + "epoch": 3.452934226073257, + "grad_norm": 0.9579913020133972, + "learning_rate": 1.3137000000000001e-05, + "loss": 0.0548, + "step": 4385 + }, + { + "epoch": 3.453721937770776, + "grad_norm": 0.6456385254859924, + "learning_rate": 1.314e-05, + "loss": 0.0383, + "step": 4386 + }, + { + "epoch": 3.4545096494682945, + "grad_norm": 1.018191933631897, + "learning_rate": 1.3143e-05, + "loss": 0.0481, + "step": 4387 + }, + { + "epoch": 3.4552973611658135, + "grad_norm": 0.664828360080719, + "learning_rate": 1.3146e-05, + "loss": 0.0281, + "step": 4388 + }, + { + "epoch": 3.456085072863332, + "grad_norm": 0.5802586674690247, + "learning_rate": 1.3149e-05, + "loss": 0.0414, + "step": 4389 + }, + { + "epoch": 3.4568727845608507, + "grad_norm": 0.6665833592414856, + "learning_rate": 1.3152e-05, + "loss": 0.0319, + "step": 4390 + }, + { + "epoch": 3.4576604962583692, + "grad_norm": 0.7914044260978699, + "learning_rate": 1.3155e-05, + "loss": 0.0564, + "step": 4391 + }, + { + "epoch": 3.4584482079558883, + "grad_norm": 0.7539932727813721, + "learning_rate": 1.3158e-05, + "loss": 0.0393, + "step": 4392 + }, + { + "epoch": 3.459235919653407, + "grad_norm": 0.575615644454956, + "learning_rate": 1.3161e-05, + "loss": 0.0373, + "step": 4393 + }, + { + "epoch": 3.4600236313509254, + "grad_norm": 1.2584296464920044, + "learning_rate": 1.3164000000000001e-05, + "loss": 0.066, + "step": 4394 + }, + { + "epoch": 3.4608113430484444, + "grad_norm": 0.709675669670105, + "learning_rate": 1.3167000000000001e-05, + "loss": 0.0367, + "step": 4395 + }, + { + "epoch": 3.461599054745963, + "grad_norm": 0.8897528648376465, + "learning_rate": 1.3170000000000001e-05, + "loss": 0.0358, + "step": 4396 + }, + { + "epoch": 3.4623867664434815, + "grad_norm": 0.8151706457138062, + "learning_rate": 1.3173e-05, + "loss": 0.0439, + "step": 4397 + }, + { + "epoch": 3.4631744781410005, + "grad_norm": 0.49639156460762024, + "learning_rate": 1.3175999999999999e-05, + "loss": 0.0308, + "step": 4398 + }, + { + "epoch": 3.463962189838519, + "grad_norm": 0.8934942483901978, + "learning_rate": 1.3179e-05, + "loss": 0.0353, + "step": 4399 + }, + { + "epoch": 3.4647499015360377, + "grad_norm": 1.1196657419204712, + "learning_rate": 1.3182e-05, + "loss": 0.0663, + "step": 4400 + }, + { + "epoch": 3.4655376132335567, + "grad_norm": 0.7877858877182007, + "learning_rate": 1.3185e-05, + "loss": 0.045, + "step": 4401 + }, + { + "epoch": 3.4663253249310753, + "grad_norm": 1.176571249961853, + "learning_rate": 1.3188e-05, + "loss": 0.0513, + "step": 4402 + }, + { + "epoch": 3.467113036628594, + "grad_norm": 0.7639464139938354, + "learning_rate": 1.3191e-05, + "loss": 0.0556, + "step": 4403 + }, + { + "epoch": 3.467900748326113, + "grad_norm": 0.450760155916214, + "learning_rate": 1.3194000000000001e-05, + "loss": 0.0276, + "step": 4404 + }, + { + "epoch": 3.4686884600236314, + "grad_norm": 1.5761055946350098, + "learning_rate": 1.3197000000000001e-05, + "loss": 0.0618, + "step": 4405 + }, + { + "epoch": 3.46947617172115, + "grad_norm": 0.6764607429504395, + "learning_rate": 1.32e-05, + "loss": 0.0289, + "step": 4406 + }, + { + "epoch": 3.470263883418669, + "grad_norm": 0.9872034788131714, + "learning_rate": 1.3203e-05, + "loss": 0.0633, + "step": 4407 + }, + { + "epoch": 3.4710515951161875, + "grad_norm": 1.631022334098816, + "learning_rate": 1.3206e-05, + "loss": 0.0546, + "step": 4408 + }, + { + "epoch": 3.471839306813706, + "grad_norm": 0.9434762001037598, + "learning_rate": 1.3209000000000002e-05, + "loss": 0.0684, + "step": 4409 + }, + { + "epoch": 3.4726270185112247, + "grad_norm": 1.346133828163147, + "learning_rate": 1.3212000000000002e-05, + "loss": 0.0665, + "step": 4410 + }, + { + "epoch": 3.4734147302087437, + "grad_norm": 1.6345129013061523, + "learning_rate": 1.3215e-05, + "loss": 0.4489, + "step": 4411 + }, + { + "epoch": 3.4742024419062623, + "grad_norm": 1.0559735298156738, + "learning_rate": 1.3218e-05, + "loss": 0.339, + "step": 4412 + }, + { + "epoch": 3.474990153603781, + "grad_norm": 1.0621230602264404, + "learning_rate": 1.3221e-05, + "loss": 0.2922, + "step": 4413 + }, + { + "epoch": 3.4757778653013, + "grad_norm": 1.0709342956542969, + "learning_rate": 1.3224e-05, + "loss": 0.2315, + "step": 4414 + }, + { + "epoch": 3.4765655769988184, + "grad_norm": 0.9823777079582214, + "learning_rate": 1.3227e-05, + "loss": 0.1351, + "step": 4415 + }, + { + "epoch": 3.477353288696337, + "grad_norm": 1.077075719833374, + "learning_rate": 1.323e-05, + "loss": 0.1758, + "step": 4416 + }, + { + "epoch": 3.478141000393856, + "grad_norm": 0.5154903531074524, + "learning_rate": 1.3233e-05, + "loss": 0.0455, + "step": 4417 + }, + { + "epoch": 3.4789287120913746, + "grad_norm": 0.49221110343933105, + "learning_rate": 1.3236e-05, + "loss": 0.0476, + "step": 4418 + }, + { + "epoch": 3.479716423788893, + "grad_norm": 0.49967122077941895, + "learning_rate": 1.3239000000000001e-05, + "loss": 0.0403, + "step": 4419 + }, + { + "epoch": 3.480504135486412, + "grad_norm": 0.4885390102863312, + "learning_rate": 1.3242000000000001e-05, + "loss": 0.0425, + "step": 4420 + }, + { + "epoch": 3.4812918471839307, + "grad_norm": 0.7519790530204773, + "learning_rate": 1.3245000000000001e-05, + "loss": 0.0449, + "step": 4421 + }, + { + "epoch": 3.4820795588814493, + "grad_norm": 1.0102506875991821, + "learning_rate": 1.3248000000000001e-05, + "loss": 0.0535, + "step": 4422 + }, + { + "epoch": 3.4828672705789683, + "grad_norm": 0.7890254855155945, + "learning_rate": 1.3250999999999999e-05, + "loss": 0.038, + "step": 4423 + }, + { + "epoch": 3.483654982276487, + "grad_norm": 0.36521169543266296, + "learning_rate": 1.3254e-05, + "loss": 0.0293, + "step": 4424 + }, + { + "epoch": 3.4844426939740054, + "grad_norm": 0.9640789031982422, + "learning_rate": 1.3257e-05, + "loss": 0.0389, + "step": 4425 + }, + { + "epoch": 3.4852304056715244, + "grad_norm": 0.682693600654602, + "learning_rate": 1.326e-05, + "loss": 0.0344, + "step": 4426 + }, + { + "epoch": 3.486018117369043, + "grad_norm": 0.5153148174285889, + "learning_rate": 1.3263e-05, + "loss": 0.0214, + "step": 4427 + }, + { + "epoch": 3.4868058290665616, + "grad_norm": 0.5954959988594055, + "learning_rate": 1.3266e-05, + "loss": 0.0374, + "step": 4428 + }, + { + "epoch": 3.48759354076408, + "grad_norm": 0.8668987154960632, + "learning_rate": 1.3269000000000001e-05, + "loss": 0.0324, + "step": 4429 + }, + { + "epoch": 3.488381252461599, + "grad_norm": 0.6700613498687744, + "learning_rate": 1.3272000000000001e-05, + "loss": 0.0395, + "step": 4430 + }, + { + "epoch": 3.4891689641591177, + "grad_norm": 0.5456323623657227, + "learning_rate": 1.3275e-05, + "loss": 0.0414, + "step": 4431 + }, + { + "epoch": 3.4899566758566367, + "grad_norm": 0.874834418296814, + "learning_rate": 1.3278e-05, + "loss": 0.0572, + "step": 4432 + }, + { + "epoch": 3.4907443875541553, + "grad_norm": 0.39359912276268005, + "learning_rate": 1.3281e-05, + "loss": 0.0178, + "step": 4433 + }, + { + "epoch": 3.491532099251674, + "grad_norm": 0.4736989736557007, + "learning_rate": 1.3284000000000002e-05, + "loss": 0.0226, + "step": 4434 + }, + { + "epoch": 3.4923198109491924, + "grad_norm": 0.7741962671279907, + "learning_rate": 1.3287e-05, + "loss": 0.0413, + "step": 4435 + }, + { + "epoch": 3.4931075226467114, + "grad_norm": 0.6763736605644226, + "learning_rate": 1.329e-05, + "loss": 0.0395, + "step": 4436 + }, + { + "epoch": 3.49389523434423, + "grad_norm": 0.8367438912391663, + "learning_rate": 1.3293e-05, + "loss": 0.0419, + "step": 4437 + }, + { + "epoch": 3.4946829460417486, + "grad_norm": 0.42102792859077454, + "learning_rate": 1.3296e-05, + "loss": 0.0277, + "step": 4438 + }, + { + "epoch": 3.4954706577392676, + "grad_norm": 1.0858253240585327, + "learning_rate": 1.3299000000000001e-05, + "loss": 0.0312, + "step": 4439 + }, + { + "epoch": 3.496258369436786, + "grad_norm": 0.8242518901824951, + "learning_rate": 1.3302e-05, + "loss": 0.0419, + "step": 4440 + }, + { + "epoch": 3.4970460811343047, + "grad_norm": 1.1794750690460205, + "learning_rate": 1.3305e-05, + "loss": 0.039, + "step": 4441 + }, + { + "epoch": 3.4978337928318237, + "grad_norm": 0.9782588481903076, + "learning_rate": 1.3308e-05, + "loss": 0.0375, + "step": 4442 + }, + { + "epoch": 3.4986215045293423, + "grad_norm": 0.7809491753578186, + "learning_rate": 1.3311e-05, + "loss": 0.0505, + "step": 4443 + }, + { + "epoch": 3.499409216226861, + "grad_norm": 0.7372303009033203, + "learning_rate": 1.3314e-05, + "loss": 0.0514, + "step": 4444 + }, + { + "epoch": 3.50019692792438, + "grad_norm": 0.6095656156539917, + "learning_rate": 1.3317000000000001e-05, + "loss": 0.0469, + "step": 4445 + }, + { + "epoch": 3.5009846396218984, + "grad_norm": 0.699089765548706, + "learning_rate": 1.3320000000000001e-05, + "loss": 0.031, + "step": 4446 + }, + { + "epoch": 3.501772351319417, + "grad_norm": 0.7517332434654236, + "learning_rate": 1.3323000000000001e-05, + "loss": 0.0567, + "step": 4447 + }, + { + "epoch": 3.5025600630169356, + "grad_norm": 0.6226314902305603, + "learning_rate": 1.3325999999999999e-05, + "loss": 0.0386, + "step": 4448 + }, + { + "epoch": 3.5033477747144546, + "grad_norm": 0.6902349591255188, + "learning_rate": 1.3328999999999999e-05, + "loss": 0.0527, + "step": 4449 + }, + { + "epoch": 3.504135486411973, + "grad_norm": 0.8033363819122314, + "learning_rate": 1.3332e-05, + "loss": 0.0547, + "step": 4450 + }, + { + "epoch": 3.504923198109492, + "grad_norm": 0.8249981999397278, + "learning_rate": 1.3335e-05, + "loss": 0.0579, + "step": 4451 + }, + { + "epoch": 3.5057109098070107, + "grad_norm": 0.3759691119194031, + "learning_rate": 1.3338e-05, + "loss": 0.0276, + "step": 4452 + }, + { + "epoch": 3.5064986215045293, + "grad_norm": 0.601763129234314, + "learning_rate": 1.3341e-05, + "loss": 0.0446, + "step": 4453 + }, + { + "epoch": 3.507286333202048, + "grad_norm": 0.8268535137176514, + "learning_rate": 1.3344e-05, + "loss": 0.0652, + "step": 4454 + }, + { + "epoch": 3.508074044899567, + "grad_norm": 0.6364887356758118, + "learning_rate": 1.3347000000000001e-05, + "loss": 0.0468, + "step": 4455 + }, + { + "epoch": 3.5088617565970854, + "grad_norm": 0.5921075940132141, + "learning_rate": 1.3350000000000001e-05, + "loss": 0.0305, + "step": 4456 + }, + { + "epoch": 3.5096494682946044, + "grad_norm": 0.49598029255867004, + "learning_rate": 1.3353e-05, + "loss": 0.0271, + "step": 4457 + }, + { + "epoch": 3.510437179992123, + "grad_norm": 0.7807714939117432, + "learning_rate": 1.3356e-05, + "loss": 0.0691, + "step": 4458 + }, + { + "epoch": 3.5112248916896416, + "grad_norm": 1.132732629776001, + "learning_rate": 1.3359e-05, + "loss": 0.0876, + "step": 4459 + }, + { + "epoch": 3.51201260338716, + "grad_norm": 0.9237645864486694, + "learning_rate": 1.3362e-05, + "loss": 0.0393, + "step": 4460 + }, + { + "epoch": 3.512800315084679, + "grad_norm": 1.5806350708007812, + "learning_rate": 1.3365e-05, + "loss": 0.4085, + "step": 4461 + }, + { + "epoch": 3.5135880267821977, + "grad_norm": 1.4201871156692505, + "learning_rate": 1.3368e-05, + "loss": 0.2904, + "step": 4462 + }, + { + "epoch": 3.5143757384797163, + "grad_norm": 0.8419984579086304, + "learning_rate": 1.3371e-05, + "loss": 0.1839, + "step": 4463 + }, + { + "epoch": 3.5151634501772353, + "grad_norm": 0.9199970364570618, + "learning_rate": 1.3374e-05, + "loss": 0.1833, + "step": 4464 + }, + { + "epoch": 3.515951161874754, + "grad_norm": 1.2706702947616577, + "learning_rate": 1.3377e-05, + "loss": 0.1922, + "step": 4465 + }, + { + "epoch": 3.5167388735722724, + "grad_norm": 0.4404146671295166, + "learning_rate": 1.338e-05, + "loss": 0.0553, + "step": 4466 + }, + { + "epoch": 3.517526585269791, + "grad_norm": 0.5647871494293213, + "learning_rate": 1.3383e-05, + "loss": 0.0547, + "step": 4467 + }, + { + "epoch": 3.51831429696731, + "grad_norm": 0.707777202129364, + "learning_rate": 1.3386e-05, + "loss": 0.0598, + "step": 4468 + }, + { + "epoch": 3.5191020086648286, + "grad_norm": 0.3537159264087677, + "learning_rate": 1.3389e-05, + "loss": 0.0264, + "step": 4469 + }, + { + "epoch": 3.5198897203623476, + "grad_norm": 0.562580943107605, + "learning_rate": 1.3392000000000002e-05, + "loss": 0.057, + "step": 4470 + }, + { + "epoch": 3.520677432059866, + "grad_norm": 0.8344609141349792, + "learning_rate": 1.3395000000000001e-05, + "loss": 0.0552, + "step": 4471 + }, + { + "epoch": 3.5214651437573847, + "grad_norm": 0.6193169951438904, + "learning_rate": 1.3398e-05, + "loss": 0.0668, + "step": 4472 + }, + { + "epoch": 3.5222528554549033, + "grad_norm": 0.803816556930542, + "learning_rate": 1.3401e-05, + "loss": 0.0596, + "step": 4473 + }, + { + "epoch": 3.5230405671524223, + "grad_norm": 0.6139846444129944, + "learning_rate": 1.3403999999999999e-05, + "loss": 0.0318, + "step": 4474 + }, + { + "epoch": 3.523828278849941, + "grad_norm": 0.3656644821166992, + "learning_rate": 1.3407e-05, + "loss": 0.0253, + "step": 4475 + }, + { + "epoch": 3.52461599054746, + "grad_norm": 0.5374866127967834, + "learning_rate": 1.341e-05, + "loss": 0.0437, + "step": 4476 + }, + { + "epoch": 3.5254037022449785, + "grad_norm": 0.7629187107086182, + "learning_rate": 1.3413e-05, + "loss": 0.0511, + "step": 4477 + }, + { + "epoch": 3.526191413942497, + "grad_norm": 0.7291738986968994, + "learning_rate": 1.3416e-05, + "loss": 0.0293, + "step": 4478 + }, + { + "epoch": 3.5269791256400156, + "grad_norm": 0.9126257300376892, + "learning_rate": 1.3419e-05, + "loss": 0.0423, + "step": 4479 + }, + { + "epoch": 3.5277668373375346, + "grad_norm": 0.7843734622001648, + "learning_rate": 1.3422000000000001e-05, + "loss": 0.0488, + "step": 4480 + }, + { + "epoch": 3.528554549035053, + "grad_norm": 0.6919388771057129, + "learning_rate": 1.3425000000000001e-05, + "loss": 0.0398, + "step": 4481 + }, + { + "epoch": 3.5293422607325717, + "grad_norm": 0.5705134868621826, + "learning_rate": 1.3428000000000001e-05, + "loss": 0.0428, + "step": 4482 + }, + { + "epoch": 3.5301299724300907, + "grad_norm": 0.6566071510314941, + "learning_rate": 1.3431e-05, + "loss": 0.0411, + "step": 4483 + }, + { + "epoch": 3.5309176841276093, + "grad_norm": 0.603489339351654, + "learning_rate": 1.3433999999999999e-05, + "loss": 0.0465, + "step": 4484 + }, + { + "epoch": 3.531705395825128, + "grad_norm": 0.5092867016792297, + "learning_rate": 1.3437e-05, + "loss": 0.044, + "step": 4485 + }, + { + "epoch": 3.5324931075226464, + "grad_norm": 0.6933104395866394, + "learning_rate": 1.344e-05, + "loss": 0.0523, + "step": 4486 + }, + { + "epoch": 3.5332808192201655, + "grad_norm": 0.8128195405006409, + "learning_rate": 1.3443e-05, + "loss": 0.0195, + "step": 4487 + }, + { + "epoch": 3.534068530917684, + "grad_norm": 0.47508832812309265, + "learning_rate": 1.3446e-05, + "loss": 0.0277, + "step": 4488 + }, + { + "epoch": 3.534856242615203, + "grad_norm": 0.3646450638771057, + "learning_rate": 1.3449e-05, + "loss": 0.028, + "step": 4489 + }, + { + "epoch": 3.5356439543127216, + "grad_norm": 0.6450896263122559, + "learning_rate": 1.3452000000000001e-05, + "loss": 0.0412, + "step": 4490 + }, + { + "epoch": 3.53643166601024, + "grad_norm": 1.6521621942520142, + "learning_rate": 1.3455e-05, + "loss": 0.0451, + "step": 4491 + }, + { + "epoch": 3.5372193777077587, + "grad_norm": 0.8346592783927917, + "learning_rate": 1.3458e-05, + "loss": 0.0521, + "step": 4492 + }, + { + "epoch": 3.5380070894052777, + "grad_norm": 0.643434464931488, + "learning_rate": 1.3461e-05, + "loss": 0.0441, + "step": 4493 + }, + { + "epoch": 3.5387948011027963, + "grad_norm": 0.6627169251441956, + "learning_rate": 1.3464e-05, + "loss": 0.038, + "step": 4494 + }, + { + "epoch": 3.5395825128003153, + "grad_norm": 1.064671516418457, + "learning_rate": 1.3467000000000002e-05, + "loss": 0.0426, + "step": 4495 + }, + { + "epoch": 3.540370224497834, + "grad_norm": 0.4710356891155243, + "learning_rate": 1.3470000000000001e-05, + "loss": 0.0239, + "step": 4496 + }, + { + "epoch": 3.5411579361953525, + "grad_norm": 0.7374324202537537, + "learning_rate": 1.3473e-05, + "loss": 0.03, + "step": 4497 + }, + { + "epoch": 3.541945647892871, + "grad_norm": 0.8377308249473572, + "learning_rate": 1.3476e-05, + "loss": 0.0404, + "step": 4498 + }, + { + "epoch": 3.54273335959039, + "grad_norm": 0.8576449751853943, + "learning_rate": 1.3479e-05, + "loss": 0.0656, + "step": 4499 + }, + { + "epoch": 3.5435210712879086, + "grad_norm": 0.6797581911087036, + "learning_rate": 1.3482e-05, + "loss": 0.0422, + "step": 4500 + }, + { + "epoch": 3.544308782985427, + "grad_norm": 1.9214550256729126, + "learning_rate": 1.3485e-05, + "loss": 0.0483, + "step": 4501 + }, + { + "epoch": 3.545096494682946, + "grad_norm": 1.2564343214035034, + "learning_rate": 1.3488e-05, + "loss": 0.0719, + "step": 4502 + }, + { + "epoch": 3.5458842063804648, + "grad_norm": 0.6697081327438354, + "learning_rate": 1.3491e-05, + "loss": 0.0413, + "step": 4503 + }, + { + "epoch": 3.5466719180779833, + "grad_norm": 0.7563186883926392, + "learning_rate": 1.3494e-05, + "loss": 0.0604, + "step": 4504 + }, + { + "epoch": 3.5474596297755023, + "grad_norm": 0.690087616443634, + "learning_rate": 1.3497000000000001e-05, + "loss": 0.0537, + "step": 4505 + }, + { + "epoch": 3.548247341473021, + "grad_norm": 0.5806155800819397, + "learning_rate": 1.3500000000000001e-05, + "loss": 0.0352, + "step": 4506 + }, + { + "epoch": 3.5490350531705395, + "grad_norm": 0.8868584632873535, + "learning_rate": 1.3503000000000001e-05, + "loss": 0.0889, + "step": 4507 + }, + { + "epoch": 3.5498227648680585, + "grad_norm": 0.7181980609893799, + "learning_rate": 1.3506e-05, + "loss": 0.0523, + "step": 4508 + }, + { + "epoch": 3.550610476565577, + "grad_norm": 0.7592438459396362, + "learning_rate": 1.3508999999999999e-05, + "loss": 0.0512, + "step": 4509 + }, + { + "epoch": 3.5513981882630956, + "grad_norm": 1.1744804382324219, + "learning_rate": 1.3512e-05, + "loss": 0.0841, + "step": 4510 + }, + { + "epoch": 3.552185899960614, + "grad_norm": 1.738540768623352, + "learning_rate": 1.3515e-05, + "loss": 0.369, + "step": 4511 + }, + { + "epoch": 3.552973611658133, + "grad_norm": 0.9719489812850952, + "learning_rate": 1.3518e-05, + "loss": 0.2731, + "step": 4512 + }, + { + "epoch": 3.5537613233556518, + "grad_norm": 1.0206208229064941, + "learning_rate": 1.3521e-05, + "loss": 0.2264, + "step": 4513 + }, + { + "epoch": 3.5545490350531708, + "grad_norm": 1.0781677961349487, + "learning_rate": 1.3524e-05, + "loss": 0.2576, + "step": 4514 + }, + { + "epoch": 3.5553367467506893, + "grad_norm": 0.8284268975257874, + "learning_rate": 1.3527000000000001e-05, + "loss": 0.1446, + "step": 4515 + }, + { + "epoch": 3.556124458448208, + "grad_norm": 0.678076446056366, + "learning_rate": 1.3530000000000001e-05, + "loss": 0.0791, + "step": 4516 + }, + { + "epoch": 3.5569121701457265, + "grad_norm": 0.6316031813621521, + "learning_rate": 1.3533e-05, + "loss": 0.0749, + "step": 4517 + }, + { + "epoch": 3.5576998818432455, + "grad_norm": 0.9171388149261475, + "learning_rate": 1.3536e-05, + "loss": 0.091, + "step": 4518 + }, + { + "epoch": 3.558487593540764, + "grad_norm": 0.7491846680641174, + "learning_rate": 1.3539e-05, + "loss": 0.0503, + "step": 4519 + }, + { + "epoch": 3.559275305238283, + "grad_norm": 0.6634436845779419, + "learning_rate": 1.3542000000000002e-05, + "loss": 0.0565, + "step": 4520 + }, + { + "epoch": 3.5600630169358016, + "grad_norm": 0.46499380469322205, + "learning_rate": 1.3545e-05, + "loss": 0.0229, + "step": 4521 + }, + { + "epoch": 3.56085072863332, + "grad_norm": 0.37323084473609924, + "learning_rate": 1.3548e-05, + "loss": 0.0214, + "step": 4522 + }, + { + "epoch": 3.5616384403308388, + "grad_norm": 0.5967015027999878, + "learning_rate": 1.3551e-05, + "loss": 0.0396, + "step": 4523 + }, + { + "epoch": 3.5624261520283578, + "grad_norm": 0.3952008783817291, + "learning_rate": 1.3554e-05, + "loss": 0.0359, + "step": 4524 + }, + { + "epoch": 3.5632138637258763, + "grad_norm": 0.3919144570827484, + "learning_rate": 1.3557e-05, + "loss": 0.0196, + "step": 4525 + }, + { + "epoch": 3.564001575423395, + "grad_norm": 0.5122617483139038, + "learning_rate": 1.356e-05, + "loss": 0.0302, + "step": 4526 + }, + { + "epoch": 3.564789287120914, + "grad_norm": 0.49006959795951843, + "learning_rate": 1.3563e-05, + "loss": 0.0394, + "step": 4527 + }, + { + "epoch": 3.5655769988184325, + "grad_norm": 0.6295151710510254, + "learning_rate": 1.3566e-05, + "loss": 0.0305, + "step": 4528 + }, + { + "epoch": 3.566364710515951, + "grad_norm": 0.5344297289848328, + "learning_rate": 1.3569e-05, + "loss": 0.0409, + "step": 4529 + }, + { + "epoch": 3.5671524222134696, + "grad_norm": 0.47690749168395996, + "learning_rate": 1.3572000000000002e-05, + "loss": 0.0201, + "step": 4530 + }, + { + "epoch": 3.5679401339109886, + "grad_norm": 0.6280548572540283, + "learning_rate": 1.3575000000000001e-05, + "loss": 0.0307, + "step": 4531 + }, + { + "epoch": 3.568727845608507, + "grad_norm": 0.5516762733459473, + "learning_rate": 1.3578000000000001e-05, + "loss": 0.0364, + "step": 4532 + }, + { + "epoch": 3.569515557306026, + "grad_norm": 0.42058616876602173, + "learning_rate": 1.3581000000000001e-05, + "loss": 0.0303, + "step": 4533 + }, + { + "epoch": 3.5703032690035448, + "grad_norm": 0.5064677596092224, + "learning_rate": 1.3583999999999999e-05, + "loss": 0.0402, + "step": 4534 + }, + { + "epoch": 3.5710909807010633, + "grad_norm": 0.6367992162704468, + "learning_rate": 1.3587e-05, + "loss": 0.0508, + "step": 4535 + }, + { + "epoch": 3.571878692398582, + "grad_norm": 0.5603300333023071, + "learning_rate": 1.359e-05, + "loss": 0.0338, + "step": 4536 + }, + { + "epoch": 3.572666404096101, + "grad_norm": 0.7877813577651978, + "learning_rate": 1.3593e-05, + "loss": 0.0292, + "step": 4537 + }, + { + "epoch": 3.5734541157936195, + "grad_norm": 0.7061344385147095, + "learning_rate": 1.3596e-05, + "loss": 0.0415, + "step": 4538 + }, + { + "epoch": 3.5742418274911385, + "grad_norm": 0.6024488210678101, + "learning_rate": 1.3599e-05, + "loss": 0.0531, + "step": 4539 + }, + { + "epoch": 3.575029539188657, + "grad_norm": 0.7078216671943665, + "learning_rate": 1.3602000000000001e-05, + "loss": 0.0362, + "step": 4540 + }, + { + "epoch": 3.5758172508861756, + "grad_norm": 0.9344009757041931, + "learning_rate": 1.3605000000000001e-05, + "loss": 0.0406, + "step": 4541 + }, + { + "epoch": 3.576604962583694, + "grad_norm": 0.7130720615386963, + "learning_rate": 1.3608e-05, + "loss": 0.0376, + "step": 4542 + }, + { + "epoch": 3.577392674281213, + "grad_norm": 0.5234359502792358, + "learning_rate": 1.3611e-05, + "loss": 0.0408, + "step": 4543 + }, + { + "epoch": 3.578180385978732, + "grad_norm": 3.7866742610931396, + "learning_rate": 1.3614e-05, + "loss": 0.0509, + "step": 4544 + }, + { + "epoch": 3.5789680976762503, + "grad_norm": 0.7235111594200134, + "learning_rate": 1.3617000000000002e-05, + "loss": 0.042, + "step": 4545 + }, + { + "epoch": 3.5797558093737694, + "grad_norm": 0.5735681056976318, + "learning_rate": 1.362e-05, + "loss": 0.0437, + "step": 4546 + }, + { + "epoch": 3.580543521071288, + "grad_norm": 3.1087069511413574, + "learning_rate": 1.3623e-05, + "loss": 0.0564, + "step": 4547 + }, + { + "epoch": 3.5813312327688065, + "grad_norm": 0.4094490110874176, + "learning_rate": 1.3626e-05, + "loss": 0.0332, + "step": 4548 + }, + { + "epoch": 3.582118944466325, + "grad_norm": 0.6246773600578308, + "learning_rate": 1.3629e-05, + "loss": 0.0403, + "step": 4549 + }, + { + "epoch": 3.582906656163844, + "grad_norm": 0.6629802584648132, + "learning_rate": 1.3632000000000001e-05, + "loss": 0.0353, + "step": 4550 + }, + { + "epoch": 3.5836943678613626, + "grad_norm": 0.5467895865440369, + "learning_rate": 1.3635e-05, + "loss": 0.0357, + "step": 4551 + }, + { + "epoch": 3.5844820795588817, + "grad_norm": 0.7103897333145142, + "learning_rate": 1.3638e-05, + "loss": 0.0507, + "step": 4552 + }, + { + "epoch": 3.5852697912564, + "grad_norm": 1.056341290473938, + "learning_rate": 1.3641e-05, + "loss": 0.064, + "step": 4553 + }, + { + "epoch": 3.586057502953919, + "grad_norm": 0.6566010117530823, + "learning_rate": 1.3644e-05, + "loss": 0.0453, + "step": 4554 + }, + { + "epoch": 3.5868452146514374, + "grad_norm": 0.8605183959007263, + "learning_rate": 1.3647000000000002e-05, + "loss": 0.044, + "step": 4555 + }, + { + "epoch": 3.5876329263489564, + "grad_norm": 0.7698177695274353, + "learning_rate": 1.3650000000000001e-05, + "loss": 0.0404, + "step": 4556 + }, + { + "epoch": 3.588420638046475, + "grad_norm": 1.0372554063796997, + "learning_rate": 1.3653000000000001e-05, + "loss": 0.0521, + "step": 4557 + }, + { + "epoch": 3.589208349743994, + "grad_norm": 0.9010123014450073, + "learning_rate": 1.3656e-05, + "loss": 0.0616, + "step": 4558 + }, + { + "epoch": 3.5899960614415125, + "grad_norm": 0.7755023837089539, + "learning_rate": 1.3659e-05, + "loss": 0.0541, + "step": 4559 + }, + { + "epoch": 3.590783773139031, + "grad_norm": 0.7180358171463013, + "learning_rate": 1.3662e-05, + "loss": 0.0566, + "step": 4560 + }, + { + "epoch": 3.5915714848365496, + "grad_norm": 3.2491395473480225, + "learning_rate": 1.3665e-05, + "loss": 0.4775, + "step": 4561 + }, + { + "epoch": 3.5923591965340687, + "grad_norm": 0.8763353228569031, + "learning_rate": 1.3668e-05, + "loss": 0.2588, + "step": 4562 + }, + { + "epoch": 3.5931469082315872, + "grad_norm": 1.0877572298049927, + "learning_rate": 1.3671e-05, + "loss": 0.3372, + "step": 4563 + }, + { + "epoch": 3.593934619929106, + "grad_norm": 0.9231072068214417, + "learning_rate": 1.3674e-05, + "loss": 0.1935, + "step": 4564 + }, + { + "epoch": 3.594722331626625, + "grad_norm": 0.6588059663772583, + "learning_rate": 1.3677000000000001e-05, + "loss": 0.1175, + "step": 4565 + }, + { + "epoch": 3.5955100433241434, + "grad_norm": 1.0428365468978882, + "learning_rate": 1.3680000000000001e-05, + "loss": 0.1268, + "step": 4566 + }, + { + "epoch": 3.596297755021662, + "grad_norm": 1.2057565450668335, + "learning_rate": 1.3683000000000001e-05, + "loss": 0.0591, + "step": 4567 + }, + { + "epoch": 3.597085466719181, + "grad_norm": 0.5937696099281311, + "learning_rate": 1.3686e-05, + "loss": 0.0791, + "step": 4568 + }, + { + "epoch": 3.5978731784166995, + "grad_norm": 0.4141480624675751, + "learning_rate": 1.3689e-05, + "loss": 0.0235, + "step": 4569 + }, + { + "epoch": 3.598660890114218, + "grad_norm": 0.39761489629745483, + "learning_rate": 1.3691999999999999e-05, + "loss": 0.0339, + "step": 4570 + }, + { + "epoch": 3.599448601811737, + "grad_norm": 0.488211452960968, + "learning_rate": 1.3695e-05, + "loss": 0.0287, + "step": 4571 + }, + { + "epoch": 3.6002363135092557, + "grad_norm": 0.5518326759338379, + "learning_rate": 1.3698e-05, + "loss": 0.0439, + "step": 4572 + }, + { + "epoch": 3.6010240252067742, + "grad_norm": 0.44303613901138306, + "learning_rate": 1.3701e-05, + "loss": 0.0218, + "step": 4573 + }, + { + "epoch": 3.601811736904293, + "grad_norm": 0.610385537147522, + "learning_rate": 1.3704e-05, + "loss": 0.0527, + "step": 4574 + }, + { + "epoch": 3.602599448601812, + "grad_norm": 0.3700445890426636, + "learning_rate": 1.3707e-05, + "loss": 0.0283, + "step": 4575 + }, + { + "epoch": 3.6033871602993304, + "grad_norm": 0.5539349317550659, + "learning_rate": 1.3710000000000001e-05, + "loss": 0.0596, + "step": 4576 + }, + { + "epoch": 3.6041748719968494, + "grad_norm": 0.6683676838874817, + "learning_rate": 1.3713e-05, + "loss": 0.0381, + "step": 4577 + }, + { + "epoch": 3.604962583694368, + "grad_norm": 0.8959924578666687, + "learning_rate": 1.3716e-05, + "loss": 0.0387, + "step": 4578 + }, + { + "epoch": 3.6057502953918865, + "grad_norm": 0.43489670753479004, + "learning_rate": 1.3719e-05, + "loss": 0.022, + "step": 4579 + }, + { + "epoch": 3.606538007089405, + "grad_norm": 0.40448805689811707, + "learning_rate": 1.3722e-05, + "loss": 0.0251, + "step": 4580 + }, + { + "epoch": 3.607325718786924, + "grad_norm": 0.6855457425117493, + "learning_rate": 1.3725000000000002e-05, + "loss": 0.0467, + "step": 4581 + }, + { + "epoch": 3.6081134304844427, + "grad_norm": 0.5747031569480896, + "learning_rate": 1.3728000000000001e-05, + "loss": 0.0313, + "step": 4582 + }, + { + "epoch": 3.6089011421819617, + "grad_norm": 0.6324950456619263, + "learning_rate": 1.3731e-05, + "loss": 0.0453, + "step": 4583 + }, + { + "epoch": 3.6096888538794802, + "grad_norm": 0.7838295102119446, + "learning_rate": 1.3734e-05, + "loss": 0.0403, + "step": 4584 + }, + { + "epoch": 3.610476565576999, + "grad_norm": 0.5317318439483643, + "learning_rate": 1.3736999999999999e-05, + "loss": 0.0342, + "step": 4585 + }, + { + "epoch": 3.6112642772745174, + "grad_norm": 1.4349759817123413, + "learning_rate": 1.374e-05, + "loss": 0.0376, + "step": 4586 + }, + { + "epoch": 3.6120519889720364, + "grad_norm": 0.8116076588630676, + "learning_rate": 1.3743e-05, + "loss": 0.0309, + "step": 4587 + }, + { + "epoch": 3.612839700669555, + "grad_norm": 0.5043283104896545, + "learning_rate": 1.3746e-05, + "loss": 0.0361, + "step": 4588 + }, + { + "epoch": 3.6136274123670735, + "grad_norm": 0.6677416563034058, + "learning_rate": 1.3749e-05, + "loss": 0.0317, + "step": 4589 + }, + { + "epoch": 3.6144151240645925, + "grad_norm": 0.4792120158672333, + "learning_rate": 1.3752e-05, + "loss": 0.025, + "step": 4590 + }, + { + "epoch": 3.615202835762111, + "grad_norm": 0.7522311210632324, + "learning_rate": 1.3755000000000001e-05, + "loss": 0.0349, + "step": 4591 + }, + { + "epoch": 3.6159905474596297, + "grad_norm": 0.6417582631111145, + "learning_rate": 1.3758000000000001e-05, + "loss": 0.0411, + "step": 4592 + }, + { + "epoch": 3.6167782591571482, + "grad_norm": 0.43476545810699463, + "learning_rate": 1.3761000000000001e-05, + "loss": 0.0313, + "step": 4593 + }, + { + "epoch": 3.6175659708546672, + "grad_norm": 0.7778320908546448, + "learning_rate": 1.3764e-05, + "loss": 0.0407, + "step": 4594 + }, + { + "epoch": 3.618353682552186, + "grad_norm": 0.9283460378646851, + "learning_rate": 1.3766999999999999e-05, + "loss": 0.0311, + "step": 4595 + }, + { + "epoch": 3.619141394249705, + "grad_norm": 0.7048974633216858, + "learning_rate": 1.377e-05, + "loss": 0.0337, + "step": 4596 + }, + { + "epoch": 3.6199291059472234, + "grad_norm": 0.6849662661552429, + "learning_rate": 1.3773e-05, + "loss": 0.0478, + "step": 4597 + }, + { + "epoch": 3.620716817644742, + "grad_norm": 0.6275836825370789, + "learning_rate": 1.3776e-05, + "loss": 0.0394, + "step": 4598 + }, + { + "epoch": 3.6215045293422605, + "grad_norm": 0.6965243816375732, + "learning_rate": 1.3779e-05, + "loss": 0.0363, + "step": 4599 + }, + { + "epoch": 3.6222922410397795, + "grad_norm": 0.8198168873786926, + "learning_rate": 1.3782e-05, + "loss": 0.0648, + "step": 4600 + }, + { + "epoch": 3.623079952737298, + "grad_norm": 0.6233096718788147, + "learning_rate": 1.3785000000000001e-05, + "loss": 0.034, + "step": 4601 + }, + { + "epoch": 3.623867664434817, + "grad_norm": 1.044008493423462, + "learning_rate": 1.3788e-05, + "loss": 0.075, + "step": 4602 + }, + { + "epoch": 3.6246553761323357, + "grad_norm": 0.9946860671043396, + "learning_rate": 1.3791e-05, + "loss": 0.0402, + "step": 4603 + }, + { + "epoch": 3.6254430878298542, + "grad_norm": 0.6432536244392395, + "learning_rate": 1.3794e-05, + "loss": 0.042, + "step": 4604 + }, + { + "epoch": 3.626230799527373, + "grad_norm": 0.7333672046661377, + "learning_rate": 1.3797e-05, + "loss": 0.0561, + "step": 4605 + }, + { + "epoch": 3.627018511224892, + "grad_norm": 0.8356862664222717, + "learning_rate": 1.3800000000000002e-05, + "loss": 0.0565, + "step": 4606 + }, + { + "epoch": 3.6278062229224104, + "grad_norm": 0.7690117955207825, + "learning_rate": 1.3803e-05, + "loss": 0.0346, + "step": 4607 + }, + { + "epoch": 3.628593934619929, + "grad_norm": 0.9401739835739136, + "learning_rate": 1.3806e-05, + "loss": 0.046, + "step": 4608 + }, + { + "epoch": 3.629381646317448, + "grad_norm": 0.7630672454833984, + "learning_rate": 1.3809e-05, + "loss": 0.0632, + "step": 4609 + }, + { + "epoch": 3.6301693580149665, + "grad_norm": 0.8935710191726685, + "learning_rate": 1.3812e-05, + "loss": 0.0732, + "step": 4610 + }, + { + "epoch": 3.630957069712485, + "grad_norm": 2.0075714588165283, + "learning_rate": 1.3815e-05, + "loss": 0.3632, + "step": 4611 + }, + { + "epoch": 3.6317447814100037, + "grad_norm": 1.3353450298309326, + "learning_rate": 1.3818e-05, + "loss": 0.2984, + "step": 4612 + }, + { + "epoch": 3.6325324931075227, + "grad_norm": 1.0516090393066406, + "learning_rate": 1.3821e-05, + "loss": 0.2074, + "step": 4613 + }, + { + "epoch": 3.6333202048050413, + "grad_norm": 0.8556323051452637, + "learning_rate": 1.3824e-05, + "loss": 0.1716, + "step": 4614 + }, + { + "epoch": 3.6341079165025603, + "grad_norm": 1.143963098526001, + "learning_rate": 1.3827e-05, + "loss": 0.2092, + "step": 4615 + }, + { + "epoch": 3.634895628200079, + "grad_norm": 0.6893602013587952, + "learning_rate": 1.3830000000000001e-05, + "loss": 0.0764, + "step": 4616 + }, + { + "epoch": 3.6356833398975974, + "grad_norm": 0.4302988350391388, + "learning_rate": 1.3833000000000001e-05, + "loss": 0.0401, + "step": 4617 + }, + { + "epoch": 3.636471051595116, + "grad_norm": 0.6583705544471741, + "learning_rate": 1.3836000000000001e-05, + "loss": 0.0477, + "step": 4618 + }, + { + "epoch": 3.637258763292635, + "grad_norm": 0.6147873997688293, + "learning_rate": 1.3839e-05, + "loss": 0.0431, + "step": 4619 + }, + { + "epoch": 3.6380464749901535, + "grad_norm": 0.7345755100250244, + "learning_rate": 1.3841999999999999e-05, + "loss": 0.056, + "step": 4620 + }, + { + "epoch": 3.6388341866876726, + "grad_norm": 0.550675094127655, + "learning_rate": 1.3845e-05, + "loss": 0.0485, + "step": 4621 + }, + { + "epoch": 3.639621898385191, + "grad_norm": 0.6242322325706482, + "learning_rate": 1.3848e-05, + "loss": 0.0422, + "step": 4622 + }, + { + "epoch": 3.6404096100827097, + "grad_norm": 0.8845615983009338, + "learning_rate": 1.3851e-05, + "loss": 0.0313, + "step": 4623 + }, + { + "epoch": 3.6411973217802283, + "grad_norm": 0.5454835295677185, + "learning_rate": 1.3854e-05, + "loss": 0.0305, + "step": 4624 + }, + { + "epoch": 3.6419850334777473, + "grad_norm": 0.722859263420105, + "learning_rate": 1.3857e-05, + "loss": 0.0453, + "step": 4625 + }, + { + "epoch": 3.642772745175266, + "grad_norm": 0.8145389556884766, + "learning_rate": 1.3860000000000001e-05, + "loss": 0.0478, + "step": 4626 + }, + { + "epoch": 3.6435604568727844, + "grad_norm": 0.6233885884284973, + "learning_rate": 1.3863000000000001e-05, + "loss": 0.0367, + "step": 4627 + }, + { + "epoch": 3.6443481685703034, + "grad_norm": 0.7261312007904053, + "learning_rate": 1.3866e-05, + "loss": 0.0166, + "step": 4628 + }, + { + "epoch": 3.645135880267822, + "grad_norm": 0.5644577741622925, + "learning_rate": 1.3869e-05, + "loss": 0.03, + "step": 4629 + }, + { + "epoch": 3.6459235919653405, + "grad_norm": 0.6530725955963135, + "learning_rate": 1.3872e-05, + "loss": 0.0307, + "step": 4630 + }, + { + "epoch": 3.646711303662859, + "grad_norm": 0.6710081696510315, + "learning_rate": 1.3875000000000002e-05, + "loss": 0.0461, + "step": 4631 + }, + { + "epoch": 3.647499015360378, + "grad_norm": 0.6511311531066895, + "learning_rate": 1.3878e-05, + "loss": 0.0428, + "step": 4632 + }, + { + "epoch": 3.6482867270578967, + "grad_norm": 0.5221413969993591, + "learning_rate": 1.3881e-05, + "loss": 0.0428, + "step": 4633 + }, + { + "epoch": 3.6490744387554157, + "grad_norm": 1.0726381540298462, + "learning_rate": 1.3884e-05, + "loss": 0.0414, + "step": 4634 + }, + { + "epoch": 3.6498621504529343, + "grad_norm": 0.5740726590156555, + "learning_rate": 1.3887e-05, + "loss": 0.0287, + "step": 4635 + }, + { + "epoch": 3.650649862150453, + "grad_norm": 1.2528679370880127, + "learning_rate": 1.389e-05, + "loss": 0.0518, + "step": 4636 + }, + { + "epoch": 3.6514375738479714, + "grad_norm": 0.6977338790893555, + "learning_rate": 1.3893e-05, + "loss": 0.0493, + "step": 4637 + }, + { + "epoch": 3.6522252855454904, + "grad_norm": 0.3102222979068756, + "learning_rate": 1.3896e-05, + "loss": 0.0219, + "step": 4638 + }, + { + "epoch": 3.653012997243009, + "grad_norm": 0.5625612735748291, + "learning_rate": 1.3899e-05, + "loss": 0.0324, + "step": 4639 + }, + { + "epoch": 3.653800708940528, + "grad_norm": 0.9886036515235901, + "learning_rate": 1.3902e-05, + "loss": 0.0577, + "step": 4640 + }, + { + "epoch": 3.6545884206380466, + "grad_norm": 0.6411997675895691, + "learning_rate": 1.3905000000000002e-05, + "loss": 0.0388, + "step": 4641 + }, + { + "epoch": 3.655376132335565, + "grad_norm": 1.702729344367981, + "learning_rate": 1.3908000000000001e-05, + "loss": 0.0363, + "step": 4642 + }, + { + "epoch": 3.6561638440330837, + "grad_norm": 1.181565761566162, + "learning_rate": 1.3911000000000001e-05, + "loss": 0.0546, + "step": 4643 + }, + { + "epoch": 3.6569515557306027, + "grad_norm": 4.785036563873291, + "learning_rate": 1.3914e-05, + "loss": 0.0274, + "step": 4644 + }, + { + "epoch": 3.6577392674281213, + "grad_norm": 0.6566200852394104, + "learning_rate": 1.3916999999999999e-05, + "loss": 0.0447, + "step": 4645 + }, + { + "epoch": 3.6585269791256403, + "grad_norm": 0.5207884311676025, + "learning_rate": 1.392e-05, + "loss": 0.0349, + "step": 4646 + }, + { + "epoch": 3.659314690823159, + "grad_norm": 0.535078763961792, + "learning_rate": 1.3923e-05, + "loss": 0.0239, + "step": 4647 + }, + { + "epoch": 3.6601024025206774, + "grad_norm": 0.5649013519287109, + "learning_rate": 1.3926e-05, + "loss": 0.0435, + "step": 4648 + }, + { + "epoch": 3.660890114218196, + "grad_norm": 0.6202024817466736, + "learning_rate": 1.3929e-05, + "loss": 0.0268, + "step": 4649 + }, + { + "epoch": 3.661677825915715, + "grad_norm": 0.9158331751823425, + "learning_rate": 1.3932e-05, + "loss": 0.0447, + "step": 4650 + }, + { + "epoch": 3.6624655376132336, + "grad_norm": 0.4518716335296631, + "learning_rate": 1.3935000000000001e-05, + "loss": 0.0348, + "step": 4651 + }, + { + "epoch": 3.663253249310752, + "grad_norm": 0.689892590045929, + "learning_rate": 1.3938000000000001e-05, + "loss": 0.0528, + "step": 4652 + }, + { + "epoch": 3.664040961008271, + "grad_norm": 0.49381235241889954, + "learning_rate": 1.3941000000000001e-05, + "loss": 0.0385, + "step": 4653 + }, + { + "epoch": 3.6648286727057897, + "grad_norm": 0.8595020771026611, + "learning_rate": 1.3944e-05, + "loss": 0.0508, + "step": 4654 + }, + { + "epoch": 3.6656163844033083, + "grad_norm": 0.7102919816970825, + "learning_rate": 1.3947e-05, + "loss": 0.0505, + "step": 4655 + }, + { + "epoch": 3.666404096100827, + "grad_norm": 1.5672036409378052, + "learning_rate": 1.395e-05, + "loss": 0.0769, + "step": 4656 + }, + { + "epoch": 3.667191807798346, + "grad_norm": 0.6165370941162109, + "learning_rate": 1.3953e-05, + "loss": 0.0396, + "step": 4657 + }, + { + "epoch": 3.6679795194958644, + "grad_norm": 1.0647269487380981, + "learning_rate": 1.3956e-05, + "loss": 0.0502, + "step": 4658 + }, + { + "epoch": 3.6687672311933834, + "grad_norm": 0.6547660231590271, + "learning_rate": 1.3959e-05, + "loss": 0.0283, + "step": 4659 + }, + { + "epoch": 3.669554942890902, + "grad_norm": 0.9027572870254517, + "learning_rate": 1.3962e-05, + "loss": 0.0537, + "step": 4660 + }, + { + "epoch": 3.6703426545884206, + "grad_norm": 1.2299683094024658, + "learning_rate": 1.3965000000000001e-05, + "loss": 0.3562, + "step": 4661 + }, + { + "epoch": 3.671130366285939, + "grad_norm": 1.0264763832092285, + "learning_rate": 1.3968e-05, + "loss": 0.257, + "step": 4662 + }, + { + "epoch": 3.671918077983458, + "grad_norm": 1.0527048110961914, + "learning_rate": 1.3971e-05, + "loss": 0.2433, + "step": 4663 + }, + { + "epoch": 3.6727057896809767, + "grad_norm": 1.3296431303024292, + "learning_rate": 1.3974e-05, + "loss": 0.1863, + "step": 4664 + }, + { + "epoch": 3.6734935013784957, + "grad_norm": 1.0902726650238037, + "learning_rate": 1.3977e-05, + "loss": 0.1465, + "step": 4665 + }, + { + "epoch": 3.6742812130760143, + "grad_norm": 0.5874989628791809, + "learning_rate": 1.3980000000000002e-05, + "loss": 0.0784, + "step": 4666 + }, + { + "epoch": 3.675068924773533, + "grad_norm": 0.6882678270339966, + "learning_rate": 1.3983000000000001e-05, + "loss": 0.0668, + "step": 4667 + }, + { + "epoch": 3.6758566364710514, + "grad_norm": 0.7161747217178345, + "learning_rate": 1.3986000000000001e-05, + "loss": 0.0569, + "step": 4668 + }, + { + "epoch": 3.6766443481685704, + "grad_norm": 0.5177010893821716, + "learning_rate": 1.3989e-05, + "loss": 0.0408, + "step": 4669 + }, + { + "epoch": 3.677432059866089, + "grad_norm": 0.5527803897857666, + "learning_rate": 1.3992e-05, + "loss": 0.0536, + "step": 4670 + }, + { + "epoch": 3.6782197715636076, + "grad_norm": 0.4687885046005249, + "learning_rate": 1.3995e-05, + "loss": 0.0385, + "step": 4671 + }, + { + "epoch": 3.6790074832611266, + "grad_norm": 0.574359655380249, + "learning_rate": 1.3998e-05, + "loss": 0.0343, + "step": 4672 + }, + { + "epoch": 3.679795194958645, + "grad_norm": 0.518426775932312, + "learning_rate": 1.4001e-05, + "loss": 0.0454, + "step": 4673 + }, + { + "epoch": 3.6805829066561637, + "grad_norm": 0.3772810697555542, + "learning_rate": 1.4004e-05, + "loss": 0.0317, + "step": 4674 + }, + { + "epoch": 3.6813706183536823, + "grad_norm": 0.7514921426773071, + "learning_rate": 1.4007e-05, + "loss": 0.0445, + "step": 4675 + }, + { + "epoch": 3.6821583300512013, + "grad_norm": 0.5101496577262878, + "learning_rate": 1.4010000000000001e-05, + "loss": 0.0415, + "step": 4676 + }, + { + "epoch": 3.68294604174872, + "grad_norm": 0.5903269648551941, + "learning_rate": 1.4013000000000001e-05, + "loss": 0.0407, + "step": 4677 + }, + { + "epoch": 3.683733753446239, + "grad_norm": 0.5485287308692932, + "learning_rate": 1.4016000000000001e-05, + "loss": 0.041, + "step": 4678 + }, + { + "epoch": 3.6845214651437574, + "grad_norm": 0.4348003566265106, + "learning_rate": 1.4019e-05, + "loss": 0.0262, + "step": 4679 + }, + { + "epoch": 3.685309176841276, + "grad_norm": 0.5983947515487671, + "learning_rate": 1.4022e-05, + "loss": 0.0322, + "step": 4680 + }, + { + "epoch": 3.6860968885387946, + "grad_norm": 0.5548213124275208, + "learning_rate": 1.4025e-05, + "loss": 0.0393, + "step": 4681 + }, + { + "epoch": 3.6868846002363136, + "grad_norm": 0.4490991532802582, + "learning_rate": 1.4028e-05, + "loss": 0.032, + "step": 4682 + }, + { + "epoch": 3.687672311933832, + "grad_norm": 0.47771671414375305, + "learning_rate": 1.4031e-05, + "loss": 0.0252, + "step": 4683 + }, + { + "epoch": 3.688460023631351, + "grad_norm": 0.6627445816993713, + "learning_rate": 1.4034e-05, + "loss": 0.0458, + "step": 4684 + }, + { + "epoch": 3.6892477353288697, + "grad_norm": 1.0530149936676025, + "learning_rate": 1.4037e-05, + "loss": 0.082, + "step": 4685 + }, + { + "epoch": 3.6900354470263883, + "grad_norm": 0.477845162153244, + "learning_rate": 1.4040000000000001e-05, + "loss": 0.0445, + "step": 4686 + }, + { + "epoch": 3.690823158723907, + "grad_norm": 0.5356115698814392, + "learning_rate": 1.4043000000000001e-05, + "loss": 0.0363, + "step": 4687 + }, + { + "epoch": 3.691610870421426, + "grad_norm": 0.9806830883026123, + "learning_rate": 1.4046e-05, + "loss": 0.0475, + "step": 4688 + }, + { + "epoch": 3.6923985821189445, + "grad_norm": 0.8052389621734619, + "learning_rate": 1.4049e-05, + "loss": 0.0455, + "step": 4689 + }, + { + "epoch": 3.693186293816463, + "grad_norm": 0.5266993641853333, + "learning_rate": 1.4052e-05, + "loss": 0.0362, + "step": 4690 + }, + { + "epoch": 3.693974005513982, + "grad_norm": 0.5381647348403931, + "learning_rate": 1.4055000000000002e-05, + "loss": 0.0368, + "step": 4691 + }, + { + "epoch": 3.6947617172115006, + "grad_norm": 0.9213071465492249, + "learning_rate": 1.4058000000000002e-05, + "loss": 0.0426, + "step": 4692 + }, + { + "epoch": 3.695549428909019, + "grad_norm": 0.6083928942680359, + "learning_rate": 1.4061e-05, + "loss": 0.0354, + "step": 4693 + }, + { + "epoch": 3.6963371406065377, + "grad_norm": 0.39724937081336975, + "learning_rate": 1.4064e-05, + "loss": 0.022, + "step": 4694 + }, + { + "epoch": 3.6971248523040567, + "grad_norm": 0.6303129196166992, + "learning_rate": 1.4067e-05, + "loss": 0.0273, + "step": 4695 + }, + { + "epoch": 3.6979125640015753, + "grad_norm": 0.7902131080627441, + "learning_rate": 1.4069999999999999e-05, + "loss": 0.0429, + "step": 4696 + }, + { + "epoch": 3.6987002756990943, + "grad_norm": 0.670024573802948, + "learning_rate": 1.4073e-05, + "loss": 0.0492, + "step": 4697 + }, + { + "epoch": 3.699487987396613, + "grad_norm": 0.9039899110794067, + "learning_rate": 1.4076e-05, + "loss": 0.0511, + "step": 4698 + }, + { + "epoch": 3.7002756990941315, + "grad_norm": 0.5812016129493713, + "learning_rate": 1.4079e-05, + "loss": 0.0299, + "step": 4699 + }, + { + "epoch": 3.70106341079165, + "grad_norm": 0.7523219585418701, + "learning_rate": 1.4082e-05, + "loss": 0.0302, + "step": 4700 + }, + { + "epoch": 3.701851122489169, + "grad_norm": 0.6939859986305237, + "learning_rate": 1.4085e-05, + "loss": 0.0267, + "step": 4701 + }, + { + "epoch": 3.7026388341866876, + "grad_norm": 0.7053356170654297, + "learning_rate": 1.4088000000000001e-05, + "loss": 0.0382, + "step": 4702 + }, + { + "epoch": 3.7034265458842066, + "grad_norm": 0.5080006122589111, + "learning_rate": 1.4091000000000001e-05, + "loss": 0.0305, + "step": 4703 + }, + { + "epoch": 3.704214257581725, + "grad_norm": 1.6591957807540894, + "learning_rate": 1.4094000000000001e-05, + "loss": 0.0833, + "step": 4704 + }, + { + "epoch": 3.7050019692792437, + "grad_norm": 0.6040511727333069, + "learning_rate": 1.4097e-05, + "loss": 0.0346, + "step": 4705 + }, + { + "epoch": 3.7057896809767623, + "grad_norm": 0.7001109719276428, + "learning_rate": 1.4099999999999999e-05, + "loss": 0.039, + "step": 4706 + }, + { + "epoch": 3.7065773926742813, + "grad_norm": 0.7651590704917908, + "learning_rate": 1.4103e-05, + "loss": 0.0526, + "step": 4707 + }, + { + "epoch": 3.7073651043718, + "grad_norm": 0.6576030254364014, + "learning_rate": 1.4106e-05, + "loss": 0.0505, + "step": 4708 + }, + { + "epoch": 3.708152816069319, + "grad_norm": 0.7606425285339355, + "learning_rate": 1.4109e-05, + "loss": 0.0404, + "step": 4709 + }, + { + "epoch": 3.7089405277668375, + "grad_norm": 1.4061487913131714, + "learning_rate": 1.4112e-05, + "loss": 0.0573, + "step": 4710 + }, + { + "epoch": 3.709728239464356, + "grad_norm": 1.6811031103134155, + "learning_rate": 1.4115e-05, + "loss": 0.3648, + "step": 4711 + }, + { + "epoch": 3.7105159511618746, + "grad_norm": 1.0669842958450317, + "learning_rate": 1.4118000000000001e-05, + "loss": 0.3105, + "step": 4712 + }, + { + "epoch": 3.7113036628593936, + "grad_norm": 1.0664957761764526, + "learning_rate": 1.4121e-05, + "loss": 0.2704, + "step": 4713 + }, + { + "epoch": 3.712091374556912, + "grad_norm": 0.9694311022758484, + "learning_rate": 1.4124e-05, + "loss": 0.1901, + "step": 4714 + }, + { + "epoch": 3.7128790862544307, + "grad_norm": 0.6391901969909668, + "learning_rate": 1.4127e-05, + "loss": 0.1048, + "step": 4715 + }, + { + "epoch": 3.7136667979519498, + "grad_norm": 0.8318548798561096, + "learning_rate": 1.413e-05, + "loss": 0.1112, + "step": 4716 + }, + { + "epoch": 3.7144545096494683, + "grad_norm": 0.5459532141685486, + "learning_rate": 1.4133000000000002e-05, + "loss": 0.0534, + "step": 4717 + }, + { + "epoch": 3.715242221346987, + "grad_norm": 0.5451174378395081, + "learning_rate": 1.4136e-05, + "loss": 0.0752, + "step": 4718 + }, + { + "epoch": 3.7160299330445055, + "grad_norm": 0.408914715051651, + "learning_rate": 1.4139e-05, + "loss": 0.0353, + "step": 4719 + }, + { + "epoch": 3.7168176447420245, + "grad_norm": 2.851891040802002, + "learning_rate": 1.4142e-05, + "loss": 0.0423, + "step": 4720 + }, + { + "epoch": 3.717605356439543, + "grad_norm": 0.5917152762413025, + "learning_rate": 1.4145e-05, + "loss": 0.0313, + "step": 4721 + }, + { + "epoch": 3.718393068137062, + "grad_norm": 0.437985360622406, + "learning_rate": 1.4148e-05, + "loss": 0.0374, + "step": 4722 + }, + { + "epoch": 3.7191807798345806, + "grad_norm": 0.6366431713104248, + "learning_rate": 1.4151e-05, + "loss": 0.0344, + "step": 4723 + }, + { + "epoch": 3.719968491532099, + "grad_norm": 0.6444607973098755, + "learning_rate": 1.4154e-05, + "loss": 0.0282, + "step": 4724 + }, + { + "epoch": 3.7207562032296178, + "grad_norm": 0.538231372833252, + "learning_rate": 1.4157e-05, + "loss": 0.0386, + "step": 4725 + }, + { + "epoch": 3.7215439149271368, + "grad_norm": 0.43542352318763733, + "learning_rate": 1.416e-05, + "loss": 0.0392, + "step": 4726 + }, + { + "epoch": 3.7223316266246553, + "grad_norm": 0.7710838913917542, + "learning_rate": 1.4163000000000001e-05, + "loss": 0.0347, + "step": 4727 + }, + { + "epoch": 3.7231193383221743, + "grad_norm": 0.5840590000152588, + "learning_rate": 1.4166000000000001e-05, + "loss": 0.05, + "step": 4728 + }, + { + "epoch": 3.723907050019693, + "grad_norm": 0.6824948787689209, + "learning_rate": 1.4169000000000001e-05, + "loss": 0.0482, + "step": 4729 + }, + { + "epoch": 3.7246947617172115, + "grad_norm": 0.7244492173194885, + "learning_rate": 1.4172e-05, + "loss": 0.0533, + "step": 4730 + }, + { + "epoch": 3.72548247341473, + "grad_norm": 0.6499211192131042, + "learning_rate": 1.4174999999999999e-05, + "loss": 0.047, + "step": 4731 + }, + { + "epoch": 3.726270185112249, + "grad_norm": 0.5618826746940613, + "learning_rate": 1.4178e-05, + "loss": 0.0348, + "step": 4732 + }, + { + "epoch": 3.7270578968097676, + "grad_norm": 0.7404304146766663, + "learning_rate": 1.4181e-05, + "loss": 0.0394, + "step": 4733 + }, + { + "epoch": 3.727845608507286, + "grad_norm": 0.472950279712677, + "learning_rate": 1.4184e-05, + "loss": 0.0398, + "step": 4734 + }, + { + "epoch": 3.728633320204805, + "grad_norm": 0.6334661245346069, + "learning_rate": 1.4187e-05, + "loss": 0.0373, + "step": 4735 + }, + { + "epoch": 3.7294210319023238, + "grad_norm": 0.4342467188835144, + "learning_rate": 1.419e-05, + "loss": 0.0304, + "step": 4736 + }, + { + "epoch": 3.7302087435998423, + "grad_norm": 0.43918392062187195, + "learning_rate": 1.4193000000000001e-05, + "loss": 0.0385, + "step": 4737 + }, + { + "epoch": 3.730996455297361, + "grad_norm": 0.750268280506134, + "learning_rate": 1.4196000000000001e-05, + "loss": 0.0338, + "step": 4738 + }, + { + "epoch": 3.73178416699488, + "grad_norm": 0.647983968257904, + "learning_rate": 1.4199e-05, + "loss": 0.0448, + "step": 4739 + }, + { + "epoch": 3.7325718786923985, + "grad_norm": 0.5326806902885437, + "learning_rate": 1.4202e-05, + "loss": 0.0298, + "step": 4740 + }, + { + "epoch": 3.7333595903899175, + "grad_norm": 0.6876817941665649, + "learning_rate": 1.4205e-05, + "loss": 0.0482, + "step": 4741 + }, + { + "epoch": 3.734147302087436, + "grad_norm": 1.0919370651245117, + "learning_rate": 1.4208e-05, + "loss": 0.0433, + "step": 4742 + }, + { + "epoch": 3.7349350137849546, + "grad_norm": 0.6521375179290771, + "learning_rate": 1.4211e-05, + "loss": 0.0347, + "step": 4743 + }, + { + "epoch": 3.735722725482473, + "grad_norm": 0.7480848431587219, + "learning_rate": 1.4214e-05, + "loss": 0.0376, + "step": 4744 + }, + { + "epoch": 3.736510437179992, + "grad_norm": 0.521490216255188, + "learning_rate": 1.4217e-05, + "loss": 0.0383, + "step": 4745 + }, + { + "epoch": 3.7372981488775108, + "grad_norm": 1.055763840675354, + "learning_rate": 1.422e-05, + "loss": 0.0418, + "step": 4746 + }, + { + "epoch": 3.73808586057503, + "grad_norm": 0.46539565920829773, + "learning_rate": 1.4223000000000001e-05, + "loss": 0.0228, + "step": 4747 + }, + { + "epoch": 3.7388735722725484, + "grad_norm": 0.5195013284683228, + "learning_rate": 1.4226e-05, + "loss": 0.0236, + "step": 4748 + }, + { + "epoch": 3.739661283970067, + "grad_norm": 0.621508002281189, + "learning_rate": 1.4229e-05, + "loss": 0.0392, + "step": 4749 + }, + { + "epoch": 3.7404489956675855, + "grad_norm": 0.8250309824943542, + "learning_rate": 1.4232e-05, + "loss": 0.0544, + "step": 4750 + }, + { + "epoch": 3.7412367073651045, + "grad_norm": 0.8217473030090332, + "learning_rate": 1.4235e-05, + "loss": 0.0319, + "step": 4751 + }, + { + "epoch": 3.742024419062623, + "grad_norm": 0.5050244927406311, + "learning_rate": 1.4238000000000002e-05, + "loss": 0.0256, + "step": 4752 + }, + { + "epoch": 3.7428121307601416, + "grad_norm": 0.878165066242218, + "learning_rate": 1.4241000000000001e-05, + "loss": 0.0572, + "step": 4753 + }, + { + "epoch": 3.7435998424576606, + "grad_norm": 0.6653318405151367, + "learning_rate": 1.4244000000000001e-05, + "loss": 0.0433, + "step": 4754 + }, + { + "epoch": 3.744387554155179, + "grad_norm": 0.8912068605422974, + "learning_rate": 1.4247e-05, + "loss": 0.0605, + "step": 4755 + }, + { + "epoch": 3.7451752658526978, + "grad_norm": 0.7892940044403076, + "learning_rate": 1.4249999999999999e-05, + "loss": 0.04, + "step": 4756 + }, + { + "epoch": 3.7459629775502163, + "grad_norm": 1.1175265312194824, + "learning_rate": 1.4253e-05, + "loss": 0.0476, + "step": 4757 + }, + { + "epoch": 3.7467506892477354, + "grad_norm": 0.721002459526062, + "learning_rate": 1.4256e-05, + "loss": 0.046, + "step": 4758 + }, + { + "epoch": 3.747538400945254, + "grad_norm": 0.9286454319953918, + "learning_rate": 1.4259e-05, + "loss": 0.0279, + "step": 4759 + }, + { + "epoch": 3.748326112642773, + "grad_norm": 0.8543899655342102, + "learning_rate": 1.4262e-05, + "loss": 0.0457, + "step": 4760 + }, + { + "epoch": 3.7491138243402915, + "grad_norm": 2.650447130203247, + "learning_rate": 1.4265e-05, + "loss": 0.5007, + "step": 4761 + }, + { + "epoch": 3.74990153603781, + "grad_norm": 1.4620110988616943, + "learning_rate": 1.4268000000000001e-05, + "loss": 0.3228, + "step": 4762 + }, + { + "epoch": 3.7506892477353286, + "grad_norm": 1.3027162551879883, + "learning_rate": 1.4271000000000001e-05, + "loss": 0.2802, + "step": 4763 + }, + { + "epoch": 3.7514769594328476, + "grad_norm": 0.893100917339325, + "learning_rate": 1.4274000000000001e-05, + "loss": 0.1873, + "step": 4764 + }, + { + "epoch": 3.752264671130366, + "grad_norm": 0.9210391640663147, + "learning_rate": 1.4277e-05, + "loss": 0.1307, + "step": 4765 + }, + { + "epoch": 3.7530523828278852, + "grad_norm": 0.8739075064659119, + "learning_rate": 1.428e-05, + "loss": 0.1367, + "step": 4766 + }, + { + "epoch": 3.753840094525404, + "grad_norm": 0.7808665037155151, + "learning_rate": 1.4283e-05, + "loss": 0.0993, + "step": 4767 + }, + { + "epoch": 3.7546278062229224, + "grad_norm": 0.5497227311134338, + "learning_rate": 1.4286e-05, + "loss": 0.0663, + "step": 4768 + }, + { + "epoch": 3.755415517920441, + "grad_norm": 0.40082505345344543, + "learning_rate": 1.4289e-05, + "loss": 0.0463, + "step": 4769 + }, + { + "epoch": 3.75620322961796, + "grad_norm": 1.8506239652633667, + "learning_rate": 1.4292e-05, + "loss": 0.0392, + "step": 4770 + }, + { + "epoch": 3.7569909413154785, + "grad_norm": 0.5438307523727417, + "learning_rate": 1.4295e-05, + "loss": 0.0322, + "step": 4771 + }, + { + "epoch": 3.7577786530129975, + "grad_norm": 0.5414202213287354, + "learning_rate": 1.4298000000000001e-05, + "loss": 0.0449, + "step": 4772 + }, + { + "epoch": 3.758566364710516, + "grad_norm": 0.37862610816955566, + "learning_rate": 1.4301e-05, + "loss": 0.025, + "step": 4773 + }, + { + "epoch": 3.7593540764080347, + "grad_norm": 0.7152016758918762, + "learning_rate": 1.4304e-05, + "loss": 0.0326, + "step": 4774 + }, + { + "epoch": 3.760141788105553, + "grad_norm": 0.7405872941017151, + "learning_rate": 1.4307e-05, + "loss": 0.0631, + "step": 4775 + }, + { + "epoch": 3.7609294998030722, + "grad_norm": 1.052172303199768, + "learning_rate": 1.431e-05, + "loss": 0.0704, + "step": 4776 + }, + { + "epoch": 3.761717211500591, + "grad_norm": 0.5962929725646973, + "learning_rate": 1.4313000000000002e-05, + "loss": 0.037, + "step": 4777 + }, + { + "epoch": 3.7625049231981094, + "grad_norm": 0.41990625858306885, + "learning_rate": 1.4316000000000002e-05, + "loss": 0.0302, + "step": 4778 + }, + { + "epoch": 3.7632926348956284, + "grad_norm": 0.6524969935417175, + "learning_rate": 1.4319e-05, + "loss": 0.0399, + "step": 4779 + }, + { + "epoch": 3.764080346593147, + "grad_norm": 0.4315299093723297, + "learning_rate": 1.4322e-05, + "loss": 0.0284, + "step": 4780 + }, + { + "epoch": 3.7648680582906655, + "grad_norm": 0.5954996943473816, + "learning_rate": 1.4325e-05, + "loss": 0.0293, + "step": 4781 + }, + { + "epoch": 3.765655769988184, + "grad_norm": 1.1301114559173584, + "learning_rate": 1.4328e-05, + "loss": 0.0412, + "step": 4782 + }, + { + "epoch": 3.766443481685703, + "grad_norm": 0.4664642810821533, + "learning_rate": 1.4331e-05, + "loss": 0.0243, + "step": 4783 + }, + { + "epoch": 3.7672311933832217, + "grad_norm": 0.7004461288452148, + "learning_rate": 1.4334e-05, + "loss": 0.0548, + "step": 4784 + }, + { + "epoch": 3.7680189050807407, + "grad_norm": 0.8350341320037842, + "learning_rate": 1.4337e-05, + "loss": 0.0916, + "step": 4785 + }, + { + "epoch": 3.7688066167782592, + "grad_norm": 0.5709887146949768, + "learning_rate": 1.434e-05, + "loss": 0.0284, + "step": 4786 + }, + { + "epoch": 3.769594328475778, + "grad_norm": 0.9805128574371338, + "learning_rate": 1.4343000000000001e-05, + "loss": 0.0499, + "step": 4787 + }, + { + "epoch": 3.7703820401732964, + "grad_norm": 0.41167962551116943, + "learning_rate": 1.4346000000000001e-05, + "loss": 0.0314, + "step": 4788 + }, + { + "epoch": 3.7711697518708154, + "grad_norm": 0.44796058535575867, + "learning_rate": 1.4349000000000001e-05, + "loss": 0.0271, + "step": 4789 + }, + { + "epoch": 3.771957463568334, + "grad_norm": 0.8534653186798096, + "learning_rate": 1.4352e-05, + "loss": 0.0392, + "step": 4790 + }, + { + "epoch": 3.772745175265853, + "grad_norm": 0.904370129108429, + "learning_rate": 1.4355e-05, + "loss": 0.0372, + "step": 4791 + }, + { + "epoch": 3.7735328869633715, + "grad_norm": 0.736432671546936, + "learning_rate": 1.4358e-05, + "loss": 0.0379, + "step": 4792 + }, + { + "epoch": 3.77432059866089, + "grad_norm": 0.5853505730628967, + "learning_rate": 1.4361e-05, + "loss": 0.043, + "step": 4793 + }, + { + "epoch": 3.7751083103584087, + "grad_norm": 0.6784687042236328, + "learning_rate": 1.4364e-05, + "loss": 0.0508, + "step": 4794 + }, + { + "epoch": 3.7758960220559277, + "grad_norm": 0.5323972105979919, + "learning_rate": 1.4367e-05, + "loss": 0.0362, + "step": 4795 + }, + { + "epoch": 3.7766837337534462, + "grad_norm": 0.5867098569869995, + "learning_rate": 1.437e-05, + "loss": 0.0402, + "step": 4796 + }, + { + "epoch": 3.777471445450965, + "grad_norm": 0.6126879453659058, + "learning_rate": 1.4373000000000001e-05, + "loss": 0.0292, + "step": 4797 + }, + { + "epoch": 3.778259157148484, + "grad_norm": 0.3678357005119324, + "learning_rate": 1.4376000000000001e-05, + "loss": 0.0167, + "step": 4798 + }, + { + "epoch": 3.7790468688460024, + "grad_norm": 0.8056368827819824, + "learning_rate": 1.4379e-05, + "loss": 0.0381, + "step": 4799 + }, + { + "epoch": 3.779834580543521, + "grad_norm": 0.7524096965789795, + "learning_rate": 1.4382e-05, + "loss": 0.0385, + "step": 4800 + }, + { + "epoch": 3.7806222922410395, + "grad_norm": 0.6119118928909302, + "learning_rate": 1.4385e-05, + "loss": 0.0431, + "step": 4801 + }, + { + "epoch": 3.7814100039385585, + "grad_norm": 0.6763418912887573, + "learning_rate": 1.4388000000000002e-05, + "loss": 0.0388, + "step": 4802 + }, + { + "epoch": 3.782197715636077, + "grad_norm": 0.47399476170539856, + "learning_rate": 1.4391000000000002e-05, + "loss": 0.0281, + "step": 4803 + }, + { + "epoch": 3.782985427333596, + "grad_norm": 0.7123922109603882, + "learning_rate": 1.4394e-05, + "loss": 0.0549, + "step": 4804 + }, + { + "epoch": 3.7837731390311147, + "grad_norm": 0.6955159902572632, + "learning_rate": 1.4397e-05, + "loss": 0.0513, + "step": 4805 + }, + { + "epoch": 3.7845608507286332, + "grad_norm": 0.6233432292938232, + "learning_rate": 1.44e-05, + "loss": 0.0418, + "step": 4806 + }, + { + "epoch": 3.785348562426152, + "grad_norm": 1.1327152252197266, + "learning_rate": 1.4403e-05, + "loss": 0.0732, + "step": 4807 + }, + { + "epoch": 3.786136274123671, + "grad_norm": 0.8017009496688843, + "learning_rate": 1.4406e-05, + "loss": 0.0457, + "step": 4808 + }, + { + "epoch": 3.7869239858211894, + "grad_norm": 1.0182321071624756, + "learning_rate": 1.4409e-05, + "loss": 0.0886, + "step": 4809 + }, + { + "epoch": 3.7877116975187084, + "grad_norm": 0.8457806706428528, + "learning_rate": 1.4412e-05, + "loss": 0.0659, + "step": 4810 + }, + { + "epoch": 3.788499409216227, + "grad_norm": 1.2805399894714355, + "learning_rate": 1.4415e-05, + "loss": 0.3665, + "step": 4811 + }, + { + "epoch": 3.7892871209137455, + "grad_norm": 1.4848525524139404, + "learning_rate": 1.4418000000000002e-05, + "loss": 0.3667, + "step": 4812 + }, + { + "epoch": 3.790074832611264, + "grad_norm": 1.2528278827667236, + "learning_rate": 1.4421000000000001e-05, + "loss": 0.3175, + "step": 4813 + }, + { + "epoch": 3.790862544308783, + "grad_norm": 1.1708396673202515, + "learning_rate": 1.4424000000000001e-05, + "loss": 0.2154, + "step": 4814 + }, + { + "epoch": 3.7916502560063017, + "grad_norm": 1.3351614475250244, + "learning_rate": 1.4427000000000001e-05, + "loss": 0.311, + "step": 4815 + }, + { + "epoch": 3.7924379677038202, + "grad_norm": 1.1149920225143433, + "learning_rate": 1.4429999999999999e-05, + "loss": 0.1331, + "step": 4816 + }, + { + "epoch": 3.7932256794013393, + "grad_norm": 0.5670040845870972, + "learning_rate": 1.4433e-05, + "loss": 0.0555, + "step": 4817 + }, + { + "epoch": 3.794013391098858, + "grad_norm": 0.5534640550613403, + "learning_rate": 1.4436e-05, + "loss": 0.0555, + "step": 4818 + }, + { + "epoch": 3.7948011027963764, + "grad_norm": 0.514578640460968, + "learning_rate": 1.4439e-05, + "loss": 0.0554, + "step": 4819 + }, + { + "epoch": 3.795588814493895, + "grad_norm": 0.67873215675354, + "learning_rate": 1.4442e-05, + "loss": 0.0432, + "step": 4820 + }, + { + "epoch": 3.796376526191414, + "grad_norm": 0.6079312562942505, + "learning_rate": 1.4445e-05, + "loss": 0.0529, + "step": 4821 + }, + { + "epoch": 3.7971642378889325, + "grad_norm": 0.3692733645439148, + "learning_rate": 1.4448e-05, + "loss": 0.0407, + "step": 4822 + }, + { + "epoch": 3.7979519495864515, + "grad_norm": 0.6557374596595764, + "learning_rate": 1.4451000000000001e-05, + "loss": 0.0311, + "step": 4823 + }, + { + "epoch": 3.79873966128397, + "grad_norm": 0.40777117013931274, + "learning_rate": 1.4454000000000001e-05, + "loss": 0.0432, + "step": 4824 + }, + { + "epoch": 3.7995273729814887, + "grad_norm": 0.32383811473846436, + "learning_rate": 1.4457e-05, + "loss": 0.0184, + "step": 4825 + }, + { + "epoch": 3.8003150846790072, + "grad_norm": 0.6320709586143494, + "learning_rate": 1.446e-05, + "loss": 0.0489, + "step": 4826 + }, + { + "epoch": 3.8011027963765263, + "grad_norm": 0.7921866178512573, + "learning_rate": 1.4463e-05, + "loss": 0.0511, + "step": 4827 + }, + { + "epoch": 3.801890508074045, + "grad_norm": 0.6873631477355957, + "learning_rate": 1.4466e-05, + "loss": 0.0297, + "step": 4828 + }, + { + "epoch": 3.802678219771564, + "grad_norm": 0.4149545729160309, + "learning_rate": 1.4469e-05, + "loss": 0.0194, + "step": 4829 + }, + { + "epoch": 3.8034659314690824, + "grad_norm": 0.5331857800483704, + "learning_rate": 1.4472e-05, + "loss": 0.045, + "step": 4830 + }, + { + "epoch": 3.804253643166601, + "grad_norm": 0.7473304271697998, + "learning_rate": 1.4475e-05, + "loss": 0.0469, + "step": 4831 + }, + { + "epoch": 3.8050413548641195, + "grad_norm": 0.907132625579834, + "learning_rate": 1.4478e-05, + "loss": 0.0489, + "step": 4832 + }, + { + "epoch": 3.8058290665616386, + "grad_norm": 0.3452204763889313, + "learning_rate": 1.4481e-05, + "loss": 0.021, + "step": 4833 + }, + { + "epoch": 3.806616778259157, + "grad_norm": 1.3224928379058838, + "learning_rate": 1.4484e-05, + "loss": 0.1099, + "step": 4834 + }, + { + "epoch": 3.807404489956676, + "grad_norm": 0.562435507774353, + "learning_rate": 1.4487e-05, + "loss": 0.033, + "step": 4835 + }, + { + "epoch": 3.8081922016541947, + "grad_norm": 0.5824658274650574, + "learning_rate": 1.449e-05, + "loss": 0.0457, + "step": 4836 + }, + { + "epoch": 3.8089799133517133, + "grad_norm": 0.5218472480773926, + "learning_rate": 1.4493e-05, + "loss": 0.0326, + "step": 4837 + }, + { + "epoch": 3.809767625049232, + "grad_norm": 0.6488819122314453, + "learning_rate": 1.4496000000000001e-05, + "loss": 0.0467, + "step": 4838 + }, + { + "epoch": 3.810555336746751, + "grad_norm": 0.5575054883956909, + "learning_rate": 1.4499000000000001e-05, + "loss": 0.0379, + "step": 4839 + }, + { + "epoch": 3.8113430484442694, + "grad_norm": 0.5799517035484314, + "learning_rate": 1.4502000000000001e-05, + "loss": 0.0389, + "step": 4840 + }, + { + "epoch": 3.812130760141788, + "grad_norm": 0.434048056602478, + "learning_rate": 1.4505e-05, + "loss": 0.0279, + "step": 4841 + }, + { + "epoch": 3.812918471839307, + "grad_norm": 0.6892963647842407, + "learning_rate": 1.4507999999999999e-05, + "loss": 0.0295, + "step": 4842 + }, + { + "epoch": 3.8137061835368256, + "grad_norm": 0.5656698346138, + "learning_rate": 1.4511e-05, + "loss": 0.0451, + "step": 4843 + }, + { + "epoch": 3.814493895234344, + "grad_norm": 0.7034119963645935, + "learning_rate": 1.4514e-05, + "loss": 0.0287, + "step": 4844 + }, + { + "epoch": 3.8152816069318627, + "grad_norm": 0.5041642189025879, + "learning_rate": 1.4517e-05, + "loss": 0.0354, + "step": 4845 + }, + { + "epoch": 3.8160693186293817, + "grad_norm": 0.6192951202392578, + "learning_rate": 1.452e-05, + "loss": 0.0505, + "step": 4846 + }, + { + "epoch": 3.8168570303269003, + "grad_norm": 0.8157956004142761, + "learning_rate": 1.4523e-05, + "loss": 0.039, + "step": 4847 + }, + { + "epoch": 3.8176447420244193, + "grad_norm": 0.5730500221252441, + "learning_rate": 1.4526000000000001e-05, + "loss": 0.044, + "step": 4848 + }, + { + "epoch": 3.818432453721938, + "grad_norm": 0.39595040678977966, + "learning_rate": 1.4529000000000001e-05, + "loss": 0.0219, + "step": 4849 + }, + { + "epoch": 3.8192201654194564, + "grad_norm": 0.9527972936630249, + "learning_rate": 1.4532e-05, + "loss": 0.0549, + "step": 4850 + }, + { + "epoch": 3.820007877116975, + "grad_norm": 0.31006598472595215, + "learning_rate": 1.4535e-05, + "loss": 0.0246, + "step": 4851 + }, + { + "epoch": 3.820795588814494, + "grad_norm": 1.1242562532424927, + "learning_rate": 1.4538e-05, + "loss": 0.0313, + "step": 4852 + }, + { + "epoch": 3.8215833005120126, + "grad_norm": 0.5073027014732361, + "learning_rate": 1.4541e-05, + "loss": 0.0251, + "step": 4853 + }, + { + "epoch": 3.8223710122095316, + "grad_norm": 1.1155965328216553, + "learning_rate": 1.4544e-05, + "loss": 0.0525, + "step": 4854 + }, + { + "epoch": 3.82315872390705, + "grad_norm": 0.599601149559021, + "learning_rate": 1.4547e-05, + "loss": 0.0312, + "step": 4855 + }, + { + "epoch": 3.8239464356045687, + "grad_norm": 0.6772897839546204, + "learning_rate": 1.455e-05, + "loss": 0.0325, + "step": 4856 + }, + { + "epoch": 3.8247341473020873, + "grad_norm": 0.615839421749115, + "learning_rate": 1.4553e-05, + "loss": 0.0432, + "step": 4857 + }, + { + "epoch": 3.8255218589996063, + "grad_norm": 0.688793957233429, + "learning_rate": 1.4556000000000001e-05, + "loss": 0.0349, + "step": 4858 + }, + { + "epoch": 3.826309570697125, + "grad_norm": 0.7725232243537903, + "learning_rate": 1.4559e-05, + "loss": 0.0465, + "step": 4859 + }, + { + "epoch": 3.8270972823946434, + "grad_norm": 1.6334017515182495, + "learning_rate": 1.4562e-05, + "loss": 0.0795, + "step": 4860 + }, + { + "epoch": 3.8278849940921624, + "grad_norm": 2.5743179321289062, + "learning_rate": 1.4565e-05, + "loss": 0.4565, + "step": 4861 + }, + { + "epoch": 3.828672705789681, + "grad_norm": 1.8168065547943115, + "learning_rate": 1.4568e-05, + "loss": 0.3249, + "step": 4862 + }, + { + "epoch": 3.8294604174871996, + "grad_norm": 1.144874095916748, + "learning_rate": 1.4571000000000002e-05, + "loss": 0.2309, + "step": 4863 + }, + { + "epoch": 3.830248129184718, + "grad_norm": 0.8291231989860535, + "learning_rate": 1.4574000000000001e-05, + "loss": 0.2178, + "step": 4864 + }, + { + "epoch": 3.831035840882237, + "grad_norm": 0.9613180160522461, + "learning_rate": 1.4577e-05, + "loss": 0.1746, + "step": 4865 + }, + { + "epoch": 3.8318235525797557, + "grad_norm": 0.8223774433135986, + "learning_rate": 1.458e-05, + "loss": 0.1059, + "step": 4866 + }, + { + "epoch": 3.8326112642772747, + "grad_norm": 0.6746121048927307, + "learning_rate": 1.4582999999999999e-05, + "loss": 0.0605, + "step": 4867 + }, + { + "epoch": 3.8333989759747933, + "grad_norm": 0.4004138708114624, + "learning_rate": 1.4586e-05, + "loss": 0.0483, + "step": 4868 + }, + { + "epoch": 3.834186687672312, + "grad_norm": 0.5294477343559265, + "learning_rate": 1.4589e-05, + "loss": 0.0574, + "step": 4869 + }, + { + "epoch": 3.8349743993698304, + "grad_norm": 0.4457085430622101, + "learning_rate": 1.4592e-05, + "loss": 0.0313, + "step": 4870 + }, + { + "epoch": 3.8357621110673494, + "grad_norm": 0.4946294128894806, + "learning_rate": 1.4595e-05, + "loss": 0.0465, + "step": 4871 + }, + { + "epoch": 3.836549822764868, + "grad_norm": 0.5696046948432922, + "learning_rate": 1.4598e-05, + "loss": 0.0579, + "step": 4872 + }, + { + "epoch": 3.837337534462387, + "grad_norm": 0.786452054977417, + "learning_rate": 1.4601000000000001e-05, + "loss": 0.1034, + "step": 4873 + }, + { + "epoch": 3.8381252461599056, + "grad_norm": 0.3256356418132782, + "learning_rate": 1.4604000000000001e-05, + "loss": 0.0239, + "step": 4874 + }, + { + "epoch": 3.838912957857424, + "grad_norm": 1.5061473846435547, + "learning_rate": 1.4607000000000001e-05, + "loss": 0.0355, + "step": 4875 + }, + { + "epoch": 3.8397006695549427, + "grad_norm": 0.4550386071205139, + "learning_rate": 1.461e-05, + "loss": 0.0298, + "step": 4876 + }, + { + "epoch": 3.8404883812524617, + "grad_norm": 0.3421264886856079, + "learning_rate": 1.4613e-05, + "loss": 0.0259, + "step": 4877 + }, + { + "epoch": 3.8412760929499803, + "grad_norm": 0.42165446281433105, + "learning_rate": 1.4616e-05, + "loss": 0.0354, + "step": 4878 + }, + { + "epoch": 3.842063804647499, + "grad_norm": 0.4284925162792206, + "learning_rate": 1.4619e-05, + "loss": 0.0305, + "step": 4879 + }, + { + "epoch": 3.842851516345018, + "grad_norm": 0.4608660042285919, + "learning_rate": 1.4622e-05, + "loss": 0.0282, + "step": 4880 + }, + { + "epoch": 3.8436392280425364, + "grad_norm": 1.0195378065109253, + "learning_rate": 1.4625e-05, + "loss": 0.0386, + "step": 4881 + }, + { + "epoch": 3.844426939740055, + "grad_norm": 0.38012629747390747, + "learning_rate": 1.4628e-05, + "loss": 0.0157, + "step": 4882 + }, + { + "epoch": 3.8452146514375736, + "grad_norm": 0.4906005561351776, + "learning_rate": 1.4631000000000001e-05, + "loss": 0.0235, + "step": 4883 + }, + { + "epoch": 3.8460023631350926, + "grad_norm": 0.8613882064819336, + "learning_rate": 1.4634e-05, + "loss": 0.0454, + "step": 4884 + }, + { + "epoch": 3.846790074832611, + "grad_norm": 0.5567412972450256, + "learning_rate": 1.4637e-05, + "loss": 0.0468, + "step": 4885 + }, + { + "epoch": 3.84757778653013, + "grad_norm": 0.6180558800697327, + "learning_rate": 1.464e-05, + "loss": 0.0445, + "step": 4886 + }, + { + "epoch": 3.8483654982276487, + "grad_norm": 0.4845811724662781, + "learning_rate": 1.4643e-05, + "loss": 0.0258, + "step": 4887 + }, + { + "epoch": 3.8491532099251673, + "grad_norm": 0.4502617120742798, + "learning_rate": 1.4646000000000002e-05, + "loss": 0.023, + "step": 4888 + }, + { + "epoch": 3.849940921622686, + "grad_norm": 0.46870535612106323, + "learning_rate": 1.4649000000000002e-05, + "loss": 0.0317, + "step": 4889 + }, + { + "epoch": 3.850728633320205, + "grad_norm": 0.6354352235794067, + "learning_rate": 1.4652e-05, + "loss": 0.0264, + "step": 4890 + }, + { + "epoch": 3.8515163450177234, + "grad_norm": 1.1186301708221436, + "learning_rate": 1.4655e-05, + "loss": 0.0284, + "step": 4891 + }, + { + "epoch": 3.8523040567152425, + "grad_norm": 0.5697392225265503, + "learning_rate": 1.4658e-05, + "loss": 0.0245, + "step": 4892 + }, + { + "epoch": 3.853091768412761, + "grad_norm": 0.5432916879653931, + "learning_rate": 1.4661e-05, + "loss": 0.0208, + "step": 4893 + }, + { + "epoch": 3.8538794801102796, + "grad_norm": 1.043566107749939, + "learning_rate": 1.4664e-05, + "loss": 0.0359, + "step": 4894 + }, + { + "epoch": 3.854667191807798, + "grad_norm": 0.7067296504974365, + "learning_rate": 1.4667e-05, + "loss": 0.0215, + "step": 4895 + }, + { + "epoch": 3.855454903505317, + "grad_norm": 0.9632616639137268, + "learning_rate": 1.467e-05, + "loss": 0.0461, + "step": 4896 + }, + { + "epoch": 3.8562426152028357, + "grad_norm": 0.9223142862319946, + "learning_rate": 1.4673e-05, + "loss": 0.0442, + "step": 4897 + }, + { + "epoch": 3.8570303269003547, + "grad_norm": 0.5752460360527039, + "learning_rate": 1.4676000000000001e-05, + "loss": 0.0383, + "step": 4898 + }, + { + "epoch": 3.8578180385978733, + "grad_norm": 0.4650450646877289, + "learning_rate": 1.4679000000000001e-05, + "loss": 0.0354, + "step": 4899 + }, + { + "epoch": 3.858605750295392, + "grad_norm": 0.5466020703315735, + "learning_rate": 1.4682000000000001e-05, + "loss": 0.0499, + "step": 4900 + }, + { + "epoch": 3.8593934619929104, + "grad_norm": 0.48963460326194763, + "learning_rate": 1.4685000000000001e-05, + "loss": 0.0362, + "step": 4901 + }, + { + "epoch": 3.8601811736904295, + "grad_norm": 0.6115140318870544, + "learning_rate": 1.4687999999999999e-05, + "loss": 0.0504, + "step": 4902 + }, + { + "epoch": 3.860968885387948, + "grad_norm": 0.5913634896278381, + "learning_rate": 1.4691e-05, + "loss": 0.0382, + "step": 4903 + }, + { + "epoch": 3.8617565970854666, + "grad_norm": 1.1883186101913452, + "learning_rate": 1.4694e-05, + "loss": 0.0693, + "step": 4904 + }, + { + "epoch": 3.8625443087829856, + "grad_norm": 0.5547631978988647, + "learning_rate": 1.4697e-05, + "loss": 0.0316, + "step": 4905 + }, + { + "epoch": 3.863332020480504, + "grad_norm": 0.8251798748970032, + "learning_rate": 1.47e-05, + "loss": 0.0578, + "step": 4906 + }, + { + "epoch": 3.8641197321780227, + "grad_norm": 0.6471707224845886, + "learning_rate": 1.4703e-05, + "loss": 0.0489, + "step": 4907 + }, + { + "epoch": 3.8649074438755413, + "grad_norm": 0.6861079335212708, + "learning_rate": 1.4706000000000001e-05, + "loss": 0.0331, + "step": 4908 + }, + { + "epoch": 3.8656951555730603, + "grad_norm": 0.8575870990753174, + "learning_rate": 1.4709000000000001e-05, + "loss": 0.0911, + "step": 4909 + }, + { + "epoch": 3.866482867270579, + "grad_norm": 0.6400560140609741, + "learning_rate": 1.4712e-05, + "loss": 0.0469, + "step": 4910 + }, + { + "epoch": 3.867270578968098, + "grad_norm": 0.8106240630149841, + "learning_rate": 1.4715e-05, + "loss": 0.2856, + "step": 4911 + }, + { + "epoch": 3.8680582906656165, + "grad_norm": 1.1106367111206055, + "learning_rate": 1.4718e-05, + "loss": 0.2868, + "step": 4912 + }, + { + "epoch": 3.868846002363135, + "grad_norm": 0.8594955205917358, + "learning_rate": 1.4721000000000002e-05, + "loss": 0.2949, + "step": 4913 + }, + { + "epoch": 3.8696337140606536, + "grad_norm": 1.1643532514572144, + "learning_rate": 1.4724e-05, + "loss": 0.1983, + "step": 4914 + }, + { + "epoch": 3.8704214257581726, + "grad_norm": 0.7772351503372192, + "learning_rate": 1.4727e-05, + "loss": 0.1744, + "step": 4915 + }, + { + "epoch": 3.871209137455691, + "grad_norm": 0.5712994933128357, + "learning_rate": 1.473e-05, + "loss": 0.1191, + "step": 4916 + }, + { + "epoch": 3.87199684915321, + "grad_norm": 0.6285783648490906, + "learning_rate": 1.4733e-05, + "loss": 0.0862, + "step": 4917 + }, + { + "epoch": 3.8727845608507288, + "grad_norm": 0.4757649302482605, + "learning_rate": 1.4736000000000001e-05, + "loss": 0.0473, + "step": 4918 + }, + { + "epoch": 3.8735722725482473, + "grad_norm": 0.35666924715042114, + "learning_rate": 1.4739e-05, + "loss": 0.0335, + "step": 4919 + }, + { + "epoch": 3.874359984245766, + "grad_norm": 0.36364465951919556, + "learning_rate": 1.4742e-05, + "loss": 0.029, + "step": 4920 + }, + { + "epoch": 3.875147695943285, + "grad_norm": 0.5233517289161682, + "learning_rate": 1.4745e-05, + "loss": 0.0364, + "step": 4921 + }, + { + "epoch": 3.8759354076408035, + "grad_norm": 0.41840308904647827, + "learning_rate": 1.4748e-05, + "loss": 0.0266, + "step": 4922 + }, + { + "epoch": 3.876723119338322, + "grad_norm": 0.7020249366760254, + "learning_rate": 1.4751000000000002e-05, + "loss": 0.0286, + "step": 4923 + }, + { + "epoch": 3.877510831035841, + "grad_norm": 0.673532247543335, + "learning_rate": 1.4754000000000001e-05, + "loss": 0.0334, + "step": 4924 + }, + { + "epoch": 3.8782985427333596, + "grad_norm": 0.44096770882606506, + "learning_rate": 1.4757000000000001e-05, + "loss": 0.0397, + "step": 4925 + }, + { + "epoch": 3.879086254430878, + "grad_norm": 0.5387856960296631, + "learning_rate": 1.4760000000000001e-05, + "loss": 0.0333, + "step": 4926 + }, + { + "epoch": 3.8798739661283967, + "grad_norm": 0.717677891254425, + "learning_rate": 1.4762999999999999e-05, + "loss": 0.0427, + "step": 4927 + }, + { + "epoch": 3.8806616778259158, + "grad_norm": 2.2638683319091797, + "learning_rate": 1.4766e-05, + "loss": 0.0494, + "step": 4928 + }, + { + "epoch": 3.8814493895234343, + "grad_norm": 0.5559812784194946, + "learning_rate": 1.4769e-05, + "loss": 0.0348, + "step": 4929 + }, + { + "epoch": 3.8822371012209533, + "grad_norm": 0.47659939527511597, + "learning_rate": 1.4772e-05, + "loss": 0.0289, + "step": 4930 + }, + { + "epoch": 3.883024812918472, + "grad_norm": 0.3031049966812134, + "learning_rate": 1.4775e-05, + "loss": 0.0258, + "step": 4931 + }, + { + "epoch": 3.8838125246159905, + "grad_norm": 0.5532894730567932, + "learning_rate": 1.4778e-05, + "loss": 0.0364, + "step": 4932 + }, + { + "epoch": 3.884600236313509, + "grad_norm": 0.5211607813835144, + "learning_rate": 1.4781000000000001e-05, + "loss": 0.0317, + "step": 4933 + }, + { + "epoch": 3.885387948011028, + "grad_norm": 1.1386340856552124, + "learning_rate": 1.4784000000000001e-05, + "loss": 0.0385, + "step": 4934 + }, + { + "epoch": 3.8861756597085466, + "grad_norm": 0.6531775593757629, + "learning_rate": 1.4787000000000001e-05, + "loss": 0.0247, + "step": 4935 + }, + { + "epoch": 3.8869633714060656, + "grad_norm": 0.47159069776535034, + "learning_rate": 1.479e-05, + "loss": 0.0394, + "step": 4936 + }, + { + "epoch": 3.887751083103584, + "grad_norm": 1.5828511714935303, + "learning_rate": 1.4793e-05, + "loss": 0.0519, + "step": 4937 + }, + { + "epoch": 3.8885387948011028, + "grad_norm": 0.3718748092651367, + "learning_rate": 1.4796000000000002e-05, + "loss": 0.0283, + "step": 4938 + }, + { + "epoch": 3.8893265064986213, + "grad_norm": 0.5686085224151611, + "learning_rate": 1.4799e-05, + "loss": 0.0458, + "step": 4939 + }, + { + "epoch": 3.8901142181961403, + "grad_norm": 0.4783523380756378, + "learning_rate": 1.4802e-05, + "loss": 0.0478, + "step": 4940 + }, + { + "epoch": 3.890901929893659, + "grad_norm": 0.9355679750442505, + "learning_rate": 1.4805e-05, + "loss": 0.0337, + "step": 4941 + }, + { + "epoch": 3.8916896415911775, + "grad_norm": 0.9166386723518372, + "learning_rate": 1.4808e-05, + "loss": 0.0336, + "step": 4942 + }, + { + "epoch": 3.8924773532886965, + "grad_norm": 0.5163027048110962, + "learning_rate": 1.4811000000000001e-05, + "loss": 0.0437, + "step": 4943 + }, + { + "epoch": 3.893265064986215, + "grad_norm": 0.6909233331680298, + "learning_rate": 1.4814e-05, + "loss": 0.0621, + "step": 4944 + }, + { + "epoch": 3.8940527766837336, + "grad_norm": 0.6146838068962097, + "learning_rate": 1.4817e-05, + "loss": 0.0558, + "step": 4945 + }, + { + "epoch": 3.894840488381252, + "grad_norm": 0.5025572776794434, + "learning_rate": 1.482e-05, + "loss": 0.0386, + "step": 4946 + }, + { + "epoch": 3.895628200078771, + "grad_norm": 0.6777660846710205, + "learning_rate": 1.4823e-05, + "loss": 0.0501, + "step": 4947 + }, + { + "epoch": 3.8964159117762898, + "grad_norm": 0.6119566559791565, + "learning_rate": 1.4826e-05, + "loss": 0.0401, + "step": 4948 + }, + { + "epoch": 3.8972036234738088, + "grad_norm": 0.5382979512214661, + "learning_rate": 1.4829000000000002e-05, + "loss": 0.0381, + "step": 4949 + }, + { + "epoch": 3.8979913351713273, + "grad_norm": 0.3727637529373169, + "learning_rate": 1.4832000000000001e-05, + "loss": 0.0259, + "step": 4950 + }, + { + "epoch": 3.898779046868846, + "grad_norm": 0.5167244672775269, + "learning_rate": 1.4835e-05, + "loss": 0.0249, + "step": 4951 + }, + { + "epoch": 3.8995667585663645, + "grad_norm": 0.9619451761245728, + "learning_rate": 1.4838e-05, + "loss": 0.0519, + "step": 4952 + }, + { + "epoch": 3.9003544702638835, + "grad_norm": 0.7896140813827515, + "learning_rate": 1.4840999999999999e-05, + "loss": 0.0389, + "step": 4953 + }, + { + "epoch": 3.901142181961402, + "grad_norm": 0.8900781869888306, + "learning_rate": 1.4844e-05, + "loss": 0.0432, + "step": 4954 + }, + { + "epoch": 3.901929893658921, + "grad_norm": 0.5415450930595398, + "learning_rate": 1.4847e-05, + "loss": 0.0365, + "step": 4955 + }, + { + "epoch": 3.9027176053564396, + "grad_norm": 0.7768602967262268, + "learning_rate": 1.485e-05, + "loss": 0.0425, + "step": 4956 + }, + { + "epoch": 3.903505317053958, + "grad_norm": 0.7196398973464966, + "learning_rate": 1.4853e-05, + "loss": 0.0355, + "step": 4957 + }, + { + "epoch": 3.9042930287514768, + "grad_norm": 0.736572802066803, + "learning_rate": 1.4856e-05, + "loss": 0.0408, + "step": 4958 + }, + { + "epoch": 3.905080740448996, + "grad_norm": 1.2145094871520996, + "learning_rate": 1.4859000000000001e-05, + "loss": 0.0543, + "step": 4959 + }, + { + "epoch": 3.9058684521465143, + "grad_norm": 1.2816499471664429, + "learning_rate": 1.4862000000000001e-05, + "loss": 0.0904, + "step": 4960 + }, + { + "epoch": 3.9066561638440334, + "grad_norm": 1.3389228582382202, + "learning_rate": 1.4865e-05, + "loss": 0.3083, + "step": 4961 + }, + { + "epoch": 3.907443875541552, + "grad_norm": 1.4300360679626465, + "learning_rate": 1.4868e-05, + "loss": 0.2912, + "step": 4962 + }, + { + "epoch": 3.9082315872390705, + "grad_norm": 1.4112001657485962, + "learning_rate": 1.4871e-05, + "loss": 0.294, + "step": 4963 + }, + { + "epoch": 3.909019298936589, + "grad_norm": 0.7806482315063477, + "learning_rate": 1.4874e-05, + "loss": 0.1656, + "step": 4964 + }, + { + "epoch": 3.909807010634108, + "grad_norm": 1.0245335102081299, + "learning_rate": 1.4877e-05, + "loss": 0.156, + "step": 4965 + }, + { + "epoch": 3.9105947223316266, + "grad_norm": 0.5047885775566101, + "learning_rate": 1.488e-05, + "loss": 0.0542, + "step": 4966 + }, + { + "epoch": 3.911382434029145, + "grad_norm": 0.5230110883712769, + "learning_rate": 1.4883e-05, + "loss": 0.0638, + "step": 4967 + }, + { + "epoch": 3.912170145726664, + "grad_norm": 0.40351244807243347, + "learning_rate": 1.4886e-05, + "loss": 0.0522, + "step": 4968 + }, + { + "epoch": 3.912957857424183, + "grad_norm": 0.4352516531944275, + "learning_rate": 1.4889000000000001e-05, + "loss": 0.0461, + "step": 4969 + }, + { + "epoch": 3.9137455691217014, + "grad_norm": 0.3761723041534424, + "learning_rate": 1.4892e-05, + "loss": 0.0407, + "step": 4970 + }, + { + "epoch": 3.91453328081922, + "grad_norm": 0.3517801761627197, + "learning_rate": 1.4895e-05, + "loss": 0.0359, + "step": 4971 + }, + { + "epoch": 3.915320992516739, + "grad_norm": 0.516165554523468, + "learning_rate": 1.4898e-05, + "loss": 0.057, + "step": 4972 + }, + { + "epoch": 3.9161087042142575, + "grad_norm": 0.4578306972980499, + "learning_rate": 1.4901e-05, + "loss": 0.0329, + "step": 4973 + }, + { + "epoch": 3.9168964159117765, + "grad_norm": 0.393002986907959, + "learning_rate": 1.4904000000000002e-05, + "loss": 0.0326, + "step": 4974 + }, + { + "epoch": 3.917684127609295, + "grad_norm": 0.5200191736221313, + "learning_rate": 1.4907000000000001e-05, + "loss": 0.0287, + "step": 4975 + }, + { + "epoch": 3.9184718393068136, + "grad_norm": 0.8652962446212769, + "learning_rate": 1.491e-05, + "loss": 0.021, + "step": 4976 + }, + { + "epoch": 3.919259551004332, + "grad_norm": 0.574653148651123, + "learning_rate": 1.4913e-05, + "loss": 0.0416, + "step": 4977 + }, + { + "epoch": 3.920047262701851, + "grad_norm": 0.5379305481910706, + "learning_rate": 1.4915999999999999e-05, + "loss": 0.0244, + "step": 4978 + }, + { + "epoch": 3.92083497439937, + "grad_norm": 0.4729829430580139, + "learning_rate": 1.4919e-05, + "loss": 0.0323, + "step": 4979 + }, + { + "epoch": 3.921622686096889, + "grad_norm": 0.4642939567565918, + "learning_rate": 1.4922e-05, + "loss": 0.032, + "step": 4980 + }, + { + "epoch": 3.9224103977944074, + "grad_norm": 0.49414536356925964, + "learning_rate": 1.4925e-05, + "loss": 0.0236, + "step": 4981 + }, + { + "epoch": 3.923198109491926, + "grad_norm": 0.5166230201721191, + "learning_rate": 1.4928e-05, + "loss": 0.0297, + "step": 4982 + }, + { + "epoch": 3.9239858211894445, + "grad_norm": 0.4299338161945343, + "learning_rate": 1.4931e-05, + "loss": 0.0269, + "step": 4983 + }, + { + "epoch": 3.9247735328869635, + "grad_norm": 0.6033605337142944, + "learning_rate": 1.4934000000000001e-05, + "loss": 0.0347, + "step": 4984 + }, + { + "epoch": 3.925561244584482, + "grad_norm": 0.4063400626182556, + "learning_rate": 1.4937000000000001e-05, + "loss": 0.018, + "step": 4985 + }, + { + "epoch": 3.9263489562820006, + "grad_norm": 0.4514407813549042, + "learning_rate": 1.4940000000000001e-05, + "loss": 0.0327, + "step": 4986 + }, + { + "epoch": 3.9271366679795197, + "grad_norm": 0.338556706905365, + "learning_rate": 1.4943e-05, + "loss": 0.0175, + "step": 4987 + }, + { + "epoch": 3.9279243796770382, + "grad_norm": 0.6432173848152161, + "learning_rate": 1.4945999999999999e-05, + "loss": 0.035, + "step": 4988 + }, + { + "epoch": 3.928712091374557, + "grad_norm": 0.5728266835212708, + "learning_rate": 1.4949e-05, + "loss": 0.039, + "step": 4989 + }, + { + "epoch": 3.9294998030720754, + "grad_norm": 0.5111736059188843, + "learning_rate": 1.4952e-05, + "loss": 0.0327, + "step": 4990 + }, + { + "epoch": 3.9302875147695944, + "grad_norm": 0.8516116142272949, + "learning_rate": 1.4955e-05, + "loss": 0.054, + "step": 4991 + }, + { + "epoch": 3.931075226467113, + "grad_norm": 0.5890465378761292, + "learning_rate": 1.4958e-05, + "loss": 0.0347, + "step": 4992 + }, + { + "epoch": 3.931862938164632, + "grad_norm": 0.468377947807312, + "learning_rate": 1.4961e-05, + "loss": 0.0268, + "step": 4993 + }, + { + "epoch": 3.9326506498621505, + "grad_norm": 0.9033798575401306, + "learning_rate": 1.4964000000000001e-05, + "loss": 0.0419, + "step": 4994 + }, + { + "epoch": 3.933438361559669, + "grad_norm": 0.50638747215271, + "learning_rate": 1.4967000000000001e-05, + "loss": 0.0243, + "step": 4995 + }, + { + "epoch": 3.9342260732571877, + "grad_norm": 0.6115638613700867, + "learning_rate": 1.497e-05, + "loss": 0.0415, + "step": 4996 + }, + { + "epoch": 3.9350137849547067, + "grad_norm": 0.5852790474891663, + "learning_rate": 1.4973e-05, + "loss": 0.0427, + "step": 4997 + }, + { + "epoch": 3.9358014966522252, + "grad_norm": 0.5730295777320862, + "learning_rate": 1.4976e-05, + "loss": 0.0383, + "step": 4998 + }, + { + "epoch": 3.9365892083497442, + "grad_norm": 0.5962824821472168, + "learning_rate": 1.4979000000000002e-05, + "loss": 0.0451, + "step": 4999 + }, + { + "epoch": 3.937376920047263, + "grad_norm": 0.601884663105011, + "learning_rate": 1.4982e-05, + "loss": 0.0452, + "step": 5000 + }, + { + "epoch": 3.937376920047263, + "eval_cer": 0.14099074578116494, + "eval_loss": 0.42835116386413574, + "eval_runtime": 16.2465, + "eval_samples_per_second": 18.712, + "eval_steps_per_second": 0.616, + "eval_wer": 0.4806216423637759, + "step": 5000 + }, + { + "epoch": 3.9381646317447814, + "grad_norm": 0.5609424114227295, + "learning_rate": 1.4985e-05, + "loss": 0.0355, + "step": 5001 + }, + { + "epoch": 3.9389523434423, + "grad_norm": 0.668885350227356, + "learning_rate": 1.4988e-05, + "loss": 0.0463, + "step": 5002 + }, + { + "epoch": 3.939740055139819, + "grad_norm": 1.0522466897964478, + "learning_rate": 1.4991e-05, + "loss": 0.0451, + "step": 5003 + }, + { + "epoch": 3.9405277668373375, + "grad_norm": 0.7151015996932983, + "learning_rate": 1.4994e-05, + "loss": 0.0346, + "step": 5004 + }, + { + "epoch": 3.941315478534856, + "grad_norm": 0.628150224685669, + "learning_rate": 1.4997e-05, + "loss": 0.0462, + "step": 5005 + }, + { + "epoch": 3.942103190232375, + "grad_norm": 1.4586844444274902, + "learning_rate": 1.5e-05, + "loss": 0.0428, + "step": 5006 + }, + { + "epoch": 3.9428909019298937, + "grad_norm": 0.8860594034194946, + "learning_rate": 1.5003e-05, + "loss": 0.0563, + "step": 5007 + }, + { + "epoch": 3.9436786136274122, + "grad_norm": 0.7119929790496826, + "learning_rate": 1.5006e-05, + "loss": 0.0483, + "step": 5008 + }, + { + "epoch": 3.944466325324931, + "grad_norm": 0.6531721949577332, + "learning_rate": 1.5009e-05, + "loss": 0.0443, + "step": 5009 + }, + { + "epoch": 3.94525403702245, + "grad_norm": 0.6804248690605164, + "learning_rate": 1.5012e-05, + "loss": 0.0476, + "step": 5010 + }, + { + "epoch": 3.9460417487199684, + "grad_norm": 1.2961481809616089, + "learning_rate": 1.5015e-05, + "loss": 0.3907, + "step": 5011 + }, + { + "epoch": 3.9468294604174874, + "grad_norm": 1.0798715353012085, + "learning_rate": 1.5018000000000001e-05, + "loss": 0.2477, + "step": 5012 + }, + { + "epoch": 3.947617172115006, + "grad_norm": 1.0786280632019043, + "learning_rate": 1.5021e-05, + "loss": 0.263, + "step": 5013 + }, + { + "epoch": 3.9484048838125245, + "grad_norm": 1.1789276599884033, + "learning_rate": 1.5024e-05, + "loss": 0.2376, + "step": 5014 + }, + { + "epoch": 3.949192595510043, + "grad_norm": 1.0992356538772583, + "learning_rate": 1.5027e-05, + "loss": 0.2099, + "step": 5015 + }, + { + "epoch": 3.949980307207562, + "grad_norm": 1.0041347742080688, + "learning_rate": 1.503e-05, + "loss": 0.151, + "step": 5016 + }, + { + "epoch": 3.9507680189050807, + "grad_norm": 0.6341467499732971, + "learning_rate": 1.5033e-05, + "loss": 0.074, + "step": 5017 + }, + { + "epoch": 3.9515557306025997, + "grad_norm": 0.7094671130180359, + "learning_rate": 1.5036e-05, + "loss": 0.1052, + "step": 5018 + }, + { + "epoch": 3.9523434423001182, + "grad_norm": 0.49841737747192383, + "learning_rate": 1.5039e-05, + "loss": 0.0635, + "step": 5019 + }, + { + "epoch": 3.953131153997637, + "grad_norm": 0.28369465470314026, + "learning_rate": 1.5042e-05, + "loss": 0.0199, + "step": 5020 + }, + { + "epoch": 3.9539188656951554, + "grad_norm": 0.5543424487113953, + "learning_rate": 1.5044999999999999e-05, + "loss": 0.0549, + "step": 5021 + }, + { + "epoch": 3.9547065773926744, + "grad_norm": 0.7823871374130249, + "learning_rate": 1.5048000000000002e-05, + "loss": 0.0508, + "step": 5022 + }, + { + "epoch": 3.955494289090193, + "grad_norm": 0.42235618829727173, + "learning_rate": 1.5051000000000002e-05, + "loss": 0.0302, + "step": 5023 + }, + { + "epoch": 3.956282000787712, + "grad_norm": 0.4790932834148407, + "learning_rate": 1.5054000000000002e-05, + "loss": 0.0274, + "step": 5024 + }, + { + "epoch": 3.9570697124852305, + "grad_norm": 0.5020702481269836, + "learning_rate": 1.5057e-05, + "loss": 0.0381, + "step": 5025 + }, + { + "epoch": 3.957857424182749, + "grad_norm": 0.920745313167572, + "learning_rate": 1.506e-05, + "loss": 0.037, + "step": 5026 + }, + { + "epoch": 3.9586451358802677, + "grad_norm": 0.8484207391738892, + "learning_rate": 1.5063e-05, + "loss": 0.0471, + "step": 5027 + }, + { + "epoch": 3.9594328475777867, + "grad_norm": 0.5745425820350647, + "learning_rate": 1.5066e-05, + "loss": 0.0434, + "step": 5028 + }, + { + "epoch": 3.9602205592753053, + "grad_norm": 0.4492958188056946, + "learning_rate": 1.5069e-05, + "loss": 0.0276, + "step": 5029 + }, + { + "epoch": 3.961008270972824, + "grad_norm": 0.7678581476211548, + "learning_rate": 1.5071999999999999e-05, + "loss": 0.0676, + "step": 5030 + }, + { + "epoch": 3.961795982670343, + "grad_norm": 0.5548162460327148, + "learning_rate": 1.5074999999999999e-05, + "loss": 0.0351, + "step": 5031 + }, + { + "epoch": 3.9625836943678614, + "grad_norm": 0.5675791501998901, + "learning_rate": 1.5078000000000002e-05, + "loss": 0.0323, + "step": 5032 + }, + { + "epoch": 3.96337140606538, + "grad_norm": 0.9375559091567993, + "learning_rate": 1.5081000000000002e-05, + "loss": 0.0433, + "step": 5033 + }, + { + "epoch": 3.9641591177628985, + "grad_norm": 0.8439798951148987, + "learning_rate": 1.5084000000000002e-05, + "loss": 0.0704, + "step": 5034 + }, + { + "epoch": 3.9649468294604175, + "grad_norm": 0.6109095811843872, + "learning_rate": 1.5087000000000001e-05, + "loss": 0.0478, + "step": 5035 + }, + { + "epoch": 3.965734541157936, + "grad_norm": 0.6368817687034607, + "learning_rate": 1.5090000000000001e-05, + "loss": 0.0271, + "step": 5036 + }, + { + "epoch": 3.966522252855455, + "grad_norm": 0.5024219155311584, + "learning_rate": 1.5093e-05, + "loss": 0.0279, + "step": 5037 + }, + { + "epoch": 3.9673099645529737, + "grad_norm": 0.7201393842697144, + "learning_rate": 1.5095999999999999e-05, + "loss": 0.0299, + "step": 5038 + }, + { + "epoch": 3.9680976762504923, + "grad_norm": 0.8204613924026489, + "learning_rate": 1.5098999999999999e-05, + "loss": 0.0327, + "step": 5039 + }, + { + "epoch": 3.968885387948011, + "grad_norm": 0.7137365937232971, + "learning_rate": 1.5101999999999999e-05, + "loss": 0.0505, + "step": 5040 + }, + { + "epoch": 3.96967309964553, + "grad_norm": 0.6411713361740112, + "learning_rate": 1.5104999999999999e-05, + "loss": 0.0608, + "step": 5041 + }, + { + "epoch": 3.9704608113430484, + "grad_norm": 0.6640939712524414, + "learning_rate": 1.5108000000000002e-05, + "loss": 0.0393, + "step": 5042 + }, + { + "epoch": 3.9712485230405674, + "grad_norm": 0.35541191697120667, + "learning_rate": 1.5111000000000002e-05, + "loss": 0.0169, + "step": 5043 + }, + { + "epoch": 3.972036234738086, + "grad_norm": 0.5347671508789062, + "learning_rate": 1.5114000000000001e-05, + "loss": 0.034, + "step": 5044 + }, + { + "epoch": 3.9728239464356045, + "grad_norm": 0.5460036396980286, + "learning_rate": 1.5117000000000001e-05, + "loss": 0.0238, + "step": 5045 + }, + { + "epoch": 3.973611658133123, + "grad_norm": 0.7533693909645081, + "learning_rate": 1.5120000000000001e-05, + "loss": 0.0608, + "step": 5046 + }, + { + "epoch": 3.974399369830642, + "grad_norm": 0.5031937956809998, + "learning_rate": 1.5123e-05, + "loss": 0.0323, + "step": 5047 + }, + { + "epoch": 3.9751870815281607, + "grad_norm": 0.7904192805290222, + "learning_rate": 1.5126e-05, + "loss": 0.0447, + "step": 5048 + }, + { + "epoch": 3.9759747932256793, + "grad_norm": 0.7124625444412231, + "learning_rate": 1.5129e-05, + "loss": 0.0477, + "step": 5049 + }, + { + "epoch": 3.9767625049231983, + "grad_norm": 0.4975137412548065, + "learning_rate": 1.5131999999999998e-05, + "loss": 0.0274, + "step": 5050 + }, + { + "epoch": 3.977550216620717, + "grad_norm": 0.5877405405044556, + "learning_rate": 1.5134999999999998e-05, + "loss": 0.0488, + "step": 5051 + }, + { + "epoch": 3.9783379283182354, + "grad_norm": 0.5654944181442261, + "learning_rate": 1.5138000000000001e-05, + "loss": 0.029, + "step": 5052 + }, + { + "epoch": 3.979125640015754, + "grad_norm": 1.372489094734192, + "learning_rate": 1.5141000000000001e-05, + "loss": 0.0541, + "step": 5053 + }, + { + "epoch": 3.979913351713273, + "grad_norm": 0.6166438460350037, + "learning_rate": 1.5144000000000001e-05, + "loss": 0.0419, + "step": 5054 + }, + { + "epoch": 3.9807010634107916, + "grad_norm": 0.5683581233024597, + "learning_rate": 1.5147e-05, + "loss": 0.0351, + "step": 5055 + }, + { + "epoch": 3.9814887751083106, + "grad_norm": 1.0520321130752563, + "learning_rate": 1.515e-05, + "loss": 0.0642, + "step": 5056 + }, + { + "epoch": 3.982276486805829, + "grad_norm": 0.5806874632835388, + "learning_rate": 1.5153e-05, + "loss": 0.0287, + "step": 5057 + }, + { + "epoch": 3.9830641985033477, + "grad_norm": 0.5718485116958618, + "learning_rate": 1.5156e-05, + "loss": 0.0336, + "step": 5058 + }, + { + "epoch": 3.9838519102008663, + "grad_norm": 0.5574166774749756, + "learning_rate": 1.5159e-05, + "loss": 0.0418, + "step": 5059 + }, + { + "epoch": 3.9846396218983853, + "grad_norm": 0.6192001700401306, + "learning_rate": 1.5162e-05, + "loss": 0.0382, + "step": 5060 + }, + { + "epoch": 3.985427333595904, + "grad_norm": 1.3131922483444214, + "learning_rate": 1.5165e-05, + "loss": 0.3211, + "step": 5061 + }, + { + "epoch": 3.986215045293423, + "grad_norm": 1.2557058334350586, + "learning_rate": 1.5168000000000001e-05, + "loss": 0.1845, + "step": 5062 + }, + { + "epoch": 3.9870027569909414, + "grad_norm": 0.6219980120658875, + "learning_rate": 1.5171000000000001e-05, + "loss": 0.0987, + "step": 5063 + }, + { + "epoch": 3.98779046868846, + "grad_norm": 0.5068855881690979, + "learning_rate": 1.5174e-05, + "loss": 0.0453, + "step": 5064 + }, + { + "epoch": 3.9885781803859786, + "grad_norm": 0.38740888237953186, + "learning_rate": 1.5177e-05, + "loss": 0.0358, + "step": 5065 + }, + { + "epoch": 3.9893658920834976, + "grad_norm": 0.49256572127342224, + "learning_rate": 1.518e-05, + "loss": 0.0257, + "step": 5066 + }, + { + "epoch": 3.990153603781016, + "grad_norm": 0.8089724183082581, + "learning_rate": 1.5183e-05, + "loss": 0.055, + "step": 5067 + }, + { + "epoch": 3.9909413154785347, + "grad_norm": 0.6952820420265198, + "learning_rate": 1.5186e-05, + "loss": 0.035, + "step": 5068 + }, + { + "epoch": 3.9917290271760537, + "grad_norm": 0.9493537545204163, + "learning_rate": 1.5189e-05, + "loss": 0.0328, + "step": 5069 + }, + { + "epoch": 3.9925167388735723, + "grad_norm": 1.09300696849823, + "learning_rate": 1.5192e-05, + "loss": 0.0578, + "step": 5070 + }, + { + "epoch": 3.993304450571091, + "grad_norm": 0.4218403697013855, + "learning_rate": 1.5195e-05, + "loss": 0.026, + "step": 5071 + }, + { + "epoch": 3.9940921622686094, + "grad_norm": 0.6966230273246765, + "learning_rate": 1.5198000000000003e-05, + "loss": 0.0431, + "step": 5072 + }, + { + "epoch": 3.9948798739661284, + "grad_norm": 0.9658969640731812, + "learning_rate": 1.5201000000000002e-05, + "loss": 0.0432, + "step": 5073 + }, + { + "epoch": 3.995667585663647, + "grad_norm": 0.7346752285957336, + "learning_rate": 1.5204e-05, + "loss": 0.0413, + "step": 5074 + }, + { + "epoch": 3.996455297361166, + "grad_norm": 0.6577554941177368, + "learning_rate": 1.5207e-05, + "loss": 0.0369, + "step": 5075 + }, + { + "epoch": 3.9972430090586846, + "grad_norm": 0.8729365468025208, + "learning_rate": 1.521e-05, + "loss": 0.0511, + "step": 5076 + }, + { + "epoch": 3.998030720756203, + "grad_norm": 0.7378578782081604, + "learning_rate": 1.5213e-05, + "loss": 0.0504, + "step": 5077 + }, + { + "epoch": 3.9988184324537217, + "grad_norm": 1.255558729171753, + "learning_rate": 1.5216e-05, + "loss": 0.0888, + "step": 5078 + }, + { + "epoch": 3.9996061441512407, + "grad_norm": 1.0794259309768677, + "learning_rate": 1.5219e-05, + "loss": 0.0611, + "step": 5079 + }, + { + "epoch": 4.0, + "grad_norm": 1.6930363178253174, + "learning_rate": 1.5222e-05, + "loss": 0.0222, + "step": 5080 + }, + { + "epoch": 4.000787711697519, + "grad_norm": 4.12112283706665, + "learning_rate": 1.5224999999999999e-05, + "loss": 0.4972, + "step": 5081 + }, + { + "epoch": 4.001575423395037, + "grad_norm": 1.3147015571594238, + "learning_rate": 1.5228000000000002e-05, + "loss": 0.25, + "step": 5082 + }, + { + "epoch": 4.002363135092556, + "grad_norm": 0.9717898964881897, + "learning_rate": 1.5231000000000002e-05, + "loss": 0.2241, + "step": 5083 + }, + { + "epoch": 4.003150846790075, + "grad_norm": 1.1988154649734497, + "learning_rate": 1.5234000000000002e-05, + "loss": 0.1961, + "step": 5084 + }, + { + "epoch": 4.003938558487594, + "grad_norm": 1.2116769552230835, + "learning_rate": 1.5237000000000002e-05, + "loss": 0.1409, + "step": 5085 + }, + { + "epoch": 4.004726270185112, + "grad_norm": 1.1099491119384766, + "learning_rate": 1.524e-05, + "loss": 0.1337, + "step": 5086 + }, + { + "epoch": 4.005513981882631, + "grad_norm": 0.5740519762039185, + "learning_rate": 1.5243e-05, + "loss": 0.0669, + "step": 5087 + }, + { + "epoch": 4.006301693580149, + "grad_norm": 0.5760665535926819, + "learning_rate": 1.5246e-05, + "loss": 0.0553, + "step": 5088 + }, + { + "epoch": 4.007089405277668, + "grad_norm": 0.7723935842514038, + "learning_rate": 1.5249e-05, + "loss": 0.0367, + "step": 5089 + }, + { + "epoch": 4.0078771169751874, + "grad_norm": 0.39286571741104126, + "learning_rate": 1.5251999999999999e-05, + "loss": 0.0463, + "step": 5090 + }, + { + "epoch": 4.008664828672706, + "grad_norm": 0.4038597643375397, + "learning_rate": 1.5254999999999999e-05, + "loss": 0.0303, + "step": 5091 + }, + { + "epoch": 4.009452540370225, + "grad_norm": 0.7885717153549194, + "learning_rate": 1.5258000000000002e-05, + "loss": 0.0524, + "step": 5092 + }, + { + "epoch": 4.010240252067743, + "grad_norm": 0.43539294600486755, + "learning_rate": 1.5261000000000002e-05, + "loss": 0.0357, + "step": 5093 + }, + { + "epoch": 4.011027963765262, + "grad_norm": 0.3695143163204193, + "learning_rate": 1.5264e-05, + "loss": 0.0291, + "step": 5094 + }, + { + "epoch": 4.01181567546278, + "grad_norm": 0.4414513111114502, + "learning_rate": 1.5267e-05, + "loss": 0.0238, + "step": 5095 + }, + { + "epoch": 4.0126033871603, + "grad_norm": 0.5507470965385437, + "learning_rate": 1.527e-05, + "loss": 0.0408, + "step": 5096 + }, + { + "epoch": 4.013391098857818, + "grad_norm": 0.373266339302063, + "learning_rate": 1.5273e-05, + "loss": 0.0177, + "step": 5097 + }, + { + "epoch": 4.014178810555337, + "grad_norm": 0.3903069496154785, + "learning_rate": 1.5276e-05, + "loss": 0.0253, + "step": 5098 + }, + { + "epoch": 4.014966522252855, + "grad_norm": 0.5820688605308533, + "learning_rate": 1.5279e-05, + "loss": 0.0355, + "step": 5099 + }, + { + "epoch": 4.015754233950374, + "grad_norm": 0.5807932615280151, + "learning_rate": 1.5282e-05, + "loss": 0.027, + "step": 5100 + }, + { + "epoch": 4.016541945647893, + "grad_norm": 0.7196099758148193, + "learning_rate": 1.5285e-05, + "loss": 0.0395, + "step": 5101 + }, + { + "epoch": 4.017329657345411, + "grad_norm": 0.40012893080711365, + "learning_rate": 1.5288000000000003e-05, + "loss": 0.0299, + "step": 5102 + }, + { + "epoch": 4.018117369042931, + "grad_norm": 0.5787605047225952, + "learning_rate": 1.5291000000000003e-05, + "loss": 0.0316, + "step": 5103 + }, + { + "epoch": 4.018905080740449, + "grad_norm": 0.6158865690231323, + "learning_rate": 1.5294000000000003e-05, + "loss": 0.0312, + "step": 5104 + }, + { + "epoch": 4.019692792437968, + "grad_norm": 0.4159129858016968, + "learning_rate": 1.5297e-05, + "loss": 0.0305, + "step": 5105 + }, + { + "epoch": 4.020480504135486, + "grad_norm": 0.5441680550575256, + "learning_rate": 1.53e-05, + "loss": 0.0281, + "step": 5106 + }, + { + "epoch": 4.021268215833005, + "grad_norm": 1.7980777025222778, + "learning_rate": 1.5303e-05, + "loss": 0.0309, + "step": 5107 + }, + { + "epoch": 4.022055927530523, + "grad_norm": 0.798596978187561, + "learning_rate": 1.5306e-05, + "loss": 0.0421, + "step": 5108 + }, + { + "epoch": 4.022843639228043, + "grad_norm": 0.561821460723877, + "learning_rate": 1.5309e-05, + "loss": 0.0446, + "step": 5109 + }, + { + "epoch": 4.0236313509255615, + "grad_norm": 0.7019943594932556, + "learning_rate": 1.5312e-05, + "loss": 0.0465, + "step": 5110 + }, + { + "epoch": 4.02441906262308, + "grad_norm": 0.6090093851089478, + "learning_rate": 1.5314999999999998e-05, + "loss": 0.0434, + "step": 5111 + }, + { + "epoch": 4.025206774320599, + "grad_norm": 0.47900134325027466, + "learning_rate": 1.5318e-05, + "loss": 0.0292, + "step": 5112 + }, + { + "epoch": 4.025994486018117, + "grad_norm": 0.8769748210906982, + "learning_rate": 1.5321e-05, + "loss": 0.0349, + "step": 5113 + }, + { + "epoch": 4.026782197715636, + "grad_norm": 0.47499391436576843, + "learning_rate": 1.5324e-05, + "loss": 0.0312, + "step": 5114 + }, + { + "epoch": 4.027569909413155, + "grad_norm": 0.5351762771606445, + "learning_rate": 1.5327e-05, + "loss": 0.0371, + "step": 5115 + }, + { + "epoch": 4.028357621110674, + "grad_norm": 0.5189850330352783, + "learning_rate": 1.533e-05, + "loss": 0.0361, + "step": 5116 + }, + { + "epoch": 4.029145332808192, + "grad_norm": 1.0241032838821411, + "learning_rate": 1.5333e-05, + "loss": 0.0239, + "step": 5117 + }, + { + "epoch": 4.029933044505711, + "grad_norm": 0.5501588582992554, + "learning_rate": 1.5336e-05, + "loss": 0.0267, + "step": 5118 + }, + { + "epoch": 4.0307207562032294, + "grad_norm": 0.5747263431549072, + "learning_rate": 1.5339e-05, + "loss": 0.0377, + "step": 5119 + }, + { + "epoch": 4.031508467900748, + "grad_norm": 0.7430927753448486, + "learning_rate": 1.5342e-05, + "loss": 0.0428, + "step": 5120 + }, + { + "epoch": 4.0322961795982675, + "grad_norm": 0.36286023259162903, + "learning_rate": 1.5345e-05, + "loss": 0.0208, + "step": 5121 + }, + { + "epoch": 4.033083891295786, + "grad_norm": 1.184988021850586, + "learning_rate": 1.5348000000000003e-05, + "loss": 0.0288, + "step": 5122 + }, + { + "epoch": 4.033871602993305, + "grad_norm": 0.7466497421264648, + "learning_rate": 1.5351000000000003e-05, + "loss": 0.0254, + "step": 5123 + }, + { + "epoch": 4.034659314690823, + "grad_norm": 0.798378050327301, + "learning_rate": 1.5354000000000002e-05, + "loss": 0.0371, + "step": 5124 + }, + { + "epoch": 4.035447026388342, + "grad_norm": 0.6211969256401062, + "learning_rate": 1.5357000000000002e-05, + "loss": 0.0318, + "step": 5125 + }, + { + "epoch": 4.03623473808586, + "grad_norm": 0.5383312702178955, + "learning_rate": 1.5360000000000002e-05, + "loss": 0.0329, + "step": 5126 + }, + { + "epoch": 4.037022449783379, + "grad_norm": 0.6459225416183472, + "learning_rate": 1.5363000000000002e-05, + "loss": 0.0368, + "step": 5127 + }, + { + "epoch": 4.037810161480898, + "grad_norm": 0.6414954662322998, + "learning_rate": 1.5366e-05, + "loss": 0.0238, + "step": 5128 + }, + { + "epoch": 4.038597873178417, + "grad_norm": 0.7128236293792725, + "learning_rate": 1.5368999999999998e-05, + "loss": 0.0456, + "step": 5129 + }, + { + "epoch": 4.0393855848759355, + "grad_norm": 0.5465481281280518, + "learning_rate": 1.5371999999999998e-05, + "loss": 0.0292, + "step": 5130 + }, + { + "epoch": 4.040173296573454, + "grad_norm": 1.6441887617111206, + "learning_rate": 1.5374999999999998e-05, + "loss": 0.4415, + "step": 5131 + }, + { + "epoch": 4.040961008270973, + "grad_norm": 1.5518052577972412, + "learning_rate": 1.5377999999999997e-05, + "loss": 0.346, + "step": 5132 + }, + { + "epoch": 4.041748719968491, + "grad_norm": 1.868662714958191, + "learning_rate": 1.5381e-05, + "loss": 0.3533, + "step": 5133 + }, + { + "epoch": 4.042536431666011, + "grad_norm": 1.3176186084747314, + "learning_rate": 1.5384e-05, + "loss": 0.2361, + "step": 5134 + }, + { + "epoch": 4.043324143363529, + "grad_norm": 1.4369608163833618, + "learning_rate": 1.5387e-05, + "loss": 0.1809, + "step": 5135 + }, + { + "epoch": 4.044111855061048, + "grad_norm": 0.5989001393318176, + "learning_rate": 1.539e-05, + "loss": 0.0619, + "step": 5136 + }, + { + "epoch": 4.044899566758566, + "grad_norm": 0.640106737613678, + "learning_rate": 1.5393e-05, + "loss": 0.0988, + "step": 5137 + }, + { + "epoch": 4.045687278456085, + "grad_norm": 0.5702539086341858, + "learning_rate": 1.5396e-05, + "loss": 0.0409, + "step": 5138 + }, + { + "epoch": 4.0464749901536035, + "grad_norm": 0.4532310962677002, + "learning_rate": 1.5399e-05, + "loss": 0.0332, + "step": 5139 + }, + { + "epoch": 4.047262701851123, + "grad_norm": 0.39804401993751526, + "learning_rate": 1.5402e-05, + "loss": 0.0272, + "step": 5140 + }, + { + "epoch": 4.0480504135486415, + "grad_norm": 0.5099515318870544, + "learning_rate": 1.5405e-05, + "loss": 0.0343, + "step": 5141 + }, + { + "epoch": 4.04883812524616, + "grad_norm": 0.5259493589401245, + "learning_rate": 1.5408e-05, + "loss": 0.0309, + "step": 5142 + }, + { + "epoch": 4.049625836943679, + "grad_norm": 0.38475000858306885, + "learning_rate": 1.5411000000000002e-05, + "loss": 0.0239, + "step": 5143 + }, + { + "epoch": 4.050413548641197, + "grad_norm": 0.36764073371887207, + "learning_rate": 1.5414000000000002e-05, + "loss": 0.0216, + "step": 5144 + }, + { + "epoch": 4.051201260338716, + "grad_norm": 0.4659872353076935, + "learning_rate": 1.5417e-05, + "loss": 0.0221, + "step": 5145 + }, + { + "epoch": 4.051988972036234, + "grad_norm": 0.7708364129066467, + "learning_rate": 1.542e-05, + "loss": 0.0615, + "step": 5146 + }, + { + "epoch": 4.052776683733754, + "grad_norm": 0.5861685872077942, + "learning_rate": 1.5423e-05, + "loss": 0.0499, + "step": 5147 + }, + { + "epoch": 4.053564395431272, + "grad_norm": 0.43874311447143555, + "learning_rate": 1.5426e-05, + "loss": 0.0238, + "step": 5148 + }, + { + "epoch": 4.054352107128791, + "grad_norm": 0.47601887583732605, + "learning_rate": 1.5429e-05, + "loss": 0.0308, + "step": 5149 + }, + { + "epoch": 4.0551398188263095, + "grad_norm": 0.6176022887229919, + "learning_rate": 1.5432e-05, + "loss": 0.0394, + "step": 5150 + }, + { + "epoch": 4.055927530523828, + "grad_norm": 0.8373801708221436, + "learning_rate": 1.5435e-05, + "loss": 0.0339, + "step": 5151 + }, + { + "epoch": 4.056715242221347, + "grad_norm": 0.4007309079170227, + "learning_rate": 1.5438e-05, + "loss": 0.0293, + "step": 5152 + }, + { + "epoch": 4.057502953918866, + "grad_norm": 0.5054627060890198, + "learning_rate": 1.5441000000000003e-05, + "loss": 0.0396, + "step": 5153 + }, + { + "epoch": 4.058290665616385, + "grad_norm": 0.9482771158218384, + "learning_rate": 1.5444e-05, + "loss": 0.0408, + "step": 5154 + }, + { + "epoch": 4.059078377313903, + "grad_norm": 0.26458296179771423, + "learning_rate": 1.5447e-05, + "loss": 0.02, + "step": 5155 + }, + { + "epoch": 4.059866089011422, + "grad_norm": 0.4374644160270691, + "learning_rate": 1.545e-05, + "loss": 0.0216, + "step": 5156 + }, + { + "epoch": 4.06065380070894, + "grad_norm": 0.5567907691001892, + "learning_rate": 1.5453e-05, + "loss": 0.0294, + "step": 5157 + }, + { + "epoch": 4.061441512406459, + "grad_norm": 0.3623482882976532, + "learning_rate": 1.5456e-05, + "loss": 0.0209, + "step": 5158 + }, + { + "epoch": 4.062229224103978, + "grad_norm": 0.5656280517578125, + "learning_rate": 1.5459e-05, + "loss": 0.0307, + "step": 5159 + }, + { + "epoch": 4.063016935801497, + "grad_norm": 0.4190707504749298, + "learning_rate": 1.5462e-05, + "loss": 0.0283, + "step": 5160 + }, + { + "epoch": 4.0638046474990155, + "grad_norm": 0.8081962466239929, + "learning_rate": 1.5465e-05, + "loss": 0.0578, + "step": 5161 + }, + { + "epoch": 4.064592359196534, + "grad_norm": 0.3173358738422394, + "learning_rate": 1.5467999999999998e-05, + "loss": 0.0143, + "step": 5162 + }, + { + "epoch": 4.065380070894053, + "grad_norm": 0.7225285768508911, + "learning_rate": 1.5471e-05, + "loss": 0.0273, + "step": 5163 + }, + { + "epoch": 4.066167782591571, + "grad_norm": 0.7182549834251404, + "learning_rate": 1.5474e-05, + "loss": 0.035, + "step": 5164 + }, + { + "epoch": 4.06695549428909, + "grad_norm": 0.40784868597984314, + "learning_rate": 1.5477e-05, + "loss": 0.0254, + "step": 5165 + }, + { + "epoch": 4.067743205986609, + "grad_norm": 0.44992175698280334, + "learning_rate": 1.548e-05, + "loss": 0.0302, + "step": 5166 + }, + { + "epoch": 4.068530917684128, + "grad_norm": 1.014388084411621, + "learning_rate": 1.5483e-05, + "loss": 0.0386, + "step": 5167 + }, + { + "epoch": 4.069318629381646, + "grad_norm": 0.617440402507782, + "learning_rate": 1.5486e-05, + "loss": 0.0455, + "step": 5168 + }, + { + "epoch": 4.070106341079165, + "grad_norm": 0.8819822669029236, + "learning_rate": 1.5489e-05, + "loss": 0.0417, + "step": 5169 + }, + { + "epoch": 4.0708940527766835, + "grad_norm": 0.7433648109436035, + "learning_rate": 1.5492e-05, + "loss": 0.0611, + "step": 5170 + }, + { + "epoch": 4.071681764474202, + "grad_norm": 0.5200750231742859, + "learning_rate": 1.5495e-05, + "loss": 0.0338, + "step": 5171 + }, + { + "epoch": 4.0724694761717215, + "grad_norm": 1.0636777877807617, + "learning_rate": 1.5498e-05, + "loss": 0.0399, + "step": 5172 + }, + { + "epoch": 4.07325718786924, + "grad_norm": 0.5048949718475342, + "learning_rate": 1.5501000000000003e-05, + "loss": 0.0298, + "step": 5173 + }, + { + "epoch": 4.074044899566759, + "grad_norm": 0.5200029015541077, + "learning_rate": 1.5504000000000003e-05, + "loss": 0.038, + "step": 5174 + }, + { + "epoch": 4.074832611264277, + "grad_norm": 0.5954045057296753, + "learning_rate": 1.5507000000000002e-05, + "loss": 0.0553, + "step": 5175 + }, + { + "epoch": 4.075620322961796, + "grad_norm": 0.7046144008636475, + "learning_rate": 1.5510000000000002e-05, + "loss": 0.0374, + "step": 5176 + }, + { + "epoch": 4.076408034659314, + "grad_norm": 0.5807145833969116, + "learning_rate": 1.5513000000000002e-05, + "loss": 0.0317, + "step": 5177 + }, + { + "epoch": 4.077195746356834, + "grad_norm": 0.5352111458778381, + "learning_rate": 1.5516000000000002e-05, + "loss": 0.0344, + "step": 5178 + }, + { + "epoch": 4.077983458054352, + "grad_norm": 0.65446537733078, + "learning_rate": 1.5518999999999998e-05, + "loss": 0.0445, + "step": 5179 + }, + { + "epoch": 4.078771169751871, + "grad_norm": 0.7230373620986938, + "learning_rate": 1.5521999999999998e-05, + "loss": 0.0488, + "step": 5180 + }, + { + "epoch": 4.0795588814493895, + "grad_norm": 1.1964952945709229, + "learning_rate": 1.5524999999999998e-05, + "loss": 0.3257, + "step": 5181 + }, + { + "epoch": 4.080346593146908, + "grad_norm": 0.9530233144760132, + "learning_rate": 1.5527999999999998e-05, + "loss": 0.245, + "step": 5182 + }, + { + "epoch": 4.081134304844427, + "grad_norm": 1.227309226989746, + "learning_rate": 1.5531e-05, + "loss": 0.2106, + "step": 5183 + }, + { + "epoch": 4.081922016541945, + "grad_norm": 0.6738479733467102, + "learning_rate": 1.5534e-05, + "loss": 0.147, + "step": 5184 + }, + { + "epoch": 4.082709728239465, + "grad_norm": 0.8704549670219421, + "learning_rate": 1.5537e-05, + "loss": 0.1466, + "step": 5185 + }, + { + "epoch": 4.083497439936983, + "grad_norm": 1.1361274719238281, + "learning_rate": 1.554e-05, + "loss": 0.2033, + "step": 5186 + }, + { + "epoch": 4.084285151634502, + "grad_norm": 0.5256837010383606, + "learning_rate": 1.5543e-05, + "loss": 0.053, + "step": 5187 + }, + { + "epoch": 4.08507286333202, + "grad_norm": 0.4109761118888855, + "learning_rate": 1.5546e-05, + "loss": 0.0396, + "step": 5188 + }, + { + "epoch": 4.085860575029539, + "grad_norm": 0.43289676308631897, + "learning_rate": 1.5549e-05, + "loss": 0.0353, + "step": 5189 + }, + { + "epoch": 4.0866482867270575, + "grad_norm": 0.6068711280822754, + "learning_rate": 1.5552e-05, + "loss": 0.0718, + "step": 5190 + }, + { + "epoch": 4.087435998424577, + "grad_norm": 0.5054280757904053, + "learning_rate": 1.5555e-05, + "loss": 0.0401, + "step": 5191 + }, + { + "epoch": 4.0882237101220955, + "grad_norm": 0.4078775644302368, + "learning_rate": 1.5558e-05, + "loss": 0.0458, + "step": 5192 + }, + { + "epoch": 4.089011421819614, + "grad_norm": 0.36477822065353394, + "learning_rate": 1.5561000000000002e-05, + "loss": 0.0293, + "step": 5193 + }, + { + "epoch": 4.089799133517133, + "grad_norm": 0.5970736145973206, + "learning_rate": 1.5564000000000002e-05, + "loss": 0.033, + "step": 5194 + }, + { + "epoch": 4.090586845214651, + "grad_norm": 0.29677313566207886, + "learning_rate": 1.5567000000000002e-05, + "loss": 0.0229, + "step": 5195 + }, + { + "epoch": 4.09137455691217, + "grad_norm": 0.5101578235626221, + "learning_rate": 1.5570000000000002e-05, + "loss": 0.0332, + "step": 5196 + }, + { + "epoch": 4.092162268609689, + "grad_norm": 1.0249627828598022, + "learning_rate": 1.5573e-05, + "loss": 0.0313, + "step": 5197 + }, + { + "epoch": 4.092949980307208, + "grad_norm": 0.6707391142845154, + "learning_rate": 1.5576e-05, + "loss": 0.0449, + "step": 5198 + }, + { + "epoch": 4.093737692004726, + "grad_norm": 0.3861648440361023, + "learning_rate": 1.5579e-05, + "loss": 0.0296, + "step": 5199 + }, + { + "epoch": 4.094525403702245, + "grad_norm": 0.6181694865226746, + "learning_rate": 1.5582e-05, + "loss": 0.0403, + "step": 5200 + }, + { + "epoch": 4.0953131153997635, + "grad_norm": 0.4755183756351471, + "learning_rate": 1.5585e-05, + "loss": 0.0339, + "step": 5201 + }, + { + "epoch": 4.096100827097282, + "grad_norm": 2.701281785964966, + "learning_rate": 1.5588e-05, + "loss": 0.0281, + "step": 5202 + }, + { + "epoch": 4.0968885387948015, + "grad_norm": 0.7173565626144409, + "learning_rate": 1.5591e-05, + "loss": 0.0321, + "step": 5203 + }, + { + "epoch": 4.09767625049232, + "grad_norm": 0.5894919037818909, + "learning_rate": 1.5594e-05, + "loss": 0.0299, + "step": 5204 + }, + { + "epoch": 4.098463962189839, + "grad_norm": 0.5307112336158752, + "learning_rate": 1.5597e-05, + "loss": 0.0321, + "step": 5205 + }, + { + "epoch": 4.099251673887357, + "grad_norm": 0.7372357249259949, + "learning_rate": 1.56e-05, + "loss": 0.0369, + "step": 5206 + }, + { + "epoch": 4.100039385584876, + "grad_norm": 0.38947442173957825, + "learning_rate": 1.5603e-05, + "loss": 0.0219, + "step": 5207 + }, + { + "epoch": 4.100827097282394, + "grad_norm": 0.4829106628894806, + "learning_rate": 1.5606e-05, + "loss": 0.0253, + "step": 5208 + }, + { + "epoch": 4.101614808979913, + "grad_norm": 0.40381288528442383, + "learning_rate": 1.5609e-05, + "loss": 0.023, + "step": 5209 + }, + { + "epoch": 4.102402520677432, + "grad_norm": 0.5647479891777039, + "learning_rate": 1.5612e-05, + "loss": 0.0411, + "step": 5210 + }, + { + "epoch": 4.103190232374951, + "grad_norm": 0.5145600438117981, + "learning_rate": 1.5615e-05, + "loss": 0.0354, + "step": 5211 + }, + { + "epoch": 4.1039779440724695, + "grad_norm": 0.66538405418396, + "learning_rate": 1.5618e-05, + "loss": 0.0252, + "step": 5212 + }, + { + "epoch": 4.104765655769988, + "grad_norm": 0.5543336868286133, + "learning_rate": 1.5621000000000002e-05, + "loss": 0.0279, + "step": 5213 + }, + { + "epoch": 4.105553367467507, + "grad_norm": 0.7167937159538269, + "learning_rate": 1.5624e-05, + "loss": 0.026, + "step": 5214 + }, + { + "epoch": 4.106341079165025, + "grad_norm": 0.5016396045684814, + "learning_rate": 1.5627e-05, + "loss": 0.0213, + "step": 5215 + }, + { + "epoch": 4.107128790862545, + "grad_norm": 0.44494521617889404, + "learning_rate": 1.563e-05, + "loss": 0.0288, + "step": 5216 + }, + { + "epoch": 4.107916502560063, + "grad_norm": 0.6197389960289001, + "learning_rate": 1.5633e-05, + "loss": 0.0407, + "step": 5217 + }, + { + "epoch": 4.108704214257582, + "grad_norm": 0.5299838185310364, + "learning_rate": 1.5636e-05, + "loss": 0.0271, + "step": 5218 + }, + { + "epoch": 4.1094919259551, + "grad_norm": 0.4526289105415344, + "learning_rate": 1.5639e-05, + "loss": 0.0248, + "step": 5219 + }, + { + "epoch": 4.110279637652619, + "grad_norm": 0.8390743732452393, + "learning_rate": 1.5642e-05, + "loss": 0.0332, + "step": 5220 + }, + { + "epoch": 4.1110673493501375, + "grad_norm": 0.5051974058151245, + "learning_rate": 1.5645e-05, + "loss": 0.0287, + "step": 5221 + }, + { + "epoch": 4.111855061047657, + "grad_norm": 0.7835407853126526, + "learning_rate": 1.5648e-05, + "loss": 0.0447, + "step": 5222 + }, + { + "epoch": 4.1126427727451755, + "grad_norm": 0.62204909324646, + "learning_rate": 1.5651000000000003e-05, + "loss": 0.024, + "step": 5223 + }, + { + "epoch": 4.113430484442694, + "grad_norm": 0.7181949615478516, + "learning_rate": 1.5654000000000003e-05, + "loss": 0.0484, + "step": 5224 + }, + { + "epoch": 4.114218196140213, + "grad_norm": 0.55841064453125, + "learning_rate": 1.5657000000000003e-05, + "loss": 0.023, + "step": 5225 + }, + { + "epoch": 4.115005907837731, + "grad_norm": 0.7823938727378845, + "learning_rate": 1.5660000000000003e-05, + "loss": 0.0274, + "step": 5226 + }, + { + "epoch": 4.11579361953525, + "grad_norm": 0.996212363243103, + "learning_rate": 1.5663000000000002e-05, + "loss": 0.0359, + "step": 5227 + }, + { + "epoch": 4.116581331232768, + "grad_norm": 0.681903064250946, + "learning_rate": 1.5666e-05, + "loss": 0.0499, + "step": 5228 + }, + { + "epoch": 4.117369042930288, + "grad_norm": 0.6058109998703003, + "learning_rate": 1.5669e-05, + "loss": 0.0417, + "step": 5229 + }, + { + "epoch": 4.118156754627806, + "grad_norm": 0.664665162563324, + "learning_rate": 1.5672e-05, + "loss": 0.0495, + "step": 5230 + }, + { + "epoch": 4.118944466325325, + "grad_norm": 1.246105670928955, + "learning_rate": 1.5674999999999998e-05, + "loss": 0.2983, + "step": 5231 + }, + { + "epoch": 4.1197321780228435, + "grad_norm": 1.1530015468597412, + "learning_rate": 1.5677999999999998e-05, + "loss": 0.3536, + "step": 5232 + }, + { + "epoch": 4.120519889720362, + "grad_norm": 0.8736244440078735, + "learning_rate": 1.5681e-05, + "loss": 0.2393, + "step": 5233 + }, + { + "epoch": 4.121307601417881, + "grad_norm": 1.0364892482757568, + "learning_rate": 1.5684e-05, + "loss": 0.1769, + "step": 5234 + }, + { + "epoch": 4.1220953131154, + "grad_norm": 1.3214889764785767, + "learning_rate": 1.5687e-05, + "loss": 0.1949, + "step": 5235 + }, + { + "epoch": 4.122883024812919, + "grad_norm": 1.0360246896743774, + "learning_rate": 1.569e-05, + "loss": 0.0806, + "step": 5236 + }, + { + "epoch": 4.123670736510437, + "grad_norm": 0.47145238518714905, + "learning_rate": 1.5693e-05, + "loss": 0.0361, + "step": 5237 + }, + { + "epoch": 4.124458448207956, + "grad_norm": 0.35247212648391724, + "learning_rate": 1.5696e-05, + "loss": 0.0413, + "step": 5238 + }, + { + "epoch": 4.125246159905474, + "grad_norm": 0.48994842171669006, + "learning_rate": 1.5699e-05, + "loss": 0.04, + "step": 5239 + }, + { + "epoch": 4.126033871602993, + "grad_norm": 0.40524232387542725, + "learning_rate": 1.5702e-05, + "loss": 0.0392, + "step": 5240 + }, + { + "epoch": 4.126821583300512, + "grad_norm": 1.4073846340179443, + "learning_rate": 1.5705e-05, + "loss": 0.0712, + "step": 5241 + }, + { + "epoch": 4.127609294998031, + "grad_norm": 0.8835009336471558, + "learning_rate": 1.5708e-05, + "loss": 0.0346, + "step": 5242 + }, + { + "epoch": 4.1283970066955495, + "grad_norm": 0.4291238784790039, + "learning_rate": 1.5711000000000003e-05, + "loss": 0.0284, + "step": 5243 + }, + { + "epoch": 4.129184718393068, + "grad_norm": 0.4932642877101898, + "learning_rate": 1.5714000000000002e-05, + "loss": 0.0449, + "step": 5244 + }, + { + "epoch": 4.129972430090587, + "grad_norm": 0.26855969429016113, + "learning_rate": 1.5717000000000002e-05, + "loss": 0.0143, + "step": 5245 + }, + { + "epoch": 4.130760141788105, + "grad_norm": 0.5638571381568909, + "learning_rate": 1.5720000000000002e-05, + "loss": 0.0404, + "step": 5246 + }, + { + "epoch": 4.131547853485625, + "grad_norm": 0.4551561176776886, + "learning_rate": 1.5723000000000002e-05, + "loss": 0.0208, + "step": 5247 + }, + { + "epoch": 4.132335565183143, + "grad_norm": 0.4132668673992157, + "learning_rate": 1.5726e-05, + "loss": 0.019, + "step": 5248 + }, + { + "epoch": 4.133123276880662, + "grad_norm": 0.46898096799850464, + "learning_rate": 1.5729e-05, + "loss": 0.0237, + "step": 5249 + }, + { + "epoch": 4.13391098857818, + "grad_norm": 0.3941424489021301, + "learning_rate": 1.5732e-05, + "loss": 0.019, + "step": 5250 + }, + { + "epoch": 4.134698700275699, + "grad_norm": 1.1930519342422485, + "learning_rate": 1.5735e-05, + "loss": 0.0423, + "step": 5251 + }, + { + "epoch": 4.1354864119732175, + "grad_norm": 0.6496204733848572, + "learning_rate": 1.5737999999999997e-05, + "loss": 0.0328, + "step": 5252 + }, + { + "epoch": 4.136274123670736, + "grad_norm": 0.5842166543006897, + "learning_rate": 1.5741e-05, + "loss": 0.0359, + "step": 5253 + }, + { + "epoch": 4.137061835368256, + "grad_norm": 0.7054948210716248, + "learning_rate": 1.5744e-05, + "loss": 0.0384, + "step": 5254 + }, + { + "epoch": 4.137849547065774, + "grad_norm": 1.0422370433807373, + "learning_rate": 1.5747e-05, + "loss": 0.1007, + "step": 5255 + }, + { + "epoch": 4.138637258763293, + "grad_norm": 0.7228564023971558, + "learning_rate": 1.575e-05, + "loss": 0.0287, + "step": 5256 + }, + { + "epoch": 4.139424970460811, + "grad_norm": 0.45464345812797546, + "learning_rate": 1.5753e-05, + "loss": 0.0247, + "step": 5257 + }, + { + "epoch": 4.14021268215833, + "grad_norm": 0.557765007019043, + "learning_rate": 1.5756e-05, + "loss": 0.0299, + "step": 5258 + }, + { + "epoch": 4.141000393855848, + "grad_norm": 0.5768417716026306, + "learning_rate": 1.5759e-05, + "loss": 0.0313, + "step": 5259 + }, + { + "epoch": 4.141788105553368, + "grad_norm": 0.6668322682380676, + "learning_rate": 1.5762e-05, + "loss": 0.037, + "step": 5260 + }, + { + "epoch": 4.142575817250886, + "grad_norm": 0.9883134365081787, + "learning_rate": 1.5765e-05, + "loss": 0.0436, + "step": 5261 + }, + { + "epoch": 4.143363528948405, + "grad_norm": 0.3184272348880768, + "learning_rate": 1.5768e-05, + "loss": 0.0165, + "step": 5262 + }, + { + "epoch": 4.1441512406459236, + "grad_norm": 1.07703697681427, + "learning_rate": 1.5771e-05, + "loss": 0.0276, + "step": 5263 + }, + { + "epoch": 4.144938952343442, + "grad_norm": 4.08777379989624, + "learning_rate": 1.5774000000000002e-05, + "loss": 0.0421, + "step": 5264 + }, + { + "epoch": 4.145726664040961, + "grad_norm": 0.6041809320449829, + "learning_rate": 1.5777e-05, + "loss": 0.0337, + "step": 5265 + }, + { + "epoch": 4.14651437573848, + "grad_norm": 0.3494212031364441, + "learning_rate": 1.578e-05, + "loss": 0.0155, + "step": 5266 + }, + { + "epoch": 4.147302087435999, + "grad_norm": 0.4804694652557373, + "learning_rate": 1.5783e-05, + "loss": 0.0273, + "step": 5267 + }, + { + "epoch": 4.148089799133517, + "grad_norm": 0.5578288435935974, + "learning_rate": 1.5786e-05, + "loss": 0.0344, + "step": 5268 + }, + { + "epoch": 4.148877510831036, + "grad_norm": 0.4417451024055481, + "learning_rate": 1.5789e-05, + "loss": 0.0202, + "step": 5269 + }, + { + "epoch": 4.149665222528554, + "grad_norm": 0.5161182284355164, + "learning_rate": 1.5792e-05, + "loss": 0.039, + "step": 5270 + }, + { + "epoch": 4.150452934226073, + "grad_norm": 0.7209335565567017, + "learning_rate": 1.5795e-05, + "loss": 0.0319, + "step": 5271 + }, + { + "epoch": 4.1512406459235915, + "grad_norm": 0.7133419513702393, + "learning_rate": 1.5798e-05, + "loss": 0.0375, + "step": 5272 + }, + { + "epoch": 4.152028357621111, + "grad_norm": 0.5802139639854431, + "learning_rate": 1.5801e-05, + "loss": 0.0265, + "step": 5273 + }, + { + "epoch": 4.15281606931863, + "grad_norm": 0.9459667801856995, + "learning_rate": 1.5804000000000003e-05, + "loss": 0.0548, + "step": 5274 + }, + { + "epoch": 4.153603781016148, + "grad_norm": 0.5306031107902527, + "learning_rate": 1.5807000000000003e-05, + "loss": 0.0292, + "step": 5275 + }, + { + "epoch": 4.154391492713667, + "grad_norm": 0.6036173701286316, + "learning_rate": 1.5810000000000003e-05, + "loss": 0.0263, + "step": 5276 + }, + { + "epoch": 4.155179204411185, + "grad_norm": 0.633965790271759, + "learning_rate": 1.5813e-05, + "loss": 0.0378, + "step": 5277 + }, + { + "epoch": 4.155966916108704, + "grad_norm": 0.8775047659873962, + "learning_rate": 1.5816e-05, + "loss": 0.0655, + "step": 5278 + }, + { + "epoch": 4.156754627806223, + "grad_norm": 1.3300260305404663, + "learning_rate": 1.5819e-05, + "loss": 0.0441, + "step": 5279 + }, + { + "epoch": 4.157542339503742, + "grad_norm": 1.1343932151794434, + "learning_rate": 1.5822e-05, + "loss": 0.0482, + "step": 5280 + }, + { + "epoch": 4.15833005120126, + "grad_norm": 2.0954556465148926, + "learning_rate": 1.5825e-05, + "loss": 0.4267, + "step": 5281 + }, + { + "epoch": 4.159117762898779, + "grad_norm": 0.9705693125724792, + "learning_rate": 1.5827999999999998e-05, + "loss": 0.3087, + "step": 5282 + }, + { + "epoch": 4.159905474596298, + "grad_norm": 0.8090206384658813, + "learning_rate": 1.5830999999999998e-05, + "loss": 0.2485, + "step": 5283 + }, + { + "epoch": 4.160693186293816, + "grad_norm": 0.8649008274078369, + "learning_rate": 1.5834e-05, + "loss": 0.2206, + "step": 5284 + }, + { + "epoch": 4.161480897991336, + "grad_norm": 1.0271512269973755, + "learning_rate": 1.5837e-05, + "loss": 0.1283, + "step": 5285 + }, + { + "epoch": 4.162268609688854, + "grad_norm": 1.8673442602157593, + "learning_rate": 1.584e-05, + "loss": 0.1104, + "step": 5286 + }, + { + "epoch": 4.163056321386373, + "grad_norm": 0.965336263179779, + "learning_rate": 1.5843e-05, + "loss": 0.1103, + "step": 5287 + }, + { + "epoch": 4.163844033083891, + "grad_norm": 0.5562377572059631, + "learning_rate": 1.5846e-05, + "loss": 0.0514, + "step": 5288 + }, + { + "epoch": 4.16463174478141, + "grad_norm": 0.42193636298179626, + "learning_rate": 1.5849e-05, + "loss": 0.0445, + "step": 5289 + }, + { + "epoch": 4.165419456478928, + "grad_norm": 0.5654952526092529, + "learning_rate": 1.5852e-05, + "loss": 0.0348, + "step": 5290 + }, + { + "epoch": 4.166207168176447, + "grad_norm": 0.5575994849205017, + "learning_rate": 1.5855e-05, + "loss": 0.0381, + "step": 5291 + }, + { + "epoch": 4.166994879873966, + "grad_norm": 0.304195374250412, + "learning_rate": 1.5858e-05, + "loss": 0.0234, + "step": 5292 + }, + { + "epoch": 4.167782591571485, + "grad_norm": 0.42914271354675293, + "learning_rate": 1.5861e-05, + "loss": 0.0412, + "step": 5293 + }, + { + "epoch": 4.168570303269004, + "grad_norm": 0.3799884617328644, + "learning_rate": 1.5864000000000003e-05, + "loss": 0.023, + "step": 5294 + }, + { + "epoch": 4.169358014966522, + "grad_norm": 0.39390793442726135, + "learning_rate": 1.5867000000000002e-05, + "loss": 0.0173, + "step": 5295 + }, + { + "epoch": 4.170145726664041, + "grad_norm": 0.5305705666542053, + "learning_rate": 1.5870000000000002e-05, + "loss": 0.0419, + "step": 5296 + }, + { + "epoch": 4.170933438361559, + "grad_norm": 0.41983774304389954, + "learning_rate": 1.5873000000000002e-05, + "loss": 0.0275, + "step": 5297 + }, + { + "epoch": 4.171721150059079, + "grad_norm": 0.7027325630187988, + "learning_rate": 1.5876000000000002e-05, + "loss": 0.0327, + "step": 5298 + }, + { + "epoch": 4.172508861756597, + "grad_norm": 0.4275326430797577, + "learning_rate": 1.5879e-05, + "loss": 0.0327, + "step": 5299 + }, + { + "epoch": 4.173296573454116, + "grad_norm": 0.3807413876056671, + "learning_rate": 1.5882e-05, + "loss": 0.0341, + "step": 5300 + }, + { + "epoch": 4.174084285151634, + "grad_norm": 0.8066030740737915, + "learning_rate": 1.5884999999999998e-05, + "loss": 0.0401, + "step": 5301 + }, + { + "epoch": 4.174871996849153, + "grad_norm": 1.4809436798095703, + "learning_rate": 1.5887999999999998e-05, + "loss": 0.0361, + "step": 5302 + }, + { + "epoch": 4.175659708546672, + "grad_norm": 0.6994068026542664, + "learning_rate": 1.5890999999999997e-05, + "loss": 0.0329, + "step": 5303 + }, + { + "epoch": 4.176447420244191, + "grad_norm": 0.6849287152290344, + "learning_rate": 1.5894e-05, + "loss": 0.027, + "step": 5304 + }, + { + "epoch": 4.17723513194171, + "grad_norm": 1.3382529020309448, + "learning_rate": 1.5897e-05, + "loss": 0.027, + "step": 5305 + }, + { + "epoch": 4.178022843639228, + "grad_norm": 0.7416152358055115, + "learning_rate": 1.59e-05, + "loss": 0.0422, + "step": 5306 + }, + { + "epoch": 4.178810555336747, + "grad_norm": 0.5530991554260254, + "learning_rate": 1.5903e-05, + "loss": 0.022, + "step": 5307 + }, + { + "epoch": 4.179598267034265, + "grad_norm": 0.4414308965206146, + "learning_rate": 1.5906e-05, + "loss": 0.0307, + "step": 5308 + }, + { + "epoch": 4.180385978731784, + "grad_norm": 0.6009773015975952, + "learning_rate": 1.5909e-05, + "loss": 0.0497, + "step": 5309 + }, + { + "epoch": 4.181173690429302, + "grad_norm": 0.6818907260894775, + "learning_rate": 1.5912e-05, + "loss": 0.0349, + "step": 5310 + }, + { + "epoch": 4.181961402126822, + "grad_norm": 0.431222140789032, + "learning_rate": 1.5915e-05, + "loss": 0.0158, + "step": 5311 + }, + { + "epoch": 4.1827491138243404, + "grad_norm": 0.8006718158721924, + "learning_rate": 1.5918e-05, + "loss": 0.0635, + "step": 5312 + }, + { + "epoch": 4.183536825521859, + "grad_norm": 0.49068138003349304, + "learning_rate": 1.5921e-05, + "loss": 0.0279, + "step": 5313 + }, + { + "epoch": 4.184324537219378, + "grad_norm": 0.6002467274665833, + "learning_rate": 1.5924000000000002e-05, + "loss": 0.0421, + "step": 5314 + }, + { + "epoch": 4.185112248916896, + "grad_norm": 1.0669196844100952, + "learning_rate": 1.5927000000000002e-05, + "loss": 0.0233, + "step": 5315 + }, + { + "epoch": 4.185899960614415, + "grad_norm": 0.30353865027427673, + "learning_rate": 1.593e-05, + "loss": 0.018, + "step": 5316 + }, + { + "epoch": 4.186687672311934, + "grad_norm": 0.8393822312355042, + "learning_rate": 1.5933e-05, + "loss": 0.0315, + "step": 5317 + }, + { + "epoch": 4.187475384009453, + "grad_norm": 0.6385977268218994, + "learning_rate": 1.5936e-05, + "loss": 0.0342, + "step": 5318 + }, + { + "epoch": 4.188263095706971, + "grad_norm": 0.43464693427085876, + "learning_rate": 1.5939e-05, + "loss": 0.0261, + "step": 5319 + }, + { + "epoch": 4.18905080740449, + "grad_norm": 0.4988303780555725, + "learning_rate": 1.5942e-05, + "loss": 0.0261, + "step": 5320 + }, + { + "epoch": 4.189838519102008, + "grad_norm": 0.5071204304695129, + "learning_rate": 1.5945e-05, + "loss": 0.0356, + "step": 5321 + }, + { + "epoch": 4.190626230799527, + "grad_norm": 0.37149736285209656, + "learning_rate": 1.5948e-05, + "loss": 0.0204, + "step": 5322 + }, + { + "epoch": 4.1914139424970465, + "grad_norm": 0.868450939655304, + "learning_rate": 1.5951e-05, + "loss": 0.0303, + "step": 5323 + }, + { + "epoch": 4.192201654194565, + "grad_norm": 0.7367833852767944, + "learning_rate": 1.5954000000000003e-05, + "loss": 0.0414, + "step": 5324 + }, + { + "epoch": 4.192989365892084, + "grad_norm": 0.799275815486908, + "learning_rate": 1.5957000000000003e-05, + "loss": 0.0363, + "step": 5325 + }, + { + "epoch": 4.193777077589602, + "grad_norm": 0.9414764046669006, + "learning_rate": 1.596e-05, + "loss": 0.0508, + "step": 5326 + }, + { + "epoch": 4.194564789287121, + "grad_norm": 0.6381154656410217, + "learning_rate": 1.5963e-05, + "loss": 0.0432, + "step": 5327 + }, + { + "epoch": 4.195352500984639, + "grad_norm": 1.308912992477417, + "learning_rate": 1.5966e-05, + "loss": 0.0583, + "step": 5328 + }, + { + "epoch": 4.196140212682159, + "grad_norm": 0.7556124925613403, + "learning_rate": 1.5969e-05, + "loss": 0.0338, + "step": 5329 + }, + { + "epoch": 4.196927924379677, + "grad_norm": 0.7129773497581482, + "learning_rate": 1.5972e-05, + "loss": 0.034, + "step": 5330 + }, + { + "epoch": 4.197715636077196, + "grad_norm": 1.1226558685302734, + "learning_rate": 1.5975e-05, + "loss": 0.3053, + "step": 5331 + }, + { + "epoch": 4.1985033477747145, + "grad_norm": 1.0396283864974976, + "learning_rate": 1.5978e-05, + "loss": 0.2912, + "step": 5332 + }, + { + "epoch": 4.199291059472233, + "grad_norm": 0.8802515268325806, + "learning_rate": 1.5980999999999998e-05, + "loss": 0.2453, + "step": 5333 + }, + { + "epoch": 4.200078771169752, + "grad_norm": 0.7219302654266357, + "learning_rate": 1.5984e-05, + "loss": 0.1977, + "step": 5334 + }, + { + "epoch": 4.20086648286727, + "grad_norm": 0.9379021525382996, + "learning_rate": 1.5987e-05, + "loss": 0.1725, + "step": 5335 + }, + { + "epoch": 4.20165419456479, + "grad_norm": 0.48629385232925415, + "learning_rate": 1.599e-05, + "loss": 0.0532, + "step": 5336 + }, + { + "epoch": 4.202441906262308, + "grad_norm": 0.38845980167388916, + "learning_rate": 1.5993e-05, + "loss": 0.0485, + "step": 5337 + }, + { + "epoch": 4.203229617959827, + "grad_norm": 0.29817861318588257, + "learning_rate": 1.5996e-05, + "loss": 0.034, + "step": 5338 + }, + { + "epoch": 4.204017329657345, + "grad_norm": 0.374605655670166, + "learning_rate": 1.5999e-05, + "loss": 0.0238, + "step": 5339 + }, + { + "epoch": 4.204805041354864, + "grad_norm": 0.5779505968093872, + "learning_rate": 1.6002e-05, + "loss": 0.0403, + "step": 5340 + }, + { + "epoch": 4.2055927530523824, + "grad_norm": 0.42204827070236206, + "learning_rate": 1.6005e-05, + "loss": 0.0352, + "step": 5341 + }, + { + "epoch": 4.206380464749902, + "grad_norm": 0.45969831943511963, + "learning_rate": 1.6008e-05, + "loss": 0.0439, + "step": 5342 + }, + { + "epoch": 4.2071681764474205, + "grad_norm": 0.48650965094566345, + "learning_rate": 1.6011e-05, + "loss": 0.0326, + "step": 5343 + }, + { + "epoch": 4.207955888144939, + "grad_norm": 0.49141043424606323, + "learning_rate": 1.6014000000000003e-05, + "loss": 0.0295, + "step": 5344 + }, + { + "epoch": 4.208743599842458, + "grad_norm": 0.25088098645210266, + "learning_rate": 1.6017000000000003e-05, + "loss": 0.0131, + "step": 5345 + }, + { + "epoch": 4.209531311539976, + "grad_norm": 0.9112576842308044, + "learning_rate": 1.6020000000000002e-05, + "loss": 0.0273, + "step": 5346 + }, + { + "epoch": 4.210319023237495, + "grad_norm": 0.7679833173751831, + "learning_rate": 1.6023000000000002e-05, + "loss": 0.0254, + "step": 5347 + }, + { + "epoch": 4.211106734935014, + "grad_norm": 0.4086472988128662, + "learning_rate": 1.6026000000000002e-05, + "loss": 0.0283, + "step": 5348 + }, + { + "epoch": 4.211894446632533, + "grad_norm": 0.275397390127182, + "learning_rate": 1.6029000000000002e-05, + "loss": 0.0149, + "step": 5349 + }, + { + "epoch": 4.212682158330051, + "grad_norm": 0.3842198848724365, + "learning_rate": 1.6032e-05, + "loss": 0.0306, + "step": 5350 + }, + { + "epoch": 4.21346987002757, + "grad_norm": 0.5260108113288879, + "learning_rate": 1.6034999999999998e-05, + "loss": 0.0363, + "step": 5351 + }, + { + "epoch": 4.2142575817250885, + "grad_norm": 0.5494160652160645, + "learning_rate": 1.6037999999999998e-05, + "loss": 0.0245, + "step": 5352 + }, + { + "epoch": 4.215045293422607, + "grad_norm": 0.5652170181274414, + "learning_rate": 1.6040999999999998e-05, + "loss": 0.0164, + "step": 5353 + }, + { + "epoch": 4.2158330051201265, + "grad_norm": 0.8045417666435242, + "learning_rate": 1.6044e-05, + "loss": 0.0378, + "step": 5354 + }, + { + "epoch": 4.216620716817645, + "grad_norm": 0.48764437437057495, + "learning_rate": 1.6047e-05, + "loss": 0.0255, + "step": 5355 + }, + { + "epoch": 4.217408428515164, + "grad_norm": 0.3342662751674652, + "learning_rate": 1.605e-05, + "loss": 0.0245, + "step": 5356 + }, + { + "epoch": 4.218196140212682, + "grad_norm": 0.6683050990104675, + "learning_rate": 1.6053e-05, + "loss": 0.0306, + "step": 5357 + }, + { + "epoch": 4.218983851910201, + "grad_norm": 0.4492298662662506, + "learning_rate": 1.6056e-05, + "loss": 0.0364, + "step": 5358 + }, + { + "epoch": 4.219771563607719, + "grad_norm": 0.5488101840019226, + "learning_rate": 1.6059e-05, + "loss": 0.0265, + "step": 5359 + }, + { + "epoch": 4.220559275305238, + "grad_norm": 1.0120909214019775, + "learning_rate": 1.6062e-05, + "loss": 0.0221, + "step": 5360 + }, + { + "epoch": 4.221346987002757, + "grad_norm": 0.5006376504898071, + "learning_rate": 1.6065e-05, + "loss": 0.0249, + "step": 5361 + }, + { + "epoch": 4.222134698700276, + "grad_norm": 0.5086765885353088, + "learning_rate": 1.6068e-05, + "loss": 0.0164, + "step": 5362 + }, + { + "epoch": 4.2229224103977945, + "grad_norm": 0.4403406083583832, + "learning_rate": 1.6071e-05, + "loss": 0.0297, + "step": 5363 + }, + { + "epoch": 4.223710122095313, + "grad_norm": 0.7490931749343872, + "learning_rate": 1.6074000000000002e-05, + "loss": 0.0388, + "step": 5364 + }, + { + "epoch": 4.224497833792832, + "grad_norm": 0.7239683866500854, + "learning_rate": 1.6077000000000002e-05, + "loss": 0.0324, + "step": 5365 + }, + { + "epoch": 4.22528554549035, + "grad_norm": 0.4530722200870514, + "learning_rate": 1.6080000000000002e-05, + "loss": 0.0223, + "step": 5366 + }, + { + "epoch": 4.22607325718787, + "grad_norm": 0.7489690184593201, + "learning_rate": 1.6083000000000002e-05, + "loss": 0.0223, + "step": 5367 + }, + { + "epoch": 4.226860968885388, + "grad_norm": 0.5057975053787231, + "learning_rate": 1.6086e-05, + "loss": 0.0228, + "step": 5368 + }, + { + "epoch": 4.227648680582907, + "grad_norm": 1.093441367149353, + "learning_rate": 1.6089e-05, + "loss": 0.0439, + "step": 5369 + }, + { + "epoch": 4.228436392280425, + "grad_norm": 0.4453144967556, + "learning_rate": 1.6092e-05, + "loss": 0.0283, + "step": 5370 + }, + { + "epoch": 4.229224103977944, + "grad_norm": 0.4372335374355316, + "learning_rate": 1.6095e-05, + "loss": 0.0239, + "step": 5371 + }, + { + "epoch": 4.2300118156754625, + "grad_norm": 0.7481223344802856, + "learning_rate": 1.6098e-05, + "loss": 0.0304, + "step": 5372 + }, + { + "epoch": 4.230799527372982, + "grad_norm": 0.7578698992729187, + "learning_rate": 1.6101e-05, + "loss": 0.0353, + "step": 5373 + }, + { + "epoch": 4.2315872390705005, + "grad_norm": 0.6098760366439819, + "learning_rate": 1.6104000000000004e-05, + "loss": 0.0273, + "step": 5374 + }, + { + "epoch": 4.232374950768019, + "grad_norm": 0.6502202153205872, + "learning_rate": 1.6107e-05, + "loss": 0.0418, + "step": 5375 + }, + { + "epoch": 4.233162662465538, + "grad_norm": 0.5339159369468689, + "learning_rate": 1.611e-05, + "loss": 0.0284, + "step": 5376 + }, + { + "epoch": 4.233950374163056, + "grad_norm": 0.4269579350948334, + "learning_rate": 1.6113e-05, + "loss": 0.0165, + "step": 5377 + }, + { + "epoch": 4.234738085860575, + "grad_norm": 0.4549376368522644, + "learning_rate": 1.6116e-05, + "loss": 0.0222, + "step": 5378 + }, + { + "epoch": 4.235525797558093, + "grad_norm": 0.39498022198677063, + "learning_rate": 1.6119e-05, + "loss": 0.0226, + "step": 5379 + }, + { + "epoch": 4.236313509255613, + "grad_norm": 1.0178558826446533, + "learning_rate": 1.6122e-05, + "loss": 0.0452, + "step": 5380 + }, + { + "epoch": 4.237101220953131, + "grad_norm": 1.2170307636260986, + "learning_rate": 1.6125e-05, + "loss": 0.3629, + "step": 5381 + }, + { + "epoch": 4.23788893265065, + "grad_norm": 1.414760708808899, + "learning_rate": 1.6128e-05, + "loss": 0.2808, + "step": 5382 + }, + { + "epoch": 4.2386766443481685, + "grad_norm": 4.325444221496582, + "learning_rate": 1.6131e-05, + "loss": 0.1959, + "step": 5383 + }, + { + "epoch": 4.239464356045687, + "grad_norm": 1.2144896984100342, + "learning_rate": 1.6134e-05, + "loss": 0.2104, + "step": 5384 + }, + { + "epoch": 4.240252067743206, + "grad_norm": 1.0268961191177368, + "learning_rate": 1.6137e-05, + "loss": 0.1458, + "step": 5385 + }, + { + "epoch": 4.241039779440725, + "grad_norm": 0.6253692507743835, + "learning_rate": 1.614e-05, + "loss": 0.0673, + "step": 5386 + }, + { + "epoch": 4.241827491138244, + "grad_norm": 0.3762063682079315, + "learning_rate": 1.6143e-05, + "loss": 0.0564, + "step": 5387 + }, + { + "epoch": 4.242615202835762, + "grad_norm": 0.4768979251384735, + "learning_rate": 1.6146e-05, + "loss": 0.0639, + "step": 5388 + }, + { + "epoch": 4.243402914533281, + "grad_norm": 0.5175498127937317, + "learning_rate": 1.6149e-05, + "loss": 0.0454, + "step": 5389 + }, + { + "epoch": 4.244190626230799, + "grad_norm": 0.5343297719955444, + "learning_rate": 1.6152e-05, + "loss": 0.0419, + "step": 5390 + }, + { + "epoch": 4.244978337928318, + "grad_norm": 0.5152279734611511, + "learning_rate": 1.6155e-05, + "loss": 0.0342, + "step": 5391 + }, + { + "epoch": 4.245766049625837, + "grad_norm": 0.366193950176239, + "learning_rate": 1.6158e-05, + "loss": 0.0209, + "step": 5392 + }, + { + "epoch": 4.246553761323356, + "grad_norm": 0.5120214223861694, + "learning_rate": 1.6161e-05, + "loss": 0.0248, + "step": 5393 + }, + { + "epoch": 4.2473414730208745, + "grad_norm": 0.6572739481925964, + "learning_rate": 1.6164e-05, + "loss": 0.0397, + "step": 5394 + }, + { + "epoch": 4.248129184718393, + "grad_norm": 0.23423990607261658, + "learning_rate": 1.6167000000000003e-05, + "loss": 0.0138, + "step": 5395 + }, + { + "epoch": 4.248916896415912, + "grad_norm": 0.72197026014328, + "learning_rate": 1.6170000000000003e-05, + "loss": 0.0408, + "step": 5396 + }, + { + "epoch": 4.24970460811343, + "grad_norm": 0.4994426965713501, + "learning_rate": 1.6173000000000003e-05, + "loss": 0.0468, + "step": 5397 + }, + { + "epoch": 4.250492319810949, + "grad_norm": 0.3981834352016449, + "learning_rate": 1.6176000000000002e-05, + "loss": 0.0293, + "step": 5398 + }, + { + "epoch": 4.251280031508468, + "grad_norm": 0.5442331433296204, + "learning_rate": 1.6179000000000002e-05, + "loss": 0.0341, + "step": 5399 + }, + { + "epoch": 4.252067743205987, + "grad_norm": 0.5914718508720398, + "learning_rate": 1.6182e-05, + "loss": 0.0233, + "step": 5400 + }, + { + "epoch": 4.252855454903505, + "grad_norm": 1.0940531492233276, + "learning_rate": 1.6185e-05, + "loss": 0.0268, + "step": 5401 + }, + { + "epoch": 4.253643166601024, + "grad_norm": 0.26145312190055847, + "learning_rate": 1.6187999999999998e-05, + "loss": 0.0178, + "step": 5402 + }, + { + "epoch": 4.2544308782985425, + "grad_norm": 0.5384897589683533, + "learning_rate": 1.6190999999999998e-05, + "loss": 0.0291, + "step": 5403 + }, + { + "epoch": 4.255218589996061, + "grad_norm": 0.5849843621253967, + "learning_rate": 1.6193999999999998e-05, + "loss": 0.035, + "step": 5404 + }, + { + "epoch": 4.2560063016935805, + "grad_norm": 0.6933060884475708, + "learning_rate": 1.6197e-05, + "loss": 0.0345, + "step": 5405 + }, + { + "epoch": 4.256794013391099, + "grad_norm": 0.6185795664787292, + "learning_rate": 1.62e-05, + "loss": 0.0252, + "step": 5406 + }, + { + "epoch": 4.257581725088618, + "grad_norm": 0.4367828667163849, + "learning_rate": 1.6203e-05, + "loss": 0.0297, + "step": 5407 + }, + { + "epoch": 4.258369436786136, + "grad_norm": 0.5462090373039246, + "learning_rate": 1.6206e-05, + "loss": 0.0273, + "step": 5408 + }, + { + "epoch": 4.259157148483655, + "grad_norm": 0.7027225494384766, + "learning_rate": 1.6209e-05, + "loss": 0.0319, + "step": 5409 + }, + { + "epoch": 4.259944860181173, + "grad_norm": 0.4699017405509949, + "learning_rate": 1.6212e-05, + "loss": 0.0197, + "step": 5410 + }, + { + "epoch": 4.260732571878693, + "grad_norm": 0.5413260459899902, + "learning_rate": 1.6215e-05, + "loss": 0.0252, + "step": 5411 + }, + { + "epoch": 4.261520283576211, + "grad_norm": 0.7428691983222961, + "learning_rate": 1.6218e-05, + "loss": 0.0306, + "step": 5412 + }, + { + "epoch": 4.26230799527373, + "grad_norm": 0.4235073924064636, + "learning_rate": 1.6221e-05, + "loss": 0.0319, + "step": 5413 + }, + { + "epoch": 4.2630957069712485, + "grad_norm": 0.7665265202522278, + "learning_rate": 1.6224e-05, + "loss": 0.036, + "step": 5414 + }, + { + "epoch": 4.263883418668767, + "grad_norm": 0.8386309146881104, + "learning_rate": 1.6227000000000002e-05, + "loss": 0.0366, + "step": 5415 + }, + { + "epoch": 4.264671130366286, + "grad_norm": 2.740328073501587, + "learning_rate": 1.6230000000000002e-05, + "loss": 0.0417, + "step": 5416 + }, + { + "epoch": 4.265458842063804, + "grad_norm": 0.5726564526557922, + "learning_rate": 1.6233000000000002e-05, + "loss": 0.0349, + "step": 5417 + }, + { + "epoch": 4.266246553761324, + "grad_norm": 1.13239324092865, + "learning_rate": 1.6236000000000002e-05, + "loss": 0.0421, + "step": 5418 + }, + { + "epoch": 4.267034265458842, + "grad_norm": 1.5934439897537231, + "learning_rate": 1.6239e-05, + "loss": 0.0363, + "step": 5419 + }, + { + "epoch": 4.267821977156361, + "grad_norm": 1.024401307106018, + "learning_rate": 1.6242e-05, + "loss": 0.0287, + "step": 5420 + }, + { + "epoch": 4.268609688853879, + "grad_norm": 1.1866904497146606, + "learning_rate": 1.6245e-05, + "loss": 0.0338, + "step": 5421 + }, + { + "epoch": 4.269397400551398, + "grad_norm": 0.6957995891571045, + "learning_rate": 1.6248e-05, + "loss": 0.0285, + "step": 5422 + }, + { + "epoch": 4.2701851122489165, + "grad_norm": 0.8498385548591614, + "learning_rate": 1.6251e-05, + "loss": 0.0455, + "step": 5423 + }, + { + "epoch": 4.270972823946436, + "grad_norm": 0.7522366046905518, + "learning_rate": 1.6253999999999997e-05, + "loss": 0.0646, + "step": 5424 + }, + { + "epoch": 4.2717605356439545, + "grad_norm": 0.688663125038147, + "learning_rate": 1.6257e-05, + "loss": 0.047, + "step": 5425 + }, + { + "epoch": 4.272548247341473, + "grad_norm": 0.8569754362106323, + "learning_rate": 1.626e-05, + "loss": 0.0561, + "step": 5426 + }, + { + "epoch": 4.273335959038992, + "grad_norm": 1.1460644006729126, + "learning_rate": 1.6263e-05, + "loss": 0.0467, + "step": 5427 + }, + { + "epoch": 4.27412367073651, + "grad_norm": 0.5999512672424316, + "learning_rate": 1.6266e-05, + "loss": 0.0309, + "step": 5428 + }, + { + "epoch": 4.274911382434029, + "grad_norm": 0.7937774062156677, + "learning_rate": 1.6269e-05, + "loss": 0.045, + "step": 5429 + }, + { + "epoch": 4.275699094131548, + "grad_norm": 0.7667547464370728, + "learning_rate": 1.6272e-05, + "loss": 0.029, + "step": 5430 + }, + { + "epoch": 4.276486805829067, + "grad_norm": 1.2545182704925537, + "learning_rate": 1.6275e-05, + "loss": 0.3091, + "step": 5431 + }, + { + "epoch": 4.277274517526585, + "grad_norm": 1.2314057350158691, + "learning_rate": 1.6278e-05, + "loss": 0.2544, + "step": 5432 + }, + { + "epoch": 4.278062229224104, + "grad_norm": 0.7854353189468384, + "learning_rate": 1.6281e-05, + "loss": 0.2066, + "step": 5433 + }, + { + "epoch": 4.2788499409216225, + "grad_norm": 0.954990804195404, + "learning_rate": 1.6284e-05, + "loss": 0.2193, + "step": 5434 + }, + { + "epoch": 4.279637652619141, + "grad_norm": 1.039833903312683, + "learning_rate": 1.6287000000000002e-05, + "loss": 0.1966, + "step": 5435 + }, + { + "epoch": 4.28042536431666, + "grad_norm": 0.9327968955039978, + "learning_rate": 1.629e-05, + "loss": 0.116, + "step": 5436 + }, + { + "epoch": 4.281213076014179, + "grad_norm": 0.7854710817337036, + "learning_rate": 1.6293e-05, + "loss": 0.1329, + "step": 5437 + }, + { + "epoch": 4.282000787711698, + "grad_norm": 0.601596474647522, + "learning_rate": 1.6296e-05, + "loss": 0.042, + "step": 5438 + }, + { + "epoch": 4.282788499409216, + "grad_norm": 0.4443156123161316, + "learning_rate": 1.6299e-05, + "loss": 0.0294, + "step": 5439 + }, + { + "epoch": 4.283576211106735, + "grad_norm": 0.5384508371353149, + "learning_rate": 1.6302e-05, + "loss": 0.0378, + "step": 5440 + }, + { + "epoch": 4.284363922804253, + "grad_norm": 0.38514769077301025, + "learning_rate": 1.6305e-05, + "loss": 0.0306, + "step": 5441 + }, + { + "epoch": 4.285151634501772, + "grad_norm": 0.3747241199016571, + "learning_rate": 1.6308e-05, + "loss": 0.0215, + "step": 5442 + }, + { + "epoch": 4.285939346199291, + "grad_norm": 0.6433278322219849, + "learning_rate": 1.6311e-05, + "loss": 0.0637, + "step": 5443 + }, + { + "epoch": 4.28672705789681, + "grad_norm": 0.5105169415473938, + "learning_rate": 1.6314e-05, + "loss": 0.0251, + "step": 5444 + }, + { + "epoch": 4.2875147695943285, + "grad_norm": 0.4283963441848755, + "learning_rate": 1.6317000000000003e-05, + "loss": 0.0354, + "step": 5445 + }, + { + "epoch": 4.288302481291847, + "grad_norm": 0.7148115038871765, + "learning_rate": 1.6320000000000003e-05, + "loss": 0.0313, + "step": 5446 + }, + { + "epoch": 4.289090192989366, + "grad_norm": 0.418308287858963, + "learning_rate": 1.6323000000000003e-05, + "loss": 0.0393, + "step": 5447 + }, + { + "epoch": 4.289877904686884, + "grad_norm": 0.37523430585861206, + "learning_rate": 1.6326000000000003e-05, + "loss": 0.0217, + "step": 5448 + }, + { + "epoch": 4.290665616384404, + "grad_norm": 0.6590297222137451, + "learning_rate": 1.6329e-05, + "loss": 0.0367, + "step": 5449 + }, + { + "epoch": 4.291453328081922, + "grad_norm": 0.3814299702644348, + "learning_rate": 1.6332e-05, + "loss": 0.0233, + "step": 5450 + }, + { + "epoch": 4.292241039779441, + "grad_norm": 0.7733478546142578, + "learning_rate": 1.6335e-05, + "loss": 0.0278, + "step": 5451 + }, + { + "epoch": 4.293028751476959, + "grad_norm": 0.40697944164276123, + "learning_rate": 1.6338e-05, + "loss": 0.0321, + "step": 5452 + }, + { + "epoch": 4.293816463174478, + "grad_norm": 0.5142633318901062, + "learning_rate": 1.6340999999999998e-05, + "loss": 0.0462, + "step": 5453 + }, + { + "epoch": 4.2946041748719965, + "grad_norm": 1.5377826690673828, + "learning_rate": 1.6343999999999998e-05, + "loss": 0.0414, + "step": 5454 + }, + { + "epoch": 4.295391886569515, + "grad_norm": 0.40684443712234497, + "learning_rate": 1.6347e-05, + "loss": 0.0271, + "step": 5455 + }, + { + "epoch": 4.2961795982670345, + "grad_norm": 0.5302705764770508, + "learning_rate": 1.635e-05, + "loss": 0.0199, + "step": 5456 + }, + { + "epoch": 4.296967309964553, + "grad_norm": 0.49964243173599243, + "learning_rate": 1.6353e-05, + "loss": 0.0296, + "step": 5457 + }, + { + "epoch": 4.297755021662072, + "grad_norm": 0.6453192830085754, + "learning_rate": 1.6356e-05, + "loss": 0.0365, + "step": 5458 + }, + { + "epoch": 4.29854273335959, + "grad_norm": 0.563252329826355, + "learning_rate": 1.6359e-05, + "loss": 0.038, + "step": 5459 + }, + { + "epoch": 4.299330445057109, + "grad_norm": 0.5756945610046387, + "learning_rate": 1.6362e-05, + "loss": 0.02, + "step": 5460 + }, + { + "epoch": 4.300118156754628, + "grad_norm": 0.6741040945053101, + "learning_rate": 1.6365e-05, + "loss": 0.0344, + "step": 5461 + }, + { + "epoch": 4.300905868452147, + "grad_norm": 0.726948618888855, + "learning_rate": 1.6368e-05, + "loss": 0.0246, + "step": 5462 + }, + { + "epoch": 4.301693580149665, + "grad_norm": 0.45328038930892944, + "learning_rate": 1.6371e-05, + "loss": 0.027, + "step": 5463 + }, + { + "epoch": 4.302481291847184, + "grad_norm": 0.6281811594963074, + "learning_rate": 1.6374e-05, + "loss": 0.0368, + "step": 5464 + }, + { + "epoch": 4.3032690035447025, + "grad_norm": 0.5450472235679626, + "learning_rate": 1.6377000000000003e-05, + "loss": 0.037, + "step": 5465 + }, + { + "epoch": 4.304056715242221, + "grad_norm": 0.5573403835296631, + "learning_rate": 1.6380000000000002e-05, + "loss": 0.032, + "step": 5466 + }, + { + "epoch": 4.30484442693974, + "grad_norm": 0.43625080585479736, + "learning_rate": 1.6383000000000002e-05, + "loss": 0.0291, + "step": 5467 + }, + { + "epoch": 4.305632138637259, + "grad_norm": 0.43117931485176086, + "learning_rate": 1.6386000000000002e-05, + "loss": 0.0236, + "step": 5468 + }, + { + "epoch": 4.306419850334778, + "grad_norm": 0.522251307964325, + "learning_rate": 1.6389000000000002e-05, + "loss": 0.0341, + "step": 5469 + }, + { + "epoch": 4.307207562032296, + "grad_norm": 0.7238819003105164, + "learning_rate": 1.6392e-05, + "loss": 0.0272, + "step": 5470 + }, + { + "epoch": 4.307995273729815, + "grad_norm": 0.49936774373054504, + "learning_rate": 1.6395e-05, + "loss": 0.0348, + "step": 5471 + }, + { + "epoch": 4.308782985427333, + "grad_norm": 0.707585334777832, + "learning_rate": 1.6398e-05, + "loss": 0.0481, + "step": 5472 + }, + { + "epoch": 4.309570697124852, + "grad_norm": 0.5981131792068481, + "learning_rate": 1.6400999999999998e-05, + "loss": 0.0261, + "step": 5473 + }, + { + "epoch": 4.310358408822371, + "grad_norm": 0.8090868592262268, + "learning_rate": 1.6403999999999997e-05, + "loss": 0.0443, + "step": 5474 + }, + { + "epoch": 4.31114612051989, + "grad_norm": 1.1202107667922974, + "learning_rate": 1.6407e-05, + "loss": 0.0221, + "step": 5475 + }, + { + "epoch": 4.311933832217409, + "grad_norm": 0.6932339668273926, + "learning_rate": 1.641e-05, + "loss": 0.0262, + "step": 5476 + }, + { + "epoch": 4.312721543914927, + "grad_norm": 0.6104475259780884, + "learning_rate": 1.6413e-05, + "loss": 0.0367, + "step": 5477 + }, + { + "epoch": 4.313509255612446, + "grad_norm": 0.6096362471580505, + "learning_rate": 1.6416e-05, + "loss": 0.0454, + "step": 5478 + }, + { + "epoch": 4.314296967309964, + "grad_norm": 1.0176963806152344, + "learning_rate": 1.6419e-05, + "loss": 0.0556, + "step": 5479 + }, + { + "epoch": 4.315084679007484, + "grad_norm": 0.7015489935874939, + "learning_rate": 1.6422e-05, + "loss": 0.0347, + "step": 5480 + }, + { + "epoch": 4.315872390705002, + "grad_norm": 2.1326310634613037, + "learning_rate": 1.6425e-05, + "loss": 0.4244, + "step": 5481 + }, + { + "epoch": 4.316660102402521, + "grad_norm": 1.1511051654815674, + "learning_rate": 1.6428e-05, + "loss": 0.283, + "step": 5482 + }, + { + "epoch": 4.317447814100039, + "grad_norm": 0.7068009376525879, + "learning_rate": 1.6431e-05, + "loss": 0.1748, + "step": 5483 + }, + { + "epoch": 4.318235525797558, + "grad_norm": 0.8990950584411621, + "learning_rate": 1.6434e-05, + "loss": 0.1538, + "step": 5484 + }, + { + "epoch": 4.3190232374950766, + "grad_norm": 0.662264347076416, + "learning_rate": 1.6437000000000002e-05, + "loss": 0.1036, + "step": 5485 + }, + { + "epoch": 4.319810949192595, + "grad_norm": 0.7300097942352295, + "learning_rate": 1.6440000000000002e-05, + "loss": 0.0795, + "step": 5486 + }, + { + "epoch": 4.320598660890115, + "grad_norm": 0.43612903356552124, + "learning_rate": 1.6443e-05, + "loss": 0.0611, + "step": 5487 + }, + { + "epoch": 4.321386372587633, + "grad_norm": 0.40926694869995117, + "learning_rate": 1.6446e-05, + "loss": 0.0454, + "step": 5488 + }, + { + "epoch": 4.322174084285152, + "grad_norm": 0.5997318029403687, + "learning_rate": 1.6449e-05, + "loss": 0.068, + "step": 5489 + }, + { + "epoch": 4.32296179598267, + "grad_norm": 0.4149211049079895, + "learning_rate": 1.6452e-05, + "loss": 0.0336, + "step": 5490 + }, + { + "epoch": 4.323749507680189, + "grad_norm": 0.386126309633255, + "learning_rate": 1.6455e-05, + "loss": 0.0341, + "step": 5491 + }, + { + "epoch": 4.324537219377707, + "grad_norm": 0.4039648771286011, + "learning_rate": 1.6458e-05, + "loss": 0.0207, + "step": 5492 + }, + { + "epoch": 4.325324931075227, + "grad_norm": 0.5084262490272522, + "learning_rate": 1.6461e-05, + "loss": 0.04, + "step": 5493 + }, + { + "epoch": 4.326112642772745, + "grad_norm": 0.4561798870563507, + "learning_rate": 1.6464e-05, + "loss": 0.02, + "step": 5494 + }, + { + "epoch": 4.326900354470264, + "grad_norm": 0.4582112729549408, + "learning_rate": 1.6467000000000003e-05, + "loss": 0.0331, + "step": 5495 + }, + { + "epoch": 4.327688066167783, + "grad_norm": 0.46027588844299316, + "learning_rate": 1.6470000000000003e-05, + "loss": 0.0372, + "step": 5496 + }, + { + "epoch": 4.328475777865301, + "grad_norm": 0.5828298330307007, + "learning_rate": 1.6473000000000003e-05, + "loss": 0.0357, + "step": 5497 + }, + { + "epoch": 4.32926348956282, + "grad_norm": 0.5336993932723999, + "learning_rate": 1.6476e-05, + "loss": 0.0303, + "step": 5498 + }, + { + "epoch": 4.330051201260339, + "grad_norm": 0.7260932922363281, + "learning_rate": 1.6479e-05, + "loss": 0.0381, + "step": 5499 + }, + { + "epoch": 4.330838912957858, + "grad_norm": 0.4434891939163208, + "learning_rate": 1.6482e-05, + "loss": 0.0154, + "step": 5500 + }, + { + "epoch": 4.331626624655376, + "grad_norm": 0.7052164673805237, + "learning_rate": 1.6485e-05, + "loss": 0.0507, + "step": 5501 + }, + { + "epoch": 4.332414336352895, + "grad_norm": 0.6491687297821045, + "learning_rate": 1.6488e-05, + "loss": 0.0232, + "step": 5502 + }, + { + "epoch": 4.333202048050413, + "grad_norm": 0.38996654748916626, + "learning_rate": 1.6491e-05, + "loss": 0.0235, + "step": 5503 + }, + { + "epoch": 4.333989759747932, + "grad_norm": 0.44110456109046936, + "learning_rate": 1.6493999999999998e-05, + "loss": 0.0395, + "step": 5504 + }, + { + "epoch": 4.334777471445451, + "grad_norm": 0.446480929851532, + "learning_rate": 1.6497e-05, + "loss": 0.0257, + "step": 5505 + }, + { + "epoch": 4.33556518314297, + "grad_norm": 0.47964024543762207, + "learning_rate": 1.65e-05, + "loss": 0.0233, + "step": 5506 + }, + { + "epoch": 4.336352894840489, + "grad_norm": 0.7355039119720459, + "learning_rate": 1.6503e-05, + "loss": 0.0373, + "step": 5507 + }, + { + "epoch": 4.337140606538007, + "grad_norm": 0.5044031143188477, + "learning_rate": 1.6506e-05, + "loss": 0.0542, + "step": 5508 + }, + { + "epoch": 4.337928318235526, + "grad_norm": 0.5245569348335266, + "learning_rate": 1.6509e-05, + "loss": 0.0259, + "step": 5509 + }, + { + "epoch": 4.338716029933044, + "grad_norm": 0.4500069320201874, + "learning_rate": 1.6512e-05, + "loss": 0.0197, + "step": 5510 + }, + { + "epoch": 4.339503741630563, + "grad_norm": 0.42903897166252136, + "learning_rate": 1.6515e-05, + "loss": 0.0286, + "step": 5511 + }, + { + "epoch": 4.340291453328082, + "grad_norm": 0.6164165735244751, + "learning_rate": 1.6518e-05, + "loss": 0.0294, + "step": 5512 + }, + { + "epoch": 4.341079165025601, + "grad_norm": 0.7048717737197876, + "learning_rate": 1.6521e-05, + "loss": 0.0489, + "step": 5513 + }, + { + "epoch": 4.341866876723119, + "grad_norm": 0.408511221408844, + "learning_rate": 1.6524e-05, + "loss": 0.0266, + "step": 5514 + }, + { + "epoch": 4.342654588420638, + "grad_norm": 0.34240710735321045, + "learning_rate": 1.6527e-05, + "loss": 0.0219, + "step": 5515 + }, + { + "epoch": 4.343442300118157, + "grad_norm": 0.6569628119468689, + "learning_rate": 1.6530000000000003e-05, + "loss": 0.0331, + "step": 5516 + }, + { + "epoch": 4.344230011815675, + "grad_norm": 0.5757091045379639, + "learning_rate": 1.6533000000000002e-05, + "loss": 0.0219, + "step": 5517 + }, + { + "epoch": 4.345017723513195, + "grad_norm": 0.6231567859649658, + "learning_rate": 1.6536000000000002e-05, + "loss": 0.0371, + "step": 5518 + }, + { + "epoch": 4.345805435210713, + "grad_norm": 0.7939009666442871, + "learning_rate": 1.6539000000000002e-05, + "loss": 0.0535, + "step": 5519 + }, + { + "epoch": 4.346593146908232, + "grad_norm": 0.7295672297477722, + "learning_rate": 1.6542000000000002e-05, + "loss": 0.032, + "step": 5520 + }, + { + "epoch": 4.34738085860575, + "grad_norm": 0.5322846174240112, + "learning_rate": 1.6545e-05, + "loss": 0.0349, + "step": 5521 + }, + { + "epoch": 4.348168570303269, + "grad_norm": 1.0090066194534302, + "learning_rate": 1.6548e-05, + "loss": 0.0542, + "step": 5522 + }, + { + "epoch": 4.348956282000787, + "grad_norm": 0.49241337180137634, + "learning_rate": 1.6550999999999998e-05, + "loss": 0.0263, + "step": 5523 + }, + { + "epoch": 4.349743993698306, + "grad_norm": 0.47370100021362305, + "learning_rate": 1.6553999999999998e-05, + "loss": 0.026, + "step": 5524 + }, + { + "epoch": 4.3505317053958255, + "grad_norm": 0.6875128746032715, + "learning_rate": 1.6556999999999998e-05, + "loss": 0.0401, + "step": 5525 + }, + { + "epoch": 4.351319417093344, + "grad_norm": 0.6882376074790955, + "learning_rate": 1.656e-05, + "loss": 0.0359, + "step": 5526 + }, + { + "epoch": 4.352107128790863, + "grad_norm": 0.9164915680885315, + "learning_rate": 1.6563e-05, + "loss": 0.0281, + "step": 5527 + }, + { + "epoch": 4.352894840488381, + "grad_norm": 1.2996858358383179, + "learning_rate": 1.6566e-05, + "loss": 0.0394, + "step": 5528 + }, + { + "epoch": 4.3536825521859, + "grad_norm": 0.5053417682647705, + "learning_rate": 1.6569e-05, + "loss": 0.0267, + "step": 5529 + }, + { + "epoch": 4.354470263883418, + "grad_norm": 0.7702280879020691, + "learning_rate": 1.6572e-05, + "loss": 0.0677, + "step": 5530 + }, + { + "epoch": 4.355257975580938, + "grad_norm": 1.690423846244812, + "learning_rate": 1.6575e-05, + "loss": 0.3552, + "step": 5531 + }, + { + "epoch": 4.356045687278456, + "grad_norm": 1.3912962675094604, + "learning_rate": 1.6578e-05, + "loss": 0.2571, + "step": 5532 + }, + { + "epoch": 4.356833398975975, + "grad_norm": 1.169671893119812, + "learning_rate": 1.6581e-05, + "loss": 0.1771, + "step": 5533 + }, + { + "epoch": 4.3576211106734934, + "grad_norm": 0.7269328236579895, + "learning_rate": 1.6584e-05, + "loss": 0.1392, + "step": 5534 + }, + { + "epoch": 4.358408822371012, + "grad_norm": 0.6441969275474548, + "learning_rate": 1.6587e-05, + "loss": 0.0922, + "step": 5535 + }, + { + "epoch": 4.359196534068531, + "grad_norm": 0.5804573893547058, + "learning_rate": 1.6590000000000002e-05, + "loss": 0.0361, + "step": 5536 + }, + { + "epoch": 4.35998424576605, + "grad_norm": 0.6912067532539368, + "learning_rate": 1.6593000000000002e-05, + "loss": 0.0539, + "step": 5537 + }, + { + "epoch": 4.360771957463569, + "grad_norm": 0.49834775924682617, + "learning_rate": 1.6596000000000002e-05, + "loss": 0.0401, + "step": 5538 + }, + { + "epoch": 4.361559669161087, + "grad_norm": 0.3836834728717804, + "learning_rate": 1.6599e-05, + "loss": 0.0358, + "step": 5539 + }, + { + "epoch": 4.362347380858606, + "grad_norm": 0.45484596490859985, + "learning_rate": 1.6602e-05, + "loss": 0.0395, + "step": 5540 + }, + { + "epoch": 4.363135092556124, + "grad_norm": 0.5310401916503906, + "learning_rate": 1.6605e-05, + "loss": 0.0293, + "step": 5541 + }, + { + "epoch": 4.363922804253643, + "grad_norm": 0.31520935893058777, + "learning_rate": 1.6608e-05, + "loss": 0.0188, + "step": 5542 + }, + { + "epoch": 4.364710515951161, + "grad_norm": 0.8239185214042664, + "learning_rate": 1.6611e-05, + "loss": 0.0412, + "step": 5543 + }, + { + "epoch": 4.365498227648681, + "grad_norm": 0.5024242997169495, + "learning_rate": 1.6614e-05, + "loss": 0.0236, + "step": 5544 + }, + { + "epoch": 4.3662859393461995, + "grad_norm": 0.353508323431015, + "learning_rate": 1.6617e-05, + "loss": 0.0239, + "step": 5545 + }, + { + "epoch": 4.367073651043718, + "grad_norm": 1.3363850116729736, + "learning_rate": 1.6620000000000004e-05, + "loss": 0.0356, + "step": 5546 + }, + { + "epoch": 4.367861362741237, + "grad_norm": 1.0271155834197998, + "learning_rate": 1.6623e-05, + "loss": 0.0434, + "step": 5547 + }, + { + "epoch": 4.368649074438755, + "grad_norm": 0.60202556848526, + "learning_rate": 1.6626e-05, + "loss": 0.0345, + "step": 5548 + }, + { + "epoch": 4.369436786136274, + "grad_norm": 0.4361249804496765, + "learning_rate": 1.6629e-05, + "loss": 0.0338, + "step": 5549 + }, + { + "epoch": 4.370224497833793, + "grad_norm": 0.5184730887413025, + "learning_rate": 1.6632e-05, + "loss": 0.0293, + "step": 5550 + }, + { + "epoch": 4.371012209531312, + "grad_norm": 0.6304498314857483, + "learning_rate": 1.6635e-05, + "loss": 0.0237, + "step": 5551 + }, + { + "epoch": 4.37179992122883, + "grad_norm": 0.536281406879425, + "learning_rate": 1.6638e-05, + "loss": 0.0348, + "step": 5552 + }, + { + "epoch": 4.372587632926349, + "grad_norm": 0.37636637687683105, + "learning_rate": 1.6641e-05, + "loss": 0.0428, + "step": 5553 + }, + { + "epoch": 4.3733753446238675, + "grad_norm": 0.5137655735015869, + "learning_rate": 1.6644e-05, + "loss": 0.027, + "step": 5554 + }, + { + "epoch": 4.374163056321386, + "grad_norm": 0.61092609167099, + "learning_rate": 1.6647e-05, + "loss": 0.0507, + "step": 5555 + }, + { + "epoch": 4.3749507680189055, + "grad_norm": 0.6080500483512878, + "learning_rate": 1.665e-05, + "loss": 0.0349, + "step": 5556 + }, + { + "epoch": 4.375738479716424, + "grad_norm": 0.5228967666625977, + "learning_rate": 1.6653e-05, + "loss": 0.0349, + "step": 5557 + }, + { + "epoch": 4.376526191413943, + "grad_norm": 0.5033703446388245, + "learning_rate": 1.6656e-05, + "loss": 0.0289, + "step": 5558 + }, + { + "epoch": 4.377313903111461, + "grad_norm": 0.4793831706047058, + "learning_rate": 1.6659e-05, + "loss": 0.0289, + "step": 5559 + }, + { + "epoch": 4.37810161480898, + "grad_norm": 0.43951401114463806, + "learning_rate": 1.6662e-05, + "loss": 0.0293, + "step": 5560 + }, + { + "epoch": 4.378889326506498, + "grad_norm": 0.5627932548522949, + "learning_rate": 1.6665e-05, + "loss": 0.04, + "step": 5561 + }, + { + "epoch": 4.379677038204017, + "grad_norm": 1.1188709735870361, + "learning_rate": 1.6668e-05, + "loss": 0.0461, + "step": 5562 + }, + { + "epoch": 4.380464749901536, + "grad_norm": 0.4383431673049927, + "learning_rate": 1.6671e-05, + "loss": 0.0378, + "step": 5563 + }, + { + "epoch": 4.381252461599055, + "grad_norm": 0.6141172051429749, + "learning_rate": 1.6674e-05, + "loss": 0.0343, + "step": 5564 + }, + { + "epoch": 4.3820401732965735, + "grad_norm": 0.6081987619400024, + "learning_rate": 1.6677e-05, + "loss": 0.0334, + "step": 5565 + }, + { + "epoch": 4.382827884994092, + "grad_norm": 0.47505253553390503, + "learning_rate": 1.6680000000000003e-05, + "loss": 0.0281, + "step": 5566 + }, + { + "epoch": 4.383615596691611, + "grad_norm": 0.45470425486564636, + "learning_rate": 1.6683000000000003e-05, + "loss": 0.0299, + "step": 5567 + }, + { + "epoch": 4.384403308389129, + "grad_norm": 0.8246652483940125, + "learning_rate": 1.6686000000000003e-05, + "loss": 0.0498, + "step": 5568 + }, + { + "epoch": 4.385191020086649, + "grad_norm": 1.2133347988128662, + "learning_rate": 1.6689000000000002e-05, + "loss": 0.0319, + "step": 5569 + }, + { + "epoch": 4.385978731784167, + "grad_norm": 0.5785076022148132, + "learning_rate": 1.6692000000000002e-05, + "loss": 0.0351, + "step": 5570 + }, + { + "epoch": 4.386766443481686, + "grad_norm": 0.9229502081871033, + "learning_rate": 1.6695000000000002e-05, + "loss": 0.0364, + "step": 5571 + }, + { + "epoch": 4.387554155179204, + "grad_norm": 0.8189412355422974, + "learning_rate": 1.6698e-05, + "loss": 0.0486, + "step": 5572 + }, + { + "epoch": 4.388341866876723, + "grad_norm": 0.40495991706848145, + "learning_rate": 1.6700999999999998e-05, + "loss": 0.0199, + "step": 5573 + }, + { + "epoch": 4.3891295785742415, + "grad_norm": 0.5168177485466003, + "learning_rate": 1.6703999999999998e-05, + "loss": 0.0329, + "step": 5574 + }, + { + "epoch": 4.389917290271761, + "grad_norm": 0.7120898962020874, + "learning_rate": 1.6706999999999998e-05, + "loss": 0.041, + "step": 5575 + }, + { + "epoch": 4.3907050019692795, + "grad_norm": 0.8351951837539673, + "learning_rate": 1.671e-05, + "loss": 0.0311, + "step": 5576 + }, + { + "epoch": 4.391492713666798, + "grad_norm": 0.5155261158943176, + "learning_rate": 1.6713e-05, + "loss": 0.0333, + "step": 5577 + }, + { + "epoch": 4.392280425364317, + "grad_norm": 0.834392249584198, + "learning_rate": 1.6716e-05, + "loss": 0.0483, + "step": 5578 + }, + { + "epoch": 4.393068137061835, + "grad_norm": 0.7414959669113159, + "learning_rate": 1.6719e-05, + "loss": 0.0377, + "step": 5579 + }, + { + "epoch": 4.393855848759354, + "grad_norm": 0.6478192806243896, + "learning_rate": 1.6722e-05, + "loss": 0.0354, + "step": 5580 + }, + { + "epoch": 4.394643560456872, + "grad_norm": 1.6728357076644897, + "learning_rate": 1.6725e-05, + "loss": 0.3061, + "step": 5581 + }, + { + "epoch": 4.395431272154392, + "grad_norm": 1.0387611389160156, + "learning_rate": 1.6728e-05, + "loss": 0.2737, + "step": 5582 + }, + { + "epoch": 4.39621898385191, + "grad_norm": 1.4023208618164062, + "learning_rate": 1.6731e-05, + "loss": 0.2495, + "step": 5583 + }, + { + "epoch": 4.397006695549429, + "grad_norm": 1.7047185897827148, + "learning_rate": 1.6734e-05, + "loss": 0.2139, + "step": 5584 + }, + { + "epoch": 4.3977944072469475, + "grad_norm": 1.0055217742919922, + "learning_rate": 1.6737e-05, + "loss": 0.1811, + "step": 5585 + }, + { + "epoch": 4.398582118944466, + "grad_norm": 0.8143244385719299, + "learning_rate": 1.6740000000000002e-05, + "loss": 0.1293, + "step": 5586 + }, + { + "epoch": 4.3993698306419855, + "grad_norm": 0.6270284652709961, + "learning_rate": 1.6743000000000002e-05, + "loss": 0.0697, + "step": 5587 + }, + { + "epoch": 4.400157542339504, + "grad_norm": 0.6796573400497437, + "learning_rate": 1.6746000000000002e-05, + "loss": 0.0743, + "step": 5588 + }, + { + "epoch": 4.400945254037023, + "grad_norm": 1.215027093887329, + "learning_rate": 1.6749000000000002e-05, + "loss": 0.0465, + "step": 5589 + }, + { + "epoch": 4.401732965734541, + "grad_norm": 0.4517606198787689, + "learning_rate": 1.6752e-05, + "loss": 0.03, + "step": 5590 + }, + { + "epoch": 4.40252067743206, + "grad_norm": 0.41791874170303345, + "learning_rate": 1.6755e-05, + "loss": 0.0362, + "step": 5591 + }, + { + "epoch": 4.403308389129578, + "grad_norm": 0.416753888130188, + "learning_rate": 1.6758e-05, + "loss": 0.034, + "step": 5592 + }, + { + "epoch": 4.404096100827097, + "grad_norm": 0.3561917543411255, + "learning_rate": 1.6761e-05, + "loss": 0.0327, + "step": 5593 + }, + { + "epoch": 4.404883812524616, + "grad_norm": 0.4294114410877228, + "learning_rate": 1.6764e-05, + "loss": 0.0219, + "step": 5594 + }, + { + "epoch": 4.405671524222135, + "grad_norm": 0.5235668420791626, + "learning_rate": 1.6767e-05, + "loss": 0.0314, + "step": 5595 + }, + { + "epoch": 4.4064592359196535, + "grad_norm": 0.36074039340019226, + "learning_rate": 1.677e-05, + "loss": 0.0273, + "step": 5596 + }, + { + "epoch": 4.407246947617172, + "grad_norm": 0.3656328320503235, + "learning_rate": 1.6773e-05, + "loss": 0.0161, + "step": 5597 + }, + { + "epoch": 4.408034659314691, + "grad_norm": 0.3663257360458374, + "learning_rate": 1.6776e-05, + "loss": 0.0203, + "step": 5598 + }, + { + "epoch": 4.408822371012209, + "grad_norm": 0.4894571602344513, + "learning_rate": 1.6779e-05, + "loss": 0.0237, + "step": 5599 + }, + { + "epoch": 4.409610082709729, + "grad_norm": 0.6902357339859009, + "learning_rate": 1.6782e-05, + "loss": 0.0709, + "step": 5600 + }, + { + "epoch": 4.410397794407247, + "grad_norm": 0.4701080024242401, + "learning_rate": 1.6785e-05, + "loss": 0.029, + "step": 5601 + }, + { + "epoch": 4.411185506104766, + "grad_norm": 0.6877861022949219, + "learning_rate": 1.6788e-05, + "loss": 0.0579, + "step": 5602 + }, + { + "epoch": 4.411973217802284, + "grad_norm": 0.648120641708374, + "learning_rate": 1.6791e-05, + "loss": 0.0327, + "step": 5603 + }, + { + "epoch": 4.412760929499803, + "grad_norm": 0.3908609449863434, + "learning_rate": 1.6794e-05, + "loss": 0.0202, + "step": 5604 + }, + { + "epoch": 4.4135486411973215, + "grad_norm": 0.5152987241744995, + "learning_rate": 1.6797e-05, + "loss": 0.0342, + "step": 5605 + }, + { + "epoch": 4.414336352894841, + "grad_norm": 0.5726132988929749, + "learning_rate": 1.6800000000000002e-05, + "loss": 0.0203, + "step": 5606 + }, + { + "epoch": 4.4151240645923595, + "grad_norm": 1.5902345180511475, + "learning_rate": 1.6803e-05, + "loss": 0.0414, + "step": 5607 + }, + { + "epoch": 4.415911776289878, + "grad_norm": 0.37921959161758423, + "learning_rate": 1.6806e-05, + "loss": 0.0236, + "step": 5608 + }, + { + "epoch": 4.416699487987397, + "grad_norm": 0.48336631059646606, + "learning_rate": 1.6809e-05, + "loss": 0.0331, + "step": 5609 + }, + { + "epoch": 4.417487199684915, + "grad_norm": 0.5485195517539978, + "learning_rate": 1.6812e-05, + "loss": 0.0277, + "step": 5610 + }, + { + "epoch": 4.418274911382434, + "grad_norm": 0.5466840267181396, + "learning_rate": 1.6815e-05, + "loss": 0.0399, + "step": 5611 + }, + { + "epoch": 4.419062623079952, + "grad_norm": 0.4073285758495331, + "learning_rate": 1.6818e-05, + "loss": 0.0263, + "step": 5612 + }, + { + "epoch": 4.419850334777472, + "grad_norm": 0.5566073060035706, + "learning_rate": 1.6821e-05, + "loss": 0.0364, + "step": 5613 + }, + { + "epoch": 4.42063804647499, + "grad_norm": 0.5473660826683044, + "learning_rate": 1.6824e-05, + "loss": 0.0519, + "step": 5614 + }, + { + "epoch": 4.421425758172509, + "grad_norm": 0.480625718832016, + "learning_rate": 1.6827e-05, + "loss": 0.029, + "step": 5615 + }, + { + "epoch": 4.4222134698700275, + "grad_norm": 0.6376765370368958, + "learning_rate": 1.6830000000000003e-05, + "loss": 0.0386, + "step": 5616 + }, + { + "epoch": 4.423001181567546, + "grad_norm": 1.0753384828567505, + "learning_rate": 1.6833000000000003e-05, + "loss": 0.0317, + "step": 5617 + }, + { + "epoch": 4.423788893265065, + "grad_norm": 0.45022812485694885, + "learning_rate": 1.6836000000000003e-05, + "loss": 0.0309, + "step": 5618 + }, + { + "epoch": 4.424576604962584, + "grad_norm": 0.8939723968505859, + "learning_rate": 1.6839000000000003e-05, + "loss": 0.0332, + "step": 5619 + }, + { + "epoch": 4.425364316660103, + "grad_norm": 0.6353593468666077, + "learning_rate": 1.6842000000000002e-05, + "loss": 0.0302, + "step": 5620 + }, + { + "epoch": 4.426152028357621, + "grad_norm": 0.6037166714668274, + "learning_rate": 1.6845e-05, + "loss": 0.0245, + "step": 5621 + }, + { + "epoch": 4.42693974005514, + "grad_norm": 0.7587124705314636, + "learning_rate": 1.6848e-05, + "loss": 0.0355, + "step": 5622 + }, + { + "epoch": 4.427727451752658, + "grad_norm": 0.5048245787620544, + "learning_rate": 1.6851e-05, + "loss": 0.0405, + "step": 5623 + }, + { + "epoch": 4.428515163450177, + "grad_norm": 0.48797059059143066, + "learning_rate": 1.6853999999999998e-05, + "loss": 0.0341, + "step": 5624 + }, + { + "epoch": 4.429302875147696, + "grad_norm": 0.5745418667793274, + "learning_rate": 1.6856999999999998e-05, + "loss": 0.0259, + "step": 5625 + }, + { + "epoch": 4.430090586845215, + "grad_norm": 0.598457932472229, + "learning_rate": 1.686e-05, + "loss": 0.0274, + "step": 5626 + }, + { + "epoch": 4.4308782985427335, + "grad_norm": 3.354126214981079, + "learning_rate": 1.6863e-05, + "loss": 0.0491, + "step": 5627 + }, + { + "epoch": 4.431666010240252, + "grad_norm": 0.4018678069114685, + "learning_rate": 1.6866e-05, + "loss": 0.0265, + "step": 5628 + }, + { + "epoch": 4.432453721937771, + "grad_norm": 0.8255400657653809, + "learning_rate": 1.6869e-05, + "loss": 0.0474, + "step": 5629 + }, + { + "epoch": 4.433241433635289, + "grad_norm": 0.6989447474479675, + "learning_rate": 1.6872e-05, + "loss": 0.0405, + "step": 5630 + }, + { + "epoch": 4.434029145332808, + "grad_norm": 1.4077306985855103, + "learning_rate": 1.6875e-05, + "loss": 0.327, + "step": 5631 + }, + { + "epoch": 4.434816857030327, + "grad_norm": 1.2437041997909546, + "learning_rate": 1.6878e-05, + "loss": 0.2938, + "step": 5632 + }, + { + "epoch": 4.435604568727846, + "grad_norm": 0.7659358382225037, + "learning_rate": 1.6881e-05, + "loss": 0.1749, + "step": 5633 + }, + { + "epoch": 4.436392280425364, + "grad_norm": 0.8643622398376465, + "learning_rate": 1.6884e-05, + "loss": 0.1866, + "step": 5634 + }, + { + "epoch": 4.437179992122883, + "grad_norm": 5.231837749481201, + "learning_rate": 1.6887e-05, + "loss": 0.1526, + "step": 5635 + }, + { + "epoch": 4.4379677038204015, + "grad_norm": 1.012130856513977, + "learning_rate": 1.689e-05, + "loss": 0.0922, + "step": 5636 + }, + { + "epoch": 4.43875541551792, + "grad_norm": 0.4509071707725525, + "learning_rate": 1.6893000000000002e-05, + "loss": 0.0454, + "step": 5637 + }, + { + "epoch": 4.4395431272154395, + "grad_norm": 1.2288610935211182, + "learning_rate": 1.6896000000000002e-05, + "loss": 0.0847, + "step": 5638 + }, + { + "epoch": 4.440330838912958, + "grad_norm": 0.27200600504875183, + "learning_rate": 1.6899000000000002e-05, + "loss": 0.0234, + "step": 5639 + }, + { + "epoch": 4.441118550610477, + "grad_norm": 0.27802497148513794, + "learning_rate": 1.6902000000000002e-05, + "loss": 0.0194, + "step": 5640 + }, + { + "epoch": 4.441906262307995, + "grad_norm": 0.5757642388343811, + "learning_rate": 1.6905e-05, + "loss": 0.039, + "step": 5641 + }, + { + "epoch": 4.442693974005514, + "grad_norm": 0.42498964071273804, + "learning_rate": 1.6908e-05, + "loss": 0.0342, + "step": 5642 + }, + { + "epoch": 4.443481685703032, + "grad_norm": 0.7504664659500122, + "learning_rate": 1.6911e-05, + "loss": 0.0471, + "step": 5643 + }, + { + "epoch": 4.444269397400552, + "grad_norm": 0.3895299732685089, + "learning_rate": 1.6914e-05, + "loss": 0.0276, + "step": 5644 + }, + { + "epoch": 4.44505710909807, + "grad_norm": 0.4846903681755066, + "learning_rate": 1.6916999999999997e-05, + "loss": 0.0275, + "step": 5645 + }, + { + "epoch": 4.445844820795589, + "grad_norm": 0.5368661284446716, + "learning_rate": 1.6919999999999997e-05, + "loss": 0.0224, + "step": 5646 + }, + { + "epoch": 4.4466325324931075, + "grad_norm": 0.5669283866882324, + "learning_rate": 1.6923e-05, + "loss": 0.0278, + "step": 5647 + }, + { + "epoch": 4.447420244190626, + "grad_norm": 0.8499312996864319, + "learning_rate": 1.6926e-05, + "loss": 0.0522, + "step": 5648 + }, + { + "epoch": 4.448207955888145, + "grad_norm": 0.35067909955978394, + "learning_rate": 1.6929e-05, + "loss": 0.0295, + "step": 5649 + }, + { + "epoch": 4.448995667585663, + "grad_norm": 0.4815874993801117, + "learning_rate": 1.6932e-05, + "loss": 0.0322, + "step": 5650 + }, + { + "epoch": 4.449783379283183, + "grad_norm": 0.4844551086425781, + "learning_rate": 1.6935e-05, + "loss": 0.0327, + "step": 5651 + }, + { + "epoch": 4.450571090980701, + "grad_norm": 0.4185558259487152, + "learning_rate": 1.6938e-05, + "loss": 0.0279, + "step": 5652 + }, + { + "epoch": 4.45135880267822, + "grad_norm": 0.36351123452186584, + "learning_rate": 1.6941e-05, + "loss": 0.0309, + "step": 5653 + }, + { + "epoch": 4.452146514375738, + "grad_norm": 0.3394171893596649, + "learning_rate": 1.6944e-05, + "loss": 0.0244, + "step": 5654 + }, + { + "epoch": 4.452934226073257, + "grad_norm": 0.5737560987472534, + "learning_rate": 1.6947e-05, + "loss": 0.0441, + "step": 5655 + }, + { + "epoch": 4.4537219377707755, + "grad_norm": 0.4562782645225525, + "learning_rate": 1.695e-05, + "loss": 0.0222, + "step": 5656 + }, + { + "epoch": 4.454509649468295, + "grad_norm": 0.5640928745269775, + "learning_rate": 1.6953000000000002e-05, + "loss": 0.0381, + "step": 5657 + }, + { + "epoch": 4.4552973611658135, + "grad_norm": 0.4557448923587799, + "learning_rate": 1.6956e-05, + "loss": 0.0258, + "step": 5658 + }, + { + "epoch": 4.456085072863332, + "grad_norm": 1.0970373153686523, + "learning_rate": 1.6959e-05, + "loss": 0.034, + "step": 5659 + }, + { + "epoch": 4.456872784560851, + "grad_norm": 0.6354279518127441, + "learning_rate": 1.6962e-05, + "loss": 0.0313, + "step": 5660 + }, + { + "epoch": 4.457660496258369, + "grad_norm": 0.40942659974098206, + "learning_rate": 1.6965e-05, + "loss": 0.0213, + "step": 5661 + }, + { + "epoch": 4.458448207955888, + "grad_norm": 0.5584750175476074, + "learning_rate": 1.6968e-05, + "loss": 0.0412, + "step": 5662 + }, + { + "epoch": 4.459235919653407, + "grad_norm": 1.017833948135376, + "learning_rate": 1.6971e-05, + "loss": 0.0288, + "step": 5663 + }, + { + "epoch": 4.460023631350926, + "grad_norm": 0.4113842844963074, + "learning_rate": 1.6974e-05, + "loss": 0.0342, + "step": 5664 + }, + { + "epoch": 4.460811343048444, + "grad_norm": 0.7468014359474182, + "learning_rate": 1.6977e-05, + "loss": 0.0433, + "step": 5665 + }, + { + "epoch": 4.461599054745963, + "grad_norm": 0.5439751148223877, + "learning_rate": 1.698e-05, + "loss": 0.0311, + "step": 5666 + }, + { + "epoch": 4.4623867664434815, + "grad_norm": 0.5608392953872681, + "learning_rate": 1.6983000000000003e-05, + "loss": 0.0254, + "step": 5667 + }, + { + "epoch": 4.463174478141, + "grad_norm": 0.48605045676231384, + "learning_rate": 1.6986000000000003e-05, + "loss": 0.0312, + "step": 5668 + }, + { + "epoch": 4.463962189838519, + "grad_norm": 0.4895947277545929, + "learning_rate": 1.6989000000000003e-05, + "loss": 0.0399, + "step": 5669 + }, + { + "epoch": 4.464749901536038, + "grad_norm": 0.5717703104019165, + "learning_rate": 1.6992e-05, + "loss": 0.0339, + "step": 5670 + }, + { + "epoch": 4.465537613233557, + "grad_norm": 0.5972121953964233, + "learning_rate": 1.6995e-05, + "loss": 0.0345, + "step": 5671 + }, + { + "epoch": 4.466325324931075, + "grad_norm": 0.6546984910964966, + "learning_rate": 1.6998e-05, + "loss": 0.047, + "step": 5672 + }, + { + "epoch": 4.467113036628594, + "grad_norm": 0.7289057970046997, + "learning_rate": 1.7001e-05, + "loss": 0.0375, + "step": 5673 + }, + { + "epoch": 4.467900748326112, + "grad_norm": 0.5710685849189758, + "learning_rate": 1.7004e-05, + "loss": 0.0277, + "step": 5674 + }, + { + "epoch": 4.468688460023631, + "grad_norm": 1.9970518350601196, + "learning_rate": 1.7006999999999998e-05, + "loss": 0.046, + "step": 5675 + }, + { + "epoch": 4.46947617172115, + "grad_norm": 0.5495120286941528, + "learning_rate": 1.7009999999999998e-05, + "loss": 0.0379, + "step": 5676 + }, + { + "epoch": 4.470263883418669, + "grad_norm": 0.6093716621398926, + "learning_rate": 1.7013e-05, + "loss": 0.0371, + "step": 5677 + }, + { + "epoch": 4.4710515951161875, + "grad_norm": 0.605340301990509, + "learning_rate": 1.7016e-05, + "loss": 0.0527, + "step": 5678 + }, + { + "epoch": 4.471839306813706, + "grad_norm": 0.5411669015884399, + "learning_rate": 1.7019e-05, + "loss": 0.0397, + "step": 5679 + }, + { + "epoch": 4.472627018511225, + "grad_norm": 0.904004693031311, + "learning_rate": 1.7022e-05, + "loss": 0.0465, + "step": 5680 + }, + { + "epoch": 4.473414730208743, + "grad_norm": 1.697760820388794, + "learning_rate": 1.7025e-05, + "loss": 0.4254, + "step": 5681 + }, + { + "epoch": 4.474202441906263, + "grad_norm": 1.0868042707443237, + "learning_rate": 1.7028e-05, + "loss": 0.3007, + "step": 5682 + }, + { + "epoch": 4.474990153603781, + "grad_norm": 0.6356713175773621, + "learning_rate": 1.7031e-05, + "loss": 0.176, + "step": 5683 + }, + { + "epoch": 4.4757778653013, + "grad_norm": 0.7274854183197021, + "learning_rate": 1.7034e-05, + "loss": 0.2096, + "step": 5684 + }, + { + "epoch": 4.476565576998818, + "grad_norm": 0.6512429118156433, + "learning_rate": 1.7037e-05, + "loss": 0.1032, + "step": 5685 + }, + { + "epoch": 4.477353288696337, + "grad_norm": 0.5995899438858032, + "learning_rate": 1.704e-05, + "loss": 0.0826, + "step": 5686 + }, + { + "epoch": 4.4781410003938555, + "grad_norm": 1.020365595817566, + "learning_rate": 1.7043000000000003e-05, + "loss": 0.0511, + "step": 5687 + }, + { + "epoch": 4.478928712091374, + "grad_norm": 1.1536352634429932, + "learning_rate": 1.7046000000000002e-05, + "loss": 0.0885, + "step": 5688 + }, + { + "epoch": 4.479716423788894, + "grad_norm": 0.48851317167282104, + "learning_rate": 1.7049000000000002e-05, + "loss": 0.0444, + "step": 5689 + }, + { + "epoch": 4.480504135486412, + "grad_norm": 0.27225714921951294, + "learning_rate": 1.7052000000000002e-05, + "loss": 0.0213, + "step": 5690 + }, + { + "epoch": 4.481291847183931, + "grad_norm": 0.4160163402557373, + "learning_rate": 1.7055000000000002e-05, + "loss": 0.0359, + "step": 5691 + }, + { + "epoch": 4.482079558881449, + "grad_norm": 0.6039144992828369, + "learning_rate": 1.7058e-05, + "loss": 0.0274, + "step": 5692 + }, + { + "epoch": 4.482867270578968, + "grad_norm": 0.4128055274486542, + "learning_rate": 1.7061e-05, + "loss": 0.0435, + "step": 5693 + }, + { + "epoch": 4.483654982276486, + "grad_norm": 0.5393528342247009, + "learning_rate": 1.7064e-05, + "loss": 0.0686, + "step": 5694 + }, + { + "epoch": 4.484442693974006, + "grad_norm": 0.5349401235580444, + "learning_rate": 1.7066999999999998e-05, + "loss": 0.0432, + "step": 5695 + }, + { + "epoch": 4.485230405671524, + "grad_norm": 0.7119920253753662, + "learning_rate": 1.7069999999999998e-05, + "loss": 0.0401, + "step": 5696 + }, + { + "epoch": 4.486018117369043, + "grad_norm": 0.8904191255569458, + "learning_rate": 1.7073e-05, + "loss": 0.0361, + "step": 5697 + }, + { + "epoch": 4.486805829066562, + "grad_norm": 0.4527822732925415, + "learning_rate": 1.7076e-05, + "loss": 0.0269, + "step": 5698 + }, + { + "epoch": 4.48759354076408, + "grad_norm": 1.2320470809936523, + "learning_rate": 1.7079e-05, + "loss": 0.016, + "step": 5699 + }, + { + "epoch": 4.488381252461599, + "grad_norm": 0.555717945098877, + "learning_rate": 1.7082e-05, + "loss": 0.0192, + "step": 5700 + }, + { + "epoch": 4.489168964159118, + "grad_norm": 0.378844290971756, + "learning_rate": 1.7085e-05, + "loss": 0.0222, + "step": 5701 + }, + { + "epoch": 4.489956675856637, + "grad_norm": 0.49924302101135254, + "learning_rate": 1.7088e-05, + "loss": 0.0304, + "step": 5702 + }, + { + "epoch": 4.490744387554155, + "grad_norm": 0.595614492893219, + "learning_rate": 1.7091e-05, + "loss": 0.0337, + "step": 5703 + }, + { + "epoch": 4.491532099251674, + "grad_norm": 0.4140430986881256, + "learning_rate": 1.7094e-05, + "loss": 0.0305, + "step": 5704 + }, + { + "epoch": 4.492319810949192, + "grad_norm": 0.6601040959358215, + "learning_rate": 1.7097e-05, + "loss": 0.0347, + "step": 5705 + }, + { + "epoch": 4.493107522646711, + "grad_norm": 0.5778909921646118, + "learning_rate": 1.71e-05, + "loss": 0.0239, + "step": 5706 + }, + { + "epoch": 4.4938952343442296, + "grad_norm": 0.4603844881057739, + "learning_rate": 1.7103000000000002e-05, + "loss": 0.0243, + "step": 5707 + }, + { + "epoch": 4.494682946041749, + "grad_norm": 0.4715285003185272, + "learning_rate": 1.7106000000000002e-05, + "loss": 0.0186, + "step": 5708 + }, + { + "epoch": 4.495470657739268, + "grad_norm": 0.6779530644416809, + "learning_rate": 1.7109000000000002e-05, + "loss": 0.0363, + "step": 5709 + }, + { + "epoch": 4.496258369436786, + "grad_norm": 0.5832409858703613, + "learning_rate": 1.7112e-05, + "loss": 0.0226, + "step": 5710 + }, + { + "epoch": 4.497046081134305, + "grad_norm": 0.5370351672172546, + "learning_rate": 1.7115e-05, + "loss": 0.0358, + "step": 5711 + }, + { + "epoch": 4.497833792831823, + "grad_norm": 0.5922435522079468, + "learning_rate": 1.7118e-05, + "loss": 0.0275, + "step": 5712 + }, + { + "epoch": 4.498621504529343, + "grad_norm": 0.6523492336273193, + "learning_rate": 1.7121e-05, + "loss": 0.0233, + "step": 5713 + }, + { + "epoch": 4.499409216226861, + "grad_norm": 0.4124923646450043, + "learning_rate": 1.7124e-05, + "loss": 0.0241, + "step": 5714 + }, + { + "epoch": 4.50019692792438, + "grad_norm": 0.3918405771255493, + "learning_rate": 1.7127e-05, + "loss": 0.0238, + "step": 5715 + }, + { + "epoch": 4.500984639621898, + "grad_norm": 0.562737226486206, + "learning_rate": 1.713e-05, + "loss": 0.0285, + "step": 5716 + }, + { + "epoch": 4.501772351319417, + "grad_norm": 0.5968077182769775, + "learning_rate": 1.7133000000000004e-05, + "loss": 0.033, + "step": 5717 + }, + { + "epoch": 4.502560063016936, + "grad_norm": 0.5983127355575562, + "learning_rate": 1.7136000000000003e-05, + "loss": 0.0326, + "step": 5718 + }, + { + "epoch": 4.503347774714454, + "grad_norm": 0.7573153972625732, + "learning_rate": 1.7139e-05, + "loss": 0.0295, + "step": 5719 + }, + { + "epoch": 4.504135486411974, + "grad_norm": 1.015745997428894, + "learning_rate": 1.7142e-05, + "loss": 0.033, + "step": 5720 + }, + { + "epoch": 4.504923198109492, + "grad_norm": 0.6835496425628662, + "learning_rate": 1.7145e-05, + "loss": 0.03, + "step": 5721 + }, + { + "epoch": 4.505710909807011, + "grad_norm": 0.49883508682250977, + "learning_rate": 1.7148e-05, + "loss": 0.0332, + "step": 5722 + }, + { + "epoch": 4.506498621504529, + "grad_norm": 0.8356814980506897, + "learning_rate": 1.7151e-05, + "loss": 0.0288, + "step": 5723 + }, + { + "epoch": 4.507286333202048, + "grad_norm": 0.4436112940311432, + "learning_rate": 1.7154e-05, + "loss": 0.0229, + "step": 5724 + }, + { + "epoch": 4.508074044899566, + "grad_norm": 0.7704896330833435, + "learning_rate": 1.7157e-05, + "loss": 0.0339, + "step": 5725 + }, + { + "epoch": 4.508861756597085, + "grad_norm": 0.7145609855651855, + "learning_rate": 1.716e-05, + "loss": 0.0498, + "step": 5726 + }, + { + "epoch": 4.5096494682946044, + "grad_norm": 0.6906453371047974, + "learning_rate": 1.7163e-05, + "loss": 0.0343, + "step": 5727 + }, + { + "epoch": 4.510437179992123, + "grad_norm": 0.5356752872467041, + "learning_rate": 1.7166e-05, + "loss": 0.0312, + "step": 5728 + }, + { + "epoch": 4.511224891689642, + "grad_norm": 0.9311608672142029, + "learning_rate": 1.7169e-05, + "loss": 0.0521, + "step": 5729 + }, + { + "epoch": 4.51201260338716, + "grad_norm": 0.9404779672622681, + "learning_rate": 1.7172e-05, + "loss": 0.0749, + "step": 5730 + }, + { + "epoch": 4.512800315084679, + "grad_norm": 1.27354896068573, + "learning_rate": 1.7175e-05, + "loss": 0.3252, + "step": 5731 + }, + { + "epoch": 4.513588026782198, + "grad_norm": 0.9987229704856873, + "learning_rate": 1.7178e-05, + "loss": 0.3242, + "step": 5732 + }, + { + "epoch": 4.514375738479717, + "grad_norm": 1.4107047319412231, + "learning_rate": 1.7181e-05, + "loss": 0.2875, + "step": 5733 + }, + { + "epoch": 4.515163450177235, + "grad_norm": 1.0935293436050415, + "learning_rate": 1.7184e-05, + "loss": 0.191, + "step": 5734 + }, + { + "epoch": 4.515951161874754, + "grad_norm": 0.6283718943595886, + "learning_rate": 1.7187e-05, + "loss": 0.1179, + "step": 5735 + }, + { + "epoch": 4.516738873572272, + "grad_norm": 0.5520817041397095, + "learning_rate": 1.719e-05, + "loss": 0.0739, + "step": 5736 + }, + { + "epoch": 4.517526585269791, + "grad_norm": 0.7175590395927429, + "learning_rate": 1.7193000000000003e-05, + "loss": 0.076, + "step": 5737 + }, + { + "epoch": 4.51831429696731, + "grad_norm": 0.5442875027656555, + "learning_rate": 1.7196000000000003e-05, + "loss": 0.058, + "step": 5738 + }, + { + "epoch": 4.519102008664829, + "grad_norm": 0.3517504334449768, + "learning_rate": 1.7199000000000003e-05, + "loss": 0.0358, + "step": 5739 + }, + { + "epoch": 4.519889720362348, + "grad_norm": 0.632746160030365, + "learning_rate": 1.7202000000000002e-05, + "loss": 0.0662, + "step": 5740 + }, + { + "epoch": 4.520677432059866, + "grad_norm": 0.531185507774353, + "learning_rate": 1.7205000000000002e-05, + "loss": 0.0419, + "step": 5741 + }, + { + "epoch": 4.521465143757385, + "grad_norm": 0.8108306527137756, + "learning_rate": 1.7208000000000002e-05, + "loss": 0.089, + "step": 5742 + }, + { + "epoch": 4.522252855454903, + "grad_norm": 0.8774417638778687, + "learning_rate": 1.7211000000000002e-05, + "loss": 0.1107, + "step": 5743 + }, + { + "epoch": 4.523040567152422, + "grad_norm": 0.6466221213340759, + "learning_rate": 1.7213999999999998e-05, + "loss": 0.0247, + "step": 5744 + }, + { + "epoch": 4.52382827884994, + "grad_norm": 0.4800305962562561, + "learning_rate": 1.7216999999999998e-05, + "loss": 0.0244, + "step": 5745 + }, + { + "epoch": 4.52461599054746, + "grad_norm": 0.32816749811172485, + "learning_rate": 1.7219999999999998e-05, + "loss": 0.0188, + "step": 5746 + }, + { + "epoch": 4.5254037022449785, + "grad_norm": 0.46812939643859863, + "learning_rate": 1.7223e-05, + "loss": 0.0295, + "step": 5747 + }, + { + "epoch": 4.526191413942497, + "grad_norm": 0.31909239292144775, + "learning_rate": 1.7226e-05, + "loss": 0.0173, + "step": 5748 + }, + { + "epoch": 4.526979125640016, + "grad_norm": 0.5626298785209656, + "learning_rate": 1.7229e-05, + "loss": 0.0464, + "step": 5749 + }, + { + "epoch": 4.527766837337534, + "grad_norm": 0.35416099429130554, + "learning_rate": 1.7232e-05, + "loss": 0.0182, + "step": 5750 + }, + { + "epoch": 4.528554549035054, + "grad_norm": 0.5092756152153015, + "learning_rate": 1.7235e-05, + "loss": 0.0293, + "step": 5751 + }, + { + "epoch": 4.529342260732572, + "grad_norm": 0.21012556552886963, + "learning_rate": 1.7238e-05, + "loss": 0.0131, + "step": 5752 + }, + { + "epoch": 4.530129972430091, + "grad_norm": 2.34135365486145, + "learning_rate": 1.7241e-05, + "loss": 0.0525, + "step": 5753 + }, + { + "epoch": 4.530917684127609, + "grad_norm": 1.086956262588501, + "learning_rate": 1.7244e-05, + "loss": 0.0389, + "step": 5754 + }, + { + "epoch": 4.531705395825128, + "grad_norm": 0.6052951812744141, + "learning_rate": 1.7247e-05, + "loss": 0.0431, + "step": 5755 + }, + { + "epoch": 4.5324931075226464, + "grad_norm": 0.37891513109207153, + "learning_rate": 1.725e-05, + "loss": 0.0215, + "step": 5756 + }, + { + "epoch": 4.533280819220165, + "grad_norm": 0.6081128120422363, + "learning_rate": 1.7253e-05, + "loss": 0.035, + "step": 5757 + }, + { + "epoch": 4.5340685309176845, + "grad_norm": 0.6718795895576477, + "learning_rate": 1.7256000000000002e-05, + "loss": 0.053, + "step": 5758 + }, + { + "epoch": 4.534856242615203, + "grad_norm": 0.39229732751846313, + "learning_rate": 1.7259000000000002e-05, + "loss": 0.0231, + "step": 5759 + }, + { + "epoch": 4.535643954312722, + "grad_norm": 0.7960717678070068, + "learning_rate": 1.7262000000000002e-05, + "loss": 0.0476, + "step": 5760 + }, + { + "epoch": 4.53643166601024, + "grad_norm": 0.5521798729896545, + "learning_rate": 1.7265e-05, + "loss": 0.0219, + "step": 5761 + }, + { + "epoch": 4.537219377707759, + "grad_norm": 1.1444835662841797, + "learning_rate": 1.7268e-05, + "loss": 0.0398, + "step": 5762 + }, + { + "epoch": 4.538007089405277, + "grad_norm": 0.9625500440597534, + "learning_rate": 1.7271e-05, + "loss": 0.0384, + "step": 5763 + }, + { + "epoch": 4.538794801102797, + "grad_norm": 0.6718111038208008, + "learning_rate": 1.7274e-05, + "loss": 0.0491, + "step": 5764 + }, + { + "epoch": 4.539582512800315, + "grad_norm": 0.621269941329956, + "learning_rate": 1.7277e-05, + "loss": 0.0321, + "step": 5765 + }, + { + "epoch": 4.540370224497834, + "grad_norm": 0.6033405661582947, + "learning_rate": 1.728e-05, + "loss": 0.0329, + "step": 5766 + }, + { + "epoch": 4.5411579361953525, + "grad_norm": 0.3415120542049408, + "learning_rate": 1.7283e-05, + "loss": 0.0263, + "step": 5767 + }, + { + "epoch": 4.541945647892871, + "grad_norm": 0.5148749947547913, + "learning_rate": 1.7286e-05, + "loss": 0.0344, + "step": 5768 + }, + { + "epoch": 4.54273335959039, + "grad_norm": 0.5803442001342773, + "learning_rate": 1.7289e-05, + "loss": 0.0374, + "step": 5769 + }, + { + "epoch": 4.543521071287909, + "grad_norm": 0.3924175798892975, + "learning_rate": 1.7292e-05, + "loss": 0.0346, + "step": 5770 + }, + { + "epoch": 4.544308782985428, + "grad_norm": 0.6841103434562683, + "learning_rate": 1.7295e-05, + "loss": 0.0391, + "step": 5771 + }, + { + "epoch": 4.545096494682946, + "grad_norm": 0.3460201919078827, + "learning_rate": 1.7298e-05, + "loss": 0.0222, + "step": 5772 + }, + { + "epoch": 4.545884206380465, + "grad_norm": 0.5031092166900635, + "learning_rate": 1.7301e-05, + "loss": 0.0265, + "step": 5773 + }, + { + "epoch": 4.546671918077983, + "grad_norm": 0.4734443724155426, + "learning_rate": 1.7304e-05, + "loss": 0.0361, + "step": 5774 + }, + { + "epoch": 4.547459629775502, + "grad_norm": 0.9932732582092285, + "learning_rate": 1.7307e-05, + "loss": 0.0399, + "step": 5775 + }, + { + "epoch": 4.5482473414730205, + "grad_norm": 0.4729170501232147, + "learning_rate": 1.731e-05, + "loss": 0.0471, + "step": 5776 + }, + { + "epoch": 4.54903505317054, + "grad_norm": 0.5124385356903076, + "learning_rate": 1.7313e-05, + "loss": 0.041, + "step": 5777 + }, + { + "epoch": 4.5498227648680585, + "grad_norm": 1.3751546144485474, + "learning_rate": 1.7316e-05, + "loss": 0.0359, + "step": 5778 + }, + { + "epoch": 4.550610476565577, + "grad_norm": 0.944659411907196, + "learning_rate": 1.7319e-05, + "loss": 0.0431, + "step": 5779 + }, + { + "epoch": 4.551398188263096, + "grad_norm": 1.2940387725830078, + "learning_rate": 1.7322e-05, + "loss": 0.0508, + "step": 5780 + }, + { + "epoch": 4.552185899960614, + "grad_norm": 1.1282494068145752, + "learning_rate": 1.7325e-05, + "loss": 0.355, + "step": 5781 + }, + { + "epoch": 4.552973611658133, + "grad_norm": 0.9558836817741394, + "learning_rate": 1.7328e-05, + "loss": 0.2217, + "step": 5782 + }, + { + "epoch": 4.553761323355652, + "grad_norm": 0.6273627281188965, + "learning_rate": 1.7331e-05, + "loss": 0.1639, + "step": 5783 + }, + { + "epoch": 4.554549035053171, + "grad_norm": 0.891825258731842, + "learning_rate": 1.7334e-05, + "loss": 0.1766, + "step": 5784 + }, + { + "epoch": 4.555336746750689, + "grad_norm": 0.9057517051696777, + "learning_rate": 1.7337e-05, + "loss": 0.1414, + "step": 5785 + }, + { + "epoch": 4.556124458448208, + "grad_norm": 0.7238613963127136, + "learning_rate": 1.734e-05, + "loss": 0.0913, + "step": 5786 + }, + { + "epoch": 4.5569121701457265, + "grad_norm": 0.9691334962844849, + "learning_rate": 1.7343e-05, + "loss": 0.0927, + "step": 5787 + }, + { + "epoch": 4.557699881843245, + "grad_norm": 0.5636436939239502, + "learning_rate": 1.7346000000000003e-05, + "loss": 0.0431, + "step": 5788 + }, + { + "epoch": 4.5584875935407645, + "grad_norm": 0.6849561333656311, + "learning_rate": 1.7349000000000003e-05, + "loss": 0.0517, + "step": 5789 + }, + { + "epoch": 4.559275305238283, + "grad_norm": 0.52740478515625, + "learning_rate": 1.7352000000000003e-05, + "loss": 0.0423, + "step": 5790 + }, + { + "epoch": 4.560063016935802, + "grad_norm": 0.6253882646560669, + "learning_rate": 1.7355000000000002e-05, + "loss": 0.0335, + "step": 5791 + }, + { + "epoch": 4.56085072863332, + "grad_norm": 0.5060365796089172, + "learning_rate": 1.7358000000000002e-05, + "loss": 0.0233, + "step": 5792 + }, + { + "epoch": 4.561638440330839, + "grad_norm": 0.4810250699520111, + "learning_rate": 1.7361e-05, + "loss": 0.0262, + "step": 5793 + }, + { + "epoch": 4.562426152028357, + "grad_norm": 0.5680013298988342, + "learning_rate": 1.7364e-05, + "loss": 0.0288, + "step": 5794 + }, + { + "epoch": 4.563213863725876, + "grad_norm": 0.34810671210289, + "learning_rate": 1.7366999999999998e-05, + "loss": 0.0172, + "step": 5795 + }, + { + "epoch": 4.564001575423395, + "grad_norm": 0.5186834335327148, + "learning_rate": 1.7369999999999998e-05, + "loss": 0.0225, + "step": 5796 + }, + { + "epoch": 4.564789287120914, + "grad_norm": 0.5483661890029907, + "learning_rate": 1.7372999999999998e-05, + "loss": 0.0307, + "step": 5797 + }, + { + "epoch": 4.5655769988184325, + "grad_norm": 0.455782949924469, + "learning_rate": 1.7376e-05, + "loss": 0.0158, + "step": 5798 + }, + { + "epoch": 4.566364710515951, + "grad_norm": 0.5971899032592773, + "learning_rate": 1.7379e-05, + "loss": 0.0281, + "step": 5799 + }, + { + "epoch": 4.56715242221347, + "grad_norm": 0.6590025424957275, + "learning_rate": 1.7382e-05, + "loss": 0.0202, + "step": 5800 + }, + { + "epoch": 4.567940133910989, + "grad_norm": 0.4834260642528534, + "learning_rate": 1.7385e-05, + "loss": 0.0238, + "step": 5801 + }, + { + "epoch": 4.568727845608508, + "grad_norm": 0.44774329662323, + "learning_rate": 1.7388e-05, + "loss": 0.0215, + "step": 5802 + }, + { + "epoch": 4.569515557306026, + "grad_norm": 0.5949686765670776, + "learning_rate": 1.7391e-05, + "loss": 0.0309, + "step": 5803 + }, + { + "epoch": 4.570303269003545, + "grad_norm": 0.6134808659553528, + "learning_rate": 1.7394e-05, + "loss": 0.0321, + "step": 5804 + }, + { + "epoch": 4.571090980701063, + "grad_norm": 0.5534802675247192, + "learning_rate": 1.7397e-05, + "loss": 0.0225, + "step": 5805 + }, + { + "epoch": 4.571878692398582, + "grad_norm": 0.4274972379207611, + "learning_rate": 1.74e-05, + "loss": 0.0285, + "step": 5806 + }, + { + "epoch": 4.5726664040961005, + "grad_norm": 0.3695841431617737, + "learning_rate": 1.7403e-05, + "loss": 0.0206, + "step": 5807 + }, + { + "epoch": 4.57345411579362, + "grad_norm": 0.42740052938461304, + "learning_rate": 1.7406000000000002e-05, + "loss": 0.0176, + "step": 5808 + }, + { + "epoch": 4.5742418274911385, + "grad_norm": 0.5084381103515625, + "learning_rate": 1.7409000000000002e-05, + "loss": 0.0242, + "step": 5809 + }, + { + "epoch": 4.575029539188657, + "grad_norm": 0.40454813838005066, + "learning_rate": 1.7412000000000002e-05, + "loss": 0.0342, + "step": 5810 + }, + { + "epoch": 4.575817250886176, + "grad_norm": 0.6039962768554688, + "learning_rate": 1.7415000000000002e-05, + "loss": 0.046, + "step": 5811 + }, + { + "epoch": 4.576604962583694, + "grad_norm": 0.4289799630641937, + "learning_rate": 1.7418e-05, + "loss": 0.0204, + "step": 5812 + }, + { + "epoch": 4.577392674281213, + "grad_norm": 0.6624336838722229, + "learning_rate": 1.7421e-05, + "loss": 0.0216, + "step": 5813 + }, + { + "epoch": 4.578180385978731, + "grad_norm": 0.6545304656028748, + "learning_rate": 1.7424e-05, + "loss": 0.0355, + "step": 5814 + }, + { + "epoch": 4.578968097676251, + "grad_norm": 0.4733744263648987, + "learning_rate": 1.7427e-05, + "loss": 0.018, + "step": 5815 + }, + { + "epoch": 4.579755809373769, + "grad_norm": 2.143454074859619, + "learning_rate": 1.743e-05, + "loss": 0.0462, + "step": 5816 + }, + { + "epoch": 4.580543521071288, + "grad_norm": 0.4924386739730835, + "learning_rate": 1.7432999999999997e-05, + "loss": 0.0368, + "step": 5817 + }, + { + "epoch": 4.5813312327688065, + "grad_norm": 0.36209729313850403, + "learning_rate": 1.7436e-05, + "loss": 0.0176, + "step": 5818 + }, + { + "epoch": 4.582118944466325, + "grad_norm": 0.41130396723747253, + "learning_rate": 1.7439e-05, + "loss": 0.0337, + "step": 5819 + }, + { + "epoch": 4.5829066561638445, + "grad_norm": 0.5643187165260315, + "learning_rate": 1.7442e-05, + "loss": 0.0314, + "step": 5820 + }, + { + "epoch": 4.583694367861363, + "grad_norm": 0.4286465346813202, + "learning_rate": 1.7445e-05, + "loss": 0.0242, + "step": 5821 + }, + { + "epoch": 4.584482079558882, + "grad_norm": 0.5071697235107422, + "learning_rate": 1.7448e-05, + "loss": 0.0382, + "step": 5822 + }, + { + "epoch": 4.5852697912564, + "grad_norm": 0.6866102814674377, + "learning_rate": 1.7451e-05, + "loss": 0.0406, + "step": 5823 + }, + { + "epoch": 4.586057502953919, + "grad_norm": 0.5930393934249878, + "learning_rate": 1.7454e-05, + "loss": 0.035, + "step": 5824 + }, + { + "epoch": 4.586845214651437, + "grad_norm": 0.8617392778396606, + "learning_rate": 1.7457e-05, + "loss": 0.0557, + "step": 5825 + }, + { + "epoch": 4.587632926348956, + "grad_norm": 0.4732918441295624, + "learning_rate": 1.746e-05, + "loss": 0.0263, + "step": 5826 + }, + { + "epoch": 4.588420638046475, + "grad_norm": 0.9618333578109741, + "learning_rate": 1.7463e-05, + "loss": 0.0362, + "step": 5827 + }, + { + "epoch": 4.589208349743994, + "grad_norm": 0.7210561633110046, + "learning_rate": 1.7466000000000002e-05, + "loss": 0.0438, + "step": 5828 + }, + { + "epoch": 4.5899960614415125, + "grad_norm": 0.5722535252571106, + "learning_rate": 1.7469e-05, + "loss": 0.036, + "step": 5829 + }, + { + "epoch": 4.590783773139031, + "grad_norm": 0.6895626783370972, + "learning_rate": 1.7472e-05, + "loss": 0.0504, + "step": 5830 + }, + { + "epoch": 4.59157148483655, + "grad_norm": 1.2404530048370361, + "learning_rate": 1.7475e-05, + "loss": 0.2971, + "step": 5831 + }, + { + "epoch": 4.592359196534068, + "grad_norm": 0.9335232973098755, + "learning_rate": 1.7478e-05, + "loss": 0.2773, + "step": 5832 + }, + { + "epoch": 4.593146908231587, + "grad_norm": 1.5834721326828003, + "learning_rate": 1.7481e-05, + "loss": 0.2509, + "step": 5833 + }, + { + "epoch": 4.593934619929106, + "grad_norm": 1.0489479303359985, + "learning_rate": 1.7484e-05, + "loss": 0.2519, + "step": 5834 + }, + { + "epoch": 4.594722331626625, + "grad_norm": 0.6794953942298889, + "learning_rate": 1.7487e-05, + "loss": 0.0971, + "step": 5835 + }, + { + "epoch": 4.595510043324143, + "grad_norm": 0.5376094579696655, + "learning_rate": 1.749e-05, + "loss": 0.0616, + "step": 5836 + }, + { + "epoch": 4.596297755021662, + "grad_norm": 0.5470057725906372, + "learning_rate": 1.7493e-05, + "loss": 0.0628, + "step": 5837 + }, + { + "epoch": 4.5970854667191805, + "grad_norm": 0.41399380564689636, + "learning_rate": 1.7496000000000003e-05, + "loss": 0.0517, + "step": 5838 + }, + { + "epoch": 4.5978731784167, + "grad_norm": 0.3541390895843506, + "learning_rate": 1.7499000000000003e-05, + "loss": 0.0361, + "step": 5839 + }, + { + "epoch": 4.5986608901142185, + "grad_norm": 0.4776906371116638, + "learning_rate": 1.7502000000000003e-05, + "loss": 0.0394, + "step": 5840 + }, + { + "epoch": 4.599448601811737, + "grad_norm": 0.46643832325935364, + "learning_rate": 1.7505000000000003e-05, + "loss": 0.0284, + "step": 5841 + }, + { + "epoch": 4.600236313509256, + "grad_norm": 0.5089391469955444, + "learning_rate": 1.7508e-05, + "loss": 0.0387, + "step": 5842 + }, + { + "epoch": 4.601024025206774, + "grad_norm": 0.3779158592224121, + "learning_rate": 1.7511e-05, + "loss": 0.0278, + "step": 5843 + }, + { + "epoch": 4.601811736904293, + "grad_norm": 0.6690749526023865, + "learning_rate": 1.7514e-05, + "loss": 0.0334, + "step": 5844 + }, + { + "epoch": 4.602599448601811, + "grad_norm": 0.39456379413604736, + "learning_rate": 1.7517e-05, + "loss": 0.0222, + "step": 5845 + }, + { + "epoch": 4.603387160299331, + "grad_norm": 0.44750869274139404, + "learning_rate": 1.7519999999999998e-05, + "loss": 0.0293, + "step": 5846 + }, + { + "epoch": 4.604174871996849, + "grad_norm": 0.5192330479621887, + "learning_rate": 1.7522999999999998e-05, + "loss": 0.0375, + "step": 5847 + }, + { + "epoch": 4.604962583694368, + "grad_norm": 0.48773127794265747, + "learning_rate": 1.7526e-05, + "loss": 0.0392, + "step": 5848 + }, + { + "epoch": 4.6057502953918865, + "grad_norm": 0.5439900159835815, + "learning_rate": 1.7529e-05, + "loss": 0.0249, + "step": 5849 + }, + { + "epoch": 4.606538007089405, + "grad_norm": 0.6148094534873962, + "learning_rate": 1.7532e-05, + "loss": 0.0235, + "step": 5850 + }, + { + "epoch": 4.607325718786924, + "grad_norm": 0.4786026179790497, + "learning_rate": 1.7535e-05, + "loss": 0.0283, + "step": 5851 + }, + { + "epoch": 4.608113430484442, + "grad_norm": 0.5544691681861877, + "learning_rate": 1.7538e-05, + "loss": 0.0247, + "step": 5852 + }, + { + "epoch": 4.608901142181962, + "grad_norm": 0.47457966208457947, + "learning_rate": 1.7541e-05, + "loss": 0.021, + "step": 5853 + }, + { + "epoch": 4.60968885387948, + "grad_norm": 0.36286115646362305, + "learning_rate": 1.7544e-05, + "loss": 0.0181, + "step": 5854 + }, + { + "epoch": 4.610476565576999, + "grad_norm": 0.471979558467865, + "learning_rate": 1.7547e-05, + "loss": 0.0348, + "step": 5855 + }, + { + "epoch": 4.611264277274517, + "grad_norm": 0.5157586932182312, + "learning_rate": 1.755e-05, + "loss": 0.0389, + "step": 5856 + }, + { + "epoch": 4.612051988972036, + "grad_norm": 0.32847991585731506, + "learning_rate": 1.7553e-05, + "loss": 0.0217, + "step": 5857 + }, + { + "epoch": 4.612839700669555, + "grad_norm": 0.7548587322235107, + "learning_rate": 1.7556000000000003e-05, + "loss": 0.0468, + "step": 5858 + }, + { + "epoch": 4.613627412367074, + "grad_norm": 0.7923691272735596, + "learning_rate": 1.7559000000000002e-05, + "loss": 0.0258, + "step": 5859 + }, + { + "epoch": 4.6144151240645925, + "grad_norm": 0.8818202614784241, + "learning_rate": 1.7562000000000002e-05, + "loss": 0.0318, + "step": 5860 + }, + { + "epoch": 4.615202835762111, + "grad_norm": 0.7137635350227356, + "learning_rate": 1.7565000000000002e-05, + "loss": 0.0321, + "step": 5861 + }, + { + "epoch": 4.61599054745963, + "grad_norm": 0.40278539061546326, + "learning_rate": 1.7568000000000002e-05, + "loss": 0.0241, + "step": 5862 + }, + { + "epoch": 4.616778259157148, + "grad_norm": 0.7905109524726868, + "learning_rate": 1.7571e-05, + "loss": 0.0343, + "step": 5863 + }, + { + "epoch": 4.617565970854667, + "grad_norm": 1.4037599563598633, + "learning_rate": 1.7574e-05, + "loss": 0.0405, + "step": 5864 + }, + { + "epoch": 4.618353682552186, + "grad_norm": 0.43894296884536743, + "learning_rate": 1.7577e-05, + "loss": 0.0314, + "step": 5865 + }, + { + "epoch": 4.619141394249705, + "grad_norm": 0.4983859062194824, + "learning_rate": 1.758e-05, + "loss": 0.0379, + "step": 5866 + }, + { + "epoch": 4.619929105947223, + "grad_norm": 0.7557739019393921, + "learning_rate": 1.7582999999999998e-05, + "loss": 0.0429, + "step": 5867 + }, + { + "epoch": 4.620716817644742, + "grad_norm": 0.7085744738578796, + "learning_rate": 1.7586e-05, + "loss": 0.0297, + "step": 5868 + }, + { + "epoch": 4.6215045293422605, + "grad_norm": 0.6791283488273621, + "learning_rate": 1.7589e-05, + "loss": 0.032, + "step": 5869 + }, + { + "epoch": 4.622292241039779, + "grad_norm": 0.45050251483917236, + "learning_rate": 1.7592e-05, + "loss": 0.0313, + "step": 5870 + }, + { + "epoch": 4.623079952737298, + "grad_norm": 0.35977956652641296, + "learning_rate": 1.7595e-05, + "loss": 0.0218, + "step": 5871 + }, + { + "epoch": 4.623867664434817, + "grad_norm": 0.42582985758781433, + "learning_rate": 1.7598e-05, + "loss": 0.036, + "step": 5872 + }, + { + "epoch": 4.624655376132336, + "grad_norm": 0.3764549195766449, + "learning_rate": 1.7601e-05, + "loss": 0.0211, + "step": 5873 + }, + { + "epoch": 4.625443087829854, + "grad_norm": 0.6544878482818604, + "learning_rate": 1.7604e-05, + "loss": 0.048, + "step": 5874 + }, + { + "epoch": 4.626230799527373, + "grad_norm": 0.5287384390830994, + "learning_rate": 1.7607e-05, + "loss": 0.0423, + "step": 5875 + }, + { + "epoch": 4.627018511224891, + "grad_norm": 0.6883998513221741, + "learning_rate": 1.761e-05, + "loss": 0.0275, + "step": 5876 + }, + { + "epoch": 4.627806222922411, + "grad_norm": 0.5674644708633423, + "learning_rate": 1.7613e-05, + "loss": 0.0234, + "step": 5877 + }, + { + "epoch": 4.628593934619929, + "grad_norm": 0.9651444554328918, + "learning_rate": 1.7616000000000002e-05, + "loss": 0.0425, + "step": 5878 + }, + { + "epoch": 4.629381646317448, + "grad_norm": 0.5411698818206787, + "learning_rate": 1.7619000000000002e-05, + "loss": 0.0296, + "step": 5879 + }, + { + "epoch": 4.6301693580149665, + "grad_norm": 0.6136542558670044, + "learning_rate": 1.7622000000000002e-05, + "loss": 0.0394, + "step": 5880 + }, + { + "epoch": 4.630957069712485, + "grad_norm": 1.5097368955612183, + "learning_rate": 1.7625e-05, + "loss": 0.4199, + "step": 5881 + }, + { + "epoch": 4.631744781410004, + "grad_norm": 2.370453119277954, + "learning_rate": 1.7628e-05, + "loss": 0.3241, + "step": 5882 + }, + { + "epoch": 4.632532493107522, + "grad_norm": 0.8687171339988708, + "learning_rate": 1.7631e-05, + "loss": 0.278, + "step": 5883 + }, + { + "epoch": 4.633320204805042, + "grad_norm": 0.7679188847541809, + "learning_rate": 1.7634e-05, + "loss": 0.179, + "step": 5884 + }, + { + "epoch": 4.63410791650256, + "grad_norm": 0.6463300585746765, + "learning_rate": 1.7637e-05, + "loss": 0.1468, + "step": 5885 + }, + { + "epoch": 4.634895628200079, + "grad_norm": 0.5000786781311035, + "learning_rate": 1.764e-05, + "loss": 0.0953, + "step": 5886 + }, + { + "epoch": 4.635683339897597, + "grad_norm": 0.41258347034454346, + "learning_rate": 1.7643e-05, + "loss": 0.0378, + "step": 5887 + }, + { + "epoch": 4.636471051595116, + "grad_norm": 0.3191555440425873, + "learning_rate": 1.7646e-05, + "loss": 0.0456, + "step": 5888 + }, + { + "epoch": 4.6372587632926345, + "grad_norm": 0.43557146191596985, + "learning_rate": 1.7649000000000003e-05, + "loss": 0.0531, + "step": 5889 + }, + { + "epoch": 4.638046474990154, + "grad_norm": 0.45584574341773987, + "learning_rate": 1.7652000000000003e-05, + "loss": 0.0267, + "step": 5890 + }, + { + "epoch": 4.638834186687673, + "grad_norm": 0.4354211986064911, + "learning_rate": 1.7655e-05, + "loss": 0.0442, + "step": 5891 + }, + { + "epoch": 4.639621898385191, + "grad_norm": 0.5561385750770569, + "learning_rate": 1.7658e-05, + "loss": 0.033, + "step": 5892 + }, + { + "epoch": 4.64040961008271, + "grad_norm": 0.3518679141998291, + "learning_rate": 1.7661e-05, + "loss": 0.0347, + "step": 5893 + }, + { + "epoch": 4.641197321780228, + "grad_norm": 0.4132443964481354, + "learning_rate": 1.7664e-05, + "loss": 0.0245, + "step": 5894 + }, + { + "epoch": 4.641985033477747, + "grad_norm": 0.4827509820461273, + "learning_rate": 1.7667e-05, + "loss": 0.0212, + "step": 5895 + }, + { + "epoch": 4.642772745175266, + "grad_norm": 0.3185177147388458, + "learning_rate": 1.767e-05, + "loss": 0.0253, + "step": 5896 + }, + { + "epoch": 4.643560456872785, + "grad_norm": 0.3268641531467438, + "learning_rate": 1.7673e-05, + "loss": 0.0188, + "step": 5897 + }, + { + "epoch": 4.644348168570303, + "grad_norm": 0.3596215844154358, + "learning_rate": 1.7675999999999998e-05, + "loss": 0.0147, + "step": 5898 + }, + { + "epoch": 4.645135880267822, + "grad_norm": 0.31807830929756165, + "learning_rate": 1.7679e-05, + "loss": 0.0159, + "step": 5899 + }, + { + "epoch": 4.6459235919653405, + "grad_norm": 0.4094181954860687, + "learning_rate": 1.7682e-05, + "loss": 0.0361, + "step": 5900 + }, + { + "epoch": 4.646711303662859, + "grad_norm": 0.6390727758407593, + "learning_rate": 1.7685e-05, + "loss": 0.0317, + "step": 5901 + }, + { + "epoch": 4.647499015360378, + "grad_norm": 0.5873292684555054, + "learning_rate": 1.7688e-05, + "loss": 0.0249, + "step": 5902 + }, + { + "epoch": 4.648286727057897, + "grad_norm": 0.5238884091377258, + "learning_rate": 1.7691e-05, + "loss": 0.0228, + "step": 5903 + }, + { + "epoch": 4.649074438755416, + "grad_norm": 0.5312708616256714, + "learning_rate": 1.7694e-05, + "loss": 0.0363, + "step": 5904 + }, + { + "epoch": 4.649862150452934, + "grad_norm": 0.6867092251777649, + "learning_rate": 1.7697e-05, + "loss": 0.0236, + "step": 5905 + }, + { + "epoch": 4.650649862150453, + "grad_norm": 0.6762291193008423, + "learning_rate": 1.77e-05, + "loss": 0.0255, + "step": 5906 + }, + { + "epoch": 4.651437573847971, + "grad_norm": 0.38024601340293884, + "learning_rate": 1.7703e-05, + "loss": 0.0237, + "step": 5907 + }, + { + "epoch": 4.65222528554549, + "grad_norm": 0.7927206754684448, + "learning_rate": 1.7706e-05, + "loss": 0.0284, + "step": 5908 + }, + { + "epoch": 4.653012997243009, + "grad_norm": 0.478898286819458, + "learning_rate": 1.7709000000000003e-05, + "loss": 0.0199, + "step": 5909 + }, + { + "epoch": 4.653800708940528, + "grad_norm": 0.5977892279624939, + "learning_rate": 1.7712000000000003e-05, + "loss": 0.0327, + "step": 5910 + }, + { + "epoch": 4.654588420638047, + "grad_norm": 0.5493293404579163, + "learning_rate": 1.7715000000000002e-05, + "loss": 0.0318, + "step": 5911 + }, + { + "epoch": 4.655376132335565, + "grad_norm": 0.41281092166900635, + "learning_rate": 1.7718000000000002e-05, + "loss": 0.0243, + "step": 5912 + }, + { + "epoch": 4.656163844033084, + "grad_norm": 0.5032139420509338, + "learning_rate": 1.7721000000000002e-05, + "loss": 0.035, + "step": 5913 + }, + { + "epoch": 4.656951555730602, + "grad_norm": 0.44693613052368164, + "learning_rate": 1.7724000000000002e-05, + "loss": 0.0306, + "step": 5914 + }, + { + "epoch": 4.657739267428122, + "grad_norm": 0.653003990650177, + "learning_rate": 1.7727e-05, + "loss": 0.0309, + "step": 5915 + }, + { + "epoch": 4.65852697912564, + "grad_norm": 0.423361599445343, + "learning_rate": 1.7729999999999998e-05, + "loss": 0.0433, + "step": 5916 + }, + { + "epoch": 4.659314690823159, + "grad_norm": 0.6584654450416565, + "learning_rate": 1.7732999999999998e-05, + "loss": 0.021, + "step": 5917 + }, + { + "epoch": 4.660102402520677, + "grad_norm": 0.5681963562965393, + "learning_rate": 1.7735999999999998e-05, + "loss": 0.034, + "step": 5918 + }, + { + "epoch": 4.660890114218196, + "grad_norm": 0.5134808421134949, + "learning_rate": 1.7739e-05, + "loss": 0.0326, + "step": 5919 + }, + { + "epoch": 4.661677825915715, + "grad_norm": 0.6056143641471863, + "learning_rate": 1.7742e-05, + "loss": 0.0288, + "step": 5920 + }, + { + "epoch": 4.662465537613233, + "grad_norm": 0.5205809473991394, + "learning_rate": 1.7745e-05, + "loss": 0.0353, + "step": 5921 + }, + { + "epoch": 4.663253249310753, + "grad_norm": 0.3192233741283417, + "learning_rate": 1.7748e-05, + "loss": 0.0249, + "step": 5922 + }, + { + "epoch": 4.664040961008271, + "grad_norm": 0.7145928740501404, + "learning_rate": 1.7751e-05, + "loss": 0.0473, + "step": 5923 + }, + { + "epoch": 4.66482867270579, + "grad_norm": 0.6215880513191223, + "learning_rate": 1.7754e-05, + "loss": 0.0296, + "step": 5924 + }, + { + "epoch": 4.665616384403308, + "grad_norm": 1.1529126167297363, + "learning_rate": 1.7757e-05, + "loss": 0.0514, + "step": 5925 + }, + { + "epoch": 4.666404096100827, + "grad_norm": 0.36237236857414246, + "learning_rate": 1.776e-05, + "loss": 0.0233, + "step": 5926 + }, + { + "epoch": 4.667191807798346, + "grad_norm": 0.5761795043945312, + "learning_rate": 1.7763e-05, + "loss": 0.0276, + "step": 5927 + }, + { + "epoch": 4.667979519495865, + "grad_norm": 0.4334849417209625, + "learning_rate": 1.7766e-05, + "loss": 0.0272, + "step": 5928 + }, + { + "epoch": 4.668767231193383, + "grad_norm": 0.42489635944366455, + "learning_rate": 1.7769000000000002e-05, + "loss": 0.033, + "step": 5929 + }, + { + "epoch": 4.669554942890902, + "grad_norm": 0.784480094909668, + "learning_rate": 1.7772000000000002e-05, + "loss": 0.0406, + "step": 5930 + }, + { + "epoch": 4.670342654588421, + "grad_norm": 1.0156117677688599, + "learning_rate": 1.7775000000000002e-05, + "loss": 0.3293, + "step": 5931 + }, + { + "epoch": 4.671130366285939, + "grad_norm": 0.7209593653678894, + "learning_rate": 1.7778e-05, + "loss": 0.1858, + "step": 5932 + }, + { + "epoch": 4.671918077983458, + "grad_norm": 0.9637007117271423, + "learning_rate": 1.7781e-05, + "loss": 0.1944, + "step": 5933 + }, + { + "epoch": 4.672705789680977, + "grad_norm": 1.2498488426208496, + "learning_rate": 1.7784e-05, + "loss": 0.1713, + "step": 5934 + }, + { + "epoch": 4.673493501378496, + "grad_norm": 0.6804729104042053, + "learning_rate": 1.7787e-05, + "loss": 0.0906, + "step": 5935 + }, + { + "epoch": 4.674281213076014, + "grad_norm": 0.44948092103004456, + "learning_rate": 1.779e-05, + "loss": 0.0639, + "step": 5936 + }, + { + "epoch": 4.675068924773533, + "grad_norm": 0.7956376671791077, + "learning_rate": 1.7793e-05, + "loss": 0.0769, + "step": 5937 + }, + { + "epoch": 4.675856636471051, + "grad_norm": 0.35452720522880554, + "learning_rate": 1.7796e-05, + "loss": 0.0298, + "step": 5938 + }, + { + "epoch": 4.67664434816857, + "grad_norm": 0.4990502893924713, + "learning_rate": 1.7799000000000004e-05, + "loss": 0.039, + "step": 5939 + }, + { + "epoch": 4.677432059866089, + "grad_norm": 0.5838979482650757, + "learning_rate": 1.7802e-05, + "loss": 0.028, + "step": 5940 + }, + { + "epoch": 4.678219771563608, + "grad_norm": 0.368379682302475, + "learning_rate": 1.7805e-05, + "loss": 0.026, + "step": 5941 + }, + { + "epoch": 4.679007483261127, + "grad_norm": 0.36758551001548767, + "learning_rate": 1.7808e-05, + "loss": 0.0302, + "step": 5942 + }, + { + "epoch": 4.679795194958645, + "grad_norm": 0.5574834942817688, + "learning_rate": 1.7811e-05, + "loss": 0.0363, + "step": 5943 + }, + { + "epoch": 4.680582906656164, + "grad_norm": 0.33717432618141174, + "learning_rate": 1.7814e-05, + "loss": 0.0219, + "step": 5944 + }, + { + "epoch": 4.681370618353682, + "grad_norm": 0.3814789950847626, + "learning_rate": 1.7817e-05, + "loss": 0.0352, + "step": 5945 + }, + { + "epoch": 4.682158330051202, + "grad_norm": 0.40640348196029663, + "learning_rate": 1.782e-05, + "loss": 0.0308, + "step": 5946 + }, + { + "epoch": 4.68294604174872, + "grad_norm": 0.5465270280838013, + "learning_rate": 1.7823e-05, + "loss": 0.0296, + "step": 5947 + }, + { + "epoch": 4.683733753446239, + "grad_norm": 0.5554477572441101, + "learning_rate": 1.7826e-05, + "loss": 0.0249, + "step": 5948 + }, + { + "epoch": 4.6845214651437574, + "grad_norm": 0.525469183921814, + "learning_rate": 1.7829e-05, + "loss": 0.0198, + "step": 5949 + }, + { + "epoch": 4.685309176841276, + "grad_norm": 0.4220438003540039, + "learning_rate": 1.7832e-05, + "loss": 0.0165, + "step": 5950 + }, + { + "epoch": 4.686096888538795, + "grad_norm": 0.7694118022918701, + "learning_rate": 1.7835e-05, + "loss": 0.0413, + "step": 5951 + }, + { + "epoch": 4.686884600236313, + "grad_norm": 1.1195706129074097, + "learning_rate": 1.7838e-05, + "loss": 0.0349, + "step": 5952 + }, + { + "epoch": 4.687672311933833, + "grad_norm": 0.5777347683906555, + "learning_rate": 1.7841e-05, + "loss": 0.038, + "step": 5953 + }, + { + "epoch": 4.688460023631351, + "grad_norm": 0.5318058729171753, + "learning_rate": 1.7844e-05, + "loss": 0.0309, + "step": 5954 + }, + { + "epoch": 4.68924773532887, + "grad_norm": 0.5445395112037659, + "learning_rate": 1.7847e-05, + "loss": 0.0381, + "step": 5955 + }, + { + "epoch": 4.690035447026388, + "grad_norm": 0.5996721386909485, + "learning_rate": 1.785e-05, + "loss": 0.0354, + "step": 5956 + }, + { + "epoch": 4.690823158723907, + "grad_norm": 0.43250906467437744, + "learning_rate": 1.7853e-05, + "loss": 0.0242, + "step": 5957 + }, + { + "epoch": 4.691610870421425, + "grad_norm": 0.3841816186904907, + "learning_rate": 1.7856e-05, + "loss": 0.0318, + "step": 5958 + }, + { + "epoch": 4.692398582118944, + "grad_norm": 0.6290112137794495, + "learning_rate": 1.7859000000000003e-05, + "loss": 0.0238, + "step": 5959 + }, + { + "epoch": 4.6931862938164635, + "grad_norm": 0.39145103096961975, + "learning_rate": 1.7862000000000003e-05, + "loss": 0.0248, + "step": 5960 + }, + { + "epoch": 4.693974005513982, + "grad_norm": 2.1988284587860107, + "learning_rate": 1.7865000000000003e-05, + "loss": 0.0324, + "step": 5961 + }, + { + "epoch": 4.694761717211501, + "grad_norm": 0.5803236961364746, + "learning_rate": 1.7868000000000002e-05, + "loss": 0.0333, + "step": 5962 + }, + { + "epoch": 4.695549428909019, + "grad_norm": 0.6028640270233154, + "learning_rate": 1.7871000000000002e-05, + "loss": 0.0391, + "step": 5963 + }, + { + "epoch": 4.696337140606538, + "grad_norm": 0.6618253588676453, + "learning_rate": 1.7874000000000002e-05, + "loss": 0.0273, + "step": 5964 + }, + { + "epoch": 4.697124852304057, + "grad_norm": 0.39344537258148193, + "learning_rate": 1.7877e-05, + "loss": 0.0248, + "step": 5965 + }, + { + "epoch": 4.697912564001576, + "grad_norm": 0.6181957125663757, + "learning_rate": 1.7879999999999998e-05, + "loss": 0.0472, + "step": 5966 + }, + { + "epoch": 4.698700275699094, + "grad_norm": 0.523180365562439, + "learning_rate": 1.7882999999999998e-05, + "loss": 0.042, + "step": 5967 + }, + { + "epoch": 4.699487987396613, + "grad_norm": 0.4298816919326782, + "learning_rate": 1.7885999999999998e-05, + "loss": 0.0152, + "step": 5968 + }, + { + "epoch": 4.7002756990941315, + "grad_norm": 0.7427806258201599, + "learning_rate": 1.7889e-05, + "loss": 0.0526, + "step": 5969 + }, + { + "epoch": 4.70106341079165, + "grad_norm": 0.36145564913749695, + "learning_rate": 1.7892e-05, + "loss": 0.0351, + "step": 5970 + }, + { + "epoch": 4.701851122489169, + "grad_norm": 0.6114131212234497, + "learning_rate": 1.7895e-05, + "loss": 0.05, + "step": 5971 + }, + { + "epoch": 4.702638834186688, + "grad_norm": 0.6321021914482117, + "learning_rate": 1.7898e-05, + "loss": 0.0482, + "step": 5972 + }, + { + "epoch": 4.703426545884207, + "grad_norm": 1.024448037147522, + "learning_rate": 1.7901e-05, + "loss": 0.0433, + "step": 5973 + }, + { + "epoch": 4.704214257581725, + "grad_norm": 0.5815706849098206, + "learning_rate": 1.7904e-05, + "loss": 0.0324, + "step": 5974 + }, + { + "epoch": 4.705001969279244, + "grad_norm": 0.4383123517036438, + "learning_rate": 1.7907e-05, + "loss": 0.0292, + "step": 5975 + }, + { + "epoch": 4.705789680976762, + "grad_norm": 0.6480106115341187, + "learning_rate": 1.791e-05, + "loss": 0.0424, + "step": 5976 + }, + { + "epoch": 4.706577392674281, + "grad_norm": 1.0081455707550049, + "learning_rate": 1.7913e-05, + "loss": 0.0584, + "step": 5977 + }, + { + "epoch": 4.7073651043717994, + "grad_norm": 0.4051041901111603, + "learning_rate": 1.7916e-05, + "loss": 0.0233, + "step": 5978 + }, + { + "epoch": 4.708152816069319, + "grad_norm": 0.6057848334312439, + "learning_rate": 1.7919000000000002e-05, + "loss": 0.0403, + "step": 5979 + }, + { + "epoch": 4.7089405277668375, + "grad_norm": 0.8923109173774719, + "learning_rate": 1.7922000000000002e-05, + "loss": 0.0861, + "step": 5980 + }, + { + "epoch": 4.709728239464356, + "grad_norm": 1.1305121183395386, + "learning_rate": 1.7925000000000002e-05, + "loss": 0.3783, + "step": 5981 + }, + { + "epoch": 4.710515951161875, + "grad_norm": 0.963130533695221, + "learning_rate": 1.7928000000000002e-05, + "loss": 0.298, + "step": 5982 + }, + { + "epoch": 4.711303662859393, + "grad_norm": 0.6791464686393738, + "learning_rate": 1.7931e-05, + "loss": 0.1885, + "step": 5983 + }, + { + "epoch": 4.712091374556913, + "grad_norm": 0.8802827596664429, + "learning_rate": 1.7934e-05, + "loss": 0.2228, + "step": 5984 + }, + { + "epoch": 4.712879086254431, + "grad_norm": 0.6914811730384827, + "learning_rate": 1.7937e-05, + "loss": 0.1651, + "step": 5985 + }, + { + "epoch": 4.71366679795195, + "grad_norm": 0.7218813896179199, + "learning_rate": 1.794e-05, + "loss": 0.112, + "step": 5986 + }, + { + "epoch": 4.714454509649468, + "grad_norm": 0.31994500756263733, + "learning_rate": 1.7943e-05, + "loss": 0.0411, + "step": 5987 + }, + { + "epoch": 4.715242221346987, + "grad_norm": 0.5612471103668213, + "learning_rate": 1.7946e-05, + "loss": 0.0583, + "step": 5988 + }, + { + "epoch": 4.7160299330445055, + "grad_norm": 0.29539725184440613, + "learning_rate": 1.7949e-05, + "loss": 0.0189, + "step": 5989 + }, + { + "epoch": 4.716817644742024, + "grad_norm": 0.6857266426086426, + "learning_rate": 1.7952e-05, + "loss": 0.0507, + "step": 5990 + }, + { + "epoch": 4.7176053564395435, + "grad_norm": 0.5682223439216614, + "learning_rate": 1.7955e-05, + "loss": 0.0395, + "step": 5991 + }, + { + "epoch": 4.718393068137062, + "grad_norm": 0.6901745796203613, + "learning_rate": 1.7958e-05, + "loss": 0.081, + "step": 5992 + }, + { + "epoch": 4.719180779834581, + "grad_norm": 0.596569836139679, + "learning_rate": 1.7961e-05, + "loss": 0.0364, + "step": 5993 + }, + { + "epoch": 4.719968491532099, + "grad_norm": 0.40006953477859497, + "learning_rate": 1.7964e-05, + "loss": 0.0225, + "step": 5994 + }, + { + "epoch": 4.720756203229618, + "grad_norm": 0.3880821168422699, + "learning_rate": 1.7967e-05, + "loss": 0.0148, + "step": 5995 + }, + { + "epoch": 4.721543914927136, + "grad_norm": 0.4068918824195862, + "learning_rate": 1.797e-05, + "loss": 0.0403, + "step": 5996 + }, + { + "epoch": 4.722331626624655, + "grad_norm": 0.49963757395744324, + "learning_rate": 1.7973e-05, + "loss": 0.0232, + "step": 5997 + }, + { + "epoch": 4.723119338322174, + "grad_norm": 0.43195146322250366, + "learning_rate": 1.7976e-05, + "loss": 0.0256, + "step": 5998 + }, + { + "epoch": 4.723907050019693, + "grad_norm": 0.5277069211006165, + "learning_rate": 1.7979000000000002e-05, + "loss": 0.0244, + "step": 5999 + }, + { + "epoch": 4.7246947617172115, + "grad_norm": 0.9251888394355774, + "learning_rate": 1.7982e-05, + "loss": 0.0277, + "step": 6000 + }, + { + "epoch": 4.7246947617172115, + "eval_cer": 0.13865774943619255, + "eval_loss": 0.38799530267715454, + "eval_runtime": 16.8975, + "eval_samples_per_second": 17.991, + "eval_steps_per_second": 0.592, + "eval_wer": 0.464888718342287, + "step": 6000 + }, + { + "epoch": 4.72548247341473, + "grad_norm": 0.4245053827762604, + "learning_rate": 1.7985e-05, + "loss": 0.0292, + "step": 6001 + }, + { + "epoch": 4.726270185112249, + "grad_norm": 0.266401469707489, + "learning_rate": 1.7988e-05, + "loss": 0.01, + "step": 6002 + }, + { + "epoch": 4.727057896809768, + "grad_norm": 0.7757538557052612, + "learning_rate": 1.7991e-05, + "loss": 0.0313, + "step": 6003 + }, + { + "epoch": 4.727845608507287, + "grad_norm": 0.7079904079437256, + "learning_rate": 1.7994e-05, + "loss": 0.036, + "step": 6004 + }, + { + "epoch": 4.728633320204805, + "grad_norm": 0.3075886070728302, + "learning_rate": 1.7997e-05, + "loss": 0.0143, + "step": 6005 + }, + { + "epoch": 4.729421031902324, + "grad_norm": 0.5712287425994873, + "learning_rate": 1.8e-05, + "loss": 0.0329, + "step": 6006 + }, + { + "epoch": 4.730208743599842, + "grad_norm": 0.6885459423065186, + "learning_rate": 1.8003e-05, + "loss": 0.0873, + "step": 6007 + }, + { + "epoch": 4.730996455297361, + "grad_norm": 0.3618836998939514, + "learning_rate": 1.8006e-05, + "loss": 0.0138, + "step": 6008 + }, + { + "epoch": 4.7317841669948795, + "grad_norm": 0.44539716839790344, + "learning_rate": 1.8009e-05, + "loss": 0.0253, + "step": 6009 + }, + { + "epoch": 4.732571878692399, + "grad_norm": 0.5509175062179565, + "learning_rate": 1.8012000000000003e-05, + "loss": 0.0273, + "step": 6010 + }, + { + "epoch": 4.7333595903899175, + "grad_norm": 0.4528714716434479, + "learning_rate": 1.8015000000000003e-05, + "loss": 0.0272, + "step": 6011 + }, + { + "epoch": 4.734147302087436, + "grad_norm": 0.35416820645332336, + "learning_rate": 1.8018000000000003e-05, + "loss": 0.019, + "step": 6012 + }, + { + "epoch": 4.734935013784955, + "grad_norm": 0.565456211566925, + "learning_rate": 1.8021000000000002e-05, + "loss": 0.0351, + "step": 6013 + }, + { + "epoch": 4.735722725482473, + "grad_norm": 0.47870081663131714, + "learning_rate": 1.8024e-05, + "loss": 0.0338, + "step": 6014 + }, + { + "epoch": 4.736510437179992, + "grad_norm": 0.32700449228286743, + "learning_rate": 1.8027e-05, + "loss": 0.0159, + "step": 6015 + }, + { + "epoch": 4.737298148877511, + "grad_norm": 0.4820171892642975, + "learning_rate": 1.803e-05, + "loss": 0.0284, + "step": 6016 + }, + { + "epoch": 4.73808586057503, + "grad_norm": 0.40730640292167664, + "learning_rate": 1.8032999999999998e-05, + "loss": 0.0117, + "step": 6017 + }, + { + "epoch": 4.738873572272548, + "grad_norm": 0.3898356258869171, + "learning_rate": 1.8035999999999998e-05, + "loss": 0.0242, + "step": 6018 + }, + { + "epoch": 4.739661283970067, + "grad_norm": 0.7347087264060974, + "learning_rate": 1.8038999999999998e-05, + "loss": 0.0443, + "step": 6019 + }, + { + "epoch": 4.7404489956675855, + "grad_norm": 1.4691461324691772, + "learning_rate": 1.8042e-05, + "loss": 0.0401, + "step": 6020 + }, + { + "epoch": 4.741236707365104, + "grad_norm": 0.5932475924491882, + "learning_rate": 1.8045e-05, + "loss": 0.0284, + "step": 6021 + }, + { + "epoch": 4.7420244190626235, + "grad_norm": 0.5255677103996277, + "learning_rate": 1.8048e-05, + "loss": 0.0589, + "step": 6022 + }, + { + "epoch": 4.742812130760142, + "grad_norm": 0.49045923352241516, + "learning_rate": 1.8051e-05, + "loss": 0.0299, + "step": 6023 + }, + { + "epoch": 4.743599842457661, + "grad_norm": 0.7130405902862549, + "learning_rate": 1.8054e-05, + "loss": 0.0446, + "step": 6024 + }, + { + "epoch": 4.744387554155179, + "grad_norm": 0.5337305665016174, + "learning_rate": 1.8057e-05, + "loss": 0.0272, + "step": 6025 + }, + { + "epoch": 4.745175265852698, + "grad_norm": 0.9863874912261963, + "learning_rate": 1.806e-05, + "loss": 0.0287, + "step": 6026 + }, + { + "epoch": 4.745962977550216, + "grad_norm": 0.6182156801223755, + "learning_rate": 1.8063e-05, + "loss": 0.036, + "step": 6027 + }, + { + "epoch": 4.746750689247735, + "grad_norm": 0.9637994766235352, + "learning_rate": 1.8066e-05, + "loss": 0.0272, + "step": 6028 + }, + { + "epoch": 4.747538400945254, + "grad_norm": 0.5377259850502014, + "learning_rate": 1.8069e-05, + "loss": 0.0301, + "step": 6029 + }, + { + "epoch": 4.748326112642773, + "grad_norm": 0.9820071458816528, + "learning_rate": 1.8072000000000002e-05, + "loss": 0.0525, + "step": 6030 + }, + { + "epoch": 4.7491138243402915, + "grad_norm": 1.2489289045333862, + "learning_rate": 1.8075000000000002e-05, + "loss": 0.3312, + "step": 6031 + }, + { + "epoch": 4.74990153603781, + "grad_norm": 0.882050633430481, + "learning_rate": 1.8078000000000002e-05, + "loss": 0.1951, + "step": 6032 + }, + { + "epoch": 4.750689247735329, + "grad_norm": 0.9062280654907227, + "learning_rate": 1.8081000000000002e-05, + "loss": 0.2146, + "step": 6033 + }, + { + "epoch": 4.751476959432847, + "grad_norm": 0.9102357625961304, + "learning_rate": 1.8084e-05, + "loss": 0.1801, + "step": 6034 + }, + { + "epoch": 4.752264671130367, + "grad_norm": 0.8359081149101257, + "learning_rate": 1.8087e-05, + "loss": 0.1482, + "step": 6035 + }, + { + "epoch": 4.753052382827885, + "grad_norm": 0.9927305579185486, + "learning_rate": 1.809e-05, + "loss": 0.105, + "step": 6036 + }, + { + "epoch": 4.753840094525404, + "grad_norm": 0.9389183521270752, + "learning_rate": 1.8093e-05, + "loss": 0.0436, + "step": 6037 + }, + { + "epoch": 4.754627806222922, + "grad_norm": 0.6275421977043152, + "learning_rate": 1.8096e-05, + "loss": 0.045, + "step": 6038 + }, + { + "epoch": 4.755415517920441, + "grad_norm": 0.5909695029258728, + "learning_rate": 1.8098999999999997e-05, + "loss": 0.0528, + "step": 6039 + }, + { + "epoch": 4.7562032296179595, + "grad_norm": 0.4275064468383789, + "learning_rate": 1.8102e-05, + "loss": 0.0271, + "step": 6040 + }, + { + "epoch": 4.756990941315479, + "grad_norm": 0.4050431251525879, + "learning_rate": 1.8105e-05, + "loss": 0.0337, + "step": 6041 + }, + { + "epoch": 4.7577786530129975, + "grad_norm": 0.3417608141899109, + "learning_rate": 1.8108e-05, + "loss": 0.0203, + "step": 6042 + }, + { + "epoch": 4.758566364710516, + "grad_norm": 0.4719613492488861, + "learning_rate": 1.8111e-05, + "loss": 0.0287, + "step": 6043 + }, + { + "epoch": 4.759354076408035, + "grad_norm": 0.3475150465965271, + "learning_rate": 1.8114e-05, + "loss": 0.0339, + "step": 6044 + }, + { + "epoch": 4.760141788105553, + "grad_norm": 0.471908301115036, + "learning_rate": 1.8117e-05, + "loss": 0.0404, + "step": 6045 + }, + { + "epoch": 4.760929499803072, + "grad_norm": 0.591242253780365, + "learning_rate": 1.812e-05, + "loss": 0.0297, + "step": 6046 + }, + { + "epoch": 4.76171721150059, + "grad_norm": 0.3433912992477417, + "learning_rate": 1.8123e-05, + "loss": 0.029, + "step": 6047 + }, + { + "epoch": 4.76250492319811, + "grad_norm": 0.41590622067451477, + "learning_rate": 1.8126e-05, + "loss": 0.0321, + "step": 6048 + }, + { + "epoch": 4.763292634895628, + "grad_norm": 0.7470031380653381, + "learning_rate": 1.8129e-05, + "loss": 0.0375, + "step": 6049 + }, + { + "epoch": 4.764080346593147, + "grad_norm": 0.3892024755477905, + "learning_rate": 1.8132000000000002e-05, + "loss": 0.0199, + "step": 6050 + }, + { + "epoch": 4.7648680582906655, + "grad_norm": 0.4970307946205139, + "learning_rate": 1.8135000000000002e-05, + "loss": 0.0291, + "step": 6051 + }, + { + "epoch": 4.765655769988184, + "grad_norm": 0.4069007337093353, + "learning_rate": 1.8138e-05, + "loss": 0.0175, + "step": 6052 + }, + { + "epoch": 4.7664434816857035, + "grad_norm": 0.3314266502857208, + "learning_rate": 1.8141e-05, + "loss": 0.0289, + "step": 6053 + }, + { + "epoch": 4.767231193383222, + "grad_norm": 0.5604872107505798, + "learning_rate": 1.8144e-05, + "loss": 0.0273, + "step": 6054 + }, + { + "epoch": 4.768018905080741, + "grad_norm": 0.35674577951431274, + "learning_rate": 1.8147e-05, + "loss": 0.0286, + "step": 6055 + }, + { + "epoch": 4.768806616778259, + "grad_norm": 0.4467989504337311, + "learning_rate": 1.815e-05, + "loss": 0.0271, + "step": 6056 + }, + { + "epoch": 4.769594328475778, + "grad_norm": 0.5202931761741638, + "learning_rate": 1.8153e-05, + "loss": 0.0299, + "step": 6057 + }, + { + "epoch": 4.770382040173296, + "grad_norm": 0.46022775769233704, + "learning_rate": 1.8156e-05, + "loss": 0.0173, + "step": 6058 + }, + { + "epoch": 4.771169751870815, + "grad_norm": 1.0554533004760742, + "learning_rate": 1.8159e-05, + "loss": 0.0404, + "step": 6059 + }, + { + "epoch": 4.771957463568334, + "grad_norm": 0.5298424363136292, + "learning_rate": 1.8162000000000003e-05, + "loss": 0.0269, + "step": 6060 + }, + { + "epoch": 4.772745175265853, + "grad_norm": 0.8471173048019409, + "learning_rate": 1.8165000000000003e-05, + "loss": 0.0426, + "step": 6061 + }, + { + "epoch": 4.7735328869633715, + "grad_norm": 0.5286890268325806, + "learning_rate": 1.8168000000000003e-05, + "loss": 0.0266, + "step": 6062 + }, + { + "epoch": 4.77432059866089, + "grad_norm": 0.4910508692264557, + "learning_rate": 1.8171e-05, + "loss": 0.0356, + "step": 6063 + }, + { + "epoch": 4.775108310358409, + "grad_norm": 0.5298975110054016, + "learning_rate": 1.8174e-05, + "loss": 0.0212, + "step": 6064 + }, + { + "epoch": 4.775896022055927, + "grad_norm": 0.6665295958518982, + "learning_rate": 1.8177e-05, + "loss": 0.0365, + "step": 6065 + }, + { + "epoch": 4.776683733753446, + "grad_norm": 0.47534018754959106, + "learning_rate": 1.818e-05, + "loss": 0.0238, + "step": 6066 + }, + { + "epoch": 4.777471445450965, + "grad_norm": 0.5164831280708313, + "learning_rate": 1.8183e-05, + "loss": 0.033, + "step": 6067 + }, + { + "epoch": 4.778259157148484, + "grad_norm": 0.7872150540351868, + "learning_rate": 1.8186e-05, + "loss": 0.0331, + "step": 6068 + }, + { + "epoch": 4.779046868846002, + "grad_norm": 0.45642518997192383, + "learning_rate": 1.8188999999999998e-05, + "loss": 0.0222, + "step": 6069 + }, + { + "epoch": 4.779834580543521, + "grad_norm": 0.5582420229911804, + "learning_rate": 1.8192e-05, + "loss": 0.0306, + "step": 6070 + }, + { + "epoch": 4.7806222922410395, + "grad_norm": 0.4928562045097351, + "learning_rate": 1.8195e-05, + "loss": 0.0341, + "step": 6071 + }, + { + "epoch": 4.781410003938559, + "grad_norm": 0.3557049632072449, + "learning_rate": 1.8198e-05, + "loss": 0.0223, + "step": 6072 + }, + { + "epoch": 4.7821977156360775, + "grad_norm": 0.6053141951560974, + "learning_rate": 1.8201e-05, + "loss": 0.0584, + "step": 6073 + }, + { + "epoch": 4.782985427333596, + "grad_norm": 0.5304587483406067, + "learning_rate": 1.8204e-05, + "loss": 0.0283, + "step": 6074 + }, + { + "epoch": 4.783773139031115, + "grad_norm": 0.6282368898391724, + "learning_rate": 1.8207e-05, + "loss": 0.0475, + "step": 6075 + }, + { + "epoch": 4.784560850728633, + "grad_norm": 0.5032363533973694, + "learning_rate": 1.821e-05, + "loss": 0.0198, + "step": 6076 + }, + { + "epoch": 4.785348562426152, + "grad_norm": 0.6020894050598145, + "learning_rate": 1.8213e-05, + "loss": 0.0389, + "step": 6077 + }, + { + "epoch": 4.78613627412367, + "grad_norm": 0.4480798840522766, + "learning_rate": 1.8216e-05, + "loss": 0.0254, + "step": 6078 + }, + { + "epoch": 4.78692398582119, + "grad_norm": 0.8505048751831055, + "learning_rate": 1.8219e-05, + "loss": 0.0432, + "step": 6079 + }, + { + "epoch": 4.787711697518708, + "grad_norm": 0.7451234459877014, + "learning_rate": 1.8222000000000003e-05, + "loss": 0.0267, + "step": 6080 + }, + { + "epoch": 4.788499409216227, + "grad_norm": 1.5290881395339966, + "learning_rate": 1.8225000000000003e-05, + "loss": 0.351, + "step": 6081 + }, + { + "epoch": 4.7892871209137455, + "grad_norm": 1.1335688829421997, + "learning_rate": 1.8228000000000002e-05, + "loss": 0.2598, + "step": 6082 + }, + { + "epoch": 4.790074832611264, + "grad_norm": 0.897023618221283, + "learning_rate": 1.8231000000000002e-05, + "loss": 0.2206, + "step": 6083 + }, + { + "epoch": 4.790862544308783, + "grad_norm": 0.6368684768676758, + "learning_rate": 1.8234000000000002e-05, + "loss": 0.1352, + "step": 6084 + }, + { + "epoch": 4.791650256006301, + "grad_norm": 1.0186690092086792, + "learning_rate": 1.8237000000000002e-05, + "loss": 0.2066, + "step": 6085 + }, + { + "epoch": 4.792437967703821, + "grad_norm": 0.9074774980545044, + "learning_rate": 1.824e-05, + "loss": 0.1154, + "step": 6086 + }, + { + "epoch": 4.793225679401339, + "grad_norm": 0.5295150279998779, + "learning_rate": 1.8243e-05, + "loss": 0.0792, + "step": 6087 + }, + { + "epoch": 4.794013391098858, + "grad_norm": 0.44234928488731384, + "learning_rate": 1.8245999999999998e-05, + "loss": 0.0335, + "step": 6088 + }, + { + "epoch": 4.794801102796376, + "grad_norm": 0.38598108291625977, + "learning_rate": 1.8248999999999998e-05, + "loss": 0.0224, + "step": 6089 + }, + { + "epoch": 4.795588814493895, + "grad_norm": 0.3824221193790436, + "learning_rate": 1.8252e-05, + "loss": 0.0257, + "step": 6090 + }, + { + "epoch": 4.796376526191414, + "grad_norm": 0.7109547853469849, + "learning_rate": 1.8255e-05, + "loss": 0.0414, + "step": 6091 + }, + { + "epoch": 4.797164237888933, + "grad_norm": 0.4062718152999878, + "learning_rate": 1.8258e-05, + "loss": 0.0252, + "step": 6092 + }, + { + "epoch": 4.7979519495864515, + "grad_norm": 0.25539180636405945, + "learning_rate": 1.8261e-05, + "loss": 0.02, + "step": 6093 + }, + { + "epoch": 4.79873966128397, + "grad_norm": 0.6615286469459534, + "learning_rate": 1.8264e-05, + "loss": 0.0358, + "step": 6094 + }, + { + "epoch": 4.799527372981489, + "grad_norm": 0.5172821283340454, + "learning_rate": 1.8267e-05, + "loss": 0.0284, + "step": 6095 + }, + { + "epoch": 4.800315084679007, + "grad_norm": 0.48832154273986816, + "learning_rate": 1.827e-05, + "loss": 0.0323, + "step": 6096 + }, + { + "epoch": 4.801102796376526, + "grad_norm": 0.4244465231895447, + "learning_rate": 1.8273e-05, + "loss": 0.0279, + "step": 6097 + }, + { + "epoch": 4.801890508074045, + "grad_norm": 0.45422041416168213, + "learning_rate": 1.8276e-05, + "loss": 0.0333, + "step": 6098 + }, + { + "epoch": 4.802678219771564, + "grad_norm": 0.5615123510360718, + "learning_rate": 1.8279e-05, + "loss": 0.0277, + "step": 6099 + }, + { + "epoch": 4.803465931469082, + "grad_norm": 0.5930907726287842, + "learning_rate": 1.8282000000000002e-05, + "loss": 0.05, + "step": 6100 + }, + { + "epoch": 4.804253643166601, + "grad_norm": 0.43764743208885193, + "learning_rate": 1.8285000000000002e-05, + "loss": 0.0222, + "step": 6101 + }, + { + "epoch": 4.8050413548641195, + "grad_norm": 0.48361948132514954, + "learning_rate": 1.8288000000000002e-05, + "loss": 0.0213, + "step": 6102 + }, + { + "epoch": 4.805829066561638, + "grad_norm": 0.3658788502216339, + "learning_rate": 1.8291e-05, + "loss": 0.017, + "step": 6103 + }, + { + "epoch": 4.806616778259157, + "grad_norm": 0.5514914989471436, + "learning_rate": 1.8294e-05, + "loss": 0.0261, + "step": 6104 + }, + { + "epoch": 4.807404489956676, + "grad_norm": 0.39278727769851685, + "learning_rate": 1.8297e-05, + "loss": 0.0224, + "step": 6105 + }, + { + "epoch": 4.808192201654195, + "grad_norm": 0.6817874312400818, + "learning_rate": 1.83e-05, + "loss": 0.0247, + "step": 6106 + }, + { + "epoch": 4.808979913351713, + "grad_norm": 0.34904754161834717, + "learning_rate": 1.8303e-05, + "loss": 0.0293, + "step": 6107 + }, + { + "epoch": 4.809767625049232, + "grad_norm": 0.6229877471923828, + "learning_rate": 1.8306e-05, + "loss": 0.0357, + "step": 6108 + }, + { + "epoch": 4.81055533674675, + "grad_norm": 0.825973391532898, + "learning_rate": 1.8309e-05, + "loss": 0.0333, + "step": 6109 + }, + { + "epoch": 4.81134304844427, + "grad_norm": 0.39821627736091614, + "learning_rate": 1.8312000000000004e-05, + "loss": 0.023, + "step": 6110 + }, + { + "epoch": 4.812130760141788, + "grad_norm": 0.5212709903717041, + "learning_rate": 1.8315000000000003e-05, + "loss": 0.0299, + "step": 6111 + }, + { + "epoch": 4.812918471839307, + "grad_norm": 0.47518160939216614, + "learning_rate": 1.8318e-05, + "loss": 0.0313, + "step": 6112 + }, + { + "epoch": 4.8137061835368256, + "grad_norm": 0.30549976229667664, + "learning_rate": 1.8321e-05, + "loss": 0.0168, + "step": 6113 + }, + { + "epoch": 4.814493895234344, + "grad_norm": 0.5577102303504944, + "learning_rate": 1.8324e-05, + "loss": 0.0203, + "step": 6114 + }, + { + "epoch": 4.815281606931863, + "grad_norm": 0.5254960656166077, + "learning_rate": 1.8327e-05, + "loss": 0.0426, + "step": 6115 + }, + { + "epoch": 4.816069318629381, + "grad_norm": 0.6110830903053284, + "learning_rate": 1.833e-05, + "loss": 0.03, + "step": 6116 + }, + { + "epoch": 4.816857030326901, + "grad_norm": 0.5631809830665588, + "learning_rate": 1.8333e-05, + "loss": 0.0364, + "step": 6117 + }, + { + "epoch": 4.817644742024419, + "grad_norm": 0.5222346782684326, + "learning_rate": 1.8336e-05, + "loss": 0.0332, + "step": 6118 + }, + { + "epoch": 4.818432453721938, + "grad_norm": 0.40415894985198975, + "learning_rate": 1.8339e-05, + "loss": 0.0251, + "step": 6119 + }, + { + "epoch": 4.819220165419456, + "grad_norm": 0.6548478007316589, + "learning_rate": 1.8342e-05, + "loss": 0.0317, + "step": 6120 + }, + { + "epoch": 4.820007877116975, + "grad_norm": 0.7469210028648376, + "learning_rate": 1.8345e-05, + "loss": 0.0306, + "step": 6121 + }, + { + "epoch": 4.8207955888144935, + "grad_norm": 0.6108599901199341, + "learning_rate": 1.8348e-05, + "loss": 0.0353, + "step": 6122 + }, + { + "epoch": 4.821583300512012, + "grad_norm": 0.4532783627510071, + "learning_rate": 1.8351e-05, + "loss": 0.0305, + "step": 6123 + }, + { + "epoch": 4.822371012209532, + "grad_norm": 0.4639834463596344, + "learning_rate": 1.8354e-05, + "loss": 0.0237, + "step": 6124 + }, + { + "epoch": 4.82315872390705, + "grad_norm": 0.33577173948287964, + "learning_rate": 1.8357e-05, + "loss": 0.0232, + "step": 6125 + }, + { + "epoch": 4.823946435604569, + "grad_norm": 0.6408082246780396, + "learning_rate": 1.836e-05, + "loss": 0.0219, + "step": 6126 + }, + { + "epoch": 4.824734147302087, + "grad_norm": 0.5876874327659607, + "learning_rate": 1.8363e-05, + "loss": 0.0363, + "step": 6127 + }, + { + "epoch": 4.825521858999606, + "grad_norm": 0.5584089756011963, + "learning_rate": 1.8366e-05, + "loss": 0.039, + "step": 6128 + }, + { + "epoch": 4.826309570697125, + "grad_norm": 1.8714033365249634, + "learning_rate": 1.8369e-05, + "loss": 0.0457, + "step": 6129 + }, + { + "epoch": 4.827097282394644, + "grad_norm": 1.1144394874572754, + "learning_rate": 1.8372000000000003e-05, + "loss": 0.0329, + "step": 6130 + }, + { + "epoch": 4.827884994092162, + "grad_norm": 1.0753424167633057, + "learning_rate": 1.8375000000000003e-05, + "loss": 0.3738, + "step": 6131 + }, + { + "epoch": 4.828672705789681, + "grad_norm": 0.8447405695915222, + "learning_rate": 1.8378000000000003e-05, + "loss": 0.2651, + "step": 6132 + }, + { + "epoch": 4.8294604174872, + "grad_norm": 0.9934149980545044, + "learning_rate": 1.8381000000000002e-05, + "loss": 0.2425, + "step": 6133 + }, + { + "epoch": 4.830248129184718, + "grad_norm": 0.8958812952041626, + "learning_rate": 1.8384000000000002e-05, + "loss": 0.1609, + "step": 6134 + }, + { + "epoch": 4.831035840882237, + "grad_norm": 0.6400876641273499, + "learning_rate": 1.8387000000000002e-05, + "loss": 0.1094, + "step": 6135 + }, + { + "epoch": 4.831823552579756, + "grad_norm": 0.6448205709457397, + "learning_rate": 1.8390000000000002e-05, + "loss": 0.0614, + "step": 6136 + }, + { + "epoch": 4.832611264277275, + "grad_norm": 1.3004392385482788, + "learning_rate": 1.8392999999999998e-05, + "loss": 0.0516, + "step": 6137 + }, + { + "epoch": 4.833398975974793, + "grad_norm": 1.029314398765564, + "learning_rate": 1.8395999999999998e-05, + "loss": 0.1072, + "step": 6138 + }, + { + "epoch": 4.834186687672312, + "grad_norm": 0.573236882686615, + "learning_rate": 1.8398999999999998e-05, + "loss": 0.0398, + "step": 6139 + }, + { + "epoch": 4.83497439936983, + "grad_norm": 0.33203473687171936, + "learning_rate": 1.8401999999999998e-05, + "loss": 0.0298, + "step": 6140 + }, + { + "epoch": 4.835762111067349, + "grad_norm": 0.41083019971847534, + "learning_rate": 1.8405e-05, + "loss": 0.0236, + "step": 6141 + }, + { + "epoch": 4.8365498227648684, + "grad_norm": 0.312472403049469, + "learning_rate": 1.8408e-05, + "loss": 0.0265, + "step": 6142 + }, + { + "epoch": 4.837337534462387, + "grad_norm": 0.35932406783103943, + "learning_rate": 1.8411e-05, + "loss": 0.0181, + "step": 6143 + }, + { + "epoch": 4.838125246159906, + "grad_norm": 0.3714703321456909, + "learning_rate": 1.8414e-05, + "loss": 0.0263, + "step": 6144 + }, + { + "epoch": 4.838912957857424, + "grad_norm": 0.3185500204563141, + "learning_rate": 1.8417e-05, + "loss": 0.0186, + "step": 6145 + }, + { + "epoch": 4.839700669554943, + "grad_norm": 0.5009068250656128, + "learning_rate": 1.842e-05, + "loss": 0.0384, + "step": 6146 + }, + { + "epoch": 4.840488381252461, + "grad_norm": 0.5260659456253052, + "learning_rate": 1.8423e-05, + "loss": 0.0313, + "step": 6147 + }, + { + "epoch": 4.841276092949981, + "grad_norm": 0.47212284803390503, + "learning_rate": 1.8426e-05, + "loss": 0.0357, + "step": 6148 + }, + { + "epoch": 4.842063804647499, + "grad_norm": 0.4355666935443878, + "learning_rate": 1.8429e-05, + "loss": 0.0261, + "step": 6149 + }, + { + "epoch": 4.842851516345018, + "grad_norm": 0.4583481252193451, + "learning_rate": 1.8432e-05, + "loss": 0.0263, + "step": 6150 + }, + { + "epoch": 4.843639228042536, + "grad_norm": 0.2829788625240326, + "learning_rate": 1.8435000000000002e-05, + "loss": 0.0123, + "step": 6151 + }, + { + "epoch": 4.844426939740055, + "grad_norm": 0.6382331252098083, + "learning_rate": 1.8438000000000002e-05, + "loss": 0.0273, + "step": 6152 + }, + { + "epoch": 4.845214651437574, + "grad_norm": 0.38678213953971863, + "learning_rate": 1.8441000000000002e-05, + "loss": 0.0236, + "step": 6153 + }, + { + "epoch": 4.846002363135092, + "grad_norm": 0.43171289563179016, + "learning_rate": 1.8444e-05, + "loss": 0.0386, + "step": 6154 + }, + { + "epoch": 4.846790074832612, + "grad_norm": 0.6173561811447144, + "learning_rate": 1.8447e-05, + "loss": 0.0502, + "step": 6155 + }, + { + "epoch": 4.84757778653013, + "grad_norm": 0.3493225574493408, + "learning_rate": 1.845e-05, + "loss": 0.0218, + "step": 6156 + }, + { + "epoch": 4.848365498227649, + "grad_norm": 0.37621062994003296, + "learning_rate": 1.8453e-05, + "loss": 0.0145, + "step": 6157 + }, + { + "epoch": 4.849153209925167, + "grad_norm": 0.45430365204811096, + "learning_rate": 1.8456e-05, + "loss": 0.0211, + "step": 6158 + }, + { + "epoch": 4.849940921622686, + "grad_norm": 0.6047540903091431, + "learning_rate": 1.8459e-05, + "loss": 0.0327, + "step": 6159 + }, + { + "epoch": 4.850728633320204, + "grad_norm": 0.6079766750335693, + "learning_rate": 1.8462e-05, + "loss": 0.0257, + "step": 6160 + }, + { + "epoch": 4.851516345017724, + "grad_norm": 0.5364305973052979, + "learning_rate": 1.8465e-05, + "loss": 0.0313, + "step": 6161 + }, + { + "epoch": 4.8523040567152425, + "grad_norm": 0.45507127046585083, + "learning_rate": 1.8468e-05, + "loss": 0.0277, + "step": 6162 + }, + { + "epoch": 4.853091768412761, + "grad_norm": 3.1443593502044678, + "learning_rate": 1.8471e-05, + "loss": 0.0352, + "step": 6163 + }, + { + "epoch": 4.85387948011028, + "grad_norm": 0.6643831729888916, + "learning_rate": 1.8474e-05, + "loss": 0.0368, + "step": 6164 + }, + { + "epoch": 4.854667191807798, + "grad_norm": 0.5002750754356384, + "learning_rate": 1.8477e-05, + "loss": 0.0325, + "step": 6165 + }, + { + "epoch": 4.855454903505317, + "grad_norm": 0.6275386214256287, + "learning_rate": 1.848e-05, + "loss": 0.0437, + "step": 6166 + }, + { + "epoch": 4.856242615202836, + "grad_norm": 0.4554417133331299, + "learning_rate": 1.8483e-05, + "loss": 0.0189, + "step": 6167 + }, + { + "epoch": 4.857030326900355, + "grad_norm": 0.3871784508228302, + "learning_rate": 1.8486e-05, + "loss": 0.0349, + "step": 6168 + }, + { + "epoch": 4.857818038597873, + "grad_norm": 0.5538287162780762, + "learning_rate": 1.8489e-05, + "loss": 0.0346, + "step": 6169 + }, + { + "epoch": 4.858605750295392, + "grad_norm": 0.4788346588611603, + "learning_rate": 1.8492e-05, + "loss": 0.0277, + "step": 6170 + }, + { + "epoch": 4.8593934619929104, + "grad_norm": 0.563859760761261, + "learning_rate": 1.8495e-05, + "loss": 0.0348, + "step": 6171 + }, + { + "epoch": 4.860181173690429, + "grad_norm": 0.5246302485466003, + "learning_rate": 1.8498e-05, + "loss": 0.0405, + "step": 6172 + }, + { + "epoch": 4.860968885387948, + "grad_norm": 0.636551022529602, + "learning_rate": 1.8501e-05, + "loss": 0.0329, + "step": 6173 + }, + { + "epoch": 4.861756597085467, + "grad_norm": 0.8346861004829407, + "learning_rate": 1.8504e-05, + "loss": 0.039, + "step": 6174 + }, + { + "epoch": 4.862544308782986, + "grad_norm": 0.6885434985160828, + "learning_rate": 1.8507e-05, + "loss": 0.0402, + "step": 6175 + }, + { + "epoch": 4.863332020480504, + "grad_norm": 0.4612361788749695, + "learning_rate": 1.851e-05, + "loss": 0.0364, + "step": 6176 + }, + { + "epoch": 4.864119732178023, + "grad_norm": 0.49034497141838074, + "learning_rate": 1.8513e-05, + "loss": 0.0315, + "step": 6177 + }, + { + "epoch": 4.864907443875541, + "grad_norm": 1.0579761266708374, + "learning_rate": 1.8516e-05, + "loss": 0.0309, + "step": 6178 + }, + { + "epoch": 4.865695155573061, + "grad_norm": 0.39697495102882385, + "learning_rate": 1.8519e-05, + "loss": 0.0214, + "step": 6179 + }, + { + "epoch": 4.866482867270579, + "grad_norm": 1.4534975290298462, + "learning_rate": 1.8522e-05, + "loss": 0.0694, + "step": 6180 + }, + { + "epoch": 4.867270578968098, + "grad_norm": 1.2022024393081665, + "learning_rate": 1.8525000000000003e-05, + "loss": 0.3252, + "step": 6181 + }, + { + "epoch": 4.8680582906656165, + "grad_norm": 0.8407143354415894, + "learning_rate": 1.8528000000000003e-05, + "loss": 0.2044, + "step": 6182 + }, + { + "epoch": 4.868846002363135, + "grad_norm": 0.9427636861801147, + "learning_rate": 1.8531000000000003e-05, + "loss": 0.2155, + "step": 6183 + }, + { + "epoch": 4.869633714060654, + "grad_norm": 0.6038057208061218, + "learning_rate": 1.8534000000000002e-05, + "loss": 0.1578, + "step": 6184 + }, + { + "epoch": 4.870421425758172, + "grad_norm": 0.7640621662139893, + "learning_rate": 1.8537000000000002e-05, + "loss": 0.087, + "step": 6185 + }, + { + "epoch": 4.871209137455692, + "grad_norm": 0.601979672908783, + "learning_rate": 1.854e-05, + "loss": 0.0746, + "step": 6186 + }, + { + "epoch": 4.87199684915321, + "grad_norm": 1.625762939453125, + "learning_rate": 1.8543e-05, + "loss": 0.1007, + "step": 6187 + }, + { + "epoch": 4.872784560850729, + "grad_norm": 0.7630209922790527, + "learning_rate": 1.8545999999999998e-05, + "loss": 0.0544, + "step": 6188 + }, + { + "epoch": 4.873572272548247, + "grad_norm": 1.0034282207489014, + "learning_rate": 1.8548999999999998e-05, + "loss": 0.0496, + "step": 6189 + }, + { + "epoch": 4.874359984245766, + "grad_norm": 0.48484402894973755, + "learning_rate": 1.8551999999999998e-05, + "loss": 0.0413, + "step": 6190 + }, + { + "epoch": 4.8751476959432845, + "grad_norm": 0.5654627680778503, + "learning_rate": 1.8555e-05, + "loss": 0.0441, + "step": 6191 + }, + { + "epoch": 4.875935407640803, + "grad_norm": 0.5661333203315735, + "learning_rate": 1.8558e-05, + "loss": 0.0354, + "step": 6192 + }, + { + "epoch": 4.8767231193383225, + "grad_norm": 0.3846527636051178, + "learning_rate": 1.8561e-05, + "loss": 0.023, + "step": 6193 + }, + { + "epoch": 4.877510831035841, + "grad_norm": 0.5900328159332275, + "learning_rate": 1.8564e-05, + "loss": 0.0356, + "step": 6194 + }, + { + "epoch": 4.87829854273336, + "grad_norm": 0.4014792740345001, + "learning_rate": 1.8567e-05, + "loss": 0.0347, + "step": 6195 + }, + { + "epoch": 4.879086254430878, + "grad_norm": 1.1905931234359741, + "learning_rate": 1.857e-05, + "loss": 0.0258, + "step": 6196 + }, + { + "epoch": 4.879873966128397, + "grad_norm": 0.3872661888599396, + "learning_rate": 1.8573e-05, + "loss": 0.0239, + "step": 6197 + }, + { + "epoch": 4.880661677825916, + "grad_norm": 0.467746376991272, + "learning_rate": 1.8576e-05, + "loss": 0.0259, + "step": 6198 + }, + { + "epoch": 4.881449389523435, + "grad_norm": 0.46539315581321716, + "learning_rate": 1.8579e-05, + "loss": 0.0296, + "step": 6199 + }, + { + "epoch": 4.882237101220953, + "grad_norm": 0.5406379699707031, + "learning_rate": 1.8582e-05, + "loss": 0.0292, + "step": 6200 + }, + { + "epoch": 4.883024812918472, + "grad_norm": 0.28702324628829956, + "learning_rate": 1.8585000000000002e-05, + "loss": 0.0288, + "step": 6201 + }, + { + "epoch": 4.8838125246159905, + "grad_norm": 0.5521730780601501, + "learning_rate": 1.8588000000000002e-05, + "loss": 0.0426, + "step": 6202 + }, + { + "epoch": 4.884600236313509, + "grad_norm": 0.7764924168586731, + "learning_rate": 1.8591000000000002e-05, + "loss": 0.0173, + "step": 6203 + }, + { + "epoch": 4.885387948011028, + "grad_norm": 0.4734675884246826, + "learning_rate": 1.8594000000000002e-05, + "loss": 0.0243, + "step": 6204 + }, + { + "epoch": 4.886175659708547, + "grad_norm": 0.40547817945480347, + "learning_rate": 1.8597e-05, + "loss": 0.0257, + "step": 6205 + }, + { + "epoch": 4.886963371406066, + "grad_norm": 0.4689960181713104, + "learning_rate": 1.86e-05, + "loss": 0.0218, + "step": 6206 + }, + { + "epoch": 4.887751083103584, + "grad_norm": 0.40804189443588257, + "learning_rate": 1.8603e-05, + "loss": 0.0241, + "step": 6207 + }, + { + "epoch": 4.888538794801103, + "grad_norm": 0.43032440543174744, + "learning_rate": 1.8606e-05, + "loss": 0.0189, + "step": 6208 + }, + { + "epoch": 4.889326506498621, + "grad_norm": 0.48890528082847595, + "learning_rate": 1.8609e-05, + "loss": 0.0252, + "step": 6209 + }, + { + "epoch": 4.89011421819614, + "grad_norm": 0.39830997586250305, + "learning_rate": 1.8612e-05, + "loss": 0.0328, + "step": 6210 + }, + { + "epoch": 4.8909019298936585, + "grad_norm": 1.1282809972763062, + "learning_rate": 1.8615e-05, + "loss": 0.032, + "step": 6211 + }, + { + "epoch": 4.891689641591178, + "grad_norm": 0.3829457461833954, + "learning_rate": 1.8618e-05, + "loss": 0.0214, + "step": 6212 + }, + { + "epoch": 4.8924773532886965, + "grad_norm": 0.45961710810661316, + "learning_rate": 1.8621e-05, + "loss": 0.0227, + "step": 6213 + }, + { + "epoch": 4.893265064986215, + "grad_norm": 0.9514200091362, + "learning_rate": 1.8624e-05, + "loss": 0.0331, + "step": 6214 + }, + { + "epoch": 4.894052776683734, + "grad_norm": 0.7137372493743896, + "learning_rate": 1.8627e-05, + "loss": 0.0262, + "step": 6215 + }, + { + "epoch": 4.894840488381252, + "grad_norm": 0.49115490913391113, + "learning_rate": 1.863e-05, + "loss": 0.0293, + "step": 6216 + }, + { + "epoch": 4.895628200078772, + "grad_norm": 0.6262810230255127, + "learning_rate": 1.8633e-05, + "loss": 0.0341, + "step": 6217 + }, + { + "epoch": 4.89641591177629, + "grad_norm": 0.6353373527526855, + "learning_rate": 1.8636e-05, + "loss": 0.0403, + "step": 6218 + }, + { + "epoch": 4.897203623473809, + "grad_norm": 0.29939791560173035, + "learning_rate": 1.8639e-05, + "loss": 0.0124, + "step": 6219 + }, + { + "epoch": 4.897991335171327, + "grad_norm": 0.7849637269973755, + "learning_rate": 1.8642e-05, + "loss": 0.0403, + "step": 6220 + }, + { + "epoch": 4.898779046868846, + "grad_norm": 0.5987557768821716, + "learning_rate": 1.8645000000000002e-05, + "loss": 0.0288, + "step": 6221 + }, + { + "epoch": 4.8995667585663645, + "grad_norm": 0.5937598347663879, + "learning_rate": 1.8648000000000002e-05, + "loss": 0.0376, + "step": 6222 + }, + { + "epoch": 4.900354470263883, + "grad_norm": 1.1570711135864258, + "learning_rate": 1.8651e-05, + "loss": 0.0391, + "step": 6223 + }, + { + "epoch": 4.9011421819614025, + "grad_norm": 0.5432996153831482, + "learning_rate": 1.8654e-05, + "loss": 0.0325, + "step": 6224 + }, + { + "epoch": 4.901929893658921, + "grad_norm": 0.5814566016197205, + "learning_rate": 1.8657e-05, + "loss": 0.0342, + "step": 6225 + }, + { + "epoch": 4.90271760535644, + "grad_norm": 0.38310712575912476, + "learning_rate": 1.866e-05, + "loss": 0.0198, + "step": 6226 + }, + { + "epoch": 4.903505317053958, + "grad_norm": 0.5501509308815002, + "learning_rate": 1.8663e-05, + "loss": 0.03, + "step": 6227 + }, + { + "epoch": 4.904293028751477, + "grad_norm": 0.4496670961380005, + "learning_rate": 1.8666e-05, + "loss": 0.0326, + "step": 6228 + }, + { + "epoch": 4.905080740448995, + "grad_norm": 0.6723899245262146, + "learning_rate": 1.8669e-05, + "loss": 0.0343, + "step": 6229 + }, + { + "epoch": 4.905868452146514, + "grad_norm": 0.48547327518463135, + "learning_rate": 1.8672e-05, + "loss": 0.0272, + "step": 6230 + }, + { + "epoch": 4.906656163844033, + "grad_norm": 1.007561445236206, + "learning_rate": 1.8675000000000003e-05, + "loss": 0.278, + "step": 6231 + }, + { + "epoch": 4.907443875541552, + "grad_norm": 1.187396764755249, + "learning_rate": 1.8678000000000003e-05, + "loss": 0.2832, + "step": 6232 + }, + { + "epoch": 4.9082315872390705, + "grad_norm": 0.652152955532074, + "learning_rate": 1.8681000000000003e-05, + "loss": 0.1647, + "step": 6233 + }, + { + "epoch": 4.909019298936589, + "grad_norm": 0.7440576553344727, + "learning_rate": 1.8684000000000003e-05, + "loss": 0.1393, + "step": 6234 + }, + { + "epoch": 4.909807010634108, + "grad_norm": 0.527349054813385, + "learning_rate": 1.8687e-05, + "loss": 0.0712, + "step": 6235 + }, + { + "epoch": 4.910594722331627, + "grad_norm": 0.5173664093017578, + "learning_rate": 1.869e-05, + "loss": 0.132, + "step": 6236 + }, + { + "epoch": 4.911382434029146, + "grad_norm": 0.41251856088638306, + "learning_rate": 1.8693e-05, + "loss": 0.0436, + "step": 6237 + }, + { + "epoch": 4.912170145726664, + "grad_norm": 0.6054829955101013, + "learning_rate": 1.8696e-05, + "loss": 0.0665, + "step": 6238 + }, + { + "epoch": 4.912957857424183, + "grad_norm": 0.2995811700820923, + "learning_rate": 1.8699e-05, + "loss": 0.0272, + "step": 6239 + }, + { + "epoch": 4.913745569121701, + "grad_norm": 0.3911952078342438, + "learning_rate": 1.8701999999999998e-05, + "loss": 0.0397, + "step": 6240 + }, + { + "epoch": 4.91453328081922, + "grad_norm": 0.41955330967903137, + "learning_rate": 1.8705e-05, + "loss": 0.0352, + "step": 6241 + }, + { + "epoch": 4.9153209925167385, + "grad_norm": 0.366879403591156, + "learning_rate": 1.8708e-05, + "loss": 0.0636, + "step": 6242 + }, + { + "epoch": 4.916108704214258, + "grad_norm": 0.5582929253578186, + "learning_rate": 1.8711e-05, + "loss": 0.0206, + "step": 6243 + }, + { + "epoch": 4.9168964159117765, + "grad_norm": 0.25861498713493347, + "learning_rate": 1.8714e-05, + "loss": 0.0195, + "step": 6244 + }, + { + "epoch": 4.917684127609295, + "grad_norm": 0.41691353917121887, + "learning_rate": 1.8717e-05, + "loss": 0.0414, + "step": 6245 + }, + { + "epoch": 4.918471839306814, + "grad_norm": 0.4438285231590271, + "learning_rate": 1.872e-05, + "loss": 0.0343, + "step": 6246 + }, + { + "epoch": 4.919259551004332, + "grad_norm": 0.24983447790145874, + "learning_rate": 1.8723e-05, + "loss": 0.0172, + "step": 6247 + }, + { + "epoch": 4.920047262701851, + "grad_norm": 0.38976627588272095, + "learning_rate": 1.8726e-05, + "loss": 0.019, + "step": 6248 + }, + { + "epoch": 4.920834974399369, + "grad_norm": 0.4057019352912903, + "learning_rate": 1.8729e-05, + "loss": 0.02, + "step": 6249 + }, + { + "epoch": 4.921622686096889, + "grad_norm": 0.39887356758117676, + "learning_rate": 1.8732e-05, + "loss": 0.0294, + "step": 6250 + }, + { + "epoch": 4.922410397794407, + "grad_norm": 0.3606346547603607, + "learning_rate": 1.8735000000000003e-05, + "loss": 0.0261, + "step": 6251 + }, + { + "epoch": 4.923198109491926, + "grad_norm": 0.606041669845581, + "learning_rate": 1.8738000000000003e-05, + "loss": 0.031, + "step": 6252 + }, + { + "epoch": 4.9239858211894445, + "grad_norm": 0.4288384020328522, + "learning_rate": 1.8741000000000002e-05, + "loss": 0.0204, + "step": 6253 + }, + { + "epoch": 4.924773532886963, + "grad_norm": 0.3459785580635071, + "learning_rate": 1.8744000000000002e-05, + "loss": 0.0162, + "step": 6254 + }, + { + "epoch": 4.9255612445844825, + "grad_norm": 0.392378032207489, + "learning_rate": 1.8747000000000002e-05, + "loss": 0.0195, + "step": 6255 + }, + { + "epoch": 4.926348956282001, + "grad_norm": 0.542898416519165, + "learning_rate": 1.8750000000000002e-05, + "loss": 0.0557, + "step": 6256 + }, + { + "epoch": 4.92713666797952, + "grad_norm": 0.5634113550186157, + "learning_rate": 1.8753e-05, + "loss": 0.0333, + "step": 6257 + }, + { + "epoch": 4.927924379677038, + "grad_norm": 0.2936376929283142, + "learning_rate": 1.8756e-05, + "loss": 0.0195, + "step": 6258 + }, + { + "epoch": 4.928712091374557, + "grad_norm": 0.4962429106235504, + "learning_rate": 1.8759e-05, + "loss": 0.0268, + "step": 6259 + }, + { + "epoch": 4.929499803072075, + "grad_norm": 0.5275611281394958, + "learning_rate": 1.8761999999999998e-05, + "loss": 0.0245, + "step": 6260 + }, + { + "epoch": 4.930287514769594, + "grad_norm": 0.6492099761962891, + "learning_rate": 1.8764999999999997e-05, + "loss": 0.0269, + "step": 6261 + }, + { + "epoch": 4.931075226467113, + "grad_norm": 0.43350738286972046, + "learning_rate": 1.8768e-05, + "loss": 0.0193, + "step": 6262 + }, + { + "epoch": 4.931862938164632, + "grad_norm": 0.4715873897075653, + "learning_rate": 1.8771e-05, + "loss": 0.0346, + "step": 6263 + }, + { + "epoch": 4.9326506498621505, + "grad_norm": 0.5973320603370667, + "learning_rate": 1.8774e-05, + "loss": 0.0235, + "step": 6264 + }, + { + "epoch": 4.933438361559669, + "grad_norm": 0.6531293392181396, + "learning_rate": 1.8777e-05, + "loss": 0.0272, + "step": 6265 + }, + { + "epoch": 4.934226073257188, + "grad_norm": 0.6851189136505127, + "learning_rate": 1.878e-05, + "loss": 0.0395, + "step": 6266 + }, + { + "epoch": 4.935013784954706, + "grad_norm": 0.5236697196960449, + "learning_rate": 1.8783e-05, + "loss": 0.0271, + "step": 6267 + }, + { + "epoch": 4.935801496652226, + "grad_norm": 0.44937869906425476, + "learning_rate": 1.8786e-05, + "loss": 0.0181, + "step": 6268 + }, + { + "epoch": 4.936589208349744, + "grad_norm": 0.5373101234436035, + "learning_rate": 1.8789e-05, + "loss": 0.0211, + "step": 6269 + }, + { + "epoch": 4.937376920047263, + "grad_norm": 0.5164479613304138, + "learning_rate": 1.8792e-05, + "loss": 0.0358, + "step": 6270 + }, + { + "epoch": 4.938164631744781, + "grad_norm": 0.5266591310501099, + "learning_rate": 1.8795e-05, + "loss": 0.038, + "step": 6271 + }, + { + "epoch": 4.9389523434423, + "grad_norm": 0.732795000076294, + "learning_rate": 1.8798000000000002e-05, + "loss": 0.0308, + "step": 6272 + }, + { + "epoch": 4.9397400551398185, + "grad_norm": 0.6122627258300781, + "learning_rate": 1.8801000000000002e-05, + "loss": 0.0277, + "step": 6273 + }, + { + "epoch": 4.940527766837338, + "grad_norm": 0.8207800388336182, + "learning_rate": 1.8804e-05, + "loss": 0.0453, + "step": 6274 + }, + { + "epoch": 4.9413154785348565, + "grad_norm": 0.7166847586631775, + "learning_rate": 1.8807e-05, + "loss": 0.0378, + "step": 6275 + }, + { + "epoch": 4.942103190232375, + "grad_norm": 0.7338752150535583, + "learning_rate": 1.881e-05, + "loss": 0.0181, + "step": 6276 + }, + { + "epoch": 4.942890901929894, + "grad_norm": 0.5399097800254822, + "learning_rate": 1.8813e-05, + "loss": 0.0264, + "step": 6277 + }, + { + "epoch": 4.943678613627412, + "grad_norm": 0.5986261963844299, + "learning_rate": 1.8816e-05, + "loss": 0.0339, + "step": 6278 + }, + { + "epoch": 4.944466325324931, + "grad_norm": 0.7587454915046692, + "learning_rate": 1.8819e-05, + "loss": 0.0462, + "step": 6279 + }, + { + "epoch": 4.945254037022449, + "grad_norm": 0.8565704822540283, + "learning_rate": 1.8822e-05, + "loss": 0.0511, + "step": 6280 + }, + { + "epoch": 4.946041748719969, + "grad_norm": 1.9199936389923096, + "learning_rate": 1.8825e-05, + "loss": 0.4046, + "step": 6281 + }, + { + "epoch": 4.946829460417487, + "grad_norm": 0.8617457747459412, + "learning_rate": 1.8828000000000003e-05, + "loss": 0.2628, + "step": 6282 + }, + { + "epoch": 4.947617172115006, + "grad_norm": 1.024232029914856, + "learning_rate": 1.8831000000000003e-05, + "loss": 0.2497, + "step": 6283 + }, + { + "epoch": 4.9484048838125245, + "grad_norm": 0.7027223706245422, + "learning_rate": 1.8834e-05, + "loss": 0.1623, + "step": 6284 + }, + { + "epoch": 4.949192595510043, + "grad_norm": 0.7301390767097473, + "learning_rate": 1.8837e-05, + "loss": 0.1145, + "step": 6285 + }, + { + "epoch": 4.949980307207562, + "grad_norm": 0.8562362790107727, + "learning_rate": 1.884e-05, + "loss": 0.0879, + "step": 6286 + }, + { + "epoch": 4.950768018905081, + "grad_norm": 0.6251290440559387, + "learning_rate": 1.8843e-05, + "loss": 0.1006, + "step": 6287 + }, + { + "epoch": 4.9515557306026, + "grad_norm": 0.5933526754379272, + "learning_rate": 1.8846e-05, + "loss": 0.0847, + "step": 6288 + }, + { + "epoch": 4.952343442300118, + "grad_norm": 0.5131922960281372, + "learning_rate": 1.8849e-05, + "loss": 0.0646, + "step": 6289 + }, + { + "epoch": 4.953131153997637, + "grad_norm": 0.4042023718357086, + "learning_rate": 1.8852e-05, + "loss": 0.043, + "step": 6290 + }, + { + "epoch": 4.953918865695155, + "grad_norm": 0.3421362340450287, + "learning_rate": 1.8854999999999998e-05, + "loss": 0.0321, + "step": 6291 + }, + { + "epoch": 4.954706577392674, + "grad_norm": 0.6265801191329956, + "learning_rate": 1.8858e-05, + "loss": 0.0556, + "step": 6292 + }, + { + "epoch": 4.955494289090193, + "grad_norm": 0.32392415404319763, + "learning_rate": 1.8861e-05, + "loss": 0.0238, + "step": 6293 + }, + { + "epoch": 4.956282000787712, + "grad_norm": 0.4316980540752411, + "learning_rate": 1.8864e-05, + "loss": 0.0232, + "step": 6294 + }, + { + "epoch": 4.9570697124852305, + "grad_norm": 0.42665407061576843, + "learning_rate": 1.8867e-05, + "loss": 0.0369, + "step": 6295 + }, + { + "epoch": 4.957857424182749, + "grad_norm": 0.5348519682884216, + "learning_rate": 1.887e-05, + "loss": 0.028, + "step": 6296 + }, + { + "epoch": 4.958645135880268, + "grad_norm": 0.6229453086853027, + "learning_rate": 1.8873e-05, + "loss": 0.0477, + "step": 6297 + }, + { + "epoch": 4.959432847577786, + "grad_norm": 0.7168146967887878, + "learning_rate": 1.8876e-05, + "loss": 0.0371, + "step": 6298 + }, + { + "epoch": 4.960220559275305, + "grad_norm": 0.398895800113678, + "learning_rate": 1.8879e-05, + "loss": 0.024, + "step": 6299 + }, + { + "epoch": 4.961008270972824, + "grad_norm": 0.43013474345207214, + "learning_rate": 1.8882e-05, + "loss": 0.0354, + "step": 6300 + }, + { + "epoch": 4.961795982670343, + "grad_norm": 0.3066977858543396, + "learning_rate": 1.8885e-05, + "loss": 0.0186, + "step": 6301 + }, + { + "epoch": 4.962583694367861, + "grad_norm": 0.3709351420402527, + "learning_rate": 1.8888000000000003e-05, + "loss": 0.023, + "step": 6302 + }, + { + "epoch": 4.96337140606538, + "grad_norm": 0.9631217122077942, + "learning_rate": 1.8891000000000003e-05, + "loss": 0.0688, + "step": 6303 + }, + { + "epoch": 4.9641591177628985, + "grad_norm": 0.7236266136169434, + "learning_rate": 1.8894000000000002e-05, + "loss": 0.0218, + "step": 6304 + }, + { + "epoch": 4.964946829460417, + "grad_norm": 0.43018823862075806, + "learning_rate": 1.8897000000000002e-05, + "loss": 0.02, + "step": 6305 + }, + { + "epoch": 4.9657345411579366, + "grad_norm": 0.7658095359802246, + "learning_rate": 1.8900000000000002e-05, + "loss": 0.0352, + "step": 6306 + }, + { + "epoch": 4.966522252855455, + "grad_norm": 0.5400773286819458, + "learning_rate": 1.8903000000000002e-05, + "loss": 0.0298, + "step": 6307 + }, + { + "epoch": 4.967309964552974, + "grad_norm": 0.4794906675815582, + "learning_rate": 1.8906e-05, + "loss": 0.0231, + "step": 6308 + }, + { + "epoch": 4.968097676250492, + "grad_norm": 0.9928862452507019, + "learning_rate": 1.8908999999999998e-05, + "loss": 0.0307, + "step": 6309 + }, + { + "epoch": 4.968885387948011, + "grad_norm": 0.38176271319389343, + "learning_rate": 1.8911999999999998e-05, + "loss": 0.0273, + "step": 6310 + }, + { + "epoch": 4.969673099645529, + "grad_norm": 0.6198738813400269, + "learning_rate": 1.8914999999999998e-05, + "loss": 0.0384, + "step": 6311 + }, + { + "epoch": 4.970460811343049, + "grad_norm": 0.5194361805915833, + "learning_rate": 1.8918e-05, + "loss": 0.0361, + "step": 6312 + }, + { + "epoch": 4.971248523040567, + "grad_norm": 0.7370855212211609, + "learning_rate": 1.8921e-05, + "loss": 0.0228, + "step": 6313 + }, + { + "epoch": 4.972036234738086, + "grad_norm": 0.3937164545059204, + "learning_rate": 1.8924e-05, + "loss": 0.0212, + "step": 6314 + }, + { + "epoch": 4.9728239464356045, + "grad_norm": 0.46070247888565063, + "learning_rate": 1.8927e-05, + "loss": 0.0579, + "step": 6315 + }, + { + "epoch": 4.973611658133123, + "grad_norm": 0.398490309715271, + "learning_rate": 1.893e-05, + "loss": 0.0177, + "step": 6316 + }, + { + "epoch": 4.974399369830642, + "grad_norm": 0.5144492387771606, + "learning_rate": 1.8933e-05, + "loss": 0.0216, + "step": 6317 + }, + { + "epoch": 4.97518708152816, + "grad_norm": 0.5639150142669678, + "learning_rate": 1.8936e-05, + "loss": 0.0432, + "step": 6318 + }, + { + "epoch": 4.97597479322568, + "grad_norm": 0.9634998440742493, + "learning_rate": 1.8939e-05, + "loss": 0.0395, + "step": 6319 + }, + { + "epoch": 4.976762504923198, + "grad_norm": 0.40721064805984497, + "learning_rate": 1.8942e-05, + "loss": 0.0247, + "step": 6320 + }, + { + "epoch": 4.977550216620717, + "grad_norm": 0.7109009623527527, + "learning_rate": 1.8945e-05, + "loss": 0.0296, + "step": 6321 + }, + { + "epoch": 4.978337928318235, + "grad_norm": 0.9312862753868103, + "learning_rate": 1.8948000000000002e-05, + "loss": 0.0511, + "step": 6322 + }, + { + "epoch": 4.979125640015754, + "grad_norm": 0.4410506784915924, + "learning_rate": 1.8951000000000002e-05, + "loss": 0.0228, + "step": 6323 + }, + { + "epoch": 4.979913351713273, + "grad_norm": 0.48672381043434143, + "learning_rate": 1.8954000000000002e-05, + "loss": 0.0353, + "step": 6324 + }, + { + "epoch": 4.980701063410792, + "grad_norm": 0.6981456279754639, + "learning_rate": 1.8957e-05, + "loss": 0.0349, + "step": 6325 + }, + { + "epoch": 4.981488775108311, + "grad_norm": 0.3775022029876709, + "learning_rate": 1.896e-05, + "loss": 0.0204, + "step": 6326 + }, + { + "epoch": 4.982276486805829, + "grad_norm": 0.6492506861686707, + "learning_rate": 1.8963e-05, + "loss": 0.0355, + "step": 6327 + }, + { + "epoch": 4.983064198503348, + "grad_norm": 0.7907310724258423, + "learning_rate": 1.8966e-05, + "loss": 0.0362, + "step": 6328 + }, + { + "epoch": 4.983851910200866, + "grad_norm": 2.557741165161133, + "learning_rate": 1.8969e-05, + "loss": 0.0389, + "step": 6329 + }, + { + "epoch": 4.984639621898385, + "grad_norm": 1.0554275512695312, + "learning_rate": 1.8972e-05, + "loss": 0.0564, + "step": 6330 + }, + { + "epoch": 4.985427333595904, + "grad_norm": 1.774063229560852, + "learning_rate": 1.8975e-05, + "loss": 0.2834, + "step": 6331 + }, + { + "epoch": 4.986215045293423, + "grad_norm": 1.4582147598266602, + "learning_rate": 1.8978000000000004e-05, + "loss": 0.1672, + "step": 6332 + }, + { + "epoch": 4.987002756990941, + "grad_norm": 0.5865042209625244, + "learning_rate": 1.8981e-05, + "loss": 0.0376, + "step": 6333 + }, + { + "epoch": 4.98779046868846, + "grad_norm": 0.6404586434364319, + "learning_rate": 1.8984e-05, + "loss": 0.0461, + "step": 6334 + }, + { + "epoch": 4.9885781803859786, + "grad_norm": 0.31156066060066223, + "learning_rate": 1.8987e-05, + "loss": 0.0221, + "step": 6335 + }, + { + "epoch": 4.989365892083497, + "grad_norm": 0.6074519157409668, + "learning_rate": 1.899e-05, + "loss": 0.0318, + "step": 6336 + }, + { + "epoch": 4.990153603781016, + "grad_norm": 0.9247211217880249, + "learning_rate": 1.8993e-05, + "loss": 0.0292, + "step": 6337 + }, + { + "epoch": 4.990941315478535, + "grad_norm": 0.5073791146278381, + "learning_rate": 1.8996e-05, + "loss": 0.0326, + "step": 6338 + }, + { + "epoch": 4.991729027176054, + "grad_norm": 0.6915385127067566, + "learning_rate": 1.8999e-05, + "loss": 0.0403, + "step": 6339 + }, + { + "epoch": 4.992516738873572, + "grad_norm": 0.30203723907470703, + "learning_rate": 1.9002e-05, + "loss": 0.0209, + "step": 6340 + }, + { + "epoch": 4.993304450571091, + "grad_norm": 0.38660117983818054, + "learning_rate": 1.9005e-05, + "loss": 0.0251, + "step": 6341 + }, + { + "epoch": 4.994092162268609, + "grad_norm": 0.4722723960876465, + "learning_rate": 1.9008e-05, + "loss": 0.0251, + "step": 6342 + }, + { + "epoch": 4.994879873966129, + "grad_norm": 0.5448219180107117, + "learning_rate": 1.9011e-05, + "loss": 0.0415, + "step": 6343 + }, + { + "epoch": 4.995667585663647, + "grad_norm": 0.4719877243041992, + "learning_rate": 1.9014e-05, + "loss": 0.0277, + "step": 6344 + }, + { + "epoch": 4.996455297361166, + "grad_norm": 0.44012925028800964, + "learning_rate": 1.9017e-05, + "loss": 0.0226, + "step": 6345 + }, + { + "epoch": 4.997243009058685, + "grad_norm": 0.4843500256538391, + "learning_rate": 1.902e-05, + "loss": 0.0238, + "step": 6346 + }, + { + "epoch": 4.998030720756203, + "grad_norm": 0.9667214751243591, + "learning_rate": 1.9023e-05, + "loss": 0.0635, + "step": 6347 + }, + { + "epoch": 4.998818432453722, + "grad_norm": 0.35403919219970703, + "learning_rate": 1.9026e-05, + "loss": 0.0233, + "step": 6348 + }, + { + "epoch": 4.99960614415124, + "grad_norm": 0.5509705543518066, + "learning_rate": 1.9029e-05, + "loss": 0.0449, + "step": 6349 + }, + { + "epoch": 5.0, + "grad_norm": 0.4344549775123596, + "learning_rate": 1.9032e-05, + "loss": 0.0096, + "step": 6350 + }, + { + "epoch": 5.000787711697519, + "grad_norm": 3.2354485988616943, + "learning_rate": 1.9035e-05, + "loss": 0.4256, + "step": 6351 + }, + { + "epoch": 5.001575423395037, + "grad_norm": 1.2887864112854004, + "learning_rate": 1.9038000000000003e-05, + "loss": 0.2211, + "step": 6352 + }, + { + "epoch": 5.002363135092556, + "grad_norm": 1.2338526248931885, + "learning_rate": 1.9041000000000003e-05, + "loss": 0.3282, + "step": 6353 + }, + { + "epoch": 5.003150846790075, + "grad_norm": 0.9118005633354187, + "learning_rate": 1.9044000000000003e-05, + "loss": 0.1728, + "step": 6354 + }, + { + "epoch": 5.003938558487594, + "grad_norm": 0.7376360893249512, + "learning_rate": 1.9047000000000002e-05, + "loss": 0.143, + "step": 6355 + }, + { + "epoch": 5.004726270185112, + "grad_norm": 0.56898432970047, + "learning_rate": 1.9050000000000002e-05, + "loss": 0.1102, + "step": 6356 + }, + { + "epoch": 5.005513981882631, + "grad_norm": 0.46542590856552124, + "learning_rate": 1.9053000000000002e-05, + "loss": 0.0596, + "step": 6357 + }, + { + "epoch": 5.006301693580149, + "grad_norm": 0.6403869390487671, + "learning_rate": 1.9056e-05, + "loss": 0.0802, + "step": 6358 + }, + { + "epoch": 5.007089405277668, + "grad_norm": 0.5471826791763306, + "learning_rate": 1.9058999999999998e-05, + "loss": 0.0314, + "step": 6359 + }, + { + "epoch": 5.0078771169751874, + "grad_norm": 0.4497816860675812, + "learning_rate": 1.9061999999999998e-05, + "loss": 0.0374, + "step": 6360 + }, + { + "epoch": 5.008664828672706, + "grad_norm": 0.36782345175743103, + "learning_rate": 1.9064999999999998e-05, + "loss": 0.0241, + "step": 6361 + }, + { + "epoch": 5.009452540370225, + "grad_norm": 0.47944366931915283, + "learning_rate": 1.9068e-05, + "loss": 0.0431, + "step": 6362 + }, + { + "epoch": 5.010240252067743, + "grad_norm": 11.542998313903809, + "learning_rate": 1.9071e-05, + "loss": 0.108, + "step": 6363 + }, + { + "epoch": 5.011027963765262, + "grad_norm": 0.22927209734916687, + "learning_rate": 1.9074e-05, + "loss": 0.0125, + "step": 6364 + }, + { + "epoch": 5.01181567546278, + "grad_norm": 0.7151424288749695, + "learning_rate": 1.9077e-05, + "loss": 0.0216, + "step": 6365 + }, + { + "epoch": 5.0126033871603, + "grad_norm": 0.26932787895202637, + "learning_rate": 1.908e-05, + "loss": 0.0191, + "step": 6366 + }, + { + "epoch": 5.013391098857818, + "grad_norm": 0.847450852394104, + "learning_rate": 1.9083e-05, + "loss": 0.0301, + "step": 6367 + }, + { + "epoch": 5.014178810555337, + "grad_norm": 0.3263017535209656, + "learning_rate": 1.9086e-05, + "loss": 0.0175, + "step": 6368 + }, + { + "epoch": 5.014966522252855, + "grad_norm": 1.1481329202651978, + "learning_rate": 1.9089e-05, + "loss": 0.0237, + "step": 6369 + }, + { + "epoch": 5.015754233950374, + "grad_norm": 0.47993364930152893, + "learning_rate": 1.9092e-05, + "loss": 0.0257, + "step": 6370 + }, + { + "epoch": 5.016541945647893, + "grad_norm": 0.515917181968689, + "learning_rate": 1.9095e-05, + "loss": 0.0268, + "step": 6371 + }, + { + "epoch": 5.017329657345411, + "grad_norm": 0.5790119767189026, + "learning_rate": 1.9098000000000002e-05, + "loss": 0.026, + "step": 6372 + }, + { + "epoch": 5.018117369042931, + "grad_norm": 0.4332146942615509, + "learning_rate": 1.9101000000000002e-05, + "loss": 0.0235, + "step": 6373 + }, + { + "epoch": 5.018905080740449, + "grad_norm": 0.5305008888244629, + "learning_rate": 1.9104000000000002e-05, + "loss": 0.035, + "step": 6374 + }, + { + "epoch": 5.019692792437968, + "grad_norm": 0.6045200824737549, + "learning_rate": 1.9107000000000002e-05, + "loss": 0.0222, + "step": 6375 + }, + { + "epoch": 5.020480504135486, + "grad_norm": 0.7450364232063293, + "learning_rate": 1.911e-05, + "loss": 0.0725, + "step": 6376 + }, + { + "epoch": 5.021268215833005, + "grad_norm": 0.4780820906162262, + "learning_rate": 1.9113e-05, + "loss": 0.0223, + "step": 6377 + }, + { + "epoch": 5.022055927530523, + "grad_norm": 0.4023146629333496, + "learning_rate": 1.9116e-05, + "loss": 0.0271, + "step": 6378 + }, + { + "epoch": 5.022843639228043, + "grad_norm": 0.6154473423957825, + "learning_rate": 1.9119e-05, + "loss": 0.0218, + "step": 6379 + }, + { + "epoch": 5.0236313509255615, + "grad_norm": 0.7734260559082031, + "learning_rate": 1.9122e-05, + "loss": 0.0245, + "step": 6380 + }, + { + "epoch": 5.02441906262308, + "grad_norm": 0.5071365833282471, + "learning_rate": 1.9125e-05, + "loss": 0.0311, + "step": 6381 + }, + { + "epoch": 5.025206774320599, + "grad_norm": 0.6351560354232788, + "learning_rate": 1.9128e-05, + "loss": 0.0281, + "step": 6382 + }, + { + "epoch": 5.025994486018117, + "grad_norm": 0.38551589846611023, + "learning_rate": 1.9131e-05, + "loss": 0.0176, + "step": 6383 + }, + { + "epoch": 5.026782197715636, + "grad_norm": 0.34784165024757385, + "learning_rate": 1.9134e-05, + "loss": 0.0261, + "step": 6384 + }, + { + "epoch": 5.027569909413155, + "grad_norm": 0.47193610668182373, + "learning_rate": 1.9137e-05, + "loss": 0.0212, + "step": 6385 + }, + { + "epoch": 5.028357621110674, + "grad_norm": 0.5202264785766602, + "learning_rate": 1.914e-05, + "loss": 0.0252, + "step": 6386 + }, + { + "epoch": 5.029145332808192, + "grad_norm": 0.45397159457206726, + "learning_rate": 1.9143e-05, + "loss": 0.0193, + "step": 6387 + }, + { + "epoch": 5.029933044505711, + "grad_norm": 0.5807051658630371, + "learning_rate": 1.9146e-05, + "loss": 0.0188, + "step": 6388 + }, + { + "epoch": 5.0307207562032294, + "grad_norm": 0.3302978277206421, + "learning_rate": 1.9149e-05, + "loss": 0.0275, + "step": 6389 + }, + { + "epoch": 5.031508467900748, + "grad_norm": 0.5147168636322021, + "learning_rate": 1.9152e-05, + "loss": 0.0171, + "step": 6390 + }, + { + "epoch": 5.0322961795982675, + "grad_norm": 0.5396860241889954, + "learning_rate": 1.9155e-05, + "loss": 0.017, + "step": 6391 + }, + { + "epoch": 5.033083891295786, + "grad_norm": 0.4392514228820801, + "learning_rate": 1.9158e-05, + "loss": 0.0222, + "step": 6392 + }, + { + "epoch": 5.033871602993305, + "grad_norm": 0.4411775469779968, + "learning_rate": 1.9161000000000002e-05, + "loss": 0.0189, + "step": 6393 + }, + { + "epoch": 5.034659314690823, + "grad_norm": 0.651892364025116, + "learning_rate": 1.9164e-05, + "loss": 0.0308, + "step": 6394 + }, + { + "epoch": 5.035447026388342, + "grad_norm": 0.5669786930084229, + "learning_rate": 1.9167e-05, + "loss": 0.0327, + "step": 6395 + }, + { + "epoch": 5.03623473808586, + "grad_norm": 0.4451357126235962, + "learning_rate": 1.917e-05, + "loss": 0.0278, + "step": 6396 + }, + { + "epoch": 5.037022449783379, + "grad_norm": 0.5933238863945007, + "learning_rate": 1.9173e-05, + "loss": 0.024, + "step": 6397 + }, + { + "epoch": 5.037810161480898, + "grad_norm": 0.6212496161460876, + "learning_rate": 1.9176e-05, + "loss": 0.0424, + "step": 6398 + }, + { + "epoch": 5.038597873178417, + "grad_norm": 0.5432877540588379, + "learning_rate": 1.9179e-05, + "loss": 0.027, + "step": 6399 + }, + { + "epoch": 5.0393855848759355, + "grad_norm": 2.7797012329101562, + "learning_rate": 1.9182e-05, + "loss": 0.0421, + "step": 6400 + }, + { + "epoch": 5.040173296573454, + "grad_norm": 1.696364402770996, + "learning_rate": 1.9185e-05, + "loss": 0.3442, + "step": 6401 + }, + { + "epoch": 5.040961008270973, + "grad_norm": 0.9662817716598511, + "learning_rate": 1.9188e-05, + "loss": 0.1701, + "step": 6402 + }, + { + "epoch": 5.041748719968491, + "grad_norm": 0.7712262272834778, + "learning_rate": 1.9191000000000003e-05, + "loss": 0.1675, + "step": 6403 + }, + { + "epoch": 5.042536431666011, + "grad_norm": 1.134250521659851, + "learning_rate": 1.9194000000000003e-05, + "loss": 0.1551, + "step": 6404 + }, + { + "epoch": 5.043324143363529, + "grad_norm": 0.5955461859703064, + "learning_rate": 1.9197000000000003e-05, + "loss": 0.1005, + "step": 6405 + }, + { + "epoch": 5.044111855061048, + "grad_norm": 0.41494229435920715, + "learning_rate": 1.9200000000000003e-05, + "loss": 0.0407, + "step": 6406 + }, + { + "epoch": 5.044899566758566, + "grad_norm": 0.3972199857234955, + "learning_rate": 1.9203e-05, + "loss": 0.0341, + "step": 6407 + }, + { + "epoch": 5.045687278456085, + "grad_norm": 0.5160945057868958, + "learning_rate": 1.9206e-05, + "loss": 0.0406, + "step": 6408 + }, + { + "epoch": 5.0464749901536035, + "grad_norm": 0.4194311499595642, + "learning_rate": 1.9209e-05, + "loss": 0.0333, + "step": 6409 + }, + { + "epoch": 5.047262701851123, + "grad_norm": 0.3676459491252899, + "learning_rate": 1.9212e-05, + "loss": 0.0295, + "step": 6410 + }, + { + "epoch": 5.0480504135486415, + "grad_norm": 0.3400542140007019, + "learning_rate": 1.9214999999999998e-05, + "loss": 0.023, + "step": 6411 + }, + { + "epoch": 5.04883812524616, + "grad_norm": 0.4174777567386627, + "learning_rate": 1.9217999999999998e-05, + "loss": 0.0241, + "step": 6412 + }, + { + "epoch": 5.049625836943679, + "grad_norm": 1.519208312034607, + "learning_rate": 1.9221e-05, + "loss": 0.0511, + "step": 6413 + }, + { + "epoch": 5.050413548641197, + "grad_norm": 0.22019913792610168, + "learning_rate": 1.9224e-05, + "loss": 0.0201, + "step": 6414 + }, + { + "epoch": 5.051201260338716, + "grad_norm": 0.3855326771736145, + "learning_rate": 1.9227e-05, + "loss": 0.023, + "step": 6415 + }, + { + "epoch": 5.051988972036234, + "grad_norm": 0.396984338760376, + "learning_rate": 1.923e-05, + "loss": 0.0212, + "step": 6416 + }, + { + "epoch": 5.052776683733754, + "grad_norm": 0.34099987149238586, + "learning_rate": 1.9233e-05, + "loss": 0.0185, + "step": 6417 + }, + { + "epoch": 5.053564395431272, + "grad_norm": 0.28897035121917725, + "learning_rate": 1.9236e-05, + "loss": 0.0122, + "step": 6418 + }, + { + "epoch": 5.054352107128791, + "grad_norm": 0.9169447422027588, + "learning_rate": 1.9239e-05, + "loss": 0.0274, + "step": 6419 + }, + { + "epoch": 5.0551398188263095, + "grad_norm": 0.25046423077583313, + "learning_rate": 1.9242e-05, + "loss": 0.0121, + "step": 6420 + }, + { + "epoch": 5.055927530523828, + "grad_norm": 0.7637526392936707, + "learning_rate": 1.9245e-05, + "loss": 0.0199, + "step": 6421 + }, + { + "epoch": 5.056715242221347, + "grad_norm": 0.5059244632720947, + "learning_rate": 1.9248e-05, + "loss": 0.0371, + "step": 6422 + }, + { + "epoch": 5.057502953918866, + "grad_norm": 0.49506625533103943, + "learning_rate": 1.9251000000000003e-05, + "loss": 0.0296, + "step": 6423 + }, + { + "epoch": 5.058290665616385, + "grad_norm": 0.5470033288002014, + "learning_rate": 1.9254000000000002e-05, + "loss": 0.0273, + "step": 6424 + }, + { + "epoch": 5.059078377313903, + "grad_norm": 0.30358046293258667, + "learning_rate": 1.9257000000000002e-05, + "loss": 0.0214, + "step": 6425 + }, + { + "epoch": 5.059866089011422, + "grad_norm": 0.5493741035461426, + "learning_rate": 1.9260000000000002e-05, + "loss": 0.0133, + "step": 6426 + }, + { + "epoch": 5.06065380070894, + "grad_norm": 0.6113781332969666, + "learning_rate": 1.9263000000000002e-05, + "loss": 0.0199, + "step": 6427 + }, + { + "epoch": 5.061441512406459, + "grad_norm": 0.5029371976852417, + "learning_rate": 1.9266e-05, + "loss": 0.0137, + "step": 6428 + }, + { + "epoch": 5.062229224103978, + "grad_norm": 0.43905141949653625, + "learning_rate": 1.9269e-05, + "loss": 0.0241, + "step": 6429 + }, + { + "epoch": 5.063016935801497, + "grad_norm": 0.5146620869636536, + "learning_rate": 1.9272e-05, + "loss": 0.0197, + "step": 6430 + }, + { + "epoch": 5.0638046474990155, + "grad_norm": 0.31609466671943665, + "learning_rate": 1.9275e-05, + "loss": 0.0232, + "step": 6431 + }, + { + "epoch": 5.064592359196534, + "grad_norm": 0.2784948945045471, + "learning_rate": 1.9277999999999997e-05, + "loss": 0.0143, + "step": 6432 + }, + { + "epoch": 5.065380070894053, + "grad_norm": 0.40016812086105347, + "learning_rate": 1.9281e-05, + "loss": 0.0202, + "step": 6433 + }, + { + "epoch": 5.066167782591571, + "grad_norm": 0.7947181463241577, + "learning_rate": 1.9284e-05, + "loss": 0.0228, + "step": 6434 + }, + { + "epoch": 5.06695549428909, + "grad_norm": 0.6452949643135071, + "learning_rate": 1.9287e-05, + "loss": 0.0261, + "step": 6435 + }, + { + "epoch": 5.067743205986609, + "grad_norm": 1.7395093441009521, + "learning_rate": 1.929e-05, + "loss": 0.0213, + "step": 6436 + }, + { + "epoch": 5.068530917684128, + "grad_norm": 0.5589245557785034, + "learning_rate": 1.9293e-05, + "loss": 0.0222, + "step": 6437 + }, + { + "epoch": 5.069318629381646, + "grad_norm": 0.487759530544281, + "learning_rate": 1.9296e-05, + "loss": 0.0135, + "step": 6438 + }, + { + "epoch": 5.070106341079165, + "grad_norm": 0.6985639333724976, + "learning_rate": 1.9299e-05, + "loss": 0.0239, + "step": 6439 + }, + { + "epoch": 5.0708940527766835, + "grad_norm": 0.5826373100280762, + "learning_rate": 1.9302e-05, + "loss": 0.0283, + "step": 6440 + }, + { + "epoch": 5.071681764474202, + "grad_norm": 0.5492955446243286, + "learning_rate": 1.9305e-05, + "loss": 0.0261, + "step": 6441 + }, + { + "epoch": 5.0724694761717215, + "grad_norm": 0.6678163409233093, + "learning_rate": 1.9308e-05, + "loss": 0.0527, + "step": 6442 + }, + { + "epoch": 5.07325718786924, + "grad_norm": 0.43667086958885193, + "learning_rate": 1.9311000000000002e-05, + "loss": 0.028, + "step": 6443 + }, + { + "epoch": 5.074044899566759, + "grad_norm": 0.41527074575424194, + "learning_rate": 1.9314000000000002e-05, + "loss": 0.0259, + "step": 6444 + }, + { + "epoch": 5.074832611264277, + "grad_norm": 0.6604272723197937, + "learning_rate": 1.9317e-05, + "loss": 0.0212, + "step": 6445 + }, + { + "epoch": 5.075620322961796, + "grad_norm": 0.4491996169090271, + "learning_rate": 1.932e-05, + "loss": 0.0214, + "step": 6446 + }, + { + "epoch": 5.076408034659314, + "grad_norm": 0.5304321050643921, + "learning_rate": 1.9323e-05, + "loss": 0.0229, + "step": 6447 + }, + { + "epoch": 5.077195746356834, + "grad_norm": 0.6506685018539429, + "learning_rate": 1.9326e-05, + "loss": 0.0277, + "step": 6448 + }, + { + "epoch": 5.077983458054352, + "grad_norm": 0.480733186006546, + "learning_rate": 1.9329e-05, + "loss": 0.038, + "step": 6449 + }, + { + "epoch": 5.078771169751871, + "grad_norm": 0.678496778011322, + "learning_rate": 1.9332e-05, + "loss": 0.0457, + "step": 6450 + }, + { + "epoch": 5.0795588814493895, + "grad_norm": 1.3677153587341309, + "learning_rate": 1.9335e-05, + "loss": 0.2858, + "step": 6451 + }, + { + "epoch": 5.080346593146908, + "grad_norm": 0.8318579196929932, + "learning_rate": 1.9338e-05, + "loss": 0.2106, + "step": 6452 + }, + { + "epoch": 5.081134304844427, + "grad_norm": 0.785123884677887, + "learning_rate": 1.9341000000000003e-05, + "loss": 0.2305, + "step": 6453 + }, + { + "epoch": 5.081922016541945, + "grad_norm": 0.9116855263710022, + "learning_rate": 1.9344000000000003e-05, + "loss": 0.1999, + "step": 6454 + }, + { + "epoch": 5.082709728239465, + "grad_norm": 0.8889244198799133, + "learning_rate": 1.9347000000000003e-05, + "loss": 0.1478, + "step": 6455 + }, + { + "epoch": 5.083497439936983, + "grad_norm": 0.5734280943870544, + "learning_rate": 1.935e-05, + "loss": 0.1039, + "step": 6456 + }, + { + "epoch": 5.084285151634502, + "grad_norm": 0.7855738401412964, + "learning_rate": 1.9353e-05, + "loss": 0.0527, + "step": 6457 + }, + { + "epoch": 5.08507286333202, + "grad_norm": 0.5848625898361206, + "learning_rate": 1.9356e-05, + "loss": 0.0501, + "step": 6458 + }, + { + "epoch": 5.085860575029539, + "grad_norm": 0.4225146472454071, + "learning_rate": 1.9359e-05, + "loss": 0.0286, + "step": 6459 + }, + { + "epoch": 5.0866482867270575, + "grad_norm": 0.36082547903060913, + "learning_rate": 1.9362e-05, + "loss": 0.029, + "step": 6460 + }, + { + "epoch": 5.087435998424577, + "grad_norm": 0.4459780752658844, + "learning_rate": 1.9365e-05, + "loss": 0.0333, + "step": 6461 + }, + { + "epoch": 5.0882237101220955, + "grad_norm": 0.4854123294353485, + "learning_rate": 1.9367999999999998e-05, + "loss": 0.0326, + "step": 6462 + }, + { + "epoch": 5.089011421819614, + "grad_norm": 0.5053470730781555, + "learning_rate": 1.9371e-05, + "loss": 0.0349, + "step": 6463 + }, + { + "epoch": 5.089799133517133, + "grad_norm": 0.7351279258728027, + "learning_rate": 1.9374e-05, + "loss": 0.0407, + "step": 6464 + }, + { + "epoch": 5.090586845214651, + "grad_norm": 0.3382734954357147, + "learning_rate": 1.9377e-05, + "loss": 0.0211, + "step": 6465 + }, + { + "epoch": 5.09137455691217, + "grad_norm": 0.4439674913883209, + "learning_rate": 1.938e-05, + "loss": 0.0302, + "step": 6466 + }, + { + "epoch": 5.092162268609689, + "grad_norm": 0.386506587266922, + "learning_rate": 1.9383e-05, + "loss": 0.0215, + "step": 6467 + }, + { + "epoch": 5.092949980307208, + "grad_norm": 0.43422892689704895, + "learning_rate": 1.9386e-05, + "loss": 0.0288, + "step": 6468 + }, + { + "epoch": 5.093737692004726, + "grad_norm": 0.43394583463668823, + "learning_rate": 1.9389e-05, + "loss": 0.0255, + "step": 6469 + }, + { + "epoch": 5.094525403702245, + "grad_norm": 0.5181490182876587, + "learning_rate": 1.9392e-05, + "loss": 0.0181, + "step": 6470 + }, + { + "epoch": 5.0953131153997635, + "grad_norm": 0.5091977715492249, + "learning_rate": 1.9395e-05, + "loss": 0.029, + "step": 6471 + }, + { + "epoch": 5.096100827097282, + "grad_norm": 0.5050637125968933, + "learning_rate": 1.9398e-05, + "loss": 0.0248, + "step": 6472 + }, + { + "epoch": 5.0968885387948015, + "grad_norm": 0.30751940608024597, + "learning_rate": 1.9401000000000003e-05, + "loss": 0.0232, + "step": 6473 + }, + { + "epoch": 5.09767625049232, + "grad_norm": 0.4170430600643158, + "learning_rate": 1.9404000000000003e-05, + "loss": 0.0236, + "step": 6474 + }, + { + "epoch": 5.098463962189839, + "grad_norm": 0.5938129425048828, + "learning_rate": 1.9407000000000002e-05, + "loss": 0.0183, + "step": 6475 + }, + { + "epoch": 5.099251673887357, + "grad_norm": 0.49277397990226746, + "learning_rate": 1.9410000000000002e-05, + "loss": 0.0249, + "step": 6476 + }, + { + "epoch": 5.100039385584876, + "grad_norm": 0.7886565327644348, + "learning_rate": 1.9413000000000002e-05, + "loss": 0.0345, + "step": 6477 + }, + { + "epoch": 5.100827097282394, + "grad_norm": 0.4907863438129425, + "learning_rate": 1.9416000000000002e-05, + "loss": 0.0201, + "step": 6478 + }, + { + "epoch": 5.101614808979913, + "grad_norm": 0.5672430992126465, + "learning_rate": 1.9419e-05, + "loss": 0.0205, + "step": 6479 + }, + { + "epoch": 5.102402520677432, + "grad_norm": 1.886879563331604, + "learning_rate": 1.9422e-05, + "loss": 0.0186, + "step": 6480 + }, + { + "epoch": 5.103190232374951, + "grad_norm": 0.41975638270378113, + "learning_rate": 1.9424999999999998e-05, + "loss": 0.0262, + "step": 6481 + }, + { + "epoch": 5.1039779440724695, + "grad_norm": 0.64447021484375, + "learning_rate": 1.9427999999999998e-05, + "loss": 0.0267, + "step": 6482 + }, + { + "epoch": 5.104765655769988, + "grad_norm": 0.4689834415912628, + "learning_rate": 1.9431e-05, + "loss": 0.0269, + "step": 6483 + }, + { + "epoch": 5.105553367467507, + "grad_norm": 0.6759003400802612, + "learning_rate": 1.9434e-05, + "loss": 0.0406, + "step": 6484 + }, + { + "epoch": 5.106341079165025, + "grad_norm": 0.45199698209762573, + "learning_rate": 1.9437e-05, + "loss": 0.0237, + "step": 6485 + }, + { + "epoch": 5.107128790862545, + "grad_norm": 0.5652632117271423, + "learning_rate": 1.944e-05, + "loss": 0.0206, + "step": 6486 + }, + { + "epoch": 5.107916502560063, + "grad_norm": 0.47344475984573364, + "learning_rate": 1.9443e-05, + "loss": 0.0243, + "step": 6487 + }, + { + "epoch": 5.108704214257582, + "grad_norm": 0.528028666973114, + "learning_rate": 1.9446e-05, + "loss": 0.0185, + "step": 6488 + }, + { + "epoch": 5.1094919259551, + "grad_norm": 0.5754517316818237, + "learning_rate": 1.9449e-05, + "loss": 0.0369, + "step": 6489 + }, + { + "epoch": 5.110279637652619, + "grad_norm": 0.7129673957824707, + "learning_rate": 1.9452e-05, + "loss": 0.0377, + "step": 6490 + }, + { + "epoch": 5.1110673493501375, + "grad_norm": 0.5907636880874634, + "learning_rate": 1.9455e-05, + "loss": 0.0219, + "step": 6491 + }, + { + "epoch": 5.111855061047657, + "grad_norm": 0.4087851345539093, + "learning_rate": 1.9458e-05, + "loss": 0.0293, + "step": 6492 + }, + { + "epoch": 5.1126427727451755, + "grad_norm": 0.3692830801010132, + "learning_rate": 1.9461000000000002e-05, + "loss": 0.0228, + "step": 6493 + }, + { + "epoch": 5.113430484442694, + "grad_norm": 0.6251417398452759, + "learning_rate": 1.9464000000000002e-05, + "loss": 0.0249, + "step": 6494 + }, + { + "epoch": 5.114218196140213, + "grad_norm": 0.5116733908653259, + "learning_rate": 1.9467000000000002e-05, + "loss": 0.0269, + "step": 6495 + }, + { + "epoch": 5.115005907837731, + "grad_norm": 0.5197828412055969, + "learning_rate": 1.947e-05, + "loss": 0.0397, + "step": 6496 + }, + { + "epoch": 5.11579361953525, + "grad_norm": 0.6025605797767639, + "learning_rate": 1.9473e-05, + "loss": 0.0244, + "step": 6497 + }, + { + "epoch": 5.116581331232768, + "grad_norm": 0.5635461211204529, + "learning_rate": 1.9476e-05, + "loss": 0.0375, + "step": 6498 + }, + { + "epoch": 5.117369042930288, + "grad_norm": 0.8844696283340454, + "learning_rate": 1.9479e-05, + "loss": 0.0284, + "step": 6499 + }, + { + "epoch": 5.118156754627806, + "grad_norm": 0.44176825881004333, + "learning_rate": 1.9482e-05, + "loss": 0.0323, + "step": 6500 + }, + { + "epoch": 5.118944466325325, + "grad_norm": 0.9646864533424377, + "learning_rate": 1.9485e-05, + "loss": 0.327, + "step": 6501 + }, + { + "epoch": 5.1197321780228435, + "grad_norm": 0.7977321147918701, + "learning_rate": 1.9488e-05, + "loss": 0.1928, + "step": 6502 + }, + { + "epoch": 5.120519889720362, + "grad_norm": 0.6503126621246338, + "learning_rate": 1.9491000000000004e-05, + "loss": 0.1714, + "step": 6503 + }, + { + "epoch": 5.121307601417881, + "grad_norm": 0.6960678100585938, + "learning_rate": 1.9494000000000003e-05, + "loss": 0.1264, + "step": 6504 + }, + { + "epoch": 5.1220953131154, + "grad_norm": 6.677678108215332, + "learning_rate": 1.9497e-05, + "loss": 0.0874, + "step": 6505 + }, + { + "epoch": 5.122883024812919, + "grad_norm": 0.34866949915885925, + "learning_rate": 1.95e-05, + "loss": 0.0744, + "step": 6506 + }, + { + "epoch": 5.123670736510437, + "grad_norm": 0.6410354375839233, + "learning_rate": 1.9503e-05, + "loss": 0.087, + "step": 6507 + }, + { + "epoch": 5.124458448207956, + "grad_norm": 0.4254361689090729, + "learning_rate": 1.9506e-05, + "loss": 0.0318, + "step": 6508 + }, + { + "epoch": 5.125246159905474, + "grad_norm": 0.5346801280975342, + "learning_rate": 1.9509e-05, + "loss": 0.0228, + "step": 6509 + }, + { + "epoch": 5.126033871602993, + "grad_norm": 0.421388179063797, + "learning_rate": 1.9512e-05, + "loss": 0.0337, + "step": 6510 + }, + { + "epoch": 5.126821583300512, + "grad_norm": 0.36113640666007996, + "learning_rate": 1.9515e-05, + "loss": 0.0443, + "step": 6511 + }, + { + "epoch": 5.127609294998031, + "grad_norm": 0.3706303834915161, + "learning_rate": 1.9518e-05, + "loss": 0.0311, + "step": 6512 + }, + { + "epoch": 5.1283970066955495, + "grad_norm": 0.3215770721435547, + "learning_rate": 1.9520999999999998e-05, + "loss": 0.0265, + "step": 6513 + }, + { + "epoch": 5.129184718393068, + "grad_norm": 0.31250137090682983, + "learning_rate": 1.9524e-05, + "loss": 0.0217, + "step": 6514 + }, + { + "epoch": 5.129972430090587, + "grad_norm": 0.38260772824287415, + "learning_rate": 1.9527e-05, + "loss": 0.0246, + "step": 6515 + }, + { + "epoch": 5.130760141788105, + "grad_norm": 0.31939059495925903, + "learning_rate": 1.953e-05, + "loss": 0.0317, + "step": 6516 + }, + { + "epoch": 5.131547853485625, + "grad_norm": 0.41069090366363525, + "learning_rate": 1.9533e-05, + "loss": 0.0238, + "step": 6517 + }, + { + "epoch": 5.132335565183143, + "grad_norm": 0.23604273796081543, + "learning_rate": 1.9536e-05, + "loss": 0.0153, + "step": 6518 + }, + { + "epoch": 5.133123276880662, + "grad_norm": 0.20935404300689697, + "learning_rate": 1.9539e-05, + "loss": 0.0186, + "step": 6519 + }, + { + "epoch": 5.13391098857818, + "grad_norm": 0.17846930027008057, + "learning_rate": 1.9542e-05, + "loss": 0.0083, + "step": 6520 + }, + { + "epoch": 5.134698700275699, + "grad_norm": 0.4018693268299103, + "learning_rate": 1.9545e-05, + "loss": 0.0164, + "step": 6521 + }, + { + "epoch": 5.1354864119732175, + "grad_norm": 0.3130512237548828, + "learning_rate": 1.9548e-05, + "loss": 0.0179, + "step": 6522 + }, + { + "epoch": 5.136274123670736, + "grad_norm": 0.6380085349082947, + "learning_rate": 1.9551e-05, + "loss": 0.0243, + "step": 6523 + }, + { + "epoch": 5.137061835368256, + "grad_norm": 0.21846356987953186, + "learning_rate": 1.9554000000000003e-05, + "loss": 0.0136, + "step": 6524 + }, + { + "epoch": 5.137849547065774, + "grad_norm": 0.3173801004886627, + "learning_rate": 1.9557000000000003e-05, + "loss": 0.014, + "step": 6525 + }, + { + "epoch": 5.138637258763293, + "grad_norm": 0.3954959511756897, + "learning_rate": 1.9560000000000002e-05, + "loss": 0.0147, + "step": 6526 + }, + { + "epoch": 5.139424970460811, + "grad_norm": 1.1338170766830444, + "learning_rate": 1.9563000000000002e-05, + "loss": 0.0138, + "step": 6527 + }, + { + "epoch": 5.14021268215833, + "grad_norm": 0.5114110112190247, + "learning_rate": 1.9566000000000002e-05, + "loss": 0.0281, + "step": 6528 + }, + { + "epoch": 5.141000393855848, + "grad_norm": 0.4906635284423828, + "learning_rate": 1.9569000000000002e-05, + "loss": 0.0299, + "step": 6529 + }, + { + "epoch": 5.141788105553368, + "grad_norm": 0.3463699221611023, + "learning_rate": 1.9571999999999998e-05, + "loss": 0.0258, + "step": 6530 + }, + { + "epoch": 5.142575817250886, + "grad_norm": 0.6276180148124695, + "learning_rate": 1.9574999999999998e-05, + "loss": 0.0265, + "step": 6531 + }, + { + "epoch": 5.143363528948405, + "grad_norm": 0.397575318813324, + "learning_rate": 1.9577999999999998e-05, + "loss": 0.015, + "step": 6532 + }, + { + "epoch": 5.1441512406459236, + "grad_norm": 0.4734208583831787, + "learning_rate": 1.9580999999999998e-05, + "loss": 0.0354, + "step": 6533 + }, + { + "epoch": 5.144938952343442, + "grad_norm": 0.6222858428955078, + "learning_rate": 1.9584e-05, + "loss": 0.0304, + "step": 6534 + }, + { + "epoch": 5.145726664040961, + "grad_norm": 0.32856670022010803, + "learning_rate": 1.9587e-05, + "loss": 0.0197, + "step": 6535 + }, + { + "epoch": 5.14651437573848, + "grad_norm": 0.6118961572647095, + "learning_rate": 1.959e-05, + "loss": 0.0309, + "step": 6536 + }, + { + "epoch": 5.147302087435999, + "grad_norm": 0.39173272252082825, + "learning_rate": 1.9593e-05, + "loss": 0.0149, + "step": 6537 + }, + { + "epoch": 5.148089799133517, + "grad_norm": 0.522091269493103, + "learning_rate": 1.9596e-05, + "loss": 0.0197, + "step": 6538 + }, + { + "epoch": 5.148877510831036, + "grad_norm": 0.6092731952667236, + "learning_rate": 1.9599e-05, + "loss": 0.023, + "step": 6539 + }, + { + "epoch": 5.149665222528554, + "grad_norm": 0.43604913353919983, + "learning_rate": 1.9602e-05, + "loss": 0.0256, + "step": 6540 + }, + { + "epoch": 5.150452934226073, + "grad_norm": 0.31468290090560913, + "learning_rate": 1.9605e-05, + "loss": 0.0195, + "step": 6541 + }, + { + "epoch": 5.1512406459235915, + "grad_norm": 0.321628600358963, + "learning_rate": 1.9608e-05, + "loss": 0.0175, + "step": 6542 + }, + { + "epoch": 5.152028357621111, + "grad_norm": 0.7361129522323608, + "learning_rate": 1.9611e-05, + "loss": 0.0368, + "step": 6543 + }, + { + "epoch": 5.15281606931863, + "grad_norm": 0.6251514554023743, + "learning_rate": 1.9614000000000002e-05, + "loss": 0.0294, + "step": 6544 + }, + { + "epoch": 5.153603781016148, + "grad_norm": 0.42691609263420105, + "learning_rate": 1.9617000000000002e-05, + "loss": 0.0214, + "step": 6545 + }, + { + "epoch": 5.154391492713667, + "grad_norm": 1.2720496654510498, + "learning_rate": 1.9620000000000002e-05, + "loss": 0.0209, + "step": 6546 + }, + { + "epoch": 5.155179204411185, + "grad_norm": 0.6556458473205566, + "learning_rate": 1.9623e-05, + "loss": 0.0199, + "step": 6547 + }, + { + "epoch": 5.155966916108704, + "grad_norm": 0.5219025015830994, + "learning_rate": 1.9626e-05, + "loss": 0.0273, + "step": 6548 + }, + { + "epoch": 5.156754627806223, + "grad_norm": 0.569627583026886, + "learning_rate": 1.9629e-05, + "loss": 0.0347, + "step": 6549 + }, + { + "epoch": 5.157542339503742, + "grad_norm": 0.9691979885101318, + "learning_rate": 1.9632e-05, + "loss": 0.0251, + "step": 6550 + }, + { + "epoch": 5.15833005120126, + "grad_norm": 1.2404744625091553, + "learning_rate": 1.9635e-05, + "loss": 0.2917, + "step": 6551 + }, + { + "epoch": 5.159117762898779, + "grad_norm": 0.7211242318153381, + "learning_rate": 1.9638e-05, + "loss": 0.1666, + "step": 6552 + }, + { + "epoch": 5.159905474596298, + "grad_norm": 0.6613093018531799, + "learning_rate": 1.9641e-05, + "loss": 0.1758, + "step": 6553 + }, + { + "epoch": 5.160693186293816, + "grad_norm": 2.038548707962036, + "learning_rate": 1.9644e-05, + "loss": 0.2005, + "step": 6554 + }, + { + "epoch": 5.161480897991336, + "grad_norm": 0.669731855392456, + "learning_rate": 1.9647e-05, + "loss": 0.1452, + "step": 6555 + }, + { + "epoch": 5.162268609688854, + "grad_norm": 0.6466328501701355, + "learning_rate": 1.965e-05, + "loss": 0.0756, + "step": 6556 + }, + { + "epoch": 5.163056321386373, + "grad_norm": 0.45868438482284546, + "learning_rate": 1.9653e-05, + "loss": 0.047, + "step": 6557 + }, + { + "epoch": 5.163844033083891, + "grad_norm": 0.4482192397117615, + "learning_rate": 1.9656e-05, + "loss": 0.0394, + "step": 6558 + }, + { + "epoch": 5.16463174478141, + "grad_norm": 0.7641374468803406, + "learning_rate": 1.9659e-05, + "loss": 0.0712, + "step": 6559 + }, + { + "epoch": 5.165419456478928, + "grad_norm": 0.42385512590408325, + "learning_rate": 1.9662e-05, + "loss": 0.023, + "step": 6560 + }, + { + "epoch": 5.166207168176447, + "grad_norm": 0.31262412667274475, + "learning_rate": 1.9665e-05, + "loss": 0.0255, + "step": 6561 + }, + { + "epoch": 5.166994879873966, + "grad_norm": 0.2968474328517914, + "learning_rate": 1.9668e-05, + "loss": 0.0245, + "step": 6562 + }, + { + "epoch": 5.167782591571485, + "grad_norm": 0.7126638293266296, + "learning_rate": 1.9671e-05, + "loss": 0.0218, + "step": 6563 + }, + { + "epoch": 5.168570303269004, + "grad_norm": 0.37515002489089966, + "learning_rate": 1.9674000000000002e-05, + "loss": 0.0234, + "step": 6564 + }, + { + "epoch": 5.169358014966522, + "grad_norm": 0.4726357161998749, + "learning_rate": 1.9677e-05, + "loss": 0.0369, + "step": 6565 + }, + { + "epoch": 5.170145726664041, + "grad_norm": 0.5647336840629578, + "learning_rate": 1.968e-05, + "loss": 0.0216, + "step": 6566 + }, + { + "epoch": 5.170933438361559, + "grad_norm": 0.5069419145584106, + "learning_rate": 1.9683e-05, + "loss": 0.041, + "step": 6567 + }, + { + "epoch": 5.171721150059079, + "grad_norm": 0.23687505722045898, + "learning_rate": 1.9686e-05, + "loss": 0.0123, + "step": 6568 + }, + { + "epoch": 5.172508861756597, + "grad_norm": 0.17503157258033752, + "learning_rate": 1.9689e-05, + "loss": 0.0102, + "step": 6569 + }, + { + "epoch": 5.173296573454116, + "grad_norm": 0.6155155897140503, + "learning_rate": 1.9692e-05, + "loss": 0.0257, + "step": 6570 + }, + { + "epoch": 5.174084285151634, + "grad_norm": 0.6955252885818481, + "learning_rate": 1.9695e-05, + "loss": 0.0354, + "step": 6571 + }, + { + "epoch": 5.174871996849153, + "grad_norm": 0.5285279154777527, + "learning_rate": 1.9698e-05, + "loss": 0.0267, + "step": 6572 + }, + { + "epoch": 5.175659708546672, + "grad_norm": 0.49389857053756714, + "learning_rate": 1.9701e-05, + "loss": 0.0249, + "step": 6573 + }, + { + "epoch": 5.176447420244191, + "grad_norm": 0.7455452084541321, + "learning_rate": 1.9704000000000003e-05, + "loss": 0.0289, + "step": 6574 + }, + { + "epoch": 5.17723513194171, + "grad_norm": 0.3355548679828644, + "learning_rate": 1.9707000000000003e-05, + "loss": 0.0213, + "step": 6575 + }, + { + "epoch": 5.178022843639228, + "grad_norm": 0.4199022650718689, + "learning_rate": 1.9710000000000003e-05, + "loss": 0.0209, + "step": 6576 + }, + { + "epoch": 5.178810555336747, + "grad_norm": 0.2591583728790283, + "learning_rate": 1.9713000000000003e-05, + "loss": 0.0135, + "step": 6577 + }, + { + "epoch": 5.179598267034265, + "grad_norm": 0.4960693120956421, + "learning_rate": 1.9716000000000002e-05, + "loss": 0.027, + "step": 6578 + }, + { + "epoch": 5.180385978731784, + "grad_norm": 0.5364500284194946, + "learning_rate": 1.9719e-05, + "loss": 0.0329, + "step": 6579 + }, + { + "epoch": 5.181173690429302, + "grad_norm": 0.3068733513355255, + "learning_rate": 1.9722e-05, + "loss": 0.0188, + "step": 6580 + }, + { + "epoch": 5.181961402126822, + "grad_norm": 0.45265069603919983, + "learning_rate": 1.9725e-05, + "loss": 0.0177, + "step": 6581 + }, + { + "epoch": 5.1827491138243404, + "grad_norm": 0.9982959628105164, + "learning_rate": 1.9727999999999998e-05, + "loss": 0.0388, + "step": 6582 + }, + { + "epoch": 5.183536825521859, + "grad_norm": 0.42076072096824646, + "learning_rate": 1.9730999999999998e-05, + "loss": 0.0192, + "step": 6583 + }, + { + "epoch": 5.184324537219378, + "grad_norm": 0.6034077405929565, + "learning_rate": 1.9734e-05, + "loss": 0.0377, + "step": 6584 + }, + { + "epoch": 5.185112248916896, + "grad_norm": 0.28859472274780273, + "learning_rate": 1.9737e-05, + "loss": 0.0171, + "step": 6585 + }, + { + "epoch": 5.185899960614415, + "grad_norm": 0.66802579164505, + "learning_rate": 1.974e-05, + "loss": 0.0416, + "step": 6586 + }, + { + "epoch": 5.186687672311934, + "grad_norm": 0.7261447906494141, + "learning_rate": 1.9743e-05, + "loss": 0.0373, + "step": 6587 + }, + { + "epoch": 5.187475384009453, + "grad_norm": 0.7395941019058228, + "learning_rate": 1.9746e-05, + "loss": 0.0381, + "step": 6588 + }, + { + "epoch": 5.188263095706971, + "grad_norm": 0.40663641691207886, + "learning_rate": 1.9749e-05, + "loss": 0.0163, + "step": 6589 + }, + { + "epoch": 5.18905080740449, + "grad_norm": 0.4851546883583069, + "learning_rate": 1.9752e-05, + "loss": 0.0183, + "step": 6590 + }, + { + "epoch": 5.189838519102008, + "grad_norm": 0.36447569727897644, + "learning_rate": 1.9755e-05, + "loss": 0.0123, + "step": 6591 + }, + { + "epoch": 5.190626230799527, + "grad_norm": 0.5360107421875, + "learning_rate": 1.9758e-05, + "loss": 0.0204, + "step": 6592 + }, + { + "epoch": 5.1914139424970465, + "grad_norm": 0.40971648693084717, + "learning_rate": 1.9761e-05, + "loss": 0.013, + "step": 6593 + }, + { + "epoch": 5.192201654194565, + "grad_norm": 0.5152776837348938, + "learning_rate": 1.9764000000000003e-05, + "loss": 0.0276, + "step": 6594 + }, + { + "epoch": 5.192989365892084, + "grad_norm": 0.9006367325782776, + "learning_rate": 1.9767000000000002e-05, + "loss": 0.0338, + "step": 6595 + }, + { + "epoch": 5.193777077589602, + "grad_norm": 0.3035190999507904, + "learning_rate": 1.9770000000000002e-05, + "loss": 0.0204, + "step": 6596 + }, + { + "epoch": 5.194564789287121, + "grad_norm": 0.7923668622970581, + "learning_rate": 1.9773000000000002e-05, + "loss": 0.0458, + "step": 6597 + }, + { + "epoch": 5.195352500984639, + "grad_norm": 0.5853260159492493, + "learning_rate": 1.9776000000000002e-05, + "loss": 0.0396, + "step": 6598 + }, + { + "epoch": 5.196140212682159, + "grad_norm": 0.74616539478302, + "learning_rate": 1.9779e-05, + "loss": 0.0399, + "step": 6599 + }, + { + "epoch": 5.196927924379677, + "grad_norm": 0.5773661136627197, + "learning_rate": 1.9782e-05, + "loss": 0.0328, + "step": 6600 + }, + { + "epoch": 5.197715636077196, + "grad_norm": 19.77264404296875, + "learning_rate": 1.9785e-05, + "loss": 0.4293, + "step": 6601 + }, + { + "epoch": 5.1985033477747145, + "grad_norm": 0.9270626902580261, + "learning_rate": 1.9788e-05, + "loss": 0.257, + "step": 6602 + }, + { + "epoch": 5.199291059472233, + "grad_norm": 1.2571176290512085, + "learning_rate": 1.9791e-05, + "loss": 0.2313, + "step": 6603 + }, + { + "epoch": 5.200078771169752, + "grad_norm": 0.5830099582672119, + "learning_rate": 1.9794e-05, + "loss": 0.1403, + "step": 6604 + }, + { + "epoch": 5.20086648286727, + "grad_norm": 0.7502968907356262, + "learning_rate": 1.9797e-05, + "loss": 0.112, + "step": 6605 + }, + { + "epoch": 5.20165419456479, + "grad_norm": 0.942430853843689, + "learning_rate": 1.98e-05, + "loss": 0.0578, + "step": 6606 + }, + { + "epoch": 5.202441906262308, + "grad_norm": 0.5401706695556641, + "learning_rate": 1.9803e-05, + "loss": 0.0482, + "step": 6607 + }, + { + "epoch": 5.203229617959827, + "grad_norm": 0.3875891864299774, + "learning_rate": 1.9806e-05, + "loss": 0.0434, + "step": 6608 + }, + { + "epoch": 5.204017329657345, + "grad_norm": 0.4753909111022949, + "learning_rate": 1.9809e-05, + "loss": 0.0374, + "step": 6609 + }, + { + "epoch": 5.204805041354864, + "grad_norm": 0.2908848226070404, + "learning_rate": 1.9812e-05, + "loss": 0.0187, + "step": 6610 + }, + { + "epoch": 5.2055927530523824, + "grad_norm": 0.7710670828819275, + "learning_rate": 1.9815e-05, + "loss": 0.0383, + "step": 6611 + }, + { + "epoch": 5.206380464749902, + "grad_norm": 0.531575083732605, + "learning_rate": 1.9818e-05, + "loss": 0.0323, + "step": 6612 + }, + { + "epoch": 5.2071681764474205, + "grad_norm": 0.3662647306919098, + "learning_rate": 1.9821e-05, + "loss": 0.0166, + "step": 6613 + }, + { + "epoch": 5.207955888144939, + "grad_norm": 0.5852437019348145, + "learning_rate": 1.9824000000000002e-05, + "loss": 0.0226, + "step": 6614 + }, + { + "epoch": 5.208743599842458, + "grad_norm": 0.25356829166412354, + "learning_rate": 1.9827000000000002e-05, + "loss": 0.0176, + "step": 6615 + }, + { + "epoch": 5.209531311539976, + "grad_norm": 0.41860076785087585, + "learning_rate": 1.983e-05, + "loss": 0.0292, + "step": 6616 + }, + { + "epoch": 5.210319023237495, + "grad_norm": 0.34257325530052185, + "learning_rate": 1.9833e-05, + "loss": 0.0179, + "step": 6617 + }, + { + "epoch": 5.211106734935014, + "grad_norm": 0.34735307097435, + "learning_rate": 1.9836e-05, + "loss": 0.0223, + "step": 6618 + }, + { + "epoch": 5.211894446632533, + "grad_norm": 0.3042084276676178, + "learning_rate": 1.9839e-05, + "loss": 0.0192, + "step": 6619 + }, + { + "epoch": 5.212682158330051, + "grad_norm": 0.4872615337371826, + "learning_rate": 1.9842e-05, + "loss": 0.0205, + "step": 6620 + }, + { + "epoch": 5.21346987002757, + "grad_norm": 0.43416959047317505, + "learning_rate": 1.9845e-05, + "loss": 0.0297, + "step": 6621 + }, + { + "epoch": 5.2142575817250885, + "grad_norm": 0.7774988412857056, + "learning_rate": 1.9848e-05, + "loss": 0.0205, + "step": 6622 + }, + { + "epoch": 5.215045293422607, + "grad_norm": 0.6046254634857178, + "learning_rate": 1.9851e-05, + "loss": 0.0265, + "step": 6623 + }, + { + "epoch": 5.2158330051201265, + "grad_norm": 0.23449143767356873, + "learning_rate": 1.9854000000000003e-05, + "loss": 0.0134, + "step": 6624 + }, + { + "epoch": 5.216620716817645, + "grad_norm": 0.3320689797401428, + "learning_rate": 1.9857000000000003e-05, + "loss": 0.0192, + "step": 6625 + }, + { + "epoch": 5.217408428515164, + "grad_norm": 0.6936871409416199, + "learning_rate": 1.9860000000000003e-05, + "loss": 0.0294, + "step": 6626 + }, + { + "epoch": 5.218196140212682, + "grad_norm": 0.3885018229484558, + "learning_rate": 1.9863000000000003e-05, + "loss": 0.025, + "step": 6627 + }, + { + "epoch": 5.218983851910201, + "grad_norm": 0.3902534544467926, + "learning_rate": 1.9866e-05, + "loss": 0.0159, + "step": 6628 + }, + { + "epoch": 5.219771563607719, + "grad_norm": 0.6178284287452698, + "learning_rate": 1.9869e-05, + "loss": 0.0151, + "step": 6629 + }, + { + "epoch": 5.220559275305238, + "grad_norm": 0.20246589183807373, + "learning_rate": 1.9872e-05, + "loss": 0.0101, + "step": 6630 + }, + { + "epoch": 5.221346987002757, + "grad_norm": 0.42059120535850525, + "learning_rate": 1.9875e-05, + "loss": 0.0256, + "step": 6631 + }, + { + "epoch": 5.222134698700276, + "grad_norm": 0.35442033410072327, + "learning_rate": 1.9878e-05, + "loss": 0.0126, + "step": 6632 + }, + { + "epoch": 5.2229224103977945, + "grad_norm": 0.4266911745071411, + "learning_rate": 1.9880999999999998e-05, + "loss": 0.0222, + "step": 6633 + }, + { + "epoch": 5.223710122095313, + "grad_norm": 0.44459569454193115, + "learning_rate": 1.9883999999999998e-05, + "loss": 0.0253, + "step": 6634 + }, + { + "epoch": 5.224497833792832, + "grad_norm": 0.7389732003211975, + "learning_rate": 1.9887e-05, + "loss": 0.0324, + "step": 6635 + }, + { + "epoch": 5.22528554549035, + "grad_norm": 0.6707794070243835, + "learning_rate": 1.989e-05, + "loss": 0.0297, + "step": 6636 + }, + { + "epoch": 5.22607325718787, + "grad_norm": 0.5213596224784851, + "learning_rate": 1.9893e-05, + "loss": 0.0204, + "step": 6637 + }, + { + "epoch": 5.226860968885388, + "grad_norm": 0.5442511439323425, + "learning_rate": 1.9896e-05, + "loss": 0.029, + "step": 6638 + }, + { + "epoch": 5.227648680582907, + "grad_norm": 0.4153422713279724, + "learning_rate": 1.9899e-05, + "loss": 0.0176, + "step": 6639 + }, + { + "epoch": 5.228436392280425, + "grad_norm": 0.5211325287818909, + "learning_rate": 1.9902e-05, + "loss": 0.0374, + "step": 6640 + }, + { + "epoch": 5.229224103977944, + "grad_norm": 0.624576210975647, + "learning_rate": 1.9905e-05, + "loss": 0.0299, + "step": 6641 + }, + { + "epoch": 5.2300118156754625, + "grad_norm": 0.5306465029716492, + "learning_rate": 1.9908e-05, + "loss": 0.0184, + "step": 6642 + }, + { + "epoch": 5.230799527372982, + "grad_norm": 0.7094345688819885, + "learning_rate": 1.9911e-05, + "loss": 0.0321, + "step": 6643 + }, + { + "epoch": 5.2315872390705005, + "grad_norm": 0.6792567372322083, + "learning_rate": 1.9914e-05, + "loss": 0.0448, + "step": 6644 + }, + { + "epoch": 5.232374950768019, + "grad_norm": 0.5179941654205322, + "learning_rate": 1.9917000000000003e-05, + "loss": 0.0319, + "step": 6645 + }, + { + "epoch": 5.233162662465538, + "grad_norm": 0.41771867871284485, + "learning_rate": 1.9920000000000002e-05, + "loss": 0.0203, + "step": 6646 + }, + { + "epoch": 5.233950374163056, + "grad_norm": 0.6116568446159363, + "learning_rate": 1.9923000000000002e-05, + "loss": 0.0232, + "step": 6647 + }, + { + "epoch": 5.234738085860575, + "grad_norm": 1.0263093709945679, + "learning_rate": 1.9926000000000002e-05, + "loss": 0.0475, + "step": 6648 + }, + { + "epoch": 5.235525797558093, + "grad_norm": 0.5488086342811584, + "learning_rate": 1.9929000000000002e-05, + "loss": 0.0273, + "step": 6649 + }, + { + "epoch": 5.236313509255613, + "grad_norm": 0.497331440448761, + "learning_rate": 1.9932e-05, + "loss": 0.0176, + "step": 6650 + }, + { + "epoch": 5.237101220953131, + "grad_norm": 1.1843721866607666, + "learning_rate": 1.9935e-05, + "loss": 0.3615, + "step": 6651 + }, + { + "epoch": 5.23788893265065, + "grad_norm": 0.9135732650756836, + "learning_rate": 1.9938e-05, + "loss": 0.2306, + "step": 6652 + }, + { + "epoch": 5.2386766443481685, + "grad_norm": 0.8542720079421997, + "learning_rate": 1.9940999999999998e-05, + "loss": 0.2124, + "step": 6653 + }, + { + "epoch": 5.239464356045687, + "grad_norm": 0.9019399881362915, + "learning_rate": 1.9943999999999997e-05, + "loss": 0.1492, + "step": 6654 + }, + { + "epoch": 5.240252067743206, + "grad_norm": 0.7887457609176636, + "learning_rate": 1.9947e-05, + "loss": 0.136, + "step": 6655 + }, + { + "epoch": 5.241039779440725, + "grad_norm": 0.4083545207977295, + "learning_rate": 1.995e-05, + "loss": 0.0651, + "step": 6656 + }, + { + "epoch": 5.241827491138244, + "grad_norm": 0.43596476316452026, + "learning_rate": 1.9953e-05, + "loss": 0.0491, + "step": 6657 + }, + { + "epoch": 5.242615202835762, + "grad_norm": 0.948398768901825, + "learning_rate": 1.9956e-05, + "loss": 0.0565, + "step": 6658 + }, + { + "epoch": 5.243402914533281, + "grad_norm": 0.535526692867279, + "learning_rate": 1.9959e-05, + "loss": 0.0689, + "step": 6659 + }, + { + "epoch": 5.244190626230799, + "grad_norm": 0.5170639753341675, + "learning_rate": 1.9962e-05, + "loss": 0.039, + "step": 6660 + }, + { + "epoch": 5.244978337928318, + "grad_norm": 0.4851924777030945, + "learning_rate": 1.9965e-05, + "loss": 0.0467, + "step": 6661 + }, + { + "epoch": 5.245766049625837, + "grad_norm": 0.3491824269294739, + "learning_rate": 1.9968e-05, + "loss": 0.0309, + "step": 6662 + }, + { + "epoch": 5.246553761323356, + "grad_norm": 0.36737260222435, + "learning_rate": 1.9971e-05, + "loss": 0.0335, + "step": 6663 + }, + { + "epoch": 5.2473414730208745, + "grad_norm": 0.4136730432510376, + "learning_rate": 1.9974e-05, + "loss": 0.0227, + "step": 6664 + }, + { + "epoch": 5.248129184718393, + "grad_norm": 0.47748735547065735, + "learning_rate": 1.9977000000000002e-05, + "loss": 0.0291, + "step": 6665 + }, + { + "epoch": 5.248916896415912, + "grad_norm": 0.2697191834449768, + "learning_rate": 1.9980000000000002e-05, + "loss": 0.0168, + "step": 6666 + }, + { + "epoch": 5.24970460811343, + "grad_norm": 0.7939308285713196, + "learning_rate": 1.9983e-05, + "loss": 0.0263, + "step": 6667 + }, + { + "epoch": 5.250492319810949, + "grad_norm": 0.452045202255249, + "learning_rate": 1.9986e-05, + "loss": 0.0343, + "step": 6668 + }, + { + "epoch": 5.251280031508468, + "grad_norm": 0.5271660089492798, + "learning_rate": 1.9989e-05, + "loss": 0.0245, + "step": 6669 + }, + { + "epoch": 5.252067743205987, + "grad_norm": 0.36857128143310547, + "learning_rate": 1.9992e-05, + "loss": 0.0256, + "step": 6670 + }, + { + "epoch": 5.252855454903505, + "grad_norm": 0.41814377903938293, + "learning_rate": 1.9995e-05, + "loss": 0.0309, + "step": 6671 + }, + { + "epoch": 5.253643166601024, + "grad_norm": 0.7846832275390625, + "learning_rate": 1.9998e-05, + "loss": 0.0234, + "step": 6672 + }, + { + "epoch": 5.2544308782985425, + "grad_norm": 0.44591763615608215, + "learning_rate": 2.0001e-05, + "loss": 0.0245, + "step": 6673 + }, + { + "epoch": 5.255218589996061, + "grad_norm": 0.48182812333106995, + "learning_rate": 2.0004e-05, + "loss": 0.0347, + "step": 6674 + }, + { + "epoch": 5.2560063016935805, + "grad_norm": 0.43452754616737366, + "learning_rate": 2.0007000000000003e-05, + "loss": 0.0231, + "step": 6675 + }, + { + "epoch": 5.256794013391099, + "grad_norm": 0.7097524404525757, + "learning_rate": 2.0010000000000003e-05, + "loss": 0.0194, + "step": 6676 + }, + { + "epoch": 5.257581725088618, + "grad_norm": 0.5317888259887695, + "learning_rate": 2.0013e-05, + "loss": 0.0267, + "step": 6677 + }, + { + "epoch": 5.258369436786136, + "grad_norm": 0.2624645233154297, + "learning_rate": 2.0016e-05, + "loss": 0.0165, + "step": 6678 + }, + { + "epoch": 5.259157148483655, + "grad_norm": 0.27429917454719543, + "learning_rate": 2.0019e-05, + "loss": 0.0166, + "step": 6679 + }, + { + "epoch": 5.259944860181173, + "grad_norm": 0.6552389860153198, + "learning_rate": 2.0022e-05, + "loss": 0.0312, + "step": 6680 + }, + { + "epoch": 5.260732571878693, + "grad_norm": 0.39730337262153625, + "learning_rate": 2.0025e-05, + "loss": 0.024, + "step": 6681 + }, + { + "epoch": 5.261520283576211, + "grad_norm": 0.49435821175575256, + "learning_rate": 2.0028e-05, + "loss": 0.0209, + "step": 6682 + }, + { + "epoch": 5.26230799527373, + "grad_norm": 0.5214632153511047, + "learning_rate": 2.0031e-05, + "loss": 0.027, + "step": 6683 + }, + { + "epoch": 5.2630957069712485, + "grad_norm": 0.41015756130218506, + "learning_rate": 2.0033999999999998e-05, + "loss": 0.0239, + "step": 6684 + }, + { + "epoch": 5.263883418668767, + "grad_norm": 0.3229021430015564, + "learning_rate": 2.0037e-05, + "loss": 0.0222, + "step": 6685 + }, + { + "epoch": 5.264671130366286, + "grad_norm": 0.33227112889289856, + "learning_rate": 2.004e-05, + "loss": 0.0204, + "step": 6686 + }, + { + "epoch": 5.265458842063804, + "grad_norm": 0.33793050050735474, + "learning_rate": 2.0043e-05, + "loss": 0.0243, + "step": 6687 + }, + { + "epoch": 5.266246553761324, + "grad_norm": 0.21621251106262207, + "learning_rate": 2.0046e-05, + "loss": 0.0152, + "step": 6688 + }, + { + "epoch": 5.267034265458842, + "grad_norm": 0.5732802152633667, + "learning_rate": 2.0049e-05, + "loss": 0.0148, + "step": 6689 + }, + { + "epoch": 5.267821977156361, + "grad_norm": 0.2999030649662018, + "learning_rate": 2.0052e-05, + "loss": 0.0162, + "step": 6690 + }, + { + "epoch": 5.268609688853879, + "grad_norm": 0.4971160590648651, + "learning_rate": 2.0055e-05, + "loss": 0.0324, + "step": 6691 + }, + { + "epoch": 5.269397400551398, + "grad_norm": 0.33170121908187866, + "learning_rate": 2.0058e-05, + "loss": 0.0133, + "step": 6692 + }, + { + "epoch": 5.2701851122489165, + "grad_norm": 0.8443450927734375, + "learning_rate": 2.0061e-05, + "loss": 0.0227, + "step": 6693 + }, + { + "epoch": 5.270972823946436, + "grad_norm": 0.4775572717189789, + "learning_rate": 2.0064e-05, + "loss": 0.0293, + "step": 6694 + }, + { + "epoch": 5.2717605356439545, + "grad_norm": 0.651276707649231, + "learning_rate": 2.0067000000000003e-05, + "loss": 0.0295, + "step": 6695 + }, + { + "epoch": 5.272548247341473, + "grad_norm": 0.41113972663879395, + "learning_rate": 2.0070000000000003e-05, + "loss": 0.0137, + "step": 6696 + }, + { + "epoch": 5.273335959038992, + "grad_norm": 0.47092360258102417, + "learning_rate": 2.0073000000000002e-05, + "loss": 0.0214, + "step": 6697 + }, + { + "epoch": 5.27412367073651, + "grad_norm": 0.6019486784934998, + "learning_rate": 2.0076000000000002e-05, + "loss": 0.0222, + "step": 6698 + }, + { + "epoch": 5.274911382434029, + "grad_norm": 0.7160693407058716, + "learning_rate": 2.0079000000000002e-05, + "loss": 0.0385, + "step": 6699 + }, + { + "epoch": 5.275699094131548, + "grad_norm": 2.507117986679077, + "learning_rate": 2.0082000000000002e-05, + "loss": 0.0523, + "step": 6700 + }, + { + "epoch": 5.276486805829067, + "grad_norm": 1.1657277345657349, + "learning_rate": 2.0085e-05, + "loss": 0.3079, + "step": 6701 + }, + { + "epoch": 5.277274517526585, + "grad_norm": 1.3332735300064087, + "learning_rate": 2.0087999999999998e-05, + "loss": 0.3025, + "step": 6702 + }, + { + "epoch": 5.278062229224104, + "grad_norm": 1.790379524230957, + "learning_rate": 2.0090999999999998e-05, + "loss": 0.1642, + "step": 6703 + }, + { + "epoch": 5.2788499409216225, + "grad_norm": 0.5900543332099915, + "learning_rate": 2.0093999999999998e-05, + "loss": 0.0989, + "step": 6704 + }, + { + "epoch": 5.279637652619141, + "grad_norm": 1.074318289756775, + "learning_rate": 2.0097e-05, + "loss": 0.0945, + "step": 6705 + }, + { + "epoch": 5.28042536431666, + "grad_norm": 0.6171978116035461, + "learning_rate": 2.01e-05, + "loss": 0.1047, + "step": 6706 + }, + { + "epoch": 5.281213076014179, + "grad_norm": 0.8045344352722168, + "learning_rate": 2.0103e-05, + "loss": 0.0516, + "step": 6707 + }, + { + "epoch": 5.282000787711698, + "grad_norm": 0.44022467732429504, + "learning_rate": 2.0106e-05, + "loss": 0.0274, + "step": 6708 + }, + { + "epoch": 5.282788499409216, + "grad_norm": 0.8153945207595825, + "learning_rate": 2.0109e-05, + "loss": 0.0316, + "step": 6709 + }, + { + "epoch": 5.283576211106735, + "grad_norm": 0.4162556529045105, + "learning_rate": 2.0112e-05, + "loss": 0.032, + "step": 6710 + }, + { + "epoch": 5.284363922804253, + "grad_norm": 0.2321617156267166, + "learning_rate": 2.0115e-05, + "loss": 0.022, + "step": 6711 + }, + { + "epoch": 5.285151634501772, + "grad_norm": 0.4180595576763153, + "learning_rate": 2.0118e-05, + "loss": 0.0242, + "step": 6712 + }, + { + "epoch": 5.285939346199291, + "grad_norm": 0.4907284379005432, + "learning_rate": 2.0121e-05, + "loss": 0.0283, + "step": 6713 + }, + { + "epoch": 5.28672705789681, + "grad_norm": 0.8341971039772034, + "learning_rate": 2.0124e-05, + "loss": 0.0141, + "step": 6714 + }, + { + "epoch": 5.2875147695943285, + "grad_norm": 0.3686416745185852, + "learning_rate": 2.0127000000000002e-05, + "loss": 0.0156, + "step": 6715 + }, + { + "epoch": 5.288302481291847, + "grad_norm": 0.5728181004524231, + "learning_rate": 2.0130000000000002e-05, + "loss": 0.0185, + "step": 6716 + }, + { + "epoch": 5.289090192989366, + "grad_norm": 0.33105456829071045, + "learning_rate": 2.0133000000000002e-05, + "loss": 0.0283, + "step": 6717 + }, + { + "epoch": 5.289877904686884, + "grad_norm": 0.33762791752815247, + "learning_rate": 2.0136e-05, + "loss": 0.0183, + "step": 6718 + }, + { + "epoch": 5.290665616384404, + "grad_norm": 0.5337520241737366, + "learning_rate": 2.0139e-05, + "loss": 0.0218, + "step": 6719 + }, + { + "epoch": 5.291453328081922, + "grad_norm": 0.23243287205696106, + "learning_rate": 2.0142e-05, + "loss": 0.0188, + "step": 6720 + }, + { + "epoch": 5.292241039779441, + "grad_norm": 0.38807255029678345, + "learning_rate": 2.0145e-05, + "loss": 0.0245, + "step": 6721 + }, + { + "epoch": 5.293028751476959, + "grad_norm": 0.6184014081954956, + "learning_rate": 2.0148e-05, + "loss": 0.0298, + "step": 6722 + }, + { + "epoch": 5.293816463174478, + "grad_norm": 0.3568713366985321, + "learning_rate": 2.0151e-05, + "loss": 0.0222, + "step": 6723 + }, + { + "epoch": 5.2946041748719965, + "grad_norm": 0.5234447717666626, + "learning_rate": 2.0154e-05, + "loss": 0.0439, + "step": 6724 + }, + { + "epoch": 5.295391886569515, + "grad_norm": 0.43279170989990234, + "learning_rate": 2.0157000000000004e-05, + "loss": 0.0226, + "step": 6725 + }, + { + "epoch": 5.2961795982670345, + "grad_norm": 0.2659114599227905, + "learning_rate": 2.016e-05, + "loss": 0.0224, + "step": 6726 + }, + { + "epoch": 5.296967309964553, + "grad_norm": 0.5251427888870239, + "learning_rate": 2.0163e-05, + "loss": 0.0616, + "step": 6727 + }, + { + "epoch": 5.297755021662072, + "grad_norm": 0.41287243366241455, + "learning_rate": 2.0166e-05, + "loss": 0.036, + "step": 6728 + }, + { + "epoch": 5.29854273335959, + "grad_norm": 0.36435264348983765, + "learning_rate": 2.0169e-05, + "loss": 0.0194, + "step": 6729 + }, + { + "epoch": 5.299330445057109, + "grad_norm": 0.6132906675338745, + "learning_rate": 2.0172e-05, + "loss": 0.0283, + "step": 6730 + }, + { + "epoch": 5.300118156754628, + "grad_norm": 0.33695188164711, + "learning_rate": 2.0175e-05, + "loss": 0.0385, + "step": 6731 + }, + { + "epoch": 5.300905868452147, + "grad_norm": 0.5005443692207336, + "learning_rate": 2.0178e-05, + "loss": 0.0208, + "step": 6732 + }, + { + "epoch": 5.301693580149665, + "grad_norm": 0.32791954278945923, + "learning_rate": 2.0181e-05, + "loss": 0.0195, + "step": 6733 + }, + { + "epoch": 5.302481291847184, + "grad_norm": 0.3757956624031067, + "learning_rate": 2.0184e-05, + "loss": 0.0207, + "step": 6734 + }, + { + "epoch": 5.3032690035447025, + "grad_norm": 0.44920867681503296, + "learning_rate": 2.0187000000000002e-05, + "loss": 0.0312, + "step": 6735 + }, + { + "epoch": 5.304056715242221, + "grad_norm": 0.29254084825515747, + "learning_rate": 2.019e-05, + "loss": 0.0158, + "step": 6736 + }, + { + "epoch": 5.30484442693974, + "grad_norm": 0.41461607813835144, + "learning_rate": 2.0193e-05, + "loss": 0.018, + "step": 6737 + }, + { + "epoch": 5.305632138637259, + "grad_norm": 0.3962373435497284, + "learning_rate": 2.0196e-05, + "loss": 0.0208, + "step": 6738 + }, + { + "epoch": 5.306419850334778, + "grad_norm": 1.1131190061569214, + "learning_rate": 2.0199e-05, + "loss": 0.026, + "step": 6739 + }, + { + "epoch": 5.307207562032296, + "grad_norm": 0.6725017428398132, + "learning_rate": 2.0202e-05, + "loss": 0.0409, + "step": 6740 + }, + { + "epoch": 5.307995273729815, + "grad_norm": 0.3830849230289459, + "learning_rate": 2.0205e-05, + "loss": 0.0227, + "step": 6741 + }, + { + "epoch": 5.308782985427333, + "grad_norm": 0.6911083459854126, + "learning_rate": 2.0208e-05, + "loss": 0.0309, + "step": 6742 + }, + { + "epoch": 5.309570697124852, + "grad_norm": 0.39602333307266235, + "learning_rate": 2.0211e-05, + "loss": 0.0211, + "step": 6743 + }, + { + "epoch": 5.310358408822371, + "grad_norm": 0.5591397881507874, + "learning_rate": 2.0214e-05, + "loss": 0.0155, + "step": 6744 + }, + { + "epoch": 5.31114612051989, + "grad_norm": 0.6961808800697327, + "learning_rate": 2.0217000000000003e-05, + "loss": 0.0188, + "step": 6745 + }, + { + "epoch": 5.311933832217409, + "grad_norm": 0.44124752283096313, + "learning_rate": 2.0220000000000003e-05, + "loss": 0.0335, + "step": 6746 + }, + { + "epoch": 5.312721543914927, + "grad_norm": 0.5616042017936707, + "learning_rate": 2.0223000000000003e-05, + "loss": 0.0314, + "step": 6747 + }, + { + "epoch": 5.313509255612446, + "grad_norm": 0.5309929847717285, + "learning_rate": 2.0226000000000003e-05, + "loss": 0.0279, + "step": 6748 + }, + { + "epoch": 5.314296967309964, + "grad_norm": 0.5834824442863464, + "learning_rate": 2.0229000000000002e-05, + "loss": 0.0232, + "step": 6749 + }, + { + "epoch": 5.315084679007484, + "grad_norm": 0.7123326659202576, + "learning_rate": 2.0232000000000002e-05, + "loss": 0.0267, + "step": 6750 + }, + { + "epoch": 5.315872390705002, + "grad_norm": 1.1513468027114868, + "learning_rate": 2.0235e-05, + "loss": 0.2846, + "step": 6751 + }, + { + "epoch": 5.316660102402521, + "grad_norm": 0.7543259859085083, + "learning_rate": 2.0238e-05, + "loss": 0.197, + "step": 6752 + }, + { + "epoch": 5.317447814100039, + "grad_norm": 0.8950716257095337, + "learning_rate": 2.0240999999999998e-05, + "loss": 0.2201, + "step": 6753 + }, + { + "epoch": 5.318235525797558, + "grad_norm": 0.6262131929397583, + "learning_rate": 2.0243999999999998e-05, + "loss": 0.1447, + "step": 6754 + }, + { + "epoch": 5.3190232374950766, + "grad_norm": 0.9897583723068237, + "learning_rate": 2.0247e-05, + "loss": 0.132, + "step": 6755 + }, + { + "epoch": 5.319810949192595, + "grad_norm": 0.4118022918701172, + "learning_rate": 2.025e-05, + "loss": 0.0476, + "step": 6756 + }, + { + "epoch": 5.320598660890115, + "grad_norm": 0.5170925259590149, + "learning_rate": 2.0253e-05, + "loss": 0.0486, + "step": 6757 + }, + { + "epoch": 5.321386372587633, + "grad_norm": 1.5753406286239624, + "learning_rate": 2.0256e-05, + "loss": 0.0753, + "step": 6758 + }, + { + "epoch": 5.322174084285152, + "grad_norm": 0.4124357998371124, + "learning_rate": 2.0259e-05, + "loss": 0.0312, + "step": 6759 + }, + { + "epoch": 5.32296179598267, + "grad_norm": 0.43910351395606995, + "learning_rate": 2.0262e-05, + "loss": 0.0276, + "step": 6760 + }, + { + "epoch": 5.323749507680189, + "grad_norm": 0.23350495100021362, + "learning_rate": 2.0265e-05, + "loss": 0.0225, + "step": 6761 + }, + { + "epoch": 5.324537219377707, + "grad_norm": 0.4426940679550171, + "learning_rate": 2.0268e-05, + "loss": 0.0614, + "step": 6762 + }, + { + "epoch": 5.325324931075227, + "grad_norm": 0.5034719109535217, + "learning_rate": 2.0271e-05, + "loss": 0.0285, + "step": 6763 + }, + { + "epoch": 5.326112642772745, + "grad_norm": 0.39832162857055664, + "learning_rate": 2.0274e-05, + "loss": 0.022, + "step": 6764 + }, + { + "epoch": 5.326900354470264, + "grad_norm": 0.40173259377479553, + "learning_rate": 2.0277e-05, + "loss": 0.0309, + "step": 6765 + }, + { + "epoch": 5.327688066167783, + "grad_norm": 0.5702373385429382, + "learning_rate": 2.0280000000000002e-05, + "loss": 0.0208, + "step": 6766 + }, + { + "epoch": 5.328475777865301, + "grad_norm": 0.4567936360836029, + "learning_rate": 2.0283000000000002e-05, + "loss": 0.0313, + "step": 6767 + }, + { + "epoch": 5.32926348956282, + "grad_norm": 0.5825759768486023, + "learning_rate": 2.0286000000000002e-05, + "loss": 0.029, + "step": 6768 + }, + { + "epoch": 5.330051201260339, + "grad_norm": 0.6660692095756531, + "learning_rate": 2.0289000000000002e-05, + "loss": 0.0352, + "step": 6769 + }, + { + "epoch": 5.330838912957858, + "grad_norm": 0.8842588663101196, + "learning_rate": 2.0292e-05, + "loss": 0.0291, + "step": 6770 + }, + { + "epoch": 5.331626624655376, + "grad_norm": 0.4541008770465851, + "learning_rate": 2.0295e-05, + "loss": 0.033, + "step": 6771 + }, + { + "epoch": 5.332414336352895, + "grad_norm": 0.5337404608726501, + "learning_rate": 2.0298e-05, + "loss": 0.0187, + "step": 6772 + }, + { + "epoch": 5.333202048050413, + "grad_norm": 0.45474159717559814, + "learning_rate": 2.0301e-05, + "loss": 0.0239, + "step": 6773 + }, + { + "epoch": 5.333989759747932, + "grad_norm": 0.5788844227790833, + "learning_rate": 2.0304e-05, + "loss": 0.0322, + "step": 6774 + }, + { + "epoch": 5.334777471445451, + "grad_norm": 0.671467661857605, + "learning_rate": 2.0307e-05, + "loss": 0.0324, + "step": 6775 + }, + { + "epoch": 5.33556518314297, + "grad_norm": 0.6476321220397949, + "learning_rate": 2.031e-05, + "loss": 0.038, + "step": 6776 + }, + { + "epoch": 5.336352894840489, + "grad_norm": 0.47812968492507935, + "learning_rate": 2.0313e-05, + "loss": 0.0208, + "step": 6777 + }, + { + "epoch": 5.337140606538007, + "grad_norm": 0.3851991295814514, + "learning_rate": 2.0316e-05, + "loss": 0.0192, + "step": 6778 + }, + { + "epoch": 5.337928318235526, + "grad_norm": 0.2778598368167877, + "learning_rate": 2.0319e-05, + "loss": 0.014, + "step": 6779 + }, + { + "epoch": 5.338716029933044, + "grad_norm": 0.4413051903247833, + "learning_rate": 2.0322e-05, + "loss": 0.024, + "step": 6780 + }, + { + "epoch": 5.339503741630563, + "grad_norm": 0.5602917075157166, + "learning_rate": 2.0325e-05, + "loss": 0.0402, + "step": 6781 + }, + { + "epoch": 5.340291453328082, + "grad_norm": 0.5723835825920105, + "learning_rate": 2.0328e-05, + "loss": 0.021, + "step": 6782 + }, + { + "epoch": 5.341079165025601, + "grad_norm": 0.4659479260444641, + "learning_rate": 2.0331e-05, + "loss": 0.0239, + "step": 6783 + }, + { + "epoch": 5.341866876723119, + "grad_norm": 0.7153133749961853, + "learning_rate": 2.0334e-05, + "loss": 0.0269, + "step": 6784 + }, + { + "epoch": 5.342654588420638, + "grad_norm": 0.6527928113937378, + "learning_rate": 2.0337e-05, + "loss": 0.0705, + "step": 6785 + }, + { + "epoch": 5.343442300118157, + "grad_norm": 0.5004417896270752, + "learning_rate": 2.0340000000000002e-05, + "loss": 0.0214, + "step": 6786 + }, + { + "epoch": 5.344230011815675, + "grad_norm": 0.5350567698478699, + "learning_rate": 2.0343e-05, + "loss": 0.0251, + "step": 6787 + }, + { + "epoch": 5.345017723513195, + "grad_norm": 0.4975443184375763, + "learning_rate": 2.0346e-05, + "loss": 0.0212, + "step": 6788 + }, + { + "epoch": 5.345805435210713, + "grad_norm": 0.18340344727039337, + "learning_rate": 2.0349e-05, + "loss": 0.0123, + "step": 6789 + }, + { + "epoch": 5.346593146908232, + "grad_norm": 0.4352981746196747, + "learning_rate": 2.0352e-05, + "loss": 0.0379, + "step": 6790 + }, + { + "epoch": 5.34738085860575, + "grad_norm": 0.4214443266391754, + "learning_rate": 2.0355e-05, + "loss": 0.03, + "step": 6791 + }, + { + "epoch": 5.348168570303269, + "grad_norm": 0.691584050655365, + "learning_rate": 2.0358e-05, + "loss": 0.0428, + "step": 6792 + }, + { + "epoch": 5.348956282000787, + "grad_norm": 0.6729466319084167, + "learning_rate": 2.0361e-05, + "loss": 0.0301, + "step": 6793 + }, + { + "epoch": 5.349743993698306, + "grad_norm": 0.6688470840454102, + "learning_rate": 2.0364e-05, + "loss": 0.0492, + "step": 6794 + }, + { + "epoch": 5.3505317053958255, + "grad_norm": 0.7301015257835388, + "learning_rate": 2.0367e-05, + "loss": 0.0457, + "step": 6795 + }, + { + "epoch": 5.351319417093344, + "grad_norm": 0.5114631056785583, + "learning_rate": 2.0370000000000003e-05, + "loss": 0.0243, + "step": 6796 + }, + { + "epoch": 5.352107128790863, + "grad_norm": 0.42538580298423767, + "learning_rate": 2.0373000000000003e-05, + "loss": 0.0157, + "step": 6797 + }, + { + "epoch": 5.352894840488381, + "grad_norm": 0.8584946393966675, + "learning_rate": 2.0376000000000003e-05, + "loss": 0.0362, + "step": 6798 + }, + { + "epoch": 5.3536825521859, + "grad_norm": 0.6147838234901428, + "learning_rate": 2.0379000000000003e-05, + "loss": 0.0466, + "step": 6799 + }, + { + "epoch": 5.354470263883418, + "grad_norm": 0.6254687905311584, + "learning_rate": 2.0382e-05, + "loss": 0.0285, + "step": 6800 + }, + { + "epoch": 5.355257975580938, + "grad_norm": 1.9125734567642212, + "learning_rate": 2.0385e-05, + "loss": 0.3865, + "step": 6801 + }, + { + "epoch": 5.356045687278456, + "grad_norm": 1.4190363883972168, + "learning_rate": 2.0388e-05, + "loss": 0.2949, + "step": 6802 + }, + { + "epoch": 5.356833398975975, + "grad_norm": 0.7257781624794006, + "learning_rate": 2.0391e-05, + "loss": 0.1439, + "step": 6803 + }, + { + "epoch": 5.3576211106734934, + "grad_norm": 0.7661374807357788, + "learning_rate": 2.0393999999999998e-05, + "loss": 0.1139, + "step": 6804 + }, + { + "epoch": 5.358408822371012, + "grad_norm": 0.7996437549591064, + "learning_rate": 2.0396999999999998e-05, + "loss": 0.0651, + "step": 6805 + }, + { + "epoch": 5.359196534068531, + "grad_norm": 0.8214867115020752, + "learning_rate": 2.04e-05, + "loss": 0.1299, + "step": 6806 + }, + { + "epoch": 5.35998424576605, + "grad_norm": 0.5180482268333435, + "learning_rate": 2.0403e-05, + "loss": 0.0526, + "step": 6807 + }, + { + "epoch": 5.360771957463569, + "grad_norm": 0.48425206542015076, + "learning_rate": 2.0406e-05, + "loss": 0.0238, + "step": 6808 + }, + { + "epoch": 5.361559669161087, + "grad_norm": 0.8364032506942749, + "learning_rate": 2.0409e-05, + "loss": 0.098, + "step": 6809 + }, + { + "epoch": 5.362347380858606, + "grad_norm": 0.3279147744178772, + "learning_rate": 2.0412e-05, + "loss": 0.0222, + "step": 6810 + }, + { + "epoch": 5.363135092556124, + "grad_norm": 0.9848613739013672, + "learning_rate": 2.0415e-05, + "loss": 0.0388, + "step": 6811 + }, + { + "epoch": 5.363922804253643, + "grad_norm": 0.4244019687175751, + "learning_rate": 2.0418e-05, + "loss": 0.0304, + "step": 6812 + }, + { + "epoch": 5.364710515951161, + "grad_norm": 0.45612117648124695, + "learning_rate": 2.0421e-05, + "loss": 0.0338, + "step": 6813 + }, + { + "epoch": 5.365498227648681, + "grad_norm": 0.30265113711357117, + "learning_rate": 2.0424e-05, + "loss": 0.0194, + "step": 6814 + }, + { + "epoch": 5.3662859393461995, + "grad_norm": 0.286160409450531, + "learning_rate": 2.0427e-05, + "loss": 0.0276, + "step": 6815 + }, + { + "epoch": 5.367073651043718, + "grad_norm": 0.27844786643981934, + "learning_rate": 2.0430000000000003e-05, + "loss": 0.0193, + "step": 6816 + }, + { + "epoch": 5.367861362741237, + "grad_norm": 0.2024909406900406, + "learning_rate": 2.0433000000000002e-05, + "loss": 0.0131, + "step": 6817 + }, + { + "epoch": 5.368649074438755, + "grad_norm": 0.4745576083660126, + "learning_rate": 2.0436000000000002e-05, + "loss": 0.0211, + "step": 6818 + }, + { + "epoch": 5.369436786136274, + "grad_norm": 0.431742787361145, + "learning_rate": 2.0439000000000002e-05, + "loss": 0.0209, + "step": 6819 + }, + { + "epoch": 5.370224497833793, + "grad_norm": 0.39664366841316223, + "learning_rate": 2.0442000000000002e-05, + "loss": 0.0564, + "step": 6820 + }, + { + "epoch": 5.371012209531312, + "grad_norm": 0.43720585107803345, + "learning_rate": 2.0445e-05, + "loss": 0.0274, + "step": 6821 + }, + { + "epoch": 5.37179992122883, + "grad_norm": 0.38260695338249207, + "learning_rate": 2.0448e-05, + "loss": 0.0231, + "step": 6822 + }, + { + "epoch": 5.372587632926349, + "grad_norm": 0.31389927864074707, + "learning_rate": 2.0451e-05, + "loss": 0.0194, + "step": 6823 + }, + { + "epoch": 5.3733753446238675, + "grad_norm": 0.39454227685928345, + "learning_rate": 2.0454e-05, + "loss": 0.0295, + "step": 6824 + }, + { + "epoch": 5.374163056321386, + "grad_norm": 0.852311909198761, + "learning_rate": 2.0456999999999997e-05, + "loss": 0.023, + "step": 6825 + }, + { + "epoch": 5.3749507680189055, + "grad_norm": 0.3808186948299408, + "learning_rate": 2.046e-05, + "loss": 0.015, + "step": 6826 + }, + { + "epoch": 5.375738479716424, + "grad_norm": 0.257826566696167, + "learning_rate": 2.0463e-05, + "loss": 0.0128, + "step": 6827 + }, + { + "epoch": 5.376526191413943, + "grad_norm": 0.6893274784088135, + "learning_rate": 2.0466e-05, + "loss": 0.0277, + "step": 6828 + }, + { + "epoch": 5.377313903111461, + "grad_norm": 0.3699823021888733, + "learning_rate": 2.0469e-05, + "loss": 0.02, + "step": 6829 + }, + { + "epoch": 5.37810161480898, + "grad_norm": 0.588147759437561, + "learning_rate": 2.0472e-05, + "loss": 0.0251, + "step": 6830 + }, + { + "epoch": 5.378889326506498, + "grad_norm": 0.5273740291595459, + "learning_rate": 2.0475e-05, + "loss": 0.0294, + "step": 6831 + }, + { + "epoch": 5.379677038204017, + "grad_norm": 0.43486881256103516, + "learning_rate": 2.0478e-05, + "loss": 0.0259, + "step": 6832 + }, + { + "epoch": 5.380464749901536, + "grad_norm": 0.8220149278640747, + "learning_rate": 2.0481e-05, + "loss": 0.0309, + "step": 6833 + }, + { + "epoch": 5.381252461599055, + "grad_norm": 0.4444347620010376, + "learning_rate": 2.0484e-05, + "loss": 0.016, + "step": 6834 + }, + { + "epoch": 5.3820401732965735, + "grad_norm": 0.6590021252632141, + "learning_rate": 2.0487e-05, + "loss": 0.0478, + "step": 6835 + }, + { + "epoch": 5.382827884994092, + "grad_norm": 0.46598145365715027, + "learning_rate": 2.0490000000000002e-05, + "loss": 0.0208, + "step": 6836 + }, + { + "epoch": 5.383615596691611, + "grad_norm": 0.5803776979446411, + "learning_rate": 2.0493000000000002e-05, + "loss": 0.0278, + "step": 6837 + }, + { + "epoch": 5.384403308389129, + "grad_norm": 0.5135135054588318, + "learning_rate": 2.0496e-05, + "loss": 0.025, + "step": 6838 + }, + { + "epoch": 5.385191020086649, + "grad_norm": 0.38314929604530334, + "learning_rate": 2.0499e-05, + "loss": 0.0305, + "step": 6839 + }, + { + "epoch": 5.385978731784167, + "grad_norm": 0.3594721853733063, + "learning_rate": 2.0502e-05, + "loss": 0.0173, + "step": 6840 + }, + { + "epoch": 5.386766443481686, + "grad_norm": 0.33813369274139404, + "learning_rate": 2.0505e-05, + "loss": 0.02, + "step": 6841 + }, + { + "epoch": 5.387554155179204, + "grad_norm": 0.6739007234573364, + "learning_rate": 2.0508e-05, + "loss": 0.0385, + "step": 6842 + }, + { + "epoch": 5.388341866876723, + "grad_norm": 0.44281435012817383, + "learning_rate": 2.0511e-05, + "loss": 0.0245, + "step": 6843 + }, + { + "epoch": 5.3891295785742415, + "grad_norm": 0.38991233706474304, + "learning_rate": 2.0514e-05, + "loss": 0.0284, + "step": 6844 + }, + { + "epoch": 5.389917290271761, + "grad_norm": 0.5532455444335938, + "learning_rate": 2.0517e-05, + "loss": 0.0423, + "step": 6845 + }, + { + "epoch": 5.3907050019692795, + "grad_norm": 0.508659303188324, + "learning_rate": 2.0520000000000003e-05, + "loss": 0.029, + "step": 6846 + }, + { + "epoch": 5.391492713666798, + "grad_norm": 0.44379085302352905, + "learning_rate": 2.0523000000000003e-05, + "loss": 0.0227, + "step": 6847 + }, + { + "epoch": 5.392280425364317, + "grad_norm": 0.6265698075294495, + "learning_rate": 2.0526000000000003e-05, + "loss": 0.0271, + "step": 6848 + }, + { + "epoch": 5.393068137061835, + "grad_norm": 0.5975472927093506, + "learning_rate": 2.0529e-05, + "loss": 0.0359, + "step": 6849 + }, + { + "epoch": 5.393855848759354, + "grad_norm": 1.4424527883529663, + "learning_rate": 2.0532e-05, + "loss": 0.0504, + "step": 6850 + }, + { + "epoch": 5.394643560456872, + "grad_norm": 1.9071357250213623, + "learning_rate": 2.0535e-05, + "loss": 0.4143, + "step": 6851 + }, + { + "epoch": 5.395431272154392, + "grad_norm": 1.0731110572814941, + "learning_rate": 2.0538e-05, + "loss": 0.2754, + "step": 6852 + }, + { + "epoch": 5.39621898385191, + "grad_norm": 0.7611952424049377, + "learning_rate": 2.0541e-05, + "loss": 0.1591, + "step": 6853 + }, + { + "epoch": 5.397006695549429, + "grad_norm": 0.7137765288352966, + "learning_rate": 2.0544e-05, + "loss": 0.1096, + "step": 6854 + }, + { + "epoch": 5.3977944072469475, + "grad_norm": 0.9052755236625671, + "learning_rate": 2.0546999999999998e-05, + "loss": 0.1589, + "step": 6855 + }, + { + "epoch": 5.398582118944466, + "grad_norm": 0.6297799944877625, + "learning_rate": 2.055e-05, + "loss": 0.0801, + "step": 6856 + }, + { + "epoch": 5.3993698306419855, + "grad_norm": 0.4380655288696289, + "learning_rate": 2.0553e-05, + "loss": 0.0658, + "step": 6857 + }, + { + "epoch": 5.400157542339504, + "grad_norm": 0.4615722894668579, + "learning_rate": 2.0556e-05, + "loss": 0.0293, + "step": 6858 + }, + { + "epoch": 5.400945254037023, + "grad_norm": 0.3929922878742218, + "learning_rate": 2.0559e-05, + "loss": 0.0535, + "step": 6859 + }, + { + "epoch": 5.401732965734541, + "grad_norm": 0.5532612204551697, + "learning_rate": 2.0562e-05, + "loss": 0.0385, + "step": 6860 + }, + { + "epoch": 5.40252067743206, + "grad_norm": 0.3853796720504761, + "learning_rate": 2.0565e-05, + "loss": 0.0225, + "step": 6861 + }, + { + "epoch": 5.403308389129578, + "grad_norm": 0.4413861334323883, + "learning_rate": 2.0568e-05, + "loss": 0.0454, + "step": 6862 + }, + { + "epoch": 5.404096100827097, + "grad_norm": 0.37083736062049866, + "learning_rate": 2.0571e-05, + "loss": 0.0318, + "step": 6863 + }, + { + "epoch": 5.404883812524616, + "grad_norm": 0.31430429220199585, + "learning_rate": 2.0574e-05, + "loss": 0.0145, + "step": 6864 + }, + { + "epoch": 5.405671524222135, + "grad_norm": 0.3106938302516937, + "learning_rate": 2.0577e-05, + "loss": 0.0401, + "step": 6865 + }, + { + "epoch": 5.4064592359196535, + "grad_norm": 0.3831808567047119, + "learning_rate": 2.0580000000000003e-05, + "loss": 0.0177, + "step": 6866 + }, + { + "epoch": 5.407246947617172, + "grad_norm": 0.3160546123981476, + "learning_rate": 2.0583000000000003e-05, + "loss": 0.0164, + "step": 6867 + }, + { + "epoch": 5.408034659314691, + "grad_norm": 0.3254889249801636, + "learning_rate": 2.0586000000000002e-05, + "loss": 0.0178, + "step": 6868 + }, + { + "epoch": 5.408822371012209, + "grad_norm": 0.4324004650115967, + "learning_rate": 2.0589000000000002e-05, + "loss": 0.016, + "step": 6869 + }, + { + "epoch": 5.409610082709729, + "grad_norm": 0.6751062273979187, + "learning_rate": 2.0592000000000002e-05, + "loss": 0.0295, + "step": 6870 + }, + { + "epoch": 5.410397794407247, + "grad_norm": 0.3888750970363617, + "learning_rate": 2.0595000000000002e-05, + "loss": 0.0239, + "step": 6871 + }, + { + "epoch": 5.411185506104766, + "grad_norm": 0.45406386256217957, + "learning_rate": 2.0598e-05, + "loss": 0.0221, + "step": 6872 + }, + { + "epoch": 5.411973217802284, + "grad_norm": 0.4046320617198944, + "learning_rate": 2.0601e-05, + "loss": 0.0317, + "step": 6873 + }, + { + "epoch": 5.412760929499803, + "grad_norm": 0.3398888111114502, + "learning_rate": 2.0603999999999998e-05, + "loss": 0.0176, + "step": 6874 + }, + { + "epoch": 5.4135486411973215, + "grad_norm": 0.2974160313606262, + "learning_rate": 2.0606999999999998e-05, + "loss": 0.0188, + "step": 6875 + }, + { + "epoch": 5.414336352894841, + "grad_norm": 0.3545845150947571, + "learning_rate": 2.061e-05, + "loss": 0.0297, + "step": 6876 + }, + { + "epoch": 5.4151240645923595, + "grad_norm": 0.5222240090370178, + "learning_rate": 2.0613e-05, + "loss": 0.017, + "step": 6877 + }, + { + "epoch": 5.415911776289878, + "grad_norm": 0.33213070034980774, + "learning_rate": 2.0616e-05, + "loss": 0.021, + "step": 6878 + }, + { + "epoch": 5.416699487987397, + "grad_norm": 0.44430625438690186, + "learning_rate": 2.0619e-05, + "loss": 0.0285, + "step": 6879 + }, + { + "epoch": 5.417487199684915, + "grad_norm": 0.6890137195587158, + "learning_rate": 2.0622e-05, + "loss": 0.0228, + "step": 6880 + }, + { + "epoch": 5.418274911382434, + "grad_norm": 0.4729720652103424, + "learning_rate": 2.0625e-05, + "loss": 0.0193, + "step": 6881 + }, + { + "epoch": 5.419062623079952, + "grad_norm": 0.26608070731163025, + "learning_rate": 2.0628e-05, + "loss": 0.013, + "step": 6882 + }, + { + "epoch": 5.419850334777472, + "grad_norm": 0.4304906129837036, + "learning_rate": 2.0631e-05, + "loss": 0.0132, + "step": 6883 + }, + { + "epoch": 5.42063804647499, + "grad_norm": 0.9545592069625854, + "learning_rate": 2.0634e-05, + "loss": 0.0271, + "step": 6884 + }, + { + "epoch": 5.421425758172509, + "grad_norm": 0.3782521188259125, + "learning_rate": 2.0637e-05, + "loss": 0.0125, + "step": 6885 + }, + { + "epoch": 5.4222134698700275, + "grad_norm": 0.5684723854064941, + "learning_rate": 2.064e-05, + "loss": 0.0188, + "step": 6886 + }, + { + "epoch": 5.423001181567546, + "grad_norm": 0.3686605393886566, + "learning_rate": 2.0643000000000002e-05, + "loss": 0.0285, + "step": 6887 + }, + { + "epoch": 5.423788893265065, + "grad_norm": 0.6500065326690674, + "learning_rate": 2.0646000000000002e-05, + "loss": 0.0299, + "step": 6888 + }, + { + "epoch": 5.424576604962584, + "grad_norm": 0.6942087411880493, + "learning_rate": 2.0649e-05, + "loss": 0.0308, + "step": 6889 + }, + { + "epoch": 5.425364316660103, + "grad_norm": 0.5005639791488647, + "learning_rate": 2.0652e-05, + "loss": 0.0298, + "step": 6890 + }, + { + "epoch": 5.426152028357621, + "grad_norm": 0.35634079575538635, + "learning_rate": 2.0655e-05, + "loss": 0.0218, + "step": 6891 + }, + { + "epoch": 5.42693974005514, + "grad_norm": 0.42023155093193054, + "learning_rate": 2.0658e-05, + "loss": 0.0293, + "step": 6892 + }, + { + "epoch": 5.427727451752658, + "grad_norm": 0.6448933482170105, + "learning_rate": 2.0661e-05, + "loss": 0.02, + "step": 6893 + }, + { + "epoch": 5.428515163450177, + "grad_norm": 1.070968747138977, + "learning_rate": 2.0664e-05, + "loss": 0.0247, + "step": 6894 + }, + { + "epoch": 5.429302875147696, + "grad_norm": 0.5085862874984741, + "learning_rate": 2.0667e-05, + "loss": 0.0334, + "step": 6895 + }, + { + "epoch": 5.430090586845215, + "grad_norm": 0.9896718859672546, + "learning_rate": 2.067e-05, + "loss": 0.0425, + "step": 6896 + }, + { + "epoch": 5.4308782985427335, + "grad_norm": 0.5194793343544006, + "learning_rate": 2.0673000000000003e-05, + "loss": 0.0174, + "step": 6897 + }, + { + "epoch": 5.431666010240252, + "grad_norm": 0.5317810773849487, + "learning_rate": 2.0676e-05, + "loss": 0.0324, + "step": 6898 + }, + { + "epoch": 5.432453721937771, + "grad_norm": 0.4246738851070404, + "learning_rate": 2.0679e-05, + "loss": 0.022, + "step": 6899 + }, + { + "epoch": 5.433241433635289, + "grad_norm": 0.6162951588630676, + "learning_rate": 2.0682e-05, + "loss": 0.032, + "step": 6900 + }, + { + "epoch": 5.434029145332808, + "grad_norm": 0.906912624835968, + "learning_rate": 2.0685e-05, + "loss": 0.281, + "step": 6901 + }, + { + "epoch": 5.434816857030327, + "grad_norm": 1.792242169380188, + "learning_rate": 2.0688e-05, + "loss": 0.2928, + "step": 6902 + }, + { + "epoch": 5.435604568727846, + "grad_norm": 0.6295994520187378, + "learning_rate": 2.0691e-05, + "loss": 0.1514, + "step": 6903 + }, + { + "epoch": 5.436392280425364, + "grad_norm": 2.561995267868042, + "learning_rate": 2.0694e-05, + "loss": 0.2714, + "step": 6904 + }, + { + "epoch": 5.437179992122883, + "grad_norm": 1.0510973930358887, + "learning_rate": 2.0697e-05, + "loss": 0.1553, + "step": 6905 + }, + { + "epoch": 5.4379677038204015, + "grad_norm": 0.8258733749389648, + "learning_rate": 2.07e-05, + "loss": 0.122, + "step": 6906 + }, + { + "epoch": 5.43875541551792, + "grad_norm": 0.5017773509025574, + "learning_rate": 2.0703e-05, + "loss": 0.0858, + "step": 6907 + }, + { + "epoch": 5.4395431272154395, + "grad_norm": 0.6590108275413513, + "learning_rate": 2.0706e-05, + "loss": 0.0794, + "step": 6908 + }, + { + "epoch": 5.440330838912958, + "grad_norm": 0.5111071467399597, + "learning_rate": 2.0709e-05, + "loss": 0.0707, + "step": 6909 + }, + { + "epoch": 5.441118550610477, + "grad_norm": 0.6520637273788452, + "learning_rate": 2.0712e-05, + "loss": 0.0368, + "step": 6910 + }, + { + "epoch": 5.441906262307995, + "grad_norm": 0.2938183844089508, + "learning_rate": 2.0715e-05, + "loss": 0.0216, + "step": 6911 + }, + { + "epoch": 5.442693974005514, + "grad_norm": 0.4814879894256592, + "learning_rate": 2.0718e-05, + "loss": 0.0294, + "step": 6912 + }, + { + "epoch": 5.443481685703032, + "grad_norm": 0.31409573554992676, + "learning_rate": 2.0721e-05, + "loss": 0.0197, + "step": 6913 + }, + { + "epoch": 5.444269397400552, + "grad_norm": 0.5319311618804932, + "learning_rate": 2.0724e-05, + "loss": 0.0339, + "step": 6914 + }, + { + "epoch": 5.44505710909807, + "grad_norm": 0.46919581294059753, + "learning_rate": 2.0727e-05, + "loss": 0.053, + "step": 6915 + }, + { + "epoch": 5.445844820795589, + "grad_norm": 0.5676921010017395, + "learning_rate": 2.073e-05, + "loss": 0.0227, + "step": 6916 + }, + { + "epoch": 5.4466325324931075, + "grad_norm": 0.4479576051235199, + "learning_rate": 2.0733000000000003e-05, + "loss": 0.0253, + "step": 6917 + }, + { + "epoch": 5.447420244190626, + "grad_norm": 0.25911957025527954, + "learning_rate": 2.0736000000000003e-05, + "loss": 0.0172, + "step": 6918 + }, + { + "epoch": 5.448207955888145, + "grad_norm": 0.4336715638637543, + "learning_rate": 2.0739000000000003e-05, + "loss": 0.02, + "step": 6919 + }, + { + "epoch": 5.448995667585663, + "grad_norm": 1.6972404718399048, + "learning_rate": 2.0742000000000002e-05, + "loss": 0.027, + "step": 6920 + }, + { + "epoch": 5.449783379283183, + "grad_norm": 0.5437182188034058, + "learning_rate": 2.0745000000000002e-05, + "loss": 0.0345, + "step": 6921 + }, + { + "epoch": 5.450571090980701, + "grad_norm": 0.38046130537986755, + "learning_rate": 2.0748000000000002e-05, + "loss": 0.0167, + "step": 6922 + }, + { + "epoch": 5.45135880267822, + "grad_norm": 0.7299495339393616, + "learning_rate": 2.0751e-05, + "loss": 0.0282, + "step": 6923 + }, + { + "epoch": 5.452146514375738, + "grad_norm": 0.7324561476707458, + "learning_rate": 2.0753999999999998e-05, + "loss": 0.0341, + "step": 6924 + }, + { + "epoch": 5.452934226073257, + "grad_norm": 0.3468732237815857, + "learning_rate": 2.0756999999999998e-05, + "loss": 0.0161, + "step": 6925 + }, + { + "epoch": 5.4537219377707755, + "grad_norm": 0.47409743070602417, + "learning_rate": 2.0759999999999998e-05, + "loss": 0.0314, + "step": 6926 + }, + { + "epoch": 5.454509649468295, + "grad_norm": 0.4709130823612213, + "learning_rate": 2.0763e-05, + "loss": 0.0264, + "step": 6927 + }, + { + "epoch": 5.4552973611658135, + "grad_norm": 0.38764429092407227, + "learning_rate": 2.0766e-05, + "loss": 0.0234, + "step": 6928 + }, + { + "epoch": 5.456085072863332, + "grad_norm": 0.5427576303482056, + "learning_rate": 2.0769e-05, + "loss": 0.0293, + "step": 6929 + }, + { + "epoch": 5.456872784560851, + "grad_norm": 0.3369499146938324, + "learning_rate": 2.0772e-05, + "loss": 0.0151, + "step": 6930 + }, + { + "epoch": 5.457660496258369, + "grad_norm": 0.5900097489356995, + "learning_rate": 2.0775e-05, + "loss": 0.0289, + "step": 6931 + }, + { + "epoch": 5.458448207955888, + "grad_norm": 0.7891398668289185, + "learning_rate": 2.0778e-05, + "loss": 0.0385, + "step": 6932 + }, + { + "epoch": 5.459235919653407, + "grad_norm": 0.3820435106754303, + "learning_rate": 2.0781e-05, + "loss": 0.0296, + "step": 6933 + }, + { + "epoch": 5.460023631350926, + "grad_norm": 0.47130343317985535, + "learning_rate": 2.0784e-05, + "loss": 0.0181, + "step": 6934 + }, + { + "epoch": 5.460811343048444, + "grad_norm": 0.5665684938430786, + "learning_rate": 2.0787e-05, + "loss": 0.0304, + "step": 6935 + }, + { + "epoch": 5.461599054745963, + "grad_norm": 0.5066754221916199, + "learning_rate": 2.079e-05, + "loss": 0.0203, + "step": 6936 + }, + { + "epoch": 5.4623867664434815, + "grad_norm": 0.47782307863235474, + "learning_rate": 2.0793000000000002e-05, + "loss": 0.0247, + "step": 6937 + }, + { + "epoch": 5.463174478141, + "grad_norm": 0.82486891746521, + "learning_rate": 2.0796000000000002e-05, + "loss": 0.037, + "step": 6938 + }, + { + "epoch": 5.463962189838519, + "grad_norm": 0.5653901696205139, + "learning_rate": 2.0799000000000002e-05, + "loss": 0.0319, + "step": 6939 + }, + { + "epoch": 5.464749901536038, + "grad_norm": 0.4854590892791748, + "learning_rate": 2.0802000000000002e-05, + "loss": 0.0219, + "step": 6940 + }, + { + "epoch": 5.465537613233557, + "grad_norm": 0.38489747047424316, + "learning_rate": 2.0805e-05, + "loss": 0.0208, + "step": 6941 + }, + { + "epoch": 5.466325324931075, + "grad_norm": 2.207014560699463, + "learning_rate": 2.0808e-05, + "loss": 0.0383, + "step": 6942 + }, + { + "epoch": 5.467113036628594, + "grad_norm": 0.9362514019012451, + "learning_rate": 2.0811e-05, + "loss": 0.0226, + "step": 6943 + }, + { + "epoch": 5.467900748326112, + "grad_norm": 0.7335556149482727, + "learning_rate": 2.0814e-05, + "loss": 0.0489, + "step": 6944 + }, + { + "epoch": 5.468688460023631, + "grad_norm": 0.312296062707901, + "learning_rate": 2.0817e-05, + "loss": 0.0194, + "step": 6945 + }, + { + "epoch": 5.46947617172115, + "grad_norm": 0.4097836911678314, + "learning_rate": 2.082e-05, + "loss": 0.0186, + "step": 6946 + }, + { + "epoch": 5.470263883418669, + "grad_norm": 0.6395778059959412, + "learning_rate": 2.0823e-05, + "loss": 0.0698, + "step": 6947 + }, + { + "epoch": 5.4710515951161875, + "grad_norm": 0.32984209060668945, + "learning_rate": 2.0826e-05, + "loss": 0.0197, + "step": 6948 + }, + { + "epoch": 5.471839306813706, + "grad_norm": 2.6325771808624268, + "learning_rate": 2.0829e-05, + "loss": 0.0305, + "step": 6949 + }, + { + "epoch": 5.472627018511225, + "grad_norm": 0.7974812984466553, + "learning_rate": 2.0832e-05, + "loss": 0.0424, + "step": 6950 + }, + { + "epoch": 5.473414730208743, + "grad_norm": 1.2094793319702148, + "learning_rate": 2.0835e-05, + "loss": 0.2903, + "step": 6951 + }, + { + "epoch": 5.474202441906263, + "grad_norm": 0.9292858242988586, + "learning_rate": 2.0838e-05, + "loss": 0.2378, + "step": 6952 + }, + { + "epoch": 5.474990153603781, + "grad_norm": 0.7993447184562683, + "learning_rate": 2.0841e-05, + "loss": 0.2172, + "step": 6953 + }, + { + "epoch": 5.4757778653013, + "grad_norm": 0.57109135389328, + "learning_rate": 2.0844e-05, + "loss": 0.0971, + "step": 6954 + }, + { + "epoch": 5.476565576998818, + "grad_norm": 0.6326733231544495, + "learning_rate": 2.0847e-05, + "loss": 0.1205, + "step": 6955 + }, + { + "epoch": 5.477353288696337, + "grad_norm": 0.36726832389831543, + "learning_rate": 2.085e-05, + "loss": 0.0606, + "step": 6956 + }, + { + "epoch": 5.4781410003938555, + "grad_norm": 0.454076886177063, + "learning_rate": 2.0853000000000002e-05, + "loss": 0.0313, + "step": 6957 + }, + { + "epoch": 5.478928712091374, + "grad_norm": 0.3536953926086426, + "learning_rate": 2.0856e-05, + "loss": 0.0463, + "step": 6958 + }, + { + "epoch": 5.479716423788894, + "grad_norm": 0.3215349018573761, + "learning_rate": 2.0859e-05, + "loss": 0.0217, + "step": 6959 + }, + { + "epoch": 5.480504135486412, + "grad_norm": 0.52034991979599, + "learning_rate": 2.0862e-05, + "loss": 0.0235, + "step": 6960 + }, + { + "epoch": 5.481291847183931, + "grad_norm": 0.3318585753440857, + "learning_rate": 2.0865e-05, + "loss": 0.0496, + "step": 6961 + }, + { + "epoch": 5.482079558881449, + "grad_norm": 0.2900337278842926, + "learning_rate": 2.0868e-05, + "loss": 0.0212, + "step": 6962 + }, + { + "epoch": 5.482867270578968, + "grad_norm": 0.3577899932861328, + "learning_rate": 2.0871e-05, + "loss": 0.0269, + "step": 6963 + }, + { + "epoch": 5.483654982276486, + "grad_norm": 0.659993052482605, + "learning_rate": 2.0874e-05, + "loss": 0.02, + "step": 6964 + }, + { + "epoch": 5.484442693974006, + "grad_norm": 0.4519389867782593, + "learning_rate": 2.0877e-05, + "loss": 0.0289, + "step": 6965 + }, + { + "epoch": 5.485230405671524, + "grad_norm": 0.5766428709030151, + "learning_rate": 2.088e-05, + "loss": 0.0243, + "step": 6966 + }, + { + "epoch": 5.486018117369043, + "grad_norm": 0.5853443741798401, + "learning_rate": 2.0883000000000003e-05, + "loss": 0.0368, + "step": 6967 + }, + { + "epoch": 5.486805829066562, + "grad_norm": 0.33551228046417236, + "learning_rate": 2.0886000000000003e-05, + "loss": 0.0191, + "step": 6968 + }, + { + "epoch": 5.48759354076408, + "grad_norm": 0.4055820107460022, + "learning_rate": 2.0889000000000003e-05, + "loss": 0.0237, + "step": 6969 + }, + { + "epoch": 5.488381252461599, + "grad_norm": 0.41821739077568054, + "learning_rate": 2.0892000000000003e-05, + "loss": 0.0284, + "step": 6970 + }, + { + "epoch": 5.489168964159118, + "grad_norm": 0.3371080160140991, + "learning_rate": 2.0895000000000002e-05, + "loss": 0.0173, + "step": 6971 + }, + { + "epoch": 5.489956675856637, + "grad_norm": 0.777405858039856, + "learning_rate": 2.0898e-05, + "loss": 0.0388, + "step": 6972 + }, + { + "epoch": 5.490744387554155, + "grad_norm": 0.3331267833709717, + "learning_rate": 2.0901e-05, + "loss": 0.016, + "step": 6973 + }, + { + "epoch": 5.491532099251674, + "grad_norm": 0.6350351572036743, + "learning_rate": 2.0904e-05, + "loss": 0.0244, + "step": 6974 + }, + { + "epoch": 5.492319810949192, + "grad_norm": 0.5456006526947021, + "learning_rate": 2.0906999999999998e-05, + "loss": 0.0367, + "step": 6975 + }, + { + "epoch": 5.493107522646711, + "grad_norm": 0.34385251998901367, + "learning_rate": 2.0909999999999998e-05, + "loss": 0.0306, + "step": 6976 + }, + { + "epoch": 5.4938952343442296, + "grad_norm": 0.3622317314147949, + "learning_rate": 2.0913e-05, + "loss": 0.0257, + "step": 6977 + }, + { + "epoch": 5.494682946041749, + "grad_norm": 0.43278175592422485, + "learning_rate": 2.0916e-05, + "loss": 0.0356, + "step": 6978 + }, + { + "epoch": 5.495470657739268, + "grad_norm": 0.9489392042160034, + "learning_rate": 2.0919e-05, + "loss": 0.0352, + "step": 6979 + }, + { + "epoch": 5.496258369436786, + "grad_norm": 0.3366234004497528, + "learning_rate": 2.0922e-05, + "loss": 0.0264, + "step": 6980 + }, + { + "epoch": 5.497046081134305, + "grad_norm": 0.38550469279289246, + "learning_rate": 2.0925e-05, + "loss": 0.0224, + "step": 6981 + }, + { + "epoch": 5.497833792831823, + "grad_norm": 0.5701372027397156, + "learning_rate": 2.0928e-05, + "loss": 0.0304, + "step": 6982 + }, + { + "epoch": 5.498621504529343, + "grad_norm": 0.23970668017864227, + "learning_rate": 2.0931e-05, + "loss": 0.0209, + "step": 6983 + }, + { + "epoch": 5.499409216226861, + "grad_norm": 0.5504406690597534, + "learning_rate": 2.0934e-05, + "loss": 0.0341, + "step": 6984 + }, + { + "epoch": 5.50019692792438, + "grad_norm": 0.49303022027015686, + "learning_rate": 2.0937e-05, + "loss": 0.031, + "step": 6985 + }, + { + "epoch": 5.500984639621898, + "grad_norm": 0.7187022566795349, + "learning_rate": 2.094e-05, + "loss": 0.0288, + "step": 6986 + }, + { + "epoch": 5.501772351319417, + "grad_norm": 0.4547847509384155, + "learning_rate": 2.0943000000000003e-05, + "loss": 0.0203, + "step": 6987 + }, + { + "epoch": 5.502560063016936, + "grad_norm": 0.46817293763160706, + "learning_rate": 2.0946000000000002e-05, + "loss": 0.0324, + "step": 6988 + }, + { + "epoch": 5.503347774714454, + "grad_norm": 0.5512637495994568, + "learning_rate": 2.0949000000000002e-05, + "loss": 0.0329, + "step": 6989 + }, + { + "epoch": 5.504135486411974, + "grad_norm": 0.23591463267803192, + "learning_rate": 2.0952000000000002e-05, + "loss": 0.0189, + "step": 6990 + }, + { + "epoch": 5.504923198109492, + "grad_norm": 0.4788447320461273, + "learning_rate": 2.0955000000000002e-05, + "loss": 0.0331, + "step": 6991 + }, + { + "epoch": 5.505710909807011, + "grad_norm": 0.3440135419368744, + "learning_rate": 2.0958e-05, + "loss": 0.0166, + "step": 6992 + }, + { + "epoch": 5.506498621504529, + "grad_norm": 0.48760494589805603, + "learning_rate": 2.0961e-05, + "loss": 0.0324, + "step": 6993 + }, + { + "epoch": 5.507286333202048, + "grad_norm": 0.4540562629699707, + "learning_rate": 2.0964e-05, + "loss": 0.0289, + "step": 6994 + }, + { + "epoch": 5.508074044899566, + "grad_norm": 0.5843654870986938, + "learning_rate": 2.0967e-05, + "loss": 0.0306, + "step": 6995 + }, + { + "epoch": 5.508861756597085, + "grad_norm": 0.717517614364624, + "learning_rate": 2.097e-05, + "loss": 0.0257, + "step": 6996 + }, + { + "epoch": 5.5096494682946044, + "grad_norm": 0.6004114151000977, + "learning_rate": 2.0973e-05, + "loss": 0.0292, + "step": 6997 + }, + { + "epoch": 5.510437179992123, + "grad_norm": 0.47758498787879944, + "learning_rate": 2.0976e-05, + "loss": 0.02, + "step": 6998 + }, + { + "epoch": 5.511224891689642, + "grad_norm": 0.7219842076301575, + "learning_rate": 2.0979e-05, + "loss": 0.0281, + "step": 6999 + }, + { + "epoch": 5.51201260338716, + "grad_norm": 0.9073770046234131, + "learning_rate": 2.0982e-05, + "loss": 0.0441, + "step": 7000 + }, + { + "epoch": 5.51201260338716, + "eval_cer": 0.1293776084195246, + "eval_loss": 0.40152251720428467, + "eval_runtime": 16.2649, + "eval_samples_per_second": 18.691, + "eval_steps_per_second": 0.615, + "eval_wer": 0.4460859554873369, + "step": 7000 + }, + { + "epoch": 5.512800315084679, + "grad_norm": 1.6457337141036987, + "learning_rate": 2.0985e-05, + "loss": 0.2854, + "step": 7001 + }, + { + "epoch": 5.513588026782198, + "grad_norm": 1.2578688859939575, + "learning_rate": 2.0988e-05, + "loss": 0.2862, + "step": 7002 + }, + { + "epoch": 5.514375738479717, + "grad_norm": 1.149498701095581, + "learning_rate": 2.0991e-05, + "loss": 0.2492, + "step": 7003 + }, + { + "epoch": 5.515163450177235, + "grad_norm": 1.2760318517684937, + "learning_rate": 2.0994e-05, + "loss": 0.1988, + "step": 7004 + }, + { + "epoch": 5.515951161874754, + "grad_norm": 0.6911354660987854, + "learning_rate": 2.0997e-05, + "loss": 0.1556, + "step": 7005 + }, + { + "epoch": 5.516738873572272, + "grad_norm": 1.207025408744812, + "learning_rate": 2.1e-05, + "loss": 0.1083, + "step": 7006 + }, + { + "epoch": 5.517526585269791, + "grad_norm": 0.42789071798324585, + "learning_rate": 2.1003e-05, + "loss": 0.0639, + "step": 7007 + }, + { + "epoch": 5.51831429696731, + "grad_norm": 0.4126545488834381, + "learning_rate": 2.1006000000000002e-05, + "loss": 0.045, + "step": 7008 + }, + { + "epoch": 5.519102008664829, + "grad_norm": 0.39140936732292175, + "learning_rate": 2.1009e-05, + "loss": 0.0337, + "step": 7009 + }, + { + "epoch": 5.519889720362348, + "grad_norm": 0.32190635800361633, + "learning_rate": 2.1012e-05, + "loss": 0.0424, + "step": 7010 + }, + { + "epoch": 5.520677432059866, + "grad_norm": 0.2656075656414032, + "learning_rate": 2.1015e-05, + "loss": 0.0179, + "step": 7011 + }, + { + "epoch": 5.521465143757385, + "grad_norm": 0.4448200464248657, + "learning_rate": 2.1018e-05, + "loss": 0.0271, + "step": 7012 + }, + { + "epoch": 5.522252855454903, + "grad_norm": 0.2407776564359665, + "learning_rate": 2.1021e-05, + "loss": 0.0208, + "step": 7013 + }, + { + "epoch": 5.523040567152422, + "grad_norm": 0.36249151825904846, + "learning_rate": 2.1024e-05, + "loss": 0.0225, + "step": 7014 + }, + { + "epoch": 5.52382827884994, + "grad_norm": 0.4967881739139557, + "learning_rate": 2.1027e-05, + "loss": 0.0147, + "step": 7015 + }, + { + "epoch": 5.52461599054746, + "grad_norm": 0.3117586076259613, + "learning_rate": 2.103e-05, + "loss": 0.0142, + "step": 7016 + }, + { + "epoch": 5.5254037022449785, + "grad_norm": 0.7568888664245605, + "learning_rate": 2.1033e-05, + "loss": 0.0326, + "step": 7017 + }, + { + "epoch": 5.526191413942497, + "grad_norm": 0.3683883547782898, + "learning_rate": 2.1036000000000003e-05, + "loss": 0.0217, + "step": 7018 + }, + { + "epoch": 5.526979125640016, + "grad_norm": 0.534612774848938, + "learning_rate": 2.1039000000000003e-05, + "loss": 0.0327, + "step": 7019 + }, + { + "epoch": 5.527766837337534, + "grad_norm": 0.6526352763175964, + "learning_rate": 2.1042000000000003e-05, + "loss": 0.0238, + "step": 7020 + }, + { + "epoch": 5.528554549035054, + "grad_norm": 0.5658441185951233, + "learning_rate": 2.1045e-05, + "loss": 0.0146, + "step": 7021 + }, + { + "epoch": 5.529342260732572, + "grad_norm": 0.8049973845481873, + "learning_rate": 2.1048e-05, + "loss": 0.016, + "step": 7022 + }, + { + "epoch": 5.530129972430091, + "grad_norm": 0.4498137831687927, + "learning_rate": 2.1051e-05, + "loss": 0.0207, + "step": 7023 + }, + { + "epoch": 5.530917684127609, + "grad_norm": 0.8456543684005737, + "learning_rate": 2.1054e-05, + "loss": 0.0205, + "step": 7024 + }, + { + "epoch": 5.531705395825128, + "grad_norm": 0.3627668023109436, + "learning_rate": 2.1057e-05, + "loss": 0.0184, + "step": 7025 + }, + { + "epoch": 5.5324931075226464, + "grad_norm": 0.3571755290031433, + "learning_rate": 2.1059999999999998e-05, + "loss": 0.016, + "step": 7026 + }, + { + "epoch": 5.533280819220165, + "grad_norm": 0.20711691677570343, + "learning_rate": 2.1062999999999998e-05, + "loss": 0.0125, + "step": 7027 + }, + { + "epoch": 5.5340685309176845, + "grad_norm": 0.47851112484931946, + "learning_rate": 2.1066e-05, + "loss": 0.0215, + "step": 7028 + }, + { + "epoch": 5.534856242615203, + "grad_norm": 0.3743222653865814, + "learning_rate": 2.1069e-05, + "loss": 0.0297, + "step": 7029 + }, + { + "epoch": 5.535643954312722, + "grad_norm": 0.6916783452033997, + "learning_rate": 2.1072e-05, + "loss": 0.0293, + "step": 7030 + }, + { + "epoch": 5.53643166601024, + "grad_norm": 0.47285839915275574, + "learning_rate": 2.1075e-05, + "loss": 0.0273, + "step": 7031 + }, + { + "epoch": 5.537219377707759, + "grad_norm": 0.30711302161216736, + "learning_rate": 2.1078e-05, + "loss": 0.0262, + "step": 7032 + }, + { + "epoch": 5.538007089405277, + "grad_norm": 0.528069257736206, + "learning_rate": 2.1081e-05, + "loss": 0.0312, + "step": 7033 + }, + { + "epoch": 5.538794801102797, + "grad_norm": 0.5474472641944885, + "learning_rate": 2.1084e-05, + "loss": 0.028, + "step": 7034 + }, + { + "epoch": 5.539582512800315, + "grad_norm": 0.5410220623016357, + "learning_rate": 2.1087e-05, + "loss": 0.0213, + "step": 7035 + }, + { + "epoch": 5.540370224497834, + "grad_norm": 0.44917160272598267, + "learning_rate": 2.109e-05, + "loss": 0.0321, + "step": 7036 + }, + { + "epoch": 5.5411579361953525, + "grad_norm": 0.3177996575832367, + "learning_rate": 2.1093e-05, + "loss": 0.0193, + "step": 7037 + }, + { + "epoch": 5.541945647892871, + "grad_norm": 0.6378876566886902, + "learning_rate": 2.1096000000000003e-05, + "loss": 0.0319, + "step": 7038 + }, + { + "epoch": 5.54273335959039, + "grad_norm": 0.38920843601226807, + "learning_rate": 2.1099000000000002e-05, + "loss": 0.0227, + "step": 7039 + }, + { + "epoch": 5.543521071287909, + "grad_norm": 0.49395501613616943, + "learning_rate": 2.1102000000000002e-05, + "loss": 0.0355, + "step": 7040 + }, + { + "epoch": 5.544308782985428, + "grad_norm": 0.37903812527656555, + "learning_rate": 2.1105000000000002e-05, + "loss": 0.0307, + "step": 7041 + }, + { + "epoch": 5.545096494682946, + "grad_norm": 0.360254168510437, + "learning_rate": 2.1108000000000002e-05, + "loss": 0.016, + "step": 7042 + }, + { + "epoch": 5.545884206380465, + "grad_norm": 0.7641535401344299, + "learning_rate": 2.1111e-05, + "loss": 0.0246, + "step": 7043 + }, + { + "epoch": 5.546671918077983, + "grad_norm": 0.9368769526481628, + "learning_rate": 2.1114e-05, + "loss": 0.038, + "step": 7044 + }, + { + "epoch": 5.547459629775502, + "grad_norm": 0.4087802767753601, + "learning_rate": 2.1117e-05, + "loss": 0.031, + "step": 7045 + }, + { + "epoch": 5.5482473414730205, + "grad_norm": 0.5280166268348694, + "learning_rate": 2.1119999999999998e-05, + "loss": 0.0292, + "step": 7046 + }, + { + "epoch": 5.54903505317054, + "grad_norm": 0.482792466878891, + "learning_rate": 2.1122999999999997e-05, + "loss": 0.0199, + "step": 7047 + }, + { + "epoch": 5.5498227648680585, + "grad_norm": 0.579125165939331, + "learning_rate": 2.1126e-05, + "loss": 0.0277, + "step": 7048 + }, + { + "epoch": 5.550610476565577, + "grad_norm": 0.734113872051239, + "learning_rate": 2.1129e-05, + "loss": 0.0454, + "step": 7049 + }, + { + "epoch": 5.551398188263096, + "grad_norm": 0.6013332009315491, + "learning_rate": 2.1132e-05, + "loss": 0.0356, + "step": 7050 + }, + { + "epoch": 5.552185899960614, + "grad_norm": 1.6361263990402222, + "learning_rate": 2.1135e-05, + "loss": 0.3306, + "step": 7051 + }, + { + "epoch": 5.552973611658133, + "grad_norm": 1.0779446363449097, + "learning_rate": 2.1138e-05, + "loss": 0.2685, + "step": 7052 + }, + { + "epoch": 5.553761323355652, + "grad_norm": 1.4067671298980713, + "learning_rate": 2.1141e-05, + "loss": 0.1848, + "step": 7053 + }, + { + "epoch": 5.554549035053171, + "grad_norm": 0.8741398453712463, + "learning_rate": 2.1144e-05, + "loss": 0.1771, + "step": 7054 + }, + { + "epoch": 5.555336746750689, + "grad_norm": 0.6630369424819946, + "learning_rate": 2.1147e-05, + "loss": 0.1072, + "step": 7055 + }, + { + "epoch": 5.556124458448208, + "grad_norm": 0.442925363779068, + "learning_rate": 2.115e-05, + "loss": 0.1106, + "step": 7056 + }, + { + "epoch": 5.5569121701457265, + "grad_norm": 0.5386802554130554, + "learning_rate": 2.1153e-05, + "loss": 0.0471, + "step": 7057 + }, + { + "epoch": 5.557699881843245, + "grad_norm": 0.3594805598258972, + "learning_rate": 2.1156000000000002e-05, + "loss": 0.0456, + "step": 7058 + }, + { + "epoch": 5.5584875935407645, + "grad_norm": 0.34260088205337524, + "learning_rate": 2.1159000000000002e-05, + "loss": 0.0306, + "step": 7059 + }, + { + "epoch": 5.559275305238283, + "grad_norm": 0.4387231171131134, + "learning_rate": 2.1162e-05, + "loss": 0.0287, + "step": 7060 + }, + { + "epoch": 5.560063016935802, + "grad_norm": 0.3862529993057251, + "learning_rate": 2.1165e-05, + "loss": 0.0224, + "step": 7061 + }, + { + "epoch": 5.56085072863332, + "grad_norm": 0.45310837030410767, + "learning_rate": 2.1168e-05, + "loss": 0.0265, + "step": 7062 + }, + { + "epoch": 5.561638440330839, + "grad_norm": 0.32069042325019836, + "learning_rate": 2.1171e-05, + "loss": 0.0269, + "step": 7063 + }, + { + "epoch": 5.562426152028357, + "grad_norm": 0.3408018946647644, + "learning_rate": 2.1174e-05, + "loss": 0.0152, + "step": 7064 + }, + { + "epoch": 5.563213863725876, + "grad_norm": 0.26270216703414917, + "learning_rate": 2.1177e-05, + "loss": 0.0237, + "step": 7065 + }, + { + "epoch": 5.564001575423395, + "grad_norm": 0.25714096426963806, + "learning_rate": 2.118e-05, + "loss": 0.0171, + "step": 7066 + }, + { + "epoch": 5.564789287120914, + "grad_norm": 0.2899719178676605, + "learning_rate": 2.1183e-05, + "loss": 0.0129, + "step": 7067 + }, + { + "epoch": 5.5655769988184325, + "grad_norm": 0.3701445162296295, + "learning_rate": 2.1186000000000003e-05, + "loss": 0.0227, + "step": 7068 + }, + { + "epoch": 5.566364710515951, + "grad_norm": 0.5463692545890808, + "learning_rate": 2.1189000000000003e-05, + "loss": 0.0332, + "step": 7069 + }, + { + "epoch": 5.56715242221347, + "grad_norm": 0.5459867715835571, + "learning_rate": 2.1192e-05, + "loss": 0.0321, + "step": 7070 + }, + { + "epoch": 5.567940133910989, + "grad_norm": 0.5581908226013184, + "learning_rate": 2.1195e-05, + "loss": 0.0261, + "step": 7071 + }, + { + "epoch": 5.568727845608508, + "grad_norm": 0.33199676871299744, + "learning_rate": 2.1198e-05, + "loss": 0.0172, + "step": 7072 + }, + { + "epoch": 5.569515557306026, + "grad_norm": 0.3328258991241455, + "learning_rate": 2.1201e-05, + "loss": 0.0188, + "step": 7073 + }, + { + "epoch": 5.570303269003545, + "grad_norm": 0.4445253610610962, + "learning_rate": 2.1204e-05, + "loss": 0.0206, + "step": 7074 + }, + { + "epoch": 5.571090980701063, + "grad_norm": 0.5046467781066895, + "learning_rate": 2.1207e-05, + "loss": 0.0256, + "step": 7075 + }, + { + "epoch": 5.571878692398582, + "grad_norm": 0.7148515582084656, + "learning_rate": 2.121e-05, + "loss": 0.0207, + "step": 7076 + }, + { + "epoch": 5.5726664040961005, + "grad_norm": 0.31251809000968933, + "learning_rate": 2.1213e-05, + "loss": 0.0185, + "step": 7077 + }, + { + "epoch": 5.57345411579362, + "grad_norm": 0.38265642523765564, + "learning_rate": 2.1216e-05, + "loss": 0.0254, + "step": 7078 + }, + { + "epoch": 5.5742418274911385, + "grad_norm": 0.5246134996414185, + "learning_rate": 2.1219e-05, + "loss": 0.0264, + "step": 7079 + }, + { + "epoch": 5.575029539188657, + "grad_norm": 0.38794174790382385, + "learning_rate": 2.1222e-05, + "loss": 0.0222, + "step": 7080 + }, + { + "epoch": 5.575817250886176, + "grad_norm": 0.6987031698226929, + "learning_rate": 2.1225e-05, + "loss": 0.0298, + "step": 7081 + }, + { + "epoch": 5.576604962583694, + "grad_norm": 0.5657626986503601, + "learning_rate": 2.1228e-05, + "loss": 0.0278, + "step": 7082 + }, + { + "epoch": 5.577392674281213, + "grad_norm": 0.2651216387748718, + "learning_rate": 2.1231e-05, + "loss": 0.0187, + "step": 7083 + }, + { + "epoch": 5.578180385978731, + "grad_norm": 0.28233104944229126, + "learning_rate": 2.1234e-05, + "loss": 0.023, + "step": 7084 + }, + { + "epoch": 5.578968097676251, + "grad_norm": 0.39436760544776917, + "learning_rate": 2.1237e-05, + "loss": 0.0184, + "step": 7085 + }, + { + "epoch": 5.579755809373769, + "grad_norm": 0.31633561849594116, + "learning_rate": 2.124e-05, + "loss": 0.0225, + "step": 7086 + }, + { + "epoch": 5.580543521071288, + "grad_norm": 0.3140537440776825, + "learning_rate": 2.1243e-05, + "loss": 0.0217, + "step": 7087 + }, + { + "epoch": 5.5813312327688065, + "grad_norm": 0.5200657844543457, + "learning_rate": 2.1246000000000003e-05, + "loss": 0.0363, + "step": 7088 + }, + { + "epoch": 5.582118944466325, + "grad_norm": 0.8604832291603088, + "learning_rate": 2.1249000000000003e-05, + "loss": 0.0352, + "step": 7089 + }, + { + "epoch": 5.5829066561638445, + "grad_norm": 0.7048577666282654, + "learning_rate": 2.1252000000000003e-05, + "loss": 0.0524, + "step": 7090 + }, + { + "epoch": 5.583694367861363, + "grad_norm": 0.3456583321094513, + "learning_rate": 2.1255000000000002e-05, + "loss": 0.0229, + "step": 7091 + }, + { + "epoch": 5.584482079558882, + "grad_norm": 0.7231417894363403, + "learning_rate": 2.1258000000000002e-05, + "loss": 0.0253, + "step": 7092 + }, + { + "epoch": 5.5852697912564, + "grad_norm": 0.3258662819862366, + "learning_rate": 2.1261000000000002e-05, + "loss": 0.0255, + "step": 7093 + }, + { + "epoch": 5.586057502953919, + "grad_norm": 0.4420015513896942, + "learning_rate": 2.1264000000000002e-05, + "loss": 0.0237, + "step": 7094 + }, + { + "epoch": 5.586845214651437, + "grad_norm": 0.8909416198730469, + "learning_rate": 2.1266999999999998e-05, + "loss": 0.0353, + "step": 7095 + }, + { + "epoch": 5.587632926348956, + "grad_norm": 0.7293686270713806, + "learning_rate": 2.1269999999999998e-05, + "loss": 0.0218, + "step": 7096 + }, + { + "epoch": 5.588420638046475, + "grad_norm": 0.5245540142059326, + "learning_rate": 2.1272999999999998e-05, + "loss": 0.0192, + "step": 7097 + }, + { + "epoch": 5.589208349743994, + "grad_norm": 0.5090667605400085, + "learning_rate": 2.1276e-05, + "loss": 0.0328, + "step": 7098 + }, + { + "epoch": 5.5899960614415125, + "grad_norm": 0.7607643604278564, + "learning_rate": 2.1279e-05, + "loss": 0.046, + "step": 7099 + }, + { + "epoch": 5.590783773139031, + "grad_norm": 0.4896757900714874, + "learning_rate": 2.1282e-05, + "loss": 0.0283, + "step": 7100 + }, + { + "epoch": 5.59157148483655, + "grad_norm": 1.2498040199279785, + "learning_rate": 2.1285e-05, + "loss": 0.3371, + "step": 7101 + }, + { + "epoch": 5.592359196534068, + "grad_norm": 1.061330795288086, + "learning_rate": 2.1288e-05, + "loss": 0.2877, + "step": 7102 + }, + { + "epoch": 5.593146908231587, + "grad_norm": 0.8926878571510315, + "learning_rate": 2.1291e-05, + "loss": 0.1464, + "step": 7103 + }, + { + "epoch": 5.593934619929106, + "grad_norm": 0.5855798125267029, + "learning_rate": 2.1294e-05, + "loss": 0.1348, + "step": 7104 + }, + { + "epoch": 5.594722331626625, + "grad_norm": 0.5689395070075989, + "learning_rate": 2.1297e-05, + "loss": 0.1424, + "step": 7105 + }, + { + "epoch": 5.595510043324143, + "grad_norm": 0.4570690989494324, + "learning_rate": 2.13e-05, + "loss": 0.0924, + "step": 7106 + }, + { + "epoch": 5.596297755021662, + "grad_norm": 0.6209364533424377, + "learning_rate": 2.1303e-05, + "loss": 0.0365, + "step": 7107 + }, + { + "epoch": 5.5970854667191805, + "grad_norm": 0.393737256526947, + "learning_rate": 2.1306000000000002e-05, + "loss": 0.0525, + "step": 7108 + }, + { + "epoch": 5.5978731784167, + "grad_norm": 0.5345342755317688, + "learning_rate": 2.1309000000000002e-05, + "loss": 0.0622, + "step": 7109 + }, + { + "epoch": 5.5986608901142185, + "grad_norm": 0.47317004203796387, + "learning_rate": 2.1312000000000002e-05, + "loss": 0.0339, + "step": 7110 + }, + { + "epoch": 5.599448601811737, + "grad_norm": 0.325152724981308, + "learning_rate": 2.1315000000000002e-05, + "loss": 0.024, + "step": 7111 + }, + { + "epoch": 5.600236313509256, + "grad_norm": 0.5486708283424377, + "learning_rate": 2.1318e-05, + "loss": 0.0445, + "step": 7112 + }, + { + "epoch": 5.601024025206774, + "grad_norm": 0.6820536255836487, + "learning_rate": 2.1321e-05, + "loss": 0.0746, + "step": 7113 + }, + { + "epoch": 5.601811736904293, + "grad_norm": 0.19536688923835754, + "learning_rate": 2.1324e-05, + "loss": 0.0125, + "step": 7114 + }, + { + "epoch": 5.602599448601811, + "grad_norm": 0.2370947003364563, + "learning_rate": 2.1327e-05, + "loss": 0.0203, + "step": 7115 + }, + { + "epoch": 5.603387160299331, + "grad_norm": 1.9506608247756958, + "learning_rate": 2.133e-05, + "loss": 0.0409, + "step": 7116 + }, + { + "epoch": 5.604174871996849, + "grad_norm": 0.6157649755477905, + "learning_rate": 2.1333e-05, + "loss": 0.0268, + "step": 7117 + }, + { + "epoch": 5.604962583694368, + "grad_norm": 0.29739949107170105, + "learning_rate": 2.1336000000000004e-05, + "loss": 0.0225, + "step": 7118 + }, + { + "epoch": 5.6057502953918865, + "grad_norm": 0.40158843994140625, + "learning_rate": 2.1339e-05, + "loss": 0.0296, + "step": 7119 + }, + { + "epoch": 5.606538007089405, + "grad_norm": 0.357453852891922, + "learning_rate": 2.1342e-05, + "loss": 0.0281, + "step": 7120 + }, + { + "epoch": 5.607325718786924, + "grad_norm": 0.44792529940605164, + "learning_rate": 2.1345e-05, + "loss": 0.0232, + "step": 7121 + }, + { + "epoch": 5.608113430484442, + "grad_norm": 0.39004477858543396, + "learning_rate": 2.1348e-05, + "loss": 0.0189, + "step": 7122 + }, + { + "epoch": 5.608901142181962, + "grad_norm": 0.30492204427719116, + "learning_rate": 2.1351e-05, + "loss": 0.018, + "step": 7123 + }, + { + "epoch": 5.60968885387948, + "grad_norm": 0.5297372937202454, + "learning_rate": 2.1354e-05, + "loss": 0.0455, + "step": 7124 + }, + { + "epoch": 5.610476565576999, + "grad_norm": 0.4256686568260193, + "learning_rate": 2.1357e-05, + "loss": 0.0333, + "step": 7125 + }, + { + "epoch": 5.611264277274517, + "grad_norm": 0.8737795352935791, + "learning_rate": 2.136e-05, + "loss": 0.0211, + "step": 7126 + }, + { + "epoch": 5.612051988972036, + "grad_norm": 0.37534233927726746, + "learning_rate": 2.1363e-05, + "loss": 0.0323, + "step": 7127 + }, + { + "epoch": 5.612839700669555, + "grad_norm": 0.8971350193023682, + "learning_rate": 2.1366000000000002e-05, + "loss": 0.0214, + "step": 7128 + }, + { + "epoch": 5.613627412367074, + "grad_norm": 0.35542356967926025, + "learning_rate": 2.1369e-05, + "loss": 0.0205, + "step": 7129 + }, + { + "epoch": 5.6144151240645925, + "grad_norm": 0.4912213683128357, + "learning_rate": 2.1372e-05, + "loss": 0.0149, + "step": 7130 + }, + { + "epoch": 5.615202835762111, + "grad_norm": 0.2815951704978943, + "learning_rate": 2.1375e-05, + "loss": 0.0159, + "step": 7131 + }, + { + "epoch": 5.61599054745963, + "grad_norm": 0.4582551419734955, + "learning_rate": 2.1378e-05, + "loss": 0.0333, + "step": 7132 + }, + { + "epoch": 5.616778259157148, + "grad_norm": 0.22863811254501343, + "learning_rate": 2.1381e-05, + "loss": 0.0151, + "step": 7133 + }, + { + "epoch": 5.617565970854667, + "grad_norm": 0.46167802810668945, + "learning_rate": 2.1384e-05, + "loss": 0.0386, + "step": 7134 + }, + { + "epoch": 5.618353682552186, + "grad_norm": 0.38074371218681335, + "learning_rate": 2.1387e-05, + "loss": 0.0211, + "step": 7135 + }, + { + "epoch": 5.619141394249705, + "grad_norm": 0.4403972327709198, + "learning_rate": 2.139e-05, + "loss": 0.0197, + "step": 7136 + }, + { + "epoch": 5.619929105947223, + "grad_norm": 0.39985042810440063, + "learning_rate": 2.1393e-05, + "loss": 0.0227, + "step": 7137 + }, + { + "epoch": 5.620716817644742, + "grad_norm": 0.5477603077888489, + "learning_rate": 2.1396e-05, + "loss": 0.0366, + "step": 7138 + }, + { + "epoch": 5.6215045293422605, + "grad_norm": 0.558581531047821, + "learning_rate": 2.1399000000000003e-05, + "loss": 0.0335, + "step": 7139 + }, + { + "epoch": 5.622292241039779, + "grad_norm": 0.42597606778144836, + "learning_rate": 2.1402000000000003e-05, + "loss": 0.0341, + "step": 7140 + }, + { + "epoch": 5.623079952737298, + "grad_norm": 0.5268921852111816, + "learning_rate": 2.1405000000000003e-05, + "loss": 0.0325, + "step": 7141 + }, + { + "epoch": 5.623867664434817, + "grad_norm": 0.9854416847229004, + "learning_rate": 2.1408000000000002e-05, + "loss": 0.0674, + "step": 7142 + }, + { + "epoch": 5.624655376132336, + "grad_norm": 0.7552934288978577, + "learning_rate": 2.1411000000000002e-05, + "loss": 0.0297, + "step": 7143 + }, + { + "epoch": 5.625443087829854, + "grad_norm": 0.9845038056373596, + "learning_rate": 2.1414e-05, + "loss": 0.0467, + "step": 7144 + }, + { + "epoch": 5.626230799527373, + "grad_norm": 0.45457738637924194, + "learning_rate": 2.1417e-05, + "loss": 0.0249, + "step": 7145 + }, + { + "epoch": 5.627018511224891, + "grad_norm": 0.42581692337989807, + "learning_rate": 2.1419999999999998e-05, + "loss": 0.0376, + "step": 7146 + }, + { + "epoch": 5.627806222922411, + "grad_norm": 0.3468382656574249, + "learning_rate": 2.1422999999999998e-05, + "loss": 0.0234, + "step": 7147 + }, + { + "epoch": 5.628593934619929, + "grad_norm": 0.594038724899292, + "learning_rate": 2.1425999999999998e-05, + "loss": 0.037, + "step": 7148 + }, + { + "epoch": 5.629381646317448, + "grad_norm": 0.5178315043449402, + "learning_rate": 2.1429e-05, + "loss": 0.0334, + "step": 7149 + }, + { + "epoch": 5.6301693580149665, + "grad_norm": 1.5058547258377075, + "learning_rate": 2.1432e-05, + "loss": 0.0726, + "step": 7150 + }, + { + "epoch": 5.630957069712485, + "grad_norm": 1.9123409986495972, + "learning_rate": 2.1435e-05, + "loss": 0.3833, + "step": 7151 + }, + { + "epoch": 5.631744781410004, + "grad_norm": 0.7413482666015625, + "learning_rate": 2.1438e-05, + "loss": 0.2383, + "step": 7152 + }, + { + "epoch": 5.632532493107522, + "grad_norm": 0.544226884841919, + "learning_rate": 2.1441e-05, + "loss": 0.1322, + "step": 7153 + }, + { + "epoch": 5.633320204805042, + "grad_norm": 0.8508384823799133, + "learning_rate": 2.1444e-05, + "loss": 0.1515, + "step": 7154 + }, + { + "epoch": 5.63410791650256, + "grad_norm": 0.915497899055481, + "learning_rate": 2.1447e-05, + "loss": 0.141, + "step": 7155 + }, + { + "epoch": 5.634895628200079, + "grad_norm": 0.6740108728408813, + "learning_rate": 2.145e-05, + "loss": 0.0784, + "step": 7156 + }, + { + "epoch": 5.635683339897597, + "grad_norm": 0.4716634452342987, + "learning_rate": 2.1453e-05, + "loss": 0.0617, + "step": 7157 + }, + { + "epoch": 5.636471051595116, + "grad_norm": 0.4805566668510437, + "learning_rate": 2.1456e-05, + "loss": 0.04, + "step": 7158 + }, + { + "epoch": 5.6372587632926345, + "grad_norm": 0.42492425441741943, + "learning_rate": 2.1459000000000002e-05, + "loss": 0.0404, + "step": 7159 + }, + { + "epoch": 5.638046474990154, + "grad_norm": 0.3663583993911743, + "learning_rate": 2.1462000000000002e-05, + "loss": 0.0225, + "step": 7160 + }, + { + "epoch": 5.638834186687673, + "grad_norm": 0.60567307472229, + "learning_rate": 2.1465000000000002e-05, + "loss": 0.0319, + "step": 7161 + }, + { + "epoch": 5.639621898385191, + "grad_norm": 0.3693927526473999, + "learning_rate": 2.1468000000000002e-05, + "loss": 0.0261, + "step": 7162 + }, + { + "epoch": 5.64040961008271, + "grad_norm": 0.3433874845504761, + "learning_rate": 2.1471e-05, + "loss": 0.0193, + "step": 7163 + }, + { + "epoch": 5.641197321780228, + "grad_norm": 0.3470400869846344, + "learning_rate": 2.1474e-05, + "loss": 0.0256, + "step": 7164 + }, + { + "epoch": 5.641985033477747, + "grad_norm": 0.43097782135009766, + "learning_rate": 2.1477e-05, + "loss": 0.0267, + "step": 7165 + }, + { + "epoch": 5.642772745175266, + "grad_norm": 0.37969812750816345, + "learning_rate": 2.148e-05, + "loss": 0.0485, + "step": 7166 + }, + { + "epoch": 5.643560456872785, + "grad_norm": 0.4669159948825836, + "learning_rate": 2.1483e-05, + "loss": 0.0329, + "step": 7167 + }, + { + "epoch": 5.644348168570303, + "grad_norm": 0.47004756331443787, + "learning_rate": 2.1486e-05, + "loss": 0.0205, + "step": 7168 + }, + { + "epoch": 5.645135880267822, + "grad_norm": 0.39901918172836304, + "learning_rate": 2.1489e-05, + "loss": 0.029, + "step": 7169 + }, + { + "epoch": 5.6459235919653405, + "grad_norm": 0.8064060211181641, + "learning_rate": 2.1492e-05, + "loss": 0.0416, + "step": 7170 + }, + { + "epoch": 5.646711303662859, + "grad_norm": 0.45331814885139465, + "learning_rate": 2.1495e-05, + "loss": 0.0433, + "step": 7171 + }, + { + "epoch": 5.647499015360378, + "grad_norm": 0.4111799895763397, + "learning_rate": 2.1498e-05, + "loss": 0.0189, + "step": 7172 + }, + { + "epoch": 5.648286727057897, + "grad_norm": 0.5457608699798584, + "learning_rate": 2.1501e-05, + "loss": 0.0374, + "step": 7173 + }, + { + "epoch": 5.649074438755416, + "grad_norm": 0.5601869821548462, + "learning_rate": 2.1504e-05, + "loss": 0.0233, + "step": 7174 + }, + { + "epoch": 5.649862150452934, + "grad_norm": 1.146532654762268, + "learning_rate": 2.1507e-05, + "loss": 0.031, + "step": 7175 + }, + { + "epoch": 5.650649862150453, + "grad_norm": 0.5560693740844727, + "learning_rate": 2.151e-05, + "loss": 0.0741, + "step": 7176 + }, + { + "epoch": 5.651437573847971, + "grad_norm": 0.5726747512817383, + "learning_rate": 2.1513e-05, + "loss": 0.0197, + "step": 7177 + }, + { + "epoch": 5.65222528554549, + "grad_norm": 0.33045390248298645, + "learning_rate": 2.1516e-05, + "loss": 0.0224, + "step": 7178 + }, + { + "epoch": 5.653012997243009, + "grad_norm": 0.5021311640739441, + "learning_rate": 2.1519000000000002e-05, + "loss": 0.0297, + "step": 7179 + }, + { + "epoch": 5.653800708940528, + "grad_norm": 0.599353551864624, + "learning_rate": 2.1522e-05, + "loss": 0.0365, + "step": 7180 + }, + { + "epoch": 5.654588420638047, + "grad_norm": 0.9003356099128723, + "learning_rate": 2.1525e-05, + "loss": 0.0311, + "step": 7181 + }, + { + "epoch": 5.655376132335565, + "grad_norm": 1.1978681087493896, + "learning_rate": 2.1528e-05, + "loss": 0.0224, + "step": 7182 + }, + { + "epoch": 5.656163844033084, + "grad_norm": 0.44419237971305847, + "learning_rate": 2.1531e-05, + "loss": 0.0304, + "step": 7183 + }, + { + "epoch": 5.656951555730602, + "grad_norm": 0.6415567994117737, + "learning_rate": 2.1534e-05, + "loss": 0.0271, + "step": 7184 + }, + { + "epoch": 5.657739267428122, + "grad_norm": 0.5661152005195618, + "learning_rate": 2.1537e-05, + "loss": 0.0205, + "step": 7185 + }, + { + "epoch": 5.65852697912564, + "grad_norm": 0.4010232388973236, + "learning_rate": 2.154e-05, + "loss": 0.0326, + "step": 7186 + }, + { + "epoch": 5.659314690823159, + "grad_norm": 0.32343000173568726, + "learning_rate": 2.1543e-05, + "loss": 0.0291, + "step": 7187 + }, + { + "epoch": 5.660102402520677, + "grad_norm": 0.6757152676582336, + "learning_rate": 2.1546e-05, + "loss": 0.0411, + "step": 7188 + }, + { + "epoch": 5.660890114218196, + "grad_norm": 0.8532548546791077, + "learning_rate": 2.1549000000000003e-05, + "loss": 0.0258, + "step": 7189 + }, + { + "epoch": 5.661677825915715, + "grad_norm": 0.6403523087501526, + "learning_rate": 2.1552000000000003e-05, + "loss": 0.0363, + "step": 7190 + }, + { + "epoch": 5.662465537613233, + "grad_norm": 0.49880659580230713, + "learning_rate": 2.1555000000000003e-05, + "loss": 0.0376, + "step": 7191 + }, + { + "epoch": 5.663253249310753, + "grad_norm": 0.6470566391944885, + "learning_rate": 2.1558000000000003e-05, + "loss": 0.0322, + "step": 7192 + }, + { + "epoch": 5.664040961008271, + "grad_norm": 0.37153714895248413, + "learning_rate": 2.1561e-05, + "loss": 0.0205, + "step": 7193 + }, + { + "epoch": 5.66482867270579, + "grad_norm": 0.4700193703174591, + "learning_rate": 2.1564e-05, + "loss": 0.0261, + "step": 7194 + }, + { + "epoch": 5.665616384403308, + "grad_norm": 0.3505166172981262, + "learning_rate": 2.1567e-05, + "loss": 0.0284, + "step": 7195 + }, + { + "epoch": 5.666404096100827, + "grad_norm": 0.47388893365859985, + "learning_rate": 2.157e-05, + "loss": 0.0285, + "step": 7196 + }, + { + "epoch": 5.667191807798346, + "grad_norm": 0.6622537970542908, + "learning_rate": 2.1572999999999998e-05, + "loss": 0.0329, + "step": 7197 + }, + { + "epoch": 5.667979519495865, + "grad_norm": 0.38256603479385376, + "learning_rate": 2.1575999999999998e-05, + "loss": 0.0142, + "step": 7198 + }, + { + "epoch": 5.668767231193383, + "grad_norm": 0.5837087631225586, + "learning_rate": 2.1579e-05, + "loss": 0.0362, + "step": 7199 + }, + { + "epoch": 5.669554942890902, + "grad_norm": 0.6487307548522949, + "learning_rate": 2.1582e-05, + "loss": 0.0256, + "step": 7200 + }, + { + "epoch": 5.670342654588421, + "grad_norm": 1.2338632345199585, + "learning_rate": 2.1585e-05, + "loss": 0.2578, + "step": 7201 + }, + { + "epoch": 5.671130366285939, + "grad_norm": 0.9618111252784729, + "learning_rate": 2.1588e-05, + "loss": 0.243, + "step": 7202 + }, + { + "epoch": 5.671918077983458, + "grad_norm": 1.1507399082183838, + "learning_rate": 2.1591e-05, + "loss": 0.1786, + "step": 7203 + }, + { + "epoch": 5.672705789680977, + "grad_norm": 0.8426865339279175, + "learning_rate": 2.1594e-05, + "loss": 0.1592, + "step": 7204 + }, + { + "epoch": 5.673493501378496, + "grad_norm": 0.9927628636360168, + "learning_rate": 2.1597e-05, + "loss": 0.1542, + "step": 7205 + }, + { + "epoch": 5.674281213076014, + "grad_norm": 0.44428855180740356, + "learning_rate": 2.16e-05, + "loss": 0.0671, + "step": 7206 + }, + { + "epoch": 5.675068924773533, + "grad_norm": 0.5280542373657227, + "learning_rate": 2.1603e-05, + "loss": 0.0433, + "step": 7207 + }, + { + "epoch": 5.675856636471051, + "grad_norm": 0.45438647270202637, + "learning_rate": 2.1606e-05, + "loss": 0.0275, + "step": 7208 + }, + { + "epoch": 5.67664434816857, + "grad_norm": 0.3029531240463257, + "learning_rate": 2.1609000000000003e-05, + "loss": 0.0248, + "step": 7209 + }, + { + "epoch": 5.677432059866089, + "grad_norm": 0.4367668926715851, + "learning_rate": 2.1612000000000002e-05, + "loss": 0.0369, + "step": 7210 + }, + { + "epoch": 5.678219771563608, + "grad_norm": 0.6240731477737427, + "learning_rate": 2.1615000000000002e-05, + "loss": 0.0175, + "step": 7211 + }, + { + "epoch": 5.679007483261127, + "grad_norm": 0.8200759887695312, + "learning_rate": 2.1618000000000002e-05, + "loss": 0.03, + "step": 7212 + }, + { + "epoch": 5.679795194958645, + "grad_norm": 0.44862326979637146, + "learning_rate": 2.1621000000000002e-05, + "loss": 0.0331, + "step": 7213 + }, + { + "epoch": 5.680582906656164, + "grad_norm": 0.21881887316703796, + "learning_rate": 2.1624e-05, + "loss": 0.0121, + "step": 7214 + }, + { + "epoch": 5.681370618353682, + "grad_norm": 0.6099089980125427, + "learning_rate": 2.1627e-05, + "loss": 0.0285, + "step": 7215 + }, + { + "epoch": 5.682158330051202, + "grad_norm": 0.24675367772579193, + "learning_rate": 2.163e-05, + "loss": 0.02, + "step": 7216 + }, + { + "epoch": 5.68294604174872, + "grad_norm": 0.3134476840496063, + "learning_rate": 2.1633e-05, + "loss": 0.0237, + "step": 7217 + }, + { + "epoch": 5.683733753446239, + "grad_norm": 0.2937740981578827, + "learning_rate": 2.1635999999999997e-05, + "loss": 0.019, + "step": 7218 + }, + { + "epoch": 5.6845214651437574, + "grad_norm": 0.4449385106563568, + "learning_rate": 2.1639e-05, + "loss": 0.0309, + "step": 7219 + }, + { + "epoch": 5.685309176841276, + "grad_norm": 0.5568476319313049, + "learning_rate": 2.1642e-05, + "loss": 0.0205, + "step": 7220 + }, + { + "epoch": 5.686096888538795, + "grad_norm": 0.547945499420166, + "learning_rate": 2.1645e-05, + "loss": 0.0384, + "step": 7221 + }, + { + "epoch": 5.686884600236313, + "grad_norm": 0.2458963841199875, + "learning_rate": 2.1648e-05, + "loss": 0.0139, + "step": 7222 + }, + { + "epoch": 5.687672311933833, + "grad_norm": 0.4156986176967621, + "learning_rate": 2.1651e-05, + "loss": 0.0185, + "step": 7223 + }, + { + "epoch": 5.688460023631351, + "grad_norm": 0.7534115314483643, + "learning_rate": 2.1654e-05, + "loss": 0.0266, + "step": 7224 + }, + { + "epoch": 5.68924773532887, + "grad_norm": 0.6483034491539001, + "learning_rate": 2.1657e-05, + "loss": 0.0272, + "step": 7225 + }, + { + "epoch": 5.690035447026388, + "grad_norm": 0.4015558063983917, + "learning_rate": 2.166e-05, + "loss": 0.0311, + "step": 7226 + }, + { + "epoch": 5.690823158723907, + "grad_norm": 0.3861445486545563, + "learning_rate": 2.1663e-05, + "loss": 0.0196, + "step": 7227 + }, + { + "epoch": 5.691610870421425, + "grad_norm": 0.6336590647697449, + "learning_rate": 2.1666e-05, + "loss": 0.0183, + "step": 7228 + }, + { + "epoch": 5.692398582118944, + "grad_norm": 0.4626050889492035, + "learning_rate": 2.1669000000000002e-05, + "loss": 0.0211, + "step": 7229 + }, + { + "epoch": 5.6931862938164635, + "grad_norm": 0.5636785626411438, + "learning_rate": 2.1672000000000002e-05, + "loss": 0.0185, + "step": 7230 + }, + { + "epoch": 5.693974005513982, + "grad_norm": 0.42994022369384766, + "learning_rate": 2.1675e-05, + "loss": 0.0236, + "step": 7231 + }, + { + "epoch": 5.694761717211501, + "grad_norm": 0.46665555238723755, + "learning_rate": 2.1678e-05, + "loss": 0.0309, + "step": 7232 + }, + { + "epoch": 5.695549428909019, + "grad_norm": 0.4936205744743347, + "learning_rate": 2.1681e-05, + "loss": 0.0261, + "step": 7233 + }, + { + "epoch": 5.696337140606538, + "grad_norm": 0.29554495215415955, + "learning_rate": 2.1684e-05, + "loss": 0.0177, + "step": 7234 + }, + { + "epoch": 5.697124852304057, + "grad_norm": 0.38423603773117065, + "learning_rate": 2.1687e-05, + "loss": 0.0318, + "step": 7235 + }, + { + "epoch": 5.697912564001576, + "grad_norm": 0.452688604593277, + "learning_rate": 2.169e-05, + "loss": 0.0345, + "step": 7236 + }, + { + "epoch": 5.698700275699094, + "grad_norm": 0.5746718049049377, + "learning_rate": 2.1693e-05, + "loss": 0.0379, + "step": 7237 + }, + { + "epoch": 5.699487987396613, + "grad_norm": 0.4834388494491577, + "learning_rate": 2.1696e-05, + "loss": 0.0274, + "step": 7238 + }, + { + "epoch": 5.7002756990941315, + "grad_norm": 0.8409642577171326, + "learning_rate": 2.1699000000000003e-05, + "loss": 0.0394, + "step": 7239 + }, + { + "epoch": 5.70106341079165, + "grad_norm": 0.307026207447052, + "learning_rate": 2.1702000000000003e-05, + "loss": 0.0205, + "step": 7240 + }, + { + "epoch": 5.701851122489169, + "grad_norm": 0.45513826608657837, + "learning_rate": 2.1705000000000003e-05, + "loss": 0.0266, + "step": 7241 + }, + { + "epoch": 5.702638834186688, + "grad_norm": 0.5660565495491028, + "learning_rate": 2.1708e-05, + "loss": 0.0247, + "step": 7242 + }, + { + "epoch": 5.703426545884207, + "grad_norm": 0.7779843211174011, + "learning_rate": 2.1711e-05, + "loss": 0.0395, + "step": 7243 + }, + { + "epoch": 5.704214257581725, + "grad_norm": 0.7380728721618652, + "learning_rate": 2.1714e-05, + "loss": 0.0241, + "step": 7244 + }, + { + "epoch": 5.705001969279244, + "grad_norm": 0.3493494391441345, + "learning_rate": 2.1717e-05, + "loss": 0.0259, + "step": 7245 + }, + { + "epoch": 5.705789680976762, + "grad_norm": 0.6011685132980347, + "learning_rate": 2.172e-05, + "loss": 0.0335, + "step": 7246 + }, + { + "epoch": 5.706577392674281, + "grad_norm": 0.390348881483078, + "learning_rate": 2.1723e-05, + "loss": 0.0177, + "step": 7247 + }, + { + "epoch": 5.7073651043717994, + "grad_norm": 0.5387747883796692, + "learning_rate": 2.1726e-05, + "loss": 0.0396, + "step": 7248 + }, + { + "epoch": 5.708152816069319, + "grad_norm": 0.7680004239082336, + "learning_rate": 2.1729e-05, + "loss": 0.0461, + "step": 7249 + }, + { + "epoch": 5.7089405277668375, + "grad_norm": 0.6690549254417419, + "learning_rate": 2.1732e-05, + "loss": 0.0489, + "step": 7250 + }, + { + "epoch": 5.709728239464356, + "grad_norm": 1.5017637014389038, + "learning_rate": 2.1735e-05, + "loss": 0.2956, + "step": 7251 + }, + { + "epoch": 5.710515951161875, + "grad_norm": 1.2077947854995728, + "learning_rate": 2.1738e-05, + "loss": 0.2311, + "step": 7252 + }, + { + "epoch": 5.711303662859393, + "grad_norm": 1.1016381978988647, + "learning_rate": 2.1741e-05, + "loss": 0.2399, + "step": 7253 + }, + { + "epoch": 5.712091374556913, + "grad_norm": 1.261164665222168, + "learning_rate": 2.1744e-05, + "loss": 0.1626, + "step": 7254 + }, + { + "epoch": 5.712879086254431, + "grad_norm": 0.7548021674156189, + "learning_rate": 2.1747e-05, + "loss": 0.1082, + "step": 7255 + }, + { + "epoch": 5.71366679795195, + "grad_norm": 0.9556264281272888, + "learning_rate": 2.175e-05, + "loss": 0.0519, + "step": 7256 + }, + { + "epoch": 5.714454509649468, + "grad_norm": 0.5148378610610962, + "learning_rate": 2.1753e-05, + "loss": 0.0513, + "step": 7257 + }, + { + "epoch": 5.715242221346987, + "grad_norm": 0.5259606242179871, + "learning_rate": 2.1756e-05, + "loss": 0.0432, + "step": 7258 + }, + { + "epoch": 5.7160299330445055, + "grad_norm": 0.2779618203639984, + "learning_rate": 2.1759e-05, + "loss": 0.0214, + "step": 7259 + }, + { + "epoch": 5.716817644742024, + "grad_norm": 0.5672035813331604, + "learning_rate": 2.1762000000000003e-05, + "loss": 0.0468, + "step": 7260 + }, + { + "epoch": 5.7176053564395435, + "grad_norm": 0.4830072820186615, + "learning_rate": 2.1765000000000003e-05, + "loss": 0.0336, + "step": 7261 + }, + { + "epoch": 5.718393068137062, + "grad_norm": 0.3291081488132477, + "learning_rate": 2.1768000000000002e-05, + "loss": 0.0315, + "step": 7262 + }, + { + "epoch": 5.719180779834581, + "grad_norm": 0.4203551709651947, + "learning_rate": 2.1771000000000002e-05, + "loss": 0.0265, + "step": 7263 + }, + { + "epoch": 5.719968491532099, + "grad_norm": 0.2752735912799835, + "learning_rate": 2.1774000000000002e-05, + "loss": 0.0139, + "step": 7264 + }, + { + "epoch": 5.720756203229618, + "grad_norm": 0.3229789137840271, + "learning_rate": 2.1777000000000002e-05, + "loss": 0.032, + "step": 7265 + }, + { + "epoch": 5.721543914927136, + "grad_norm": 0.6758624315261841, + "learning_rate": 2.178e-05, + "loss": 0.0228, + "step": 7266 + }, + { + "epoch": 5.722331626624655, + "grad_norm": 0.3350953459739685, + "learning_rate": 2.1782999999999998e-05, + "loss": 0.0286, + "step": 7267 + }, + { + "epoch": 5.723119338322174, + "grad_norm": 0.27284544706344604, + "learning_rate": 2.1785999999999998e-05, + "loss": 0.0159, + "step": 7268 + }, + { + "epoch": 5.723907050019693, + "grad_norm": 0.3182239830493927, + "learning_rate": 2.1788999999999998e-05, + "loss": 0.0221, + "step": 7269 + }, + { + "epoch": 5.7246947617172115, + "grad_norm": 0.304507851600647, + "learning_rate": 2.1792e-05, + "loss": 0.0188, + "step": 7270 + }, + { + "epoch": 5.72548247341473, + "grad_norm": 0.33882710337638855, + "learning_rate": 2.1795e-05, + "loss": 0.0199, + "step": 7271 + }, + { + "epoch": 5.726270185112249, + "grad_norm": 0.6391003131866455, + "learning_rate": 2.1798e-05, + "loss": 0.044, + "step": 7272 + }, + { + "epoch": 5.727057896809768, + "grad_norm": 0.42515116930007935, + "learning_rate": 2.1801e-05, + "loss": 0.0262, + "step": 7273 + }, + { + "epoch": 5.727845608507287, + "grad_norm": 0.3198474943637848, + "learning_rate": 2.1804e-05, + "loss": 0.0184, + "step": 7274 + }, + { + "epoch": 5.728633320204805, + "grad_norm": 0.40513208508491516, + "learning_rate": 2.1807e-05, + "loss": 0.0424, + "step": 7275 + }, + { + "epoch": 5.729421031902324, + "grad_norm": 0.39810362458229065, + "learning_rate": 2.181e-05, + "loss": 0.0197, + "step": 7276 + }, + { + "epoch": 5.730208743599842, + "grad_norm": 0.3858497440814972, + "learning_rate": 2.1813e-05, + "loss": 0.0236, + "step": 7277 + }, + { + "epoch": 5.730996455297361, + "grad_norm": 0.33155015110969543, + "learning_rate": 2.1816e-05, + "loss": 0.0175, + "step": 7278 + }, + { + "epoch": 5.7317841669948795, + "grad_norm": 0.44739750027656555, + "learning_rate": 2.1819e-05, + "loss": 0.0338, + "step": 7279 + }, + { + "epoch": 5.732571878692399, + "grad_norm": 0.39206305146217346, + "learning_rate": 2.1822000000000002e-05, + "loss": 0.0253, + "step": 7280 + }, + { + "epoch": 5.7333595903899175, + "grad_norm": 0.6963793635368347, + "learning_rate": 2.1825000000000002e-05, + "loss": 0.0297, + "step": 7281 + }, + { + "epoch": 5.734147302087436, + "grad_norm": 0.4806087017059326, + "learning_rate": 2.1828000000000002e-05, + "loss": 0.0172, + "step": 7282 + }, + { + "epoch": 5.734935013784955, + "grad_norm": 0.498068630695343, + "learning_rate": 2.1831e-05, + "loss": 0.0323, + "step": 7283 + }, + { + "epoch": 5.735722725482473, + "grad_norm": 0.633056640625, + "learning_rate": 2.1834e-05, + "loss": 0.0534, + "step": 7284 + }, + { + "epoch": 5.736510437179992, + "grad_norm": 0.5134871006011963, + "learning_rate": 2.1837e-05, + "loss": 0.0269, + "step": 7285 + }, + { + "epoch": 5.737298148877511, + "grad_norm": 0.44968557357788086, + "learning_rate": 2.184e-05, + "loss": 0.0331, + "step": 7286 + }, + { + "epoch": 5.73808586057503, + "grad_norm": 0.7400898337364197, + "learning_rate": 2.1843e-05, + "loss": 0.0255, + "step": 7287 + }, + { + "epoch": 5.738873572272548, + "grad_norm": 0.6399885416030884, + "learning_rate": 2.1846e-05, + "loss": 0.0516, + "step": 7288 + }, + { + "epoch": 5.739661283970067, + "grad_norm": 0.5349054932594299, + "learning_rate": 2.1849e-05, + "loss": 0.0296, + "step": 7289 + }, + { + "epoch": 5.7404489956675855, + "grad_norm": 0.567317545413971, + "learning_rate": 2.1852000000000004e-05, + "loss": 0.032, + "step": 7290 + }, + { + "epoch": 5.741236707365104, + "grad_norm": 0.6183437705039978, + "learning_rate": 2.1855e-05, + "loss": 0.0261, + "step": 7291 + }, + { + "epoch": 5.7420244190626235, + "grad_norm": 0.435428261756897, + "learning_rate": 2.1858e-05, + "loss": 0.0274, + "step": 7292 + }, + { + "epoch": 5.742812130760142, + "grad_norm": 0.5603861212730408, + "learning_rate": 2.1861e-05, + "loss": 0.0259, + "step": 7293 + }, + { + "epoch": 5.743599842457661, + "grad_norm": 0.5556600689888, + "learning_rate": 2.1864e-05, + "loss": 0.0267, + "step": 7294 + }, + { + "epoch": 5.744387554155179, + "grad_norm": 0.6991695761680603, + "learning_rate": 2.1867e-05, + "loss": 0.0343, + "step": 7295 + }, + { + "epoch": 5.745175265852698, + "grad_norm": 0.29437416791915894, + "learning_rate": 2.187e-05, + "loss": 0.0186, + "step": 7296 + }, + { + "epoch": 5.745962977550216, + "grad_norm": 0.4316202998161316, + "learning_rate": 2.1873e-05, + "loss": 0.0265, + "step": 7297 + }, + { + "epoch": 5.746750689247735, + "grad_norm": 0.38594356179237366, + "learning_rate": 2.1876e-05, + "loss": 0.0211, + "step": 7298 + }, + { + "epoch": 5.747538400945254, + "grad_norm": 0.814647912979126, + "learning_rate": 2.1879e-05, + "loss": 0.0498, + "step": 7299 + }, + { + "epoch": 5.748326112642773, + "grad_norm": 1.0280232429504395, + "learning_rate": 2.1882e-05, + "loss": 0.0305, + "step": 7300 + }, + { + "epoch": 5.7491138243402915, + "grad_norm": 0.9952058792114258, + "learning_rate": 2.1885e-05, + "loss": 0.333, + "step": 7301 + }, + { + "epoch": 5.74990153603781, + "grad_norm": 1.0158400535583496, + "learning_rate": 2.1888e-05, + "loss": 0.3396, + "step": 7302 + }, + { + "epoch": 5.750689247735329, + "grad_norm": 0.7803139090538025, + "learning_rate": 2.1891e-05, + "loss": 0.2373, + "step": 7303 + }, + { + "epoch": 5.751476959432847, + "grad_norm": 0.6360378265380859, + "learning_rate": 2.1894e-05, + "loss": 0.1235, + "step": 7304 + }, + { + "epoch": 5.752264671130367, + "grad_norm": 1.2586941719055176, + "learning_rate": 2.1897e-05, + "loss": 0.1478, + "step": 7305 + }, + { + "epoch": 5.753052382827885, + "grad_norm": 0.4357336759567261, + "learning_rate": 2.19e-05, + "loss": 0.0641, + "step": 7306 + }, + { + "epoch": 5.753840094525404, + "grad_norm": 0.38267236948013306, + "learning_rate": 2.1903e-05, + "loss": 0.0463, + "step": 7307 + }, + { + "epoch": 5.754627806222922, + "grad_norm": 0.3178431987762451, + "learning_rate": 2.1906e-05, + "loss": 0.0304, + "step": 7308 + }, + { + "epoch": 5.755415517920441, + "grad_norm": 0.28822076320648193, + "learning_rate": 2.1909e-05, + "loss": 0.0313, + "step": 7309 + }, + { + "epoch": 5.7562032296179595, + "grad_norm": 0.21870344877243042, + "learning_rate": 2.1912000000000003e-05, + "loss": 0.0216, + "step": 7310 + }, + { + "epoch": 5.756990941315479, + "grad_norm": 0.577383279800415, + "learning_rate": 2.1915000000000003e-05, + "loss": 0.0688, + "step": 7311 + }, + { + "epoch": 5.7577786530129975, + "grad_norm": 0.45632365345954895, + "learning_rate": 2.1918000000000003e-05, + "loss": 0.0306, + "step": 7312 + }, + { + "epoch": 5.758566364710516, + "grad_norm": 0.6014730334281921, + "learning_rate": 2.1921000000000002e-05, + "loss": 0.0251, + "step": 7313 + }, + { + "epoch": 5.759354076408035, + "grad_norm": 0.5378854870796204, + "learning_rate": 2.1924000000000002e-05, + "loss": 0.0261, + "step": 7314 + }, + { + "epoch": 5.760141788105553, + "grad_norm": 0.7341826558113098, + "learning_rate": 2.1927000000000002e-05, + "loss": 0.0272, + "step": 7315 + }, + { + "epoch": 5.760929499803072, + "grad_norm": 0.35334455966949463, + "learning_rate": 2.193e-05, + "loss": 0.025, + "step": 7316 + }, + { + "epoch": 5.76171721150059, + "grad_norm": 0.8707945346832275, + "learning_rate": 2.1932999999999998e-05, + "loss": 0.0256, + "step": 7317 + }, + { + "epoch": 5.76250492319811, + "grad_norm": 0.3362422585487366, + "learning_rate": 2.1935999999999998e-05, + "loss": 0.0259, + "step": 7318 + }, + { + "epoch": 5.763292634895628, + "grad_norm": 0.4296465218067169, + "learning_rate": 2.1938999999999998e-05, + "loss": 0.0225, + "step": 7319 + }, + { + "epoch": 5.764080346593147, + "grad_norm": 0.5887699127197266, + "learning_rate": 2.1942e-05, + "loss": 0.0253, + "step": 7320 + }, + { + "epoch": 5.7648680582906655, + "grad_norm": 0.5839806795120239, + "learning_rate": 2.1945e-05, + "loss": 0.0339, + "step": 7321 + }, + { + "epoch": 5.765655769988184, + "grad_norm": 0.3646068274974823, + "learning_rate": 2.1948e-05, + "loss": 0.0229, + "step": 7322 + }, + { + "epoch": 5.7664434816857035, + "grad_norm": 0.3527778089046478, + "learning_rate": 2.1951e-05, + "loss": 0.0215, + "step": 7323 + }, + { + "epoch": 5.767231193383222, + "grad_norm": 0.5565947890281677, + "learning_rate": 2.1954e-05, + "loss": 0.0341, + "step": 7324 + }, + { + "epoch": 5.768018905080741, + "grad_norm": 0.40638643503189087, + "learning_rate": 2.1957e-05, + "loss": 0.0278, + "step": 7325 + }, + { + "epoch": 5.768806616778259, + "grad_norm": 0.5188766717910767, + "learning_rate": 2.196e-05, + "loss": 0.0391, + "step": 7326 + }, + { + "epoch": 5.769594328475778, + "grad_norm": 0.44620710611343384, + "learning_rate": 2.1963e-05, + "loss": 0.0365, + "step": 7327 + }, + { + "epoch": 5.770382040173296, + "grad_norm": 0.37898316979408264, + "learning_rate": 2.1966e-05, + "loss": 0.0145, + "step": 7328 + }, + { + "epoch": 5.771169751870815, + "grad_norm": 0.3551992177963257, + "learning_rate": 2.1969e-05, + "loss": 0.0144, + "step": 7329 + }, + { + "epoch": 5.771957463568334, + "grad_norm": 0.7654078006744385, + "learning_rate": 2.1972000000000002e-05, + "loss": 0.0333, + "step": 7330 + }, + { + "epoch": 5.772745175265853, + "grad_norm": 0.1857636719942093, + "learning_rate": 2.1975000000000002e-05, + "loss": 0.0092, + "step": 7331 + }, + { + "epoch": 5.7735328869633715, + "grad_norm": 0.7003113627433777, + "learning_rate": 2.1978000000000002e-05, + "loss": 0.0288, + "step": 7332 + }, + { + "epoch": 5.77432059866089, + "grad_norm": 0.421414852142334, + "learning_rate": 2.1981000000000002e-05, + "loss": 0.0176, + "step": 7333 + }, + { + "epoch": 5.775108310358409, + "grad_norm": 0.7399498224258423, + "learning_rate": 2.1984e-05, + "loss": 0.0315, + "step": 7334 + }, + { + "epoch": 5.775896022055927, + "grad_norm": 0.4797098934650421, + "learning_rate": 2.1987e-05, + "loss": 0.0344, + "step": 7335 + }, + { + "epoch": 5.776683733753446, + "grad_norm": 0.27724748849868774, + "learning_rate": 2.199e-05, + "loss": 0.027, + "step": 7336 + }, + { + "epoch": 5.777471445450965, + "grad_norm": 0.39822903275489807, + "learning_rate": 2.1993e-05, + "loss": 0.0225, + "step": 7337 + }, + { + "epoch": 5.778259157148484, + "grad_norm": 0.5029299259185791, + "learning_rate": 2.1996e-05, + "loss": 0.0211, + "step": 7338 + }, + { + "epoch": 5.779046868846002, + "grad_norm": 0.7739003300666809, + "learning_rate": 2.1999e-05, + "loss": 0.0395, + "step": 7339 + }, + { + "epoch": 5.779834580543521, + "grad_norm": 0.5437313318252563, + "learning_rate": 2.2002e-05, + "loss": 0.0355, + "step": 7340 + }, + { + "epoch": 5.7806222922410395, + "grad_norm": 0.9596534371376038, + "learning_rate": 2.2005e-05, + "loss": 0.0257, + "step": 7341 + }, + { + "epoch": 5.781410003938559, + "grad_norm": 0.5861401557922363, + "learning_rate": 2.2008e-05, + "loss": 0.0307, + "step": 7342 + }, + { + "epoch": 5.7821977156360775, + "grad_norm": 0.5524150729179382, + "learning_rate": 2.2011e-05, + "loss": 0.0214, + "step": 7343 + }, + { + "epoch": 5.782985427333596, + "grad_norm": 0.7154013514518738, + "learning_rate": 2.2014e-05, + "loss": 0.0297, + "step": 7344 + }, + { + "epoch": 5.783773139031115, + "grad_norm": 0.41433265805244446, + "learning_rate": 2.2017e-05, + "loss": 0.0388, + "step": 7345 + }, + { + "epoch": 5.784560850728633, + "grad_norm": 0.27038413286209106, + "learning_rate": 2.202e-05, + "loss": 0.0135, + "step": 7346 + }, + { + "epoch": 5.785348562426152, + "grad_norm": 0.612205445766449, + "learning_rate": 2.2023e-05, + "loss": 0.0334, + "step": 7347 + }, + { + "epoch": 5.78613627412367, + "grad_norm": 0.43179553747177124, + "learning_rate": 2.2026e-05, + "loss": 0.0249, + "step": 7348 + }, + { + "epoch": 5.78692398582119, + "grad_norm": 0.3834361732006073, + "learning_rate": 2.2029e-05, + "loss": 0.0212, + "step": 7349 + }, + { + "epoch": 5.787711697518708, + "grad_norm": 1.0008469820022583, + "learning_rate": 2.2032000000000002e-05, + "loss": 0.0467, + "step": 7350 + }, + { + "epoch": 5.788499409216227, + "grad_norm": 1.6268280744552612, + "learning_rate": 2.2035e-05, + "loss": 0.4099, + "step": 7351 + }, + { + "epoch": 5.7892871209137455, + "grad_norm": 0.8463324308395386, + "learning_rate": 2.2038e-05, + "loss": 0.1958, + "step": 7352 + }, + { + "epoch": 5.790074832611264, + "grad_norm": 0.7806091904640198, + "learning_rate": 2.2041e-05, + "loss": 0.1708, + "step": 7353 + }, + { + "epoch": 5.790862544308783, + "grad_norm": 0.6220914125442505, + "learning_rate": 2.2044e-05, + "loss": 0.1418, + "step": 7354 + }, + { + "epoch": 5.791650256006301, + "grad_norm": 0.6499823927879333, + "learning_rate": 2.2047e-05, + "loss": 0.1297, + "step": 7355 + }, + { + "epoch": 5.792437967703821, + "grad_norm": 0.9861825108528137, + "learning_rate": 2.205e-05, + "loss": 0.1143, + "step": 7356 + }, + { + "epoch": 5.793225679401339, + "grad_norm": 0.5784850716590881, + "learning_rate": 2.2053e-05, + "loss": 0.0569, + "step": 7357 + }, + { + "epoch": 5.794013391098858, + "grad_norm": 0.3654801547527313, + "learning_rate": 2.2056e-05, + "loss": 0.05, + "step": 7358 + }, + { + "epoch": 5.794801102796376, + "grad_norm": 0.47842249274253845, + "learning_rate": 2.2059e-05, + "loss": 0.041, + "step": 7359 + }, + { + "epoch": 5.795588814493895, + "grad_norm": 0.5971513986587524, + "learning_rate": 2.2062000000000003e-05, + "loss": 0.0342, + "step": 7360 + }, + { + "epoch": 5.796376526191414, + "grad_norm": 0.24803581833839417, + "learning_rate": 2.2065000000000003e-05, + "loss": 0.0228, + "step": 7361 + }, + { + "epoch": 5.797164237888933, + "grad_norm": 0.49955108761787415, + "learning_rate": 2.2068000000000003e-05, + "loss": 0.0326, + "step": 7362 + }, + { + "epoch": 5.7979519495864515, + "grad_norm": 0.24619373679161072, + "learning_rate": 2.2071000000000003e-05, + "loss": 0.0237, + "step": 7363 + }, + { + "epoch": 5.79873966128397, + "grad_norm": 0.372414231300354, + "learning_rate": 2.2074000000000002e-05, + "loss": 0.0226, + "step": 7364 + }, + { + "epoch": 5.799527372981489, + "grad_norm": 0.3279469311237335, + "learning_rate": 2.2077e-05, + "loss": 0.022, + "step": 7365 + }, + { + "epoch": 5.800315084679007, + "grad_norm": 0.3421505391597748, + "learning_rate": 2.208e-05, + "loss": 0.0152, + "step": 7366 + }, + { + "epoch": 5.801102796376526, + "grad_norm": 0.3805878758430481, + "learning_rate": 2.2083e-05, + "loss": 0.0294, + "step": 7367 + }, + { + "epoch": 5.801890508074045, + "grad_norm": 0.3084288537502289, + "learning_rate": 2.2085999999999998e-05, + "loss": 0.0216, + "step": 7368 + }, + { + "epoch": 5.802678219771564, + "grad_norm": 0.8778548240661621, + "learning_rate": 2.2088999999999998e-05, + "loss": 0.0348, + "step": 7369 + }, + { + "epoch": 5.803465931469082, + "grad_norm": 0.46456000208854675, + "learning_rate": 2.2092e-05, + "loss": 0.0264, + "step": 7370 + }, + { + "epoch": 5.804253643166601, + "grad_norm": 0.2713642120361328, + "learning_rate": 2.2095e-05, + "loss": 0.0174, + "step": 7371 + }, + { + "epoch": 5.8050413548641195, + "grad_norm": 0.6433964967727661, + "learning_rate": 2.2098e-05, + "loss": 0.0169, + "step": 7372 + }, + { + "epoch": 5.805829066561638, + "grad_norm": 0.433463990688324, + "learning_rate": 2.2101e-05, + "loss": 0.0241, + "step": 7373 + }, + { + "epoch": 5.806616778259157, + "grad_norm": 0.5695592761039734, + "learning_rate": 2.2104e-05, + "loss": 0.0247, + "step": 7374 + }, + { + "epoch": 5.807404489956676, + "grad_norm": 0.5994040966033936, + "learning_rate": 2.2107e-05, + "loss": 0.032, + "step": 7375 + }, + { + "epoch": 5.808192201654195, + "grad_norm": 0.589667558670044, + "learning_rate": 2.211e-05, + "loss": 0.0274, + "step": 7376 + }, + { + "epoch": 5.808979913351713, + "grad_norm": 0.9874478578567505, + "learning_rate": 2.2113e-05, + "loss": 0.0327, + "step": 7377 + }, + { + "epoch": 5.809767625049232, + "grad_norm": 0.589371919631958, + "learning_rate": 2.2116e-05, + "loss": 0.032, + "step": 7378 + }, + { + "epoch": 5.81055533674675, + "grad_norm": 0.41893884539604187, + "learning_rate": 2.2119e-05, + "loss": 0.0258, + "step": 7379 + }, + { + "epoch": 5.81134304844427, + "grad_norm": 0.43212851881980896, + "learning_rate": 2.2122000000000003e-05, + "loss": 0.02, + "step": 7380 + }, + { + "epoch": 5.812130760141788, + "grad_norm": 0.46935391426086426, + "learning_rate": 2.2125000000000002e-05, + "loss": 0.0447, + "step": 7381 + }, + { + "epoch": 5.812918471839307, + "grad_norm": 0.4502299129962921, + "learning_rate": 2.2128000000000002e-05, + "loss": 0.0273, + "step": 7382 + }, + { + "epoch": 5.8137061835368256, + "grad_norm": 0.445507287979126, + "learning_rate": 2.2131000000000002e-05, + "loss": 0.0366, + "step": 7383 + }, + { + "epoch": 5.814493895234344, + "grad_norm": 0.4307246208190918, + "learning_rate": 2.2134000000000002e-05, + "loss": 0.0271, + "step": 7384 + }, + { + "epoch": 5.815281606931863, + "grad_norm": 0.37356215715408325, + "learning_rate": 2.2137e-05, + "loss": 0.02, + "step": 7385 + }, + { + "epoch": 5.816069318629381, + "grad_norm": 0.6024392247200012, + "learning_rate": 2.214e-05, + "loss": 0.0213, + "step": 7386 + }, + { + "epoch": 5.816857030326901, + "grad_norm": 0.5386610627174377, + "learning_rate": 2.2143e-05, + "loss": 0.0361, + "step": 7387 + }, + { + "epoch": 5.817644742024419, + "grad_norm": 0.6806403994560242, + "learning_rate": 2.2146e-05, + "loss": 0.028, + "step": 7388 + }, + { + "epoch": 5.818432453721938, + "grad_norm": 0.40486401319503784, + "learning_rate": 2.2149e-05, + "loss": 0.0242, + "step": 7389 + }, + { + "epoch": 5.819220165419456, + "grad_norm": 0.7096560597419739, + "learning_rate": 2.2151999999999997e-05, + "loss": 0.023, + "step": 7390 + }, + { + "epoch": 5.820007877116975, + "grad_norm": 0.3856709599494934, + "learning_rate": 2.2155e-05, + "loss": 0.023, + "step": 7391 + }, + { + "epoch": 5.8207955888144935, + "grad_norm": 0.2781676650047302, + "learning_rate": 2.2158e-05, + "loss": 0.0147, + "step": 7392 + }, + { + "epoch": 5.821583300512012, + "grad_norm": 0.7281113266944885, + "learning_rate": 2.2161e-05, + "loss": 0.0237, + "step": 7393 + }, + { + "epoch": 5.822371012209532, + "grad_norm": 0.6914578676223755, + "learning_rate": 2.2164e-05, + "loss": 0.0451, + "step": 7394 + }, + { + "epoch": 5.82315872390705, + "grad_norm": 0.5990017056465149, + "learning_rate": 2.2167e-05, + "loss": 0.0441, + "step": 7395 + }, + { + "epoch": 5.823946435604569, + "grad_norm": 0.7633236050605774, + "learning_rate": 2.217e-05, + "loss": 0.0207, + "step": 7396 + }, + { + "epoch": 5.824734147302087, + "grad_norm": 0.35484132170677185, + "learning_rate": 2.2173e-05, + "loss": 0.0229, + "step": 7397 + }, + { + "epoch": 5.825521858999606, + "grad_norm": 0.6412415504455566, + "learning_rate": 2.2176e-05, + "loss": 0.0163, + "step": 7398 + }, + { + "epoch": 5.826309570697125, + "grad_norm": 1.0399835109710693, + "learning_rate": 2.2179e-05, + "loss": 0.0372, + "step": 7399 + }, + { + "epoch": 5.827097282394644, + "grad_norm": 0.9738124012947083, + "learning_rate": 2.2182e-05, + "loss": 0.0285, + "step": 7400 + }, + { + "epoch": 5.827884994092162, + "grad_norm": 1.4233146905899048, + "learning_rate": 2.2185000000000002e-05, + "loss": 0.3448, + "step": 7401 + }, + { + "epoch": 5.828672705789681, + "grad_norm": 0.9484218955039978, + "learning_rate": 2.2188e-05, + "loss": 0.3105, + "step": 7402 + }, + { + "epoch": 5.8294604174872, + "grad_norm": 1.148545265197754, + "learning_rate": 2.2191e-05, + "loss": 0.2274, + "step": 7403 + }, + { + "epoch": 5.830248129184718, + "grad_norm": 0.9787518382072449, + "learning_rate": 2.2194e-05, + "loss": 0.134, + "step": 7404 + }, + { + "epoch": 5.831035840882237, + "grad_norm": 0.49197283387184143, + "learning_rate": 2.2197e-05, + "loss": 0.1057, + "step": 7405 + }, + { + "epoch": 5.831823552579756, + "grad_norm": 0.5233865976333618, + "learning_rate": 2.22e-05, + "loss": 0.0874, + "step": 7406 + }, + { + "epoch": 5.832611264277275, + "grad_norm": 0.29698437452316284, + "learning_rate": 2.2203e-05, + "loss": 0.03, + "step": 7407 + }, + { + "epoch": 5.833398975974793, + "grad_norm": 0.5982612371444702, + "learning_rate": 2.2206e-05, + "loss": 0.0842, + "step": 7408 + }, + { + "epoch": 5.834186687672312, + "grad_norm": 0.29005807638168335, + "learning_rate": 2.2209e-05, + "loss": 0.0266, + "step": 7409 + }, + { + "epoch": 5.83497439936983, + "grad_norm": 0.3334808945655823, + "learning_rate": 2.2212e-05, + "loss": 0.031, + "step": 7410 + }, + { + "epoch": 5.835762111067349, + "grad_norm": 0.36559754610061646, + "learning_rate": 2.2215000000000003e-05, + "loss": 0.025, + "step": 7411 + }, + { + "epoch": 5.8365498227648684, + "grad_norm": 0.43305617570877075, + "learning_rate": 2.2218000000000003e-05, + "loss": 0.0327, + "step": 7412 + }, + { + "epoch": 5.837337534462387, + "grad_norm": 0.5100217461585999, + "learning_rate": 2.2221000000000003e-05, + "loss": 0.0365, + "step": 7413 + }, + { + "epoch": 5.838125246159906, + "grad_norm": 0.5346847772598267, + "learning_rate": 2.2224e-05, + "loss": 0.0181, + "step": 7414 + }, + { + "epoch": 5.838912957857424, + "grad_norm": 0.49139124155044556, + "learning_rate": 2.2227e-05, + "loss": 0.0348, + "step": 7415 + }, + { + "epoch": 5.839700669554943, + "grad_norm": 0.9075154066085815, + "learning_rate": 2.223e-05, + "loss": 0.026, + "step": 7416 + }, + { + "epoch": 5.840488381252461, + "grad_norm": 0.7097296714782715, + "learning_rate": 2.2233e-05, + "loss": 0.0379, + "step": 7417 + }, + { + "epoch": 5.841276092949981, + "grad_norm": 0.3849387466907501, + "learning_rate": 2.2236e-05, + "loss": 0.0206, + "step": 7418 + }, + { + "epoch": 5.842063804647499, + "grad_norm": 0.32814741134643555, + "learning_rate": 2.2239e-05, + "loss": 0.0218, + "step": 7419 + }, + { + "epoch": 5.842851516345018, + "grad_norm": 1.169305443763733, + "learning_rate": 2.2241999999999998e-05, + "loss": 0.0402, + "step": 7420 + }, + { + "epoch": 5.843639228042536, + "grad_norm": 0.33328455686569214, + "learning_rate": 2.2245e-05, + "loss": 0.0183, + "step": 7421 + }, + { + "epoch": 5.844426939740055, + "grad_norm": 0.3647632598876953, + "learning_rate": 2.2248e-05, + "loss": 0.0174, + "step": 7422 + }, + { + "epoch": 5.845214651437574, + "grad_norm": 0.36341798305511475, + "learning_rate": 2.2251e-05, + "loss": 0.0194, + "step": 7423 + }, + { + "epoch": 5.846002363135092, + "grad_norm": 0.3465370535850525, + "learning_rate": 2.2254e-05, + "loss": 0.0313, + "step": 7424 + }, + { + "epoch": 5.846790074832612, + "grad_norm": 0.6212738752365112, + "learning_rate": 2.2257e-05, + "loss": 0.0177, + "step": 7425 + }, + { + "epoch": 5.84757778653013, + "grad_norm": 0.46266159415245056, + "learning_rate": 2.226e-05, + "loss": 0.0347, + "step": 7426 + }, + { + "epoch": 5.848365498227649, + "grad_norm": 0.3819383680820465, + "learning_rate": 2.2263e-05, + "loss": 0.0201, + "step": 7427 + }, + { + "epoch": 5.849153209925167, + "grad_norm": 0.4961303472518921, + "learning_rate": 2.2266e-05, + "loss": 0.0166, + "step": 7428 + }, + { + "epoch": 5.849940921622686, + "grad_norm": 0.36250999569892883, + "learning_rate": 2.2269e-05, + "loss": 0.0265, + "step": 7429 + }, + { + "epoch": 5.850728633320204, + "grad_norm": 0.4163123369216919, + "learning_rate": 2.2272e-05, + "loss": 0.0265, + "step": 7430 + }, + { + "epoch": 5.851516345017724, + "grad_norm": 0.8202512264251709, + "learning_rate": 2.2275000000000003e-05, + "loss": 0.0307, + "step": 7431 + }, + { + "epoch": 5.8523040567152425, + "grad_norm": 0.3271998465061188, + "learning_rate": 2.2278000000000003e-05, + "loss": 0.0201, + "step": 7432 + }, + { + "epoch": 5.853091768412761, + "grad_norm": 0.48259615898132324, + "learning_rate": 2.2281000000000002e-05, + "loss": 0.0319, + "step": 7433 + }, + { + "epoch": 5.85387948011028, + "grad_norm": 0.6534935235977173, + "learning_rate": 2.2284000000000002e-05, + "loss": 0.021, + "step": 7434 + }, + { + "epoch": 5.854667191807798, + "grad_norm": 0.5293385982513428, + "learning_rate": 2.2287000000000002e-05, + "loss": 0.026, + "step": 7435 + }, + { + "epoch": 5.855454903505317, + "grad_norm": 0.7085530757904053, + "learning_rate": 2.2290000000000002e-05, + "loss": 0.0323, + "step": 7436 + }, + { + "epoch": 5.856242615202836, + "grad_norm": 0.8263840079307556, + "learning_rate": 2.2293e-05, + "loss": 0.0249, + "step": 7437 + }, + { + "epoch": 5.857030326900355, + "grad_norm": 0.3020738959312439, + "learning_rate": 2.2296e-05, + "loss": 0.0154, + "step": 7438 + }, + { + "epoch": 5.857818038597873, + "grad_norm": 0.5487253069877625, + "learning_rate": 2.2298999999999998e-05, + "loss": 0.0447, + "step": 7439 + }, + { + "epoch": 5.858605750295392, + "grad_norm": 0.3991488218307495, + "learning_rate": 2.2301999999999998e-05, + "loss": 0.0243, + "step": 7440 + }, + { + "epoch": 5.8593934619929104, + "grad_norm": 0.4843483567237854, + "learning_rate": 2.2305e-05, + "loss": 0.0182, + "step": 7441 + }, + { + "epoch": 5.860181173690429, + "grad_norm": 0.670274019241333, + "learning_rate": 2.2308e-05, + "loss": 0.037, + "step": 7442 + }, + { + "epoch": 5.860968885387948, + "grad_norm": 0.8237044811248779, + "learning_rate": 2.2311e-05, + "loss": 0.0341, + "step": 7443 + }, + { + "epoch": 5.861756597085467, + "grad_norm": 0.6752669811248779, + "learning_rate": 2.2314e-05, + "loss": 0.0423, + "step": 7444 + }, + { + "epoch": 5.862544308782986, + "grad_norm": 0.38504543900489807, + "learning_rate": 2.2317e-05, + "loss": 0.0276, + "step": 7445 + }, + { + "epoch": 5.863332020480504, + "grad_norm": 0.9826382994651794, + "learning_rate": 2.232e-05, + "loss": 0.0364, + "step": 7446 + }, + { + "epoch": 5.864119732178023, + "grad_norm": 0.4001007080078125, + "learning_rate": 2.2323e-05, + "loss": 0.0151, + "step": 7447 + }, + { + "epoch": 5.864907443875541, + "grad_norm": 0.6151981353759766, + "learning_rate": 2.2326e-05, + "loss": 0.0465, + "step": 7448 + }, + { + "epoch": 5.865695155573061, + "grad_norm": 0.5018827319145203, + "learning_rate": 2.2329e-05, + "loss": 0.034, + "step": 7449 + }, + { + "epoch": 5.866482867270579, + "grad_norm": 0.7870573401451111, + "learning_rate": 2.2332e-05, + "loss": 0.0415, + "step": 7450 + }, + { + "epoch": 5.867270578968098, + "grad_norm": 1.0789068937301636, + "learning_rate": 2.2335000000000002e-05, + "loss": 0.255, + "step": 7451 + }, + { + "epoch": 5.8680582906656165, + "grad_norm": 0.954352855682373, + "learning_rate": 2.2338000000000002e-05, + "loss": 0.2151, + "step": 7452 + }, + { + "epoch": 5.868846002363135, + "grad_norm": 0.9227737784385681, + "learning_rate": 2.2341000000000002e-05, + "loss": 0.2429, + "step": 7453 + }, + { + "epoch": 5.869633714060654, + "grad_norm": 0.5633801817893982, + "learning_rate": 2.2344e-05, + "loss": 0.1686, + "step": 7454 + }, + { + "epoch": 5.870421425758172, + "grad_norm": 0.7079812288284302, + "learning_rate": 2.2347e-05, + "loss": 0.1128, + "step": 7455 + }, + { + "epoch": 5.871209137455692, + "grad_norm": 0.6807801723480225, + "learning_rate": 2.235e-05, + "loss": 0.0636, + "step": 7456 + }, + { + "epoch": 5.87199684915321, + "grad_norm": 0.5686448812484741, + "learning_rate": 2.2353e-05, + "loss": 0.0438, + "step": 7457 + }, + { + "epoch": 5.872784560850729, + "grad_norm": 0.304775595664978, + "learning_rate": 2.2356e-05, + "loss": 0.0367, + "step": 7458 + }, + { + "epoch": 5.873572272548247, + "grad_norm": 0.36933666467666626, + "learning_rate": 2.2359e-05, + "loss": 0.0245, + "step": 7459 + }, + { + "epoch": 5.874359984245766, + "grad_norm": 0.420253723859787, + "learning_rate": 2.2362e-05, + "loss": 0.0256, + "step": 7460 + }, + { + "epoch": 5.8751476959432845, + "grad_norm": 0.22283393144607544, + "learning_rate": 2.2365000000000004e-05, + "loss": 0.0157, + "step": 7461 + }, + { + "epoch": 5.875935407640803, + "grad_norm": 0.38942790031433105, + "learning_rate": 2.2368000000000003e-05, + "loss": 0.0266, + "step": 7462 + }, + { + "epoch": 5.8767231193383225, + "grad_norm": 0.8641586899757385, + "learning_rate": 2.2371e-05, + "loss": 0.0298, + "step": 7463 + }, + { + "epoch": 5.877510831035841, + "grad_norm": 0.4621396064758301, + "learning_rate": 2.2374e-05, + "loss": 0.0572, + "step": 7464 + }, + { + "epoch": 5.87829854273336, + "grad_norm": 0.49407944083213806, + "learning_rate": 2.2377e-05, + "loss": 0.024, + "step": 7465 + }, + { + "epoch": 5.879086254430878, + "grad_norm": 0.5221455693244934, + "learning_rate": 2.238e-05, + "loss": 0.0383, + "step": 7466 + }, + { + "epoch": 5.879873966128397, + "grad_norm": 0.3943987786769867, + "learning_rate": 2.2383e-05, + "loss": 0.0303, + "step": 7467 + }, + { + "epoch": 5.880661677825916, + "grad_norm": 0.4249272048473358, + "learning_rate": 2.2386e-05, + "loss": 0.0284, + "step": 7468 + }, + { + "epoch": 5.881449389523435, + "grad_norm": 0.29982471466064453, + "learning_rate": 2.2389e-05, + "loss": 0.015, + "step": 7469 + }, + { + "epoch": 5.882237101220953, + "grad_norm": 0.7988577485084534, + "learning_rate": 2.2392e-05, + "loss": 0.051, + "step": 7470 + }, + { + "epoch": 5.883024812918472, + "grad_norm": 0.4013335704803467, + "learning_rate": 2.2395e-05, + "loss": 0.0301, + "step": 7471 + }, + { + "epoch": 5.8838125246159905, + "grad_norm": 0.21624992787837982, + "learning_rate": 2.2398e-05, + "loss": 0.0116, + "step": 7472 + }, + { + "epoch": 5.884600236313509, + "grad_norm": 0.40894192457199097, + "learning_rate": 2.2401e-05, + "loss": 0.0226, + "step": 7473 + }, + { + "epoch": 5.885387948011028, + "grad_norm": 0.5085041522979736, + "learning_rate": 2.2404e-05, + "loss": 0.0165, + "step": 7474 + }, + { + "epoch": 5.886175659708547, + "grad_norm": 0.3367408514022827, + "learning_rate": 2.2407e-05, + "loss": 0.0255, + "step": 7475 + }, + { + "epoch": 5.886963371406066, + "grad_norm": 0.40392130613327026, + "learning_rate": 2.241e-05, + "loss": 0.0176, + "step": 7476 + }, + { + "epoch": 5.887751083103584, + "grad_norm": 0.33298051357269287, + "learning_rate": 2.2413e-05, + "loss": 0.0286, + "step": 7477 + }, + { + "epoch": 5.888538794801103, + "grad_norm": 0.35251760482788086, + "learning_rate": 2.2416e-05, + "loss": 0.0233, + "step": 7478 + }, + { + "epoch": 5.889326506498621, + "grad_norm": 0.39662325382232666, + "learning_rate": 2.2419e-05, + "loss": 0.0261, + "step": 7479 + }, + { + "epoch": 5.89011421819614, + "grad_norm": 0.644556999206543, + "learning_rate": 2.2422e-05, + "loss": 0.038, + "step": 7480 + }, + { + "epoch": 5.8909019298936585, + "grad_norm": 0.5760177373886108, + "learning_rate": 2.2425000000000003e-05, + "loss": 0.0332, + "step": 7481 + }, + { + "epoch": 5.891689641591178, + "grad_norm": 0.27291199564933777, + "learning_rate": 2.2428000000000003e-05, + "loss": 0.0167, + "step": 7482 + }, + { + "epoch": 5.8924773532886965, + "grad_norm": 0.6056340336799622, + "learning_rate": 2.2431000000000003e-05, + "loss": 0.029, + "step": 7483 + }, + { + "epoch": 5.893265064986215, + "grad_norm": 0.3051922619342804, + "learning_rate": 2.2434000000000002e-05, + "loss": 0.0342, + "step": 7484 + }, + { + "epoch": 5.894052776683734, + "grad_norm": 0.5807642340660095, + "learning_rate": 2.2437000000000002e-05, + "loss": 0.0298, + "step": 7485 + }, + { + "epoch": 5.894840488381252, + "grad_norm": 0.6131526231765747, + "learning_rate": 2.2440000000000002e-05, + "loss": 0.0262, + "step": 7486 + }, + { + "epoch": 5.895628200078772, + "grad_norm": 1.2293950319290161, + "learning_rate": 2.2443000000000002e-05, + "loss": 0.0295, + "step": 7487 + }, + { + "epoch": 5.89641591177629, + "grad_norm": 0.41855889558792114, + "learning_rate": 2.2445999999999998e-05, + "loss": 0.0234, + "step": 7488 + }, + { + "epoch": 5.897203623473809, + "grad_norm": 0.4823336601257324, + "learning_rate": 2.2448999999999998e-05, + "loss": 0.032, + "step": 7489 + }, + { + "epoch": 5.897991335171327, + "grad_norm": 0.6877893805503845, + "learning_rate": 2.2451999999999998e-05, + "loss": 0.0208, + "step": 7490 + }, + { + "epoch": 5.898779046868846, + "grad_norm": 0.5274639129638672, + "learning_rate": 2.2455e-05, + "loss": 0.0372, + "step": 7491 + }, + { + "epoch": 5.8995667585663645, + "grad_norm": 0.3773907423019409, + "learning_rate": 2.2458e-05, + "loss": 0.0323, + "step": 7492 + }, + { + "epoch": 5.900354470263883, + "grad_norm": 0.5414057970046997, + "learning_rate": 2.2461e-05, + "loss": 0.0236, + "step": 7493 + }, + { + "epoch": 5.9011421819614025, + "grad_norm": 0.42957448959350586, + "learning_rate": 2.2464e-05, + "loss": 0.0328, + "step": 7494 + }, + { + "epoch": 5.901929893658921, + "grad_norm": 0.9481053948402405, + "learning_rate": 2.2467e-05, + "loss": 0.0362, + "step": 7495 + }, + { + "epoch": 5.90271760535644, + "grad_norm": 0.5513505935668945, + "learning_rate": 2.247e-05, + "loss": 0.0237, + "step": 7496 + }, + { + "epoch": 5.903505317053958, + "grad_norm": 0.7468547224998474, + "learning_rate": 2.2473e-05, + "loss": 0.0231, + "step": 7497 + }, + { + "epoch": 5.904293028751477, + "grad_norm": 0.6024125814437866, + "learning_rate": 2.2476e-05, + "loss": 0.0388, + "step": 7498 + }, + { + "epoch": 5.905080740448995, + "grad_norm": 0.48916855454444885, + "learning_rate": 2.2479e-05, + "loss": 0.0345, + "step": 7499 + }, + { + "epoch": 5.905868452146514, + "grad_norm": 0.6602763533592224, + "learning_rate": 2.2482e-05, + "loss": 0.0238, + "step": 7500 + }, + { + "epoch": 5.906656163844033, + "grad_norm": 1.2968257665634155, + "learning_rate": 2.2485000000000002e-05, + "loss": 0.3031, + "step": 7501 + }, + { + "epoch": 5.907443875541552, + "grad_norm": 0.9160588383674622, + "learning_rate": 2.2488000000000002e-05, + "loss": 0.2206, + "step": 7502 + }, + { + "epoch": 5.9082315872390705, + "grad_norm": 0.6059646010398865, + "learning_rate": 2.2491000000000002e-05, + "loss": 0.158, + "step": 7503 + }, + { + "epoch": 5.909019298936589, + "grad_norm": 0.8596699237823486, + "learning_rate": 2.2494000000000002e-05, + "loss": 0.1803, + "step": 7504 + }, + { + "epoch": 5.909807010634108, + "grad_norm": 0.7256431579589844, + "learning_rate": 2.2497e-05, + "loss": 0.1118, + "step": 7505 + }, + { + "epoch": 5.910594722331627, + "grad_norm": 0.9387232065200806, + "learning_rate": 2.25e-05, + "loss": 0.1744, + "step": 7506 + }, + { + "epoch": 5.911382434029146, + "grad_norm": 0.40107154846191406, + "learning_rate": 2.2503e-05, + "loss": 0.0599, + "step": 7507 + }, + { + "epoch": 5.912170145726664, + "grad_norm": 0.4490979015827179, + "learning_rate": 2.2506e-05, + "loss": 0.0456, + "step": 7508 + }, + { + "epoch": 5.912957857424183, + "grad_norm": 0.31791654229164124, + "learning_rate": 2.2509e-05, + "loss": 0.0324, + "step": 7509 + }, + { + "epoch": 5.913745569121701, + "grad_norm": 0.2589089870452881, + "learning_rate": 2.2512e-05, + "loss": 0.022, + "step": 7510 + }, + { + "epoch": 5.91453328081922, + "grad_norm": 0.356998473405838, + "learning_rate": 2.2515e-05, + "loss": 0.0184, + "step": 7511 + }, + { + "epoch": 5.9153209925167385, + "grad_norm": 0.21311962604522705, + "learning_rate": 2.2518e-05, + "loss": 0.0205, + "step": 7512 + }, + { + "epoch": 5.916108704214258, + "grad_norm": 0.28978580236434937, + "learning_rate": 2.2521e-05, + "loss": 0.0239, + "step": 7513 + }, + { + "epoch": 5.9168964159117765, + "grad_norm": 0.31131863594055176, + "learning_rate": 2.2524e-05, + "loss": 0.0174, + "step": 7514 + }, + { + "epoch": 5.917684127609295, + "grad_norm": 0.2946123778820038, + "learning_rate": 2.2527e-05, + "loss": 0.0223, + "step": 7515 + }, + { + "epoch": 5.918471839306814, + "grad_norm": 0.4058450758457184, + "learning_rate": 2.253e-05, + "loss": 0.0249, + "step": 7516 + }, + { + "epoch": 5.919259551004332, + "grad_norm": 0.30797046422958374, + "learning_rate": 2.2533e-05, + "loss": 0.0127, + "step": 7517 + }, + { + "epoch": 5.920047262701851, + "grad_norm": 0.5011530518531799, + "learning_rate": 2.2536e-05, + "loss": 0.0261, + "step": 7518 + }, + { + "epoch": 5.920834974399369, + "grad_norm": 0.2372819185256958, + "learning_rate": 2.2539e-05, + "loss": 0.0222, + "step": 7519 + }, + { + "epoch": 5.921622686096889, + "grad_norm": 1.5212897062301636, + "learning_rate": 2.2542e-05, + "loss": 0.0156, + "step": 7520 + }, + { + "epoch": 5.922410397794407, + "grad_norm": 0.3417063355445862, + "learning_rate": 2.2545e-05, + "loss": 0.016, + "step": 7521 + }, + { + "epoch": 5.923198109491926, + "grad_norm": 0.41114309430122375, + "learning_rate": 2.2548e-05, + "loss": 0.0181, + "step": 7522 + }, + { + "epoch": 5.9239858211894445, + "grad_norm": 0.6646881103515625, + "learning_rate": 2.2551e-05, + "loss": 0.041, + "step": 7523 + }, + { + "epoch": 5.924773532886963, + "grad_norm": 0.3222334384918213, + "learning_rate": 2.2554e-05, + "loss": 0.0246, + "step": 7524 + }, + { + "epoch": 5.9255612445844825, + "grad_norm": 0.5753511786460876, + "learning_rate": 2.2557e-05, + "loss": 0.0342, + "step": 7525 + }, + { + "epoch": 5.926348956282001, + "grad_norm": 0.4946962296962738, + "learning_rate": 2.256e-05, + "loss": 0.029, + "step": 7526 + }, + { + "epoch": 5.92713666797952, + "grad_norm": 0.5911245346069336, + "learning_rate": 2.2563e-05, + "loss": 0.0284, + "step": 7527 + }, + { + "epoch": 5.927924379677038, + "grad_norm": 0.6208152174949646, + "learning_rate": 2.2566e-05, + "loss": 0.0246, + "step": 7528 + }, + { + "epoch": 5.928712091374557, + "grad_norm": 0.40132850408554077, + "learning_rate": 2.2569e-05, + "loss": 0.0259, + "step": 7529 + }, + { + "epoch": 5.929499803072075, + "grad_norm": 0.5009105205535889, + "learning_rate": 2.2572e-05, + "loss": 0.0208, + "step": 7530 + }, + { + "epoch": 5.930287514769594, + "grad_norm": 0.6232486367225647, + "learning_rate": 2.2575e-05, + "loss": 0.0313, + "step": 7531 + }, + { + "epoch": 5.931075226467113, + "grad_norm": 0.5384225845336914, + "learning_rate": 2.2578000000000003e-05, + "loss": 0.0233, + "step": 7532 + }, + { + "epoch": 5.931862938164632, + "grad_norm": 0.26130369305610657, + "learning_rate": 2.2581000000000003e-05, + "loss": 0.0146, + "step": 7533 + }, + { + "epoch": 5.9326506498621505, + "grad_norm": 0.40286943316459656, + "learning_rate": 2.2584000000000003e-05, + "loss": 0.0211, + "step": 7534 + }, + { + "epoch": 5.933438361559669, + "grad_norm": 0.3121725916862488, + "learning_rate": 2.2587000000000002e-05, + "loss": 0.0184, + "step": 7535 + }, + { + "epoch": 5.934226073257188, + "grad_norm": 0.5301908850669861, + "learning_rate": 2.2590000000000002e-05, + "loss": 0.0266, + "step": 7536 + }, + { + "epoch": 5.935013784954706, + "grad_norm": 0.6161260008811951, + "learning_rate": 2.2593e-05, + "loss": 0.0208, + "step": 7537 + }, + { + "epoch": 5.935801496652226, + "grad_norm": 0.5400674343109131, + "learning_rate": 2.2596e-05, + "loss": 0.0314, + "step": 7538 + }, + { + "epoch": 5.936589208349744, + "grad_norm": 0.33066654205322266, + "learning_rate": 2.2598999999999998e-05, + "loss": 0.0252, + "step": 7539 + }, + { + "epoch": 5.937376920047263, + "grad_norm": 0.6301922798156738, + "learning_rate": 2.2601999999999998e-05, + "loss": 0.0341, + "step": 7540 + }, + { + "epoch": 5.938164631744781, + "grad_norm": 0.46629396080970764, + "learning_rate": 2.2604999999999998e-05, + "loss": 0.0256, + "step": 7541 + }, + { + "epoch": 5.9389523434423, + "grad_norm": 0.5838980078697205, + "learning_rate": 2.2608e-05, + "loss": 0.0266, + "step": 7542 + }, + { + "epoch": 5.9397400551398185, + "grad_norm": 0.46251413226127625, + "learning_rate": 2.2611e-05, + "loss": 0.0257, + "step": 7543 + }, + { + "epoch": 5.940527766837338, + "grad_norm": 0.7339991927146912, + "learning_rate": 2.2614e-05, + "loss": 0.0253, + "step": 7544 + }, + { + "epoch": 5.9413154785348565, + "grad_norm": 0.622410237789154, + "learning_rate": 2.2617e-05, + "loss": 0.038, + "step": 7545 + }, + { + "epoch": 5.942103190232375, + "grad_norm": 0.5667653679847717, + "learning_rate": 2.262e-05, + "loss": 0.0316, + "step": 7546 + }, + { + "epoch": 5.942890901929894, + "grad_norm": 0.5870182514190674, + "learning_rate": 2.2623e-05, + "loss": 0.0331, + "step": 7547 + }, + { + "epoch": 5.943678613627412, + "grad_norm": 0.35484716296195984, + "learning_rate": 2.2626e-05, + "loss": 0.0291, + "step": 7548 + }, + { + "epoch": 5.944466325324931, + "grad_norm": 0.7331976890563965, + "learning_rate": 2.2629e-05, + "loss": 0.0234, + "step": 7549 + }, + { + "epoch": 5.945254037022449, + "grad_norm": 0.6111581325531006, + "learning_rate": 2.2632e-05, + "loss": 0.0548, + "step": 7550 + }, + { + "epoch": 5.946041748719969, + "grad_norm": 1.0486671924591064, + "learning_rate": 2.2635e-05, + "loss": 0.3275, + "step": 7551 + }, + { + "epoch": 5.946829460417487, + "grad_norm": 0.811720073223114, + "learning_rate": 2.2638000000000002e-05, + "loss": 0.1898, + "step": 7552 + }, + { + "epoch": 5.947617172115006, + "grad_norm": 0.6250370740890503, + "learning_rate": 2.2641000000000002e-05, + "loss": 0.1892, + "step": 7553 + }, + { + "epoch": 5.9484048838125245, + "grad_norm": 0.9465884566307068, + "learning_rate": 2.2644000000000002e-05, + "loss": 0.1671, + "step": 7554 + }, + { + "epoch": 5.949192595510043, + "grad_norm": 0.8927658796310425, + "learning_rate": 2.2647000000000002e-05, + "loss": 0.0884, + "step": 7555 + }, + { + "epoch": 5.949980307207562, + "grad_norm": 0.7173064351081848, + "learning_rate": 2.265e-05, + "loss": 0.1485, + "step": 7556 + }, + { + "epoch": 5.950768018905081, + "grad_norm": 0.5049932599067688, + "learning_rate": 2.2653e-05, + "loss": 0.0391, + "step": 7557 + }, + { + "epoch": 5.9515557306026, + "grad_norm": 0.41346633434295654, + "learning_rate": 2.2656e-05, + "loss": 0.0326, + "step": 7558 + }, + { + "epoch": 5.952343442300118, + "grad_norm": 0.287440687417984, + "learning_rate": 2.2659e-05, + "loss": 0.0251, + "step": 7559 + }, + { + "epoch": 5.953131153997637, + "grad_norm": 0.34060966968536377, + "learning_rate": 2.2662e-05, + "loss": 0.0309, + "step": 7560 + }, + { + "epoch": 5.953918865695155, + "grad_norm": 0.33498528599739075, + "learning_rate": 2.2665e-05, + "loss": 0.0216, + "step": 7561 + }, + { + "epoch": 5.954706577392674, + "grad_norm": 0.3967617452144623, + "learning_rate": 2.2668e-05, + "loss": 0.0254, + "step": 7562 + }, + { + "epoch": 5.955494289090193, + "grad_norm": 0.5297648906707764, + "learning_rate": 2.2671e-05, + "loss": 0.0216, + "step": 7563 + }, + { + "epoch": 5.956282000787712, + "grad_norm": 0.8104897141456604, + "learning_rate": 2.2674e-05, + "loss": 0.0316, + "step": 7564 + }, + { + "epoch": 5.9570697124852305, + "grad_norm": 0.5127218961715698, + "learning_rate": 2.2677e-05, + "loss": 0.0179, + "step": 7565 + }, + { + "epoch": 5.957857424182749, + "grad_norm": 0.492701917886734, + "learning_rate": 2.268e-05, + "loss": 0.0246, + "step": 7566 + }, + { + "epoch": 5.958645135880268, + "grad_norm": 0.49930086731910706, + "learning_rate": 2.2683e-05, + "loss": 0.0218, + "step": 7567 + }, + { + "epoch": 5.959432847577786, + "grad_norm": 0.7389442920684814, + "learning_rate": 2.2686e-05, + "loss": 0.0285, + "step": 7568 + }, + { + "epoch": 5.960220559275305, + "grad_norm": 0.6677248477935791, + "learning_rate": 2.2689e-05, + "loss": 0.045, + "step": 7569 + }, + { + "epoch": 5.961008270972824, + "grad_norm": 0.4697619080543518, + "learning_rate": 2.2692e-05, + "loss": 0.024, + "step": 7570 + }, + { + "epoch": 5.961795982670343, + "grad_norm": 0.4799104630947113, + "learning_rate": 2.2695e-05, + "loss": 0.0341, + "step": 7571 + }, + { + "epoch": 5.962583694367861, + "grad_norm": 0.41168010234832764, + "learning_rate": 2.2698000000000002e-05, + "loss": 0.0581, + "step": 7572 + }, + { + "epoch": 5.96337140606538, + "grad_norm": 0.3499285876750946, + "learning_rate": 2.2701000000000002e-05, + "loss": 0.0148, + "step": 7573 + }, + { + "epoch": 5.9641591177628985, + "grad_norm": 0.5093154311180115, + "learning_rate": 2.2704e-05, + "loss": 0.023, + "step": 7574 + }, + { + "epoch": 5.964946829460417, + "grad_norm": 0.6148359775543213, + "learning_rate": 2.2707e-05, + "loss": 0.0242, + "step": 7575 + }, + { + "epoch": 5.9657345411579366, + "grad_norm": 0.5371955633163452, + "learning_rate": 2.271e-05, + "loss": 0.0184, + "step": 7576 + }, + { + "epoch": 5.966522252855455, + "grad_norm": 0.42654702067375183, + "learning_rate": 2.2713e-05, + "loss": 0.0343, + "step": 7577 + }, + { + "epoch": 5.967309964552974, + "grad_norm": 0.3875792920589447, + "learning_rate": 2.2716e-05, + "loss": 0.0236, + "step": 7578 + }, + { + "epoch": 5.968097676250492, + "grad_norm": 0.5190200209617615, + "learning_rate": 2.2719e-05, + "loss": 0.0205, + "step": 7579 + }, + { + "epoch": 5.968885387948011, + "grad_norm": 0.3857458531856537, + "learning_rate": 2.2722e-05, + "loss": 0.02, + "step": 7580 + }, + { + "epoch": 5.969673099645529, + "grad_norm": 0.565773069858551, + "learning_rate": 2.2725e-05, + "loss": 0.0302, + "step": 7581 + }, + { + "epoch": 5.970460811343049, + "grad_norm": 0.2882916033267975, + "learning_rate": 2.2728000000000003e-05, + "loss": 0.0164, + "step": 7582 + }, + { + "epoch": 5.971248523040567, + "grad_norm": 0.38042208552360535, + "learning_rate": 2.2731000000000003e-05, + "loss": 0.0216, + "step": 7583 + }, + { + "epoch": 5.972036234738086, + "grad_norm": 0.9930114150047302, + "learning_rate": 2.2734000000000003e-05, + "loss": 0.0487, + "step": 7584 + }, + { + "epoch": 5.9728239464356045, + "grad_norm": 0.9388958811759949, + "learning_rate": 2.2737000000000003e-05, + "loss": 0.0199, + "step": 7585 + }, + { + "epoch": 5.973611658133123, + "grad_norm": 0.6600744724273682, + "learning_rate": 2.274e-05, + "loss": 0.0205, + "step": 7586 + }, + { + "epoch": 5.974399369830642, + "grad_norm": 0.5282280445098877, + "learning_rate": 2.2743e-05, + "loss": 0.0237, + "step": 7587 + }, + { + "epoch": 5.97518708152816, + "grad_norm": 0.5327697396278381, + "learning_rate": 2.2746e-05, + "loss": 0.0181, + "step": 7588 + }, + { + "epoch": 5.97597479322568, + "grad_norm": 0.5351530313491821, + "learning_rate": 2.2749e-05, + "loss": 0.0305, + "step": 7589 + }, + { + "epoch": 5.976762504923198, + "grad_norm": 0.41856053471565247, + "learning_rate": 2.2752e-05, + "loss": 0.0265, + "step": 7590 + }, + { + "epoch": 5.977550216620717, + "grad_norm": 0.5013870596885681, + "learning_rate": 2.2754999999999998e-05, + "loss": 0.0298, + "step": 7591 + }, + { + "epoch": 5.978337928318235, + "grad_norm": 0.8064817786216736, + "learning_rate": 2.2758e-05, + "loss": 0.0369, + "step": 7592 + }, + { + "epoch": 5.979125640015754, + "grad_norm": 0.5113141536712646, + "learning_rate": 2.2761e-05, + "loss": 0.0306, + "step": 7593 + }, + { + "epoch": 5.979913351713273, + "grad_norm": 0.41400569677352905, + "learning_rate": 2.2764e-05, + "loss": 0.0263, + "step": 7594 + }, + { + "epoch": 5.980701063410792, + "grad_norm": 0.25450223684310913, + "learning_rate": 2.2767e-05, + "loss": 0.0165, + "step": 7595 + }, + { + "epoch": 5.981488775108311, + "grad_norm": 0.7016476988792419, + "learning_rate": 2.277e-05, + "loss": 0.0345, + "step": 7596 + }, + { + "epoch": 5.982276486805829, + "grad_norm": 0.6484482884407043, + "learning_rate": 2.2773e-05, + "loss": 0.0364, + "step": 7597 + }, + { + "epoch": 5.983064198503348, + "grad_norm": 0.5382499098777771, + "learning_rate": 2.2776e-05, + "loss": 0.0279, + "step": 7598 + }, + { + "epoch": 5.983851910200866, + "grad_norm": 0.7107750773429871, + "learning_rate": 2.2779e-05, + "loss": 0.0307, + "step": 7599 + }, + { + "epoch": 5.984639621898385, + "grad_norm": 1.9659613370895386, + "learning_rate": 2.2782e-05, + "loss": 0.0636, + "step": 7600 + }, + { + "epoch": 5.985427333595904, + "grad_norm": 1.2589417695999146, + "learning_rate": 2.2785e-05, + "loss": 0.2817, + "step": 7601 + }, + { + "epoch": 5.986215045293423, + "grad_norm": 0.9580874443054199, + "learning_rate": 2.2788000000000003e-05, + "loss": 0.1538, + "step": 7602 + }, + { + "epoch": 5.987002756990941, + "grad_norm": 0.4390586316585541, + "learning_rate": 2.2791000000000003e-05, + "loss": 0.039, + "step": 7603 + }, + { + "epoch": 5.98779046868846, + "grad_norm": 0.5490836501121521, + "learning_rate": 2.2794000000000002e-05, + "loss": 0.0577, + "step": 7604 + }, + { + "epoch": 5.9885781803859786, + "grad_norm": 0.6104032397270203, + "learning_rate": 2.2797000000000002e-05, + "loss": 0.0431, + "step": 7605 + }, + { + "epoch": 5.989365892083497, + "grad_norm": 0.27964261174201965, + "learning_rate": 2.2800000000000002e-05, + "loss": 0.0259, + "step": 7606 + }, + { + "epoch": 5.990153603781016, + "grad_norm": 0.7721052169799805, + "learning_rate": 2.2803000000000002e-05, + "loss": 0.0341, + "step": 7607 + }, + { + "epoch": 5.990941315478535, + "grad_norm": 0.3529788553714752, + "learning_rate": 2.2806e-05, + "loss": 0.0249, + "step": 7608 + }, + { + "epoch": 5.991729027176054, + "grad_norm": 0.43809735774993896, + "learning_rate": 2.2809e-05, + "loss": 0.0254, + "step": 7609 + }, + { + "epoch": 5.992516738873572, + "grad_norm": 0.42944061756134033, + "learning_rate": 2.2812e-05, + "loss": 0.0317, + "step": 7610 + }, + { + "epoch": 5.993304450571091, + "grad_norm": 0.398857444524765, + "learning_rate": 2.2814999999999998e-05, + "loss": 0.025, + "step": 7611 + }, + { + "epoch": 5.994092162268609, + "grad_norm": 0.3694613575935364, + "learning_rate": 2.2818e-05, + "loss": 0.025, + "step": 7612 + }, + { + "epoch": 5.994879873966129, + "grad_norm": 0.2983225882053375, + "learning_rate": 2.2821e-05, + "loss": 0.0235, + "step": 7613 + }, + { + "epoch": 5.995667585663647, + "grad_norm": 0.350710928440094, + "learning_rate": 2.2824e-05, + "loss": 0.0208, + "step": 7614 + }, + { + "epoch": 5.996455297361166, + "grad_norm": 0.43092554807662964, + "learning_rate": 2.2827e-05, + "loss": 0.0443, + "step": 7615 + }, + { + "epoch": 5.997243009058685, + "grad_norm": 0.8082802295684814, + "learning_rate": 2.283e-05, + "loss": 0.0459, + "step": 7616 + }, + { + "epoch": 5.998030720756203, + "grad_norm": 0.5327081084251404, + "learning_rate": 2.2833e-05, + "loss": 0.0298, + "step": 7617 + }, + { + "epoch": 5.998818432453722, + "grad_norm": 0.38381683826446533, + "learning_rate": 2.2836e-05, + "loss": 0.0189, + "step": 7618 + }, + { + "epoch": 5.99960614415124, + "grad_norm": 0.9939479827880859, + "learning_rate": 2.2839e-05, + "loss": 0.0268, + "step": 7619 + }, + { + "epoch": 6.0, + "grad_norm": 0.6553249955177307, + "learning_rate": 2.2842e-05, + "loss": 0.0108, + "step": 7620 + }, + { + "epoch": 6.000787711697519, + "grad_norm": 1.9541267156600952, + "learning_rate": 2.2845e-05, + "loss": 0.4412, + "step": 7621 + }, + { + "epoch": 6.001575423395037, + "grad_norm": 1.075101613998413, + "learning_rate": 2.2848000000000002e-05, + "loss": 0.2422, + "step": 7622 + }, + { + "epoch": 6.002363135092556, + "grad_norm": 0.784930944442749, + "learning_rate": 2.2851000000000002e-05, + "loss": 0.1547, + "step": 7623 + }, + { + "epoch": 6.003150846790075, + "grad_norm": 0.7275795340538025, + "learning_rate": 2.2854000000000002e-05, + "loss": 0.1321, + "step": 7624 + }, + { + "epoch": 6.003938558487594, + "grad_norm": 0.9636182188987732, + "learning_rate": 2.2857e-05, + "loss": 0.1389, + "step": 7625 + }, + { + "epoch": 6.004726270185112, + "grad_norm": 0.7168843746185303, + "learning_rate": 2.286e-05, + "loss": 0.0846, + "step": 7626 + }, + { + "epoch": 6.005513981882631, + "grad_norm": 0.29273176193237305, + "learning_rate": 2.2863e-05, + "loss": 0.0414, + "step": 7627 + }, + { + "epoch": 6.006301693580149, + "grad_norm": 0.5079658031463623, + "learning_rate": 2.2866e-05, + "loss": 0.0293, + "step": 7628 + }, + { + "epoch": 6.007089405277668, + "grad_norm": 0.3227942883968353, + "learning_rate": 2.2869e-05, + "loss": 0.0255, + "step": 7629 + }, + { + "epoch": 6.0078771169751874, + "grad_norm": 0.43143168091773987, + "learning_rate": 2.2872e-05, + "loss": 0.0311, + "step": 7630 + }, + { + "epoch": 6.008664828672706, + "grad_norm": 0.3467210531234741, + "learning_rate": 2.2875e-05, + "loss": 0.0267, + "step": 7631 + }, + { + "epoch": 6.009452540370225, + "grad_norm": 0.3681434988975525, + "learning_rate": 2.2878e-05, + "loss": 0.0295, + "step": 7632 + }, + { + "epoch": 6.010240252067743, + "grad_norm": 0.575013279914856, + "learning_rate": 2.2881000000000003e-05, + "loss": 0.0728, + "step": 7633 + }, + { + "epoch": 6.011027963765262, + "grad_norm": 0.47420021891593933, + "learning_rate": 2.2884000000000003e-05, + "loss": 0.0309, + "step": 7634 + }, + { + "epoch": 6.01181567546278, + "grad_norm": 0.3787539005279541, + "learning_rate": 2.2887e-05, + "loss": 0.0219, + "step": 7635 + }, + { + "epoch": 6.0126033871603, + "grad_norm": 0.398490846157074, + "learning_rate": 2.289e-05, + "loss": 0.0224, + "step": 7636 + }, + { + "epoch": 6.013391098857818, + "grad_norm": 0.5073296427726746, + "learning_rate": 2.2893e-05, + "loss": 0.0142, + "step": 7637 + }, + { + "epoch": 6.014178810555337, + "grad_norm": 1.006945252418518, + "learning_rate": 2.2896e-05, + "loss": 0.0205, + "step": 7638 + }, + { + "epoch": 6.014966522252855, + "grad_norm": 0.3439183235168457, + "learning_rate": 2.2899e-05, + "loss": 0.0129, + "step": 7639 + }, + { + "epoch": 6.015754233950374, + "grad_norm": 0.5594606399536133, + "learning_rate": 2.2902e-05, + "loss": 0.0204, + "step": 7640 + }, + { + "epoch": 6.016541945647893, + "grad_norm": 0.2997545301914215, + "learning_rate": 2.2905e-05, + "loss": 0.0206, + "step": 7641 + }, + { + "epoch": 6.017329657345411, + "grad_norm": 0.3904585540294647, + "learning_rate": 2.2907999999999998e-05, + "loss": 0.0242, + "step": 7642 + }, + { + "epoch": 6.018117369042931, + "grad_norm": 0.5120677351951599, + "learning_rate": 2.2911e-05, + "loss": 0.0278, + "step": 7643 + }, + { + "epoch": 6.018905080740449, + "grad_norm": 0.4072013199329376, + "learning_rate": 2.2914e-05, + "loss": 0.023, + "step": 7644 + }, + { + "epoch": 6.019692792437968, + "grad_norm": 0.4236587584018707, + "learning_rate": 2.2917e-05, + "loss": 0.029, + "step": 7645 + }, + { + "epoch": 6.020480504135486, + "grad_norm": 0.5583325028419495, + "learning_rate": 2.292e-05, + "loss": 0.0285, + "step": 7646 + }, + { + "epoch": 6.021268215833005, + "grad_norm": 0.22957488894462585, + "learning_rate": 2.2923e-05, + "loss": 0.0215, + "step": 7647 + }, + { + "epoch": 6.022055927530523, + "grad_norm": 0.2490191012620926, + "learning_rate": 2.2926e-05, + "loss": 0.0169, + "step": 7648 + }, + { + "epoch": 6.022843639228043, + "grad_norm": 0.5116975903511047, + "learning_rate": 2.2929e-05, + "loss": 0.0165, + "step": 7649 + }, + { + "epoch": 6.0236313509255615, + "grad_norm": 0.39632317423820496, + "learning_rate": 2.2932e-05, + "loss": 0.0162, + "step": 7650 + }, + { + "epoch": 6.02441906262308, + "grad_norm": 0.2930530607700348, + "learning_rate": 2.2935e-05, + "loss": 0.0173, + "step": 7651 + }, + { + "epoch": 6.025206774320599, + "grad_norm": 1.0048574209213257, + "learning_rate": 2.2938e-05, + "loss": 0.0462, + "step": 7652 + }, + { + "epoch": 6.025994486018117, + "grad_norm": 0.23530393838882446, + "learning_rate": 2.2941000000000003e-05, + "loss": 0.014, + "step": 7653 + }, + { + "epoch": 6.026782197715636, + "grad_norm": 0.30272164940834045, + "learning_rate": 2.2944000000000003e-05, + "loss": 0.014, + "step": 7654 + }, + { + "epoch": 6.027569909413155, + "grad_norm": 0.35008886456489563, + "learning_rate": 2.2947000000000002e-05, + "loss": 0.0109, + "step": 7655 + }, + { + "epoch": 6.028357621110674, + "grad_norm": 0.6612917184829712, + "learning_rate": 2.2950000000000002e-05, + "loss": 0.0212, + "step": 7656 + }, + { + "epoch": 6.029145332808192, + "grad_norm": 0.3757241368293762, + "learning_rate": 2.2953000000000002e-05, + "loss": 0.02, + "step": 7657 + }, + { + "epoch": 6.029933044505711, + "grad_norm": 0.42388930916786194, + "learning_rate": 2.2956000000000002e-05, + "loss": 0.0225, + "step": 7658 + }, + { + "epoch": 6.0307207562032294, + "grad_norm": 0.5561228394508362, + "learning_rate": 2.2959e-05, + "loss": 0.0316, + "step": 7659 + }, + { + "epoch": 6.031508467900748, + "grad_norm": 0.5288693904876709, + "learning_rate": 2.2961999999999998e-05, + "loss": 0.0263, + "step": 7660 + }, + { + "epoch": 6.0322961795982675, + "grad_norm": 0.2782362401485443, + "learning_rate": 2.2964999999999998e-05, + "loss": 0.0144, + "step": 7661 + }, + { + "epoch": 6.033083891295786, + "grad_norm": 0.31150153279304504, + "learning_rate": 2.2967999999999998e-05, + "loss": 0.0189, + "step": 7662 + }, + { + "epoch": 6.033871602993305, + "grad_norm": 0.5037028789520264, + "learning_rate": 2.2971e-05, + "loss": 0.022, + "step": 7663 + }, + { + "epoch": 6.034659314690823, + "grad_norm": 0.8580628037452698, + "learning_rate": 2.2974e-05, + "loss": 0.038, + "step": 7664 + }, + { + "epoch": 6.035447026388342, + "grad_norm": 0.26799213886260986, + "learning_rate": 2.2977e-05, + "loss": 0.0137, + "step": 7665 + }, + { + "epoch": 6.03623473808586, + "grad_norm": 0.49061429500579834, + "learning_rate": 2.298e-05, + "loss": 0.023, + "step": 7666 + }, + { + "epoch": 6.037022449783379, + "grad_norm": 0.6453385353088379, + "learning_rate": 2.2983e-05, + "loss": 0.0297, + "step": 7667 + }, + { + "epoch": 6.037810161480898, + "grad_norm": 1.0254312753677368, + "learning_rate": 2.2986e-05, + "loss": 0.0213, + "step": 7668 + }, + { + "epoch": 6.038597873178417, + "grad_norm": 0.7206034064292908, + "learning_rate": 2.2989e-05, + "loss": 0.0204, + "step": 7669 + }, + { + "epoch": 6.0393855848759355, + "grad_norm": 0.7542174458503723, + "learning_rate": 2.2992e-05, + "loss": 0.0238, + "step": 7670 + }, + { + "epoch": 6.040173296573454, + "grad_norm": 1.032448649406433, + "learning_rate": 2.2995e-05, + "loss": 0.279, + "step": 7671 + }, + { + "epoch": 6.040961008270973, + "grad_norm": 1.9735047817230225, + "learning_rate": 2.2998e-05, + "loss": 0.2882, + "step": 7672 + }, + { + "epoch": 6.041748719968491, + "grad_norm": 0.7739889025688171, + "learning_rate": 2.3001000000000002e-05, + "loss": 0.1573, + "step": 7673 + }, + { + "epoch": 6.042536431666011, + "grad_norm": 0.8069956302642822, + "learning_rate": 2.3004000000000002e-05, + "loss": 0.1881, + "step": 7674 + }, + { + "epoch": 6.043324143363529, + "grad_norm": 0.4344964325428009, + "learning_rate": 2.3007000000000002e-05, + "loss": 0.0465, + "step": 7675 + }, + { + "epoch": 6.044111855061048, + "grad_norm": 0.4824860095977783, + "learning_rate": 2.301e-05, + "loss": 0.0518, + "step": 7676 + }, + { + "epoch": 6.044899566758566, + "grad_norm": 0.3154263198375702, + "learning_rate": 2.3013e-05, + "loss": 0.0339, + "step": 7677 + }, + { + "epoch": 6.045687278456085, + "grad_norm": 0.5777984857559204, + "learning_rate": 2.3016e-05, + "loss": 0.0612, + "step": 7678 + }, + { + "epoch": 6.0464749901536035, + "grad_norm": 0.2597048580646515, + "learning_rate": 2.3019e-05, + "loss": 0.0186, + "step": 7679 + }, + { + "epoch": 6.047262701851123, + "grad_norm": 0.3237766623497009, + "learning_rate": 2.3022e-05, + "loss": 0.0191, + "step": 7680 + }, + { + "epoch": 6.0480504135486415, + "grad_norm": 0.6227539777755737, + "learning_rate": 2.3025e-05, + "loss": 0.0357, + "step": 7681 + }, + { + "epoch": 6.04883812524616, + "grad_norm": 0.5060641765594482, + "learning_rate": 2.3028e-05, + "loss": 0.0934, + "step": 7682 + }, + { + "epoch": 6.049625836943679, + "grad_norm": 0.3550703525543213, + "learning_rate": 2.3031000000000004e-05, + "loss": 0.0254, + "step": 7683 + }, + { + "epoch": 6.050413548641197, + "grad_norm": 0.4383516013622284, + "learning_rate": 2.3034e-05, + "loss": 0.023, + "step": 7684 + }, + { + "epoch": 6.051201260338716, + "grad_norm": 0.36449551582336426, + "learning_rate": 2.3037e-05, + "loss": 0.0204, + "step": 7685 + }, + { + "epoch": 6.051988972036234, + "grad_norm": 0.35194486379623413, + "learning_rate": 2.304e-05, + "loss": 0.026, + "step": 7686 + }, + { + "epoch": 6.052776683733754, + "grad_norm": 0.3370625674724579, + "learning_rate": 2.3043e-05, + "loss": 0.0251, + "step": 7687 + }, + { + "epoch": 6.053564395431272, + "grad_norm": 0.46794435381889343, + "learning_rate": 2.3046e-05, + "loss": 0.0337, + "step": 7688 + }, + { + "epoch": 6.054352107128791, + "grad_norm": 0.24105626344680786, + "learning_rate": 2.3049e-05, + "loss": 0.0147, + "step": 7689 + }, + { + "epoch": 6.0551398188263095, + "grad_norm": 0.444987952709198, + "learning_rate": 2.3052e-05, + "loss": 0.0291, + "step": 7690 + }, + { + "epoch": 6.055927530523828, + "grad_norm": 0.3015430271625519, + "learning_rate": 2.3055e-05, + "loss": 0.0111, + "step": 7691 + }, + { + "epoch": 6.056715242221347, + "grad_norm": 0.30496376752853394, + "learning_rate": 2.3058e-05, + "loss": 0.0194, + "step": 7692 + }, + { + "epoch": 6.057502953918866, + "grad_norm": 0.38958048820495605, + "learning_rate": 2.3061e-05, + "loss": 0.0182, + "step": 7693 + }, + { + "epoch": 6.058290665616385, + "grad_norm": 0.43366894125938416, + "learning_rate": 2.3064e-05, + "loss": 0.0192, + "step": 7694 + }, + { + "epoch": 6.059078377313903, + "grad_norm": 0.480649471282959, + "learning_rate": 2.3067e-05, + "loss": 0.0239, + "step": 7695 + }, + { + "epoch": 6.059866089011422, + "grad_norm": 0.25862032175064087, + "learning_rate": 2.307e-05, + "loss": 0.0098, + "step": 7696 + }, + { + "epoch": 6.06065380070894, + "grad_norm": 0.35655832290649414, + "learning_rate": 2.3073e-05, + "loss": 0.0164, + "step": 7697 + }, + { + "epoch": 6.061441512406459, + "grad_norm": 0.3809155821800232, + "learning_rate": 2.3076e-05, + "loss": 0.0144, + "step": 7698 + }, + { + "epoch": 6.062229224103978, + "grad_norm": 1.0425140857696533, + "learning_rate": 2.3079e-05, + "loss": 0.0363, + "step": 7699 + }, + { + "epoch": 6.063016935801497, + "grad_norm": 0.3864046633243561, + "learning_rate": 2.3082e-05, + "loss": 0.0214, + "step": 7700 + }, + { + "epoch": 6.0638046474990155, + "grad_norm": 0.7235077023506165, + "learning_rate": 2.3085e-05, + "loss": 0.0225, + "step": 7701 + }, + { + "epoch": 6.064592359196534, + "grad_norm": 0.4380795359611511, + "learning_rate": 2.3088e-05, + "loss": 0.0292, + "step": 7702 + }, + { + "epoch": 6.065380070894053, + "grad_norm": 0.8251463174819946, + "learning_rate": 2.3091000000000003e-05, + "loss": 0.0225, + "step": 7703 + }, + { + "epoch": 6.066167782591571, + "grad_norm": 0.4789700210094452, + "learning_rate": 2.3094000000000003e-05, + "loss": 0.0333, + "step": 7704 + }, + { + "epoch": 6.06695549428909, + "grad_norm": 0.5799053907394409, + "learning_rate": 2.3097000000000003e-05, + "loss": 0.0296, + "step": 7705 + }, + { + "epoch": 6.067743205986609, + "grad_norm": 0.3882097005844116, + "learning_rate": 2.3100000000000002e-05, + "loss": 0.023, + "step": 7706 + }, + { + "epoch": 6.068530917684128, + "grad_norm": 0.3882692754268646, + "learning_rate": 2.3103000000000002e-05, + "loss": 0.0252, + "step": 7707 + }, + { + "epoch": 6.069318629381646, + "grad_norm": 0.48816630244255066, + "learning_rate": 2.3106000000000002e-05, + "loss": 0.0201, + "step": 7708 + }, + { + "epoch": 6.070106341079165, + "grad_norm": 0.3649718165397644, + "learning_rate": 2.3109e-05, + "loss": 0.021, + "step": 7709 + }, + { + "epoch": 6.0708940527766835, + "grad_norm": 0.5552560091018677, + "learning_rate": 2.3111999999999998e-05, + "loss": 0.0287, + "step": 7710 + }, + { + "epoch": 6.071681764474202, + "grad_norm": 0.40127789974212646, + "learning_rate": 2.3114999999999998e-05, + "loss": 0.02, + "step": 7711 + }, + { + "epoch": 6.0724694761717215, + "grad_norm": 0.7461314797401428, + "learning_rate": 2.3117999999999998e-05, + "loss": 0.0396, + "step": 7712 + }, + { + "epoch": 6.07325718786924, + "grad_norm": 0.4511371850967407, + "learning_rate": 2.3121e-05, + "loss": 0.0117, + "step": 7713 + }, + { + "epoch": 6.074044899566759, + "grad_norm": 0.4413197636604309, + "learning_rate": 2.3124e-05, + "loss": 0.0232, + "step": 7714 + }, + { + "epoch": 6.074832611264277, + "grad_norm": 0.5408613085746765, + "learning_rate": 2.3127e-05, + "loss": 0.025, + "step": 7715 + }, + { + "epoch": 6.075620322961796, + "grad_norm": 1.980278730392456, + "learning_rate": 2.313e-05, + "loss": 0.021, + "step": 7716 + }, + { + "epoch": 6.076408034659314, + "grad_norm": 0.40312764048576355, + "learning_rate": 2.3133e-05, + "loss": 0.0231, + "step": 7717 + }, + { + "epoch": 6.077195746356834, + "grad_norm": 0.3091520369052887, + "learning_rate": 2.3136e-05, + "loss": 0.019, + "step": 7718 + }, + { + "epoch": 6.077983458054352, + "grad_norm": 0.5983188152313232, + "learning_rate": 2.3139e-05, + "loss": 0.0288, + "step": 7719 + }, + { + "epoch": 6.078771169751871, + "grad_norm": 0.6458439230918884, + "learning_rate": 2.3142e-05, + "loss": 0.0263, + "step": 7720 + }, + { + "epoch": 6.0795588814493895, + "grad_norm": 1.4563790559768677, + "learning_rate": 2.3145e-05, + "loss": 0.3558, + "step": 7721 + }, + { + "epoch": 6.080346593146908, + "grad_norm": 0.8415952920913696, + "learning_rate": 2.3148e-05, + "loss": 0.2608, + "step": 7722 + }, + { + "epoch": 6.081134304844427, + "grad_norm": 0.7501044869422913, + "learning_rate": 2.3151000000000002e-05, + "loss": 0.2167, + "step": 7723 + }, + { + "epoch": 6.081922016541945, + "grad_norm": 0.5636917948722839, + "learning_rate": 2.3154000000000002e-05, + "loss": 0.1426, + "step": 7724 + }, + { + "epoch": 6.082709728239465, + "grad_norm": 0.6122381687164307, + "learning_rate": 2.3157000000000002e-05, + "loss": 0.137, + "step": 7725 + }, + { + "epoch": 6.083497439936983, + "grad_norm": 0.6225518584251404, + "learning_rate": 2.3160000000000002e-05, + "loss": 0.0922, + "step": 7726 + }, + { + "epoch": 6.084285151634502, + "grad_norm": 0.5375517010688782, + "learning_rate": 2.3163e-05, + "loss": 0.0382, + "step": 7727 + }, + { + "epoch": 6.08507286333202, + "grad_norm": 1.0604263544082642, + "learning_rate": 2.3166e-05, + "loss": 0.0529, + "step": 7728 + }, + { + "epoch": 6.085860575029539, + "grad_norm": 0.33098748326301575, + "learning_rate": 2.3169e-05, + "loss": 0.015, + "step": 7729 + }, + { + "epoch": 6.0866482867270575, + "grad_norm": 0.4936716854572296, + "learning_rate": 2.3172e-05, + "loss": 0.0526, + "step": 7730 + }, + { + "epoch": 6.087435998424577, + "grad_norm": 0.2227405607700348, + "learning_rate": 2.3175e-05, + "loss": 0.0183, + "step": 7731 + }, + { + "epoch": 6.0882237101220955, + "grad_norm": 0.22956015169620514, + "learning_rate": 2.3178e-05, + "loss": 0.014, + "step": 7732 + }, + { + "epoch": 6.089011421819614, + "grad_norm": 0.2889116704463959, + "learning_rate": 2.3181000000000004e-05, + "loss": 0.0249, + "step": 7733 + }, + { + "epoch": 6.089799133517133, + "grad_norm": 0.5321547985076904, + "learning_rate": 2.3184e-05, + "loss": 0.0315, + "step": 7734 + }, + { + "epoch": 6.090586845214651, + "grad_norm": 0.5782608985900879, + "learning_rate": 2.3187e-05, + "loss": 0.0236, + "step": 7735 + }, + { + "epoch": 6.09137455691217, + "grad_norm": 0.18334460258483887, + "learning_rate": 2.319e-05, + "loss": 0.0114, + "step": 7736 + }, + { + "epoch": 6.092162268609689, + "grad_norm": 0.31693553924560547, + "learning_rate": 2.3193e-05, + "loss": 0.0147, + "step": 7737 + }, + { + "epoch": 6.092949980307208, + "grad_norm": 1.525753378868103, + "learning_rate": 2.3196e-05, + "loss": 0.0217, + "step": 7738 + }, + { + "epoch": 6.093737692004726, + "grad_norm": 0.49405568838119507, + "learning_rate": 2.3199e-05, + "loss": 0.0177, + "step": 7739 + }, + { + "epoch": 6.094525403702245, + "grad_norm": 0.2603946924209595, + "learning_rate": 2.3202e-05, + "loss": 0.0145, + "step": 7740 + }, + { + "epoch": 6.0953131153997635, + "grad_norm": 0.3722802996635437, + "learning_rate": 2.3205e-05, + "loss": 0.0191, + "step": 7741 + }, + { + "epoch": 6.096100827097282, + "grad_norm": 0.39019066095352173, + "learning_rate": 2.3208e-05, + "loss": 0.0122, + "step": 7742 + }, + { + "epoch": 6.0968885387948015, + "grad_norm": 0.48828762769699097, + "learning_rate": 2.3211000000000002e-05, + "loss": 0.0196, + "step": 7743 + }, + { + "epoch": 6.09767625049232, + "grad_norm": 0.6238077878952026, + "learning_rate": 2.3214000000000002e-05, + "loss": 0.0234, + "step": 7744 + }, + { + "epoch": 6.098463962189839, + "grad_norm": 0.35097864270210266, + "learning_rate": 2.3217e-05, + "loss": 0.0245, + "step": 7745 + }, + { + "epoch": 6.099251673887357, + "grad_norm": 0.660934329032898, + "learning_rate": 2.322e-05, + "loss": 0.0237, + "step": 7746 + }, + { + "epoch": 6.100039385584876, + "grad_norm": 0.34716472029685974, + "learning_rate": 2.3223e-05, + "loss": 0.0271, + "step": 7747 + }, + { + "epoch": 6.100827097282394, + "grad_norm": 0.5405645370483398, + "learning_rate": 2.3226e-05, + "loss": 0.0257, + "step": 7748 + }, + { + "epoch": 6.101614808979913, + "grad_norm": 0.27557075023651123, + "learning_rate": 2.3229e-05, + "loss": 0.0122, + "step": 7749 + }, + { + "epoch": 6.102402520677432, + "grad_norm": 0.4074194133281708, + "learning_rate": 2.3232e-05, + "loss": 0.0097, + "step": 7750 + }, + { + "epoch": 6.103190232374951, + "grad_norm": 0.6526117920875549, + "learning_rate": 2.3235e-05, + "loss": 0.032, + "step": 7751 + }, + { + "epoch": 6.1039779440724695, + "grad_norm": 0.35382091999053955, + "learning_rate": 2.3238e-05, + "loss": 0.0098, + "step": 7752 + }, + { + "epoch": 6.104765655769988, + "grad_norm": 0.39443230628967285, + "learning_rate": 2.3241000000000003e-05, + "loss": 0.0204, + "step": 7753 + }, + { + "epoch": 6.105553367467507, + "grad_norm": 0.5083954930305481, + "learning_rate": 2.3244000000000003e-05, + "loss": 0.0339, + "step": 7754 + }, + { + "epoch": 6.106341079165025, + "grad_norm": 0.6107825636863708, + "learning_rate": 2.3247000000000003e-05, + "loss": 0.0432, + "step": 7755 + }, + { + "epoch": 6.107128790862545, + "grad_norm": 0.41930535435676575, + "learning_rate": 2.3250000000000003e-05, + "loss": 0.0296, + "step": 7756 + }, + { + "epoch": 6.107916502560063, + "grad_norm": 0.21906185150146484, + "learning_rate": 2.3253000000000003e-05, + "loss": 0.0107, + "step": 7757 + }, + { + "epoch": 6.108704214257582, + "grad_norm": 0.3362185060977936, + "learning_rate": 2.3256e-05, + "loss": 0.0341, + "step": 7758 + }, + { + "epoch": 6.1094919259551, + "grad_norm": 0.35771092772483826, + "learning_rate": 2.3259e-05, + "loss": 0.015, + "step": 7759 + }, + { + "epoch": 6.110279637652619, + "grad_norm": 0.7734946012496948, + "learning_rate": 2.3262e-05, + "loss": 0.0276, + "step": 7760 + }, + { + "epoch": 6.1110673493501375, + "grad_norm": 0.663244366645813, + "learning_rate": 2.3265e-05, + "loss": 0.0219, + "step": 7761 + }, + { + "epoch": 6.111855061047657, + "grad_norm": 0.30704575777053833, + "learning_rate": 2.3267999999999998e-05, + "loss": 0.0165, + "step": 7762 + }, + { + "epoch": 6.1126427727451755, + "grad_norm": 0.49614372849464417, + "learning_rate": 2.3270999999999998e-05, + "loss": 0.0144, + "step": 7763 + }, + { + "epoch": 6.113430484442694, + "grad_norm": 0.32210874557495117, + "learning_rate": 2.3274e-05, + "loss": 0.0203, + "step": 7764 + }, + { + "epoch": 6.114218196140213, + "grad_norm": 0.3816727101802826, + "learning_rate": 2.3277e-05, + "loss": 0.027, + "step": 7765 + }, + { + "epoch": 6.115005907837731, + "grad_norm": 0.19267410039901733, + "learning_rate": 2.328e-05, + "loss": 0.009, + "step": 7766 + }, + { + "epoch": 6.11579361953525, + "grad_norm": 0.9708350896835327, + "learning_rate": 2.3283e-05, + "loss": 0.0242, + "step": 7767 + }, + { + "epoch": 6.116581331232768, + "grad_norm": 0.43417075276374817, + "learning_rate": 2.3286e-05, + "loss": 0.0221, + "step": 7768 + }, + { + "epoch": 6.117369042930288, + "grad_norm": 0.6329561471939087, + "learning_rate": 2.3289e-05, + "loss": 0.0282, + "step": 7769 + }, + { + "epoch": 6.118156754627806, + "grad_norm": 0.6207032799720764, + "learning_rate": 2.3292e-05, + "loss": 0.028, + "step": 7770 + }, + { + "epoch": 6.118944466325325, + "grad_norm": 1.0622371435165405, + "learning_rate": 2.3295e-05, + "loss": 0.2492, + "step": 7771 + }, + { + "epoch": 6.1197321780228435, + "grad_norm": 0.834469735622406, + "learning_rate": 2.3298e-05, + "loss": 0.1865, + "step": 7772 + }, + { + "epoch": 6.120519889720362, + "grad_norm": 0.5618965029716492, + "learning_rate": 2.3301e-05, + "loss": 0.1448, + "step": 7773 + }, + { + "epoch": 6.121307601417881, + "grad_norm": 0.6537894606590271, + "learning_rate": 2.3304000000000003e-05, + "loss": 0.1136, + "step": 7774 + }, + { + "epoch": 6.1220953131154, + "grad_norm": 0.6371846795082092, + "learning_rate": 2.3307000000000002e-05, + "loss": 0.0876, + "step": 7775 + }, + { + "epoch": 6.122883024812919, + "grad_norm": 0.8166736364364624, + "learning_rate": 2.3310000000000002e-05, + "loss": 0.0799, + "step": 7776 + }, + { + "epoch": 6.123670736510437, + "grad_norm": 0.510674774646759, + "learning_rate": 2.3313000000000002e-05, + "loss": 0.0575, + "step": 7777 + }, + { + "epoch": 6.124458448207956, + "grad_norm": 0.3865259885787964, + "learning_rate": 2.3316000000000002e-05, + "loss": 0.0435, + "step": 7778 + }, + { + "epoch": 6.125246159905474, + "grad_norm": 0.40345633029937744, + "learning_rate": 2.3319e-05, + "loss": 0.0505, + "step": 7779 + }, + { + "epoch": 6.126033871602993, + "grad_norm": 0.41099873185157776, + "learning_rate": 2.3322e-05, + "loss": 0.0192, + "step": 7780 + }, + { + "epoch": 6.126821583300512, + "grad_norm": 0.47643667459487915, + "learning_rate": 2.3325e-05, + "loss": 0.0247, + "step": 7781 + }, + { + "epoch": 6.127609294998031, + "grad_norm": 0.4686391353607178, + "learning_rate": 2.3328e-05, + "loss": 0.0146, + "step": 7782 + }, + { + "epoch": 6.1283970066955495, + "grad_norm": 0.49262312054634094, + "learning_rate": 2.3330999999999997e-05, + "loss": 0.0164, + "step": 7783 + }, + { + "epoch": 6.129184718393068, + "grad_norm": 0.4204620122909546, + "learning_rate": 2.3334e-05, + "loss": 0.0235, + "step": 7784 + }, + { + "epoch": 6.129972430090587, + "grad_norm": 0.31958481669425964, + "learning_rate": 2.3337e-05, + "loss": 0.022, + "step": 7785 + }, + { + "epoch": 6.130760141788105, + "grad_norm": 0.6051145195960999, + "learning_rate": 2.334e-05, + "loss": 0.0221, + "step": 7786 + }, + { + "epoch": 6.131547853485625, + "grad_norm": 0.2721211314201355, + "learning_rate": 2.3343e-05, + "loss": 0.0236, + "step": 7787 + }, + { + "epoch": 6.132335565183143, + "grad_norm": 0.2591617703437805, + "learning_rate": 2.3346e-05, + "loss": 0.0176, + "step": 7788 + }, + { + "epoch": 6.133123276880662, + "grad_norm": 0.4969123303890228, + "learning_rate": 2.3349e-05, + "loss": 0.0164, + "step": 7789 + }, + { + "epoch": 6.13391098857818, + "grad_norm": 0.3057115375995636, + "learning_rate": 2.3352e-05, + "loss": 0.0151, + "step": 7790 + }, + { + "epoch": 6.134698700275699, + "grad_norm": 0.3314473330974579, + "learning_rate": 2.3355e-05, + "loss": 0.016, + "step": 7791 + }, + { + "epoch": 6.1354864119732175, + "grad_norm": 0.40685006976127625, + "learning_rate": 2.3358e-05, + "loss": 0.0204, + "step": 7792 + }, + { + "epoch": 6.136274123670736, + "grad_norm": 0.4878276586532593, + "learning_rate": 2.3361e-05, + "loss": 0.0572, + "step": 7793 + }, + { + "epoch": 6.137061835368256, + "grad_norm": 0.3554495573043823, + "learning_rate": 2.3364000000000002e-05, + "loss": 0.0164, + "step": 7794 + }, + { + "epoch": 6.137849547065774, + "grad_norm": 0.41921505331993103, + "learning_rate": 2.3367000000000002e-05, + "loss": 0.0189, + "step": 7795 + }, + { + "epoch": 6.138637258763293, + "grad_norm": 0.3200557231903076, + "learning_rate": 2.337e-05, + "loss": 0.0213, + "step": 7796 + }, + { + "epoch": 6.139424970460811, + "grad_norm": 0.2932818531990051, + "learning_rate": 2.3373e-05, + "loss": 0.0135, + "step": 7797 + }, + { + "epoch": 6.14021268215833, + "grad_norm": 0.30058661103248596, + "learning_rate": 2.3376e-05, + "loss": 0.0145, + "step": 7798 + }, + { + "epoch": 6.141000393855848, + "grad_norm": 0.2759752869606018, + "learning_rate": 2.3379e-05, + "loss": 0.0154, + "step": 7799 + }, + { + "epoch": 6.141788105553368, + "grad_norm": 0.30793410539627075, + "learning_rate": 2.3382e-05, + "loss": 0.0195, + "step": 7800 + }, + { + "epoch": 6.142575817250886, + "grad_norm": 1.660492181777954, + "learning_rate": 2.3385e-05, + "loss": 0.038, + "step": 7801 + }, + { + "epoch": 6.143363528948405, + "grad_norm": 0.4138924479484558, + "learning_rate": 2.3388e-05, + "loss": 0.0157, + "step": 7802 + }, + { + "epoch": 6.1441512406459236, + "grad_norm": 0.23954546451568604, + "learning_rate": 2.3391e-05, + "loss": 0.0168, + "step": 7803 + }, + { + "epoch": 6.144938952343442, + "grad_norm": 0.44883081316947937, + "learning_rate": 2.3394000000000003e-05, + "loss": 0.0302, + "step": 7804 + }, + { + "epoch": 6.145726664040961, + "grad_norm": 0.48411792516708374, + "learning_rate": 2.3397000000000003e-05, + "loss": 0.0242, + "step": 7805 + }, + { + "epoch": 6.14651437573848, + "grad_norm": 0.2309124618768692, + "learning_rate": 2.3400000000000003e-05, + "loss": 0.0116, + "step": 7806 + }, + { + "epoch": 6.147302087435999, + "grad_norm": 0.72086501121521, + "learning_rate": 2.3403e-05, + "loss": 0.0229, + "step": 7807 + }, + { + "epoch": 6.148089799133517, + "grad_norm": 0.6646937131881714, + "learning_rate": 2.3406e-05, + "loss": 0.0175, + "step": 7808 + }, + { + "epoch": 6.148877510831036, + "grad_norm": 1.3113377094268799, + "learning_rate": 2.3409e-05, + "loss": 0.0253, + "step": 7809 + }, + { + "epoch": 6.149665222528554, + "grad_norm": 0.4384828805923462, + "learning_rate": 2.3412e-05, + "loss": 0.0142, + "step": 7810 + }, + { + "epoch": 6.150452934226073, + "grad_norm": 0.5327108502388, + "learning_rate": 2.3415e-05, + "loss": 0.0319, + "step": 7811 + }, + { + "epoch": 6.1512406459235915, + "grad_norm": 0.624022364616394, + "learning_rate": 2.3418e-05, + "loss": 0.0166, + "step": 7812 + }, + { + "epoch": 6.152028357621111, + "grad_norm": 0.6071394681930542, + "learning_rate": 2.3420999999999998e-05, + "loss": 0.0287, + "step": 7813 + }, + { + "epoch": 6.15281606931863, + "grad_norm": 0.3611057698726654, + "learning_rate": 2.3424e-05, + "loss": 0.0274, + "step": 7814 + }, + { + "epoch": 6.153603781016148, + "grad_norm": 0.4016696810722351, + "learning_rate": 2.3427e-05, + "loss": 0.0264, + "step": 7815 + }, + { + "epoch": 6.154391492713667, + "grad_norm": 1.034593105316162, + "learning_rate": 2.343e-05, + "loss": 0.0353, + "step": 7816 + }, + { + "epoch": 6.155179204411185, + "grad_norm": 0.6443043351173401, + "learning_rate": 2.3433e-05, + "loss": 0.0213, + "step": 7817 + }, + { + "epoch": 6.155966916108704, + "grad_norm": 0.5665024518966675, + "learning_rate": 2.3436e-05, + "loss": 0.0296, + "step": 7818 + }, + { + "epoch": 6.156754627806223, + "grad_norm": 0.8967957496643066, + "learning_rate": 2.3439e-05, + "loss": 0.0352, + "step": 7819 + }, + { + "epoch": 6.157542339503742, + "grad_norm": 0.5989775061607361, + "learning_rate": 2.3442e-05, + "loss": 0.015, + "step": 7820 + }, + { + "epoch": 6.15833005120126, + "grad_norm": 0.9496285319328308, + "learning_rate": 2.3445e-05, + "loss": 0.2506, + "step": 7821 + }, + { + "epoch": 6.159117762898779, + "grad_norm": 0.9616008400917053, + "learning_rate": 2.3448e-05, + "loss": 0.2009, + "step": 7822 + }, + { + "epoch": 6.159905474596298, + "grad_norm": 0.6637731194496155, + "learning_rate": 2.3451e-05, + "loss": 0.1544, + "step": 7823 + }, + { + "epoch": 6.160693186293816, + "grad_norm": 0.7732980251312256, + "learning_rate": 2.3454000000000003e-05, + "loss": 0.1532, + "step": 7824 + }, + { + "epoch": 6.161480897991336, + "grad_norm": 0.9708369970321655, + "learning_rate": 2.3457000000000003e-05, + "loss": 0.1646, + "step": 7825 + }, + { + "epoch": 6.162268609688854, + "grad_norm": 0.7580649852752686, + "learning_rate": 2.3460000000000002e-05, + "loss": 0.0619, + "step": 7826 + }, + { + "epoch": 6.163056321386373, + "grad_norm": 0.2912876307964325, + "learning_rate": 2.3463000000000002e-05, + "loss": 0.0287, + "step": 7827 + }, + { + "epoch": 6.163844033083891, + "grad_norm": 0.5141652226448059, + "learning_rate": 2.3466000000000002e-05, + "loss": 0.0323, + "step": 7828 + }, + { + "epoch": 6.16463174478141, + "grad_norm": 0.5693254470825195, + "learning_rate": 2.3469000000000002e-05, + "loss": 0.0338, + "step": 7829 + }, + { + "epoch": 6.165419456478928, + "grad_norm": 0.21028007566928864, + "learning_rate": 2.3472e-05, + "loss": 0.0175, + "step": 7830 + }, + { + "epoch": 6.166207168176447, + "grad_norm": 0.40082377195358276, + "learning_rate": 2.3475e-05, + "loss": 0.0304, + "step": 7831 + }, + { + "epoch": 6.166994879873966, + "grad_norm": 0.3941837549209595, + "learning_rate": 2.3477999999999998e-05, + "loss": 0.0297, + "step": 7832 + }, + { + "epoch": 6.167782591571485, + "grad_norm": 0.19772207736968994, + "learning_rate": 2.3480999999999998e-05, + "loss": 0.0146, + "step": 7833 + }, + { + "epoch": 6.168570303269004, + "grad_norm": 0.389922171831131, + "learning_rate": 2.3484e-05, + "loss": 0.0224, + "step": 7834 + }, + { + "epoch": 6.169358014966522, + "grad_norm": 0.2358703911304474, + "learning_rate": 2.3487e-05, + "loss": 0.0245, + "step": 7835 + }, + { + "epoch": 6.170145726664041, + "grad_norm": 0.27693942189216614, + "learning_rate": 2.349e-05, + "loss": 0.0202, + "step": 7836 + }, + { + "epoch": 6.170933438361559, + "grad_norm": 0.29027849435806274, + "learning_rate": 2.3493e-05, + "loss": 0.0184, + "step": 7837 + }, + { + "epoch": 6.171721150059079, + "grad_norm": 0.2532712519168854, + "learning_rate": 2.3496e-05, + "loss": 0.0128, + "step": 7838 + }, + { + "epoch": 6.172508861756597, + "grad_norm": 0.3075482249259949, + "learning_rate": 2.3499e-05, + "loss": 0.0242, + "step": 7839 + }, + { + "epoch": 6.173296573454116, + "grad_norm": 0.399461567401886, + "learning_rate": 2.3502e-05, + "loss": 0.0177, + "step": 7840 + }, + { + "epoch": 6.174084285151634, + "grad_norm": 0.2980434000492096, + "learning_rate": 2.3505e-05, + "loss": 0.0196, + "step": 7841 + }, + { + "epoch": 6.174871996849153, + "grad_norm": 0.6901800036430359, + "learning_rate": 2.3508e-05, + "loss": 0.0237, + "step": 7842 + }, + { + "epoch": 6.175659708546672, + "grad_norm": 0.4998253881931305, + "learning_rate": 2.3511e-05, + "loss": 0.0227, + "step": 7843 + }, + { + "epoch": 6.176447420244191, + "grad_norm": 0.2582685947418213, + "learning_rate": 2.3514000000000002e-05, + "loss": 0.0189, + "step": 7844 + }, + { + "epoch": 6.17723513194171, + "grad_norm": 0.5751510262489319, + "learning_rate": 2.3517000000000002e-05, + "loss": 0.0327, + "step": 7845 + }, + { + "epoch": 6.178022843639228, + "grad_norm": 0.35345643758773804, + "learning_rate": 2.3520000000000002e-05, + "loss": 0.0182, + "step": 7846 + }, + { + "epoch": 6.178810555336747, + "grad_norm": 0.4477609097957611, + "learning_rate": 2.3523e-05, + "loss": 0.0214, + "step": 7847 + }, + { + "epoch": 6.179598267034265, + "grad_norm": 0.5105072259902954, + "learning_rate": 2.3526e-05, + "loss": 0.0221, + "step": 7848 + }, + { + "epoch": 6.180385978731784, + "grad_norm": 0.2282285988330841, + "learning_rate": 2.3529e-05, + "loss": 0.0168, + "step": 7849 + }, + { + "epoch": 6.181173690429302, + "grad_norm": 0.6291734576225281, + "learning_rate": 2.3532e-05, + "loss": 0.0146, + "step": 7850 + }, + { + "epoch": 6.181961402126822, + "grad_norm": 3.3462934494018555, + "learning_rate": 2.3535e-05, + "loss": 0.0371, + "step": 7851 + }, + { + "epoch": 6.1827491138243404, + "grad_norm": 0.7584933042526245, + "learning_rate": 2.3538e-05, + "loss": 0.0275, + "step": 7852 + }, + { + "epoch": 6.183536825521859, + "grad_norm": 0.31475630402565, + "learning_rate": 2.3541e-05, + "loss": 0.0195, + "step": 7853 + }, + { + "epoch": 6.184324537219378, + "grad_norm": 0.2889225482940674, + "learning_rate": 2.3544000000000004e-05, + "loss": 0.019, + "step": 7854 + }, + { + "epoch": 6.185112248916896, + "grad_norm": 1.0634061098098755, + "learning_rate": 2.3547000000000003e-05, + "loss": 0.021, + "step": 7855 + }, + { + "epoch": 6.185899960614415, + "grad_norm": 0.5143982768058777, + "learning_rate": 2.3550000000000003e-05, + "loss": 0.0252, + "step": 7856 + }, + { + "epoch": 6.186687672311934, + "grad_norm": 0.3428564667701721, + "learning_rate": 2.3553e-05, + "loss": 0.0197, + "step": 7857 + }, + { + "epoch": 6.187475384009453, + "grad_norm": 0.5409908294677734, + "learning_rate": 2.3556e-05, + "loss": 0.0286, + "step": 7858 + }, + { + "epoch": 6.188263095706971, + "grad_norm": 0.489341139793396, + "learning_rate": 2.3559e-05, + "loss": 0.0299, + "step": 7859 + }, + { + "epoch": 6.18905080740449, + "grad_norm": 0.34158268570899963, + "learning_rate": 2.3562e-05, + "loss": 0.0235, + "step": 7860 + }, + { + "epoch": 6.189838519102008, + "grad_norm": 0.4007698595523834, + "learning_rate": 2.3565e-05, + "loss": 0.0137, + "step": 7861 + }, + { + "epoch": 6.190626230799527, + "grad_norm": 0.29130324721336365, + "learning_rate": 2.3568e-05, + "loss": 0.0125, + "step": 7862 + }, + { + "epoch": 6.1914139424970465, + "grad_norm": 0.36953601241111755, + "learning_rate": 2.3571e-05, + "loss": 0.0176, + "step": 7863 + }, + { + "epoch": 6.192201654194565, + "grad_norm": 0.6564675569534302, + "learning_rate": 2.3574e-05, + "loss": 0.0375, + "step": 7864 + }, + { + "epoch": 6.192989365892084, + "grad_norm": 0.3386462926864624, + "learning_rate": 2.3577e-05, + "loss": 0.0167, + "step": 7865 + }, + { + "epoch": 6.193777077589602, + "grad_norm": 0.3170674741268158, + "learning_rate": 2.358e-05, + "loss": 0.0175, + "step": 7866 + }, + { + "epoch": 6.194564789287121, + "grad_norm": 0.4268887937068939, + "learning_rate": 2.3583e-05, + "loss": 0.0235, + "step": 7867 + }, + { + "epoch": 6.195352500984639, + "grad_norm": 1.535287618637085, + "learning_rate": 2.3586e-05, + "loss": 0.0258, + "step": 7868 + }, + { + "epoch": 6.196140212682159, + "grad_norm": 0.8956959843635559, + "learning_rate": 2.3589e-05, + "loss": 0.0404, + "step": 7869 + }, + { + "epoch": 6.196927924379677, + "grad_norm": 1.4800348281860352, + "learning_rate": 2.3592e-05, + "loss": 0.0606, + "step": 7870 + }, + { + "epoch": 6.197715636077196, + "grad_norm": 0.7692916989326477, + "learning_rate": 2.3595e-05, + "loss": 0.2293, + "step": 7871 + }, + { + "epoch": 6.1985033477747145, + "grad_norm": 1.1320428848266602, + "learning_rate": 2.3598e-05, + "loss": 0.2375, + "step": 7872 + }, + { + "epoch": 6.199291059472233, + "grad_norm": 0.7692564725875854, + "learning_rate": 2.3601e-05, + "loss": 0.1209, + "step": 7873 + }, + { + "epoch": 6.200078771169752, + "grad_norm": 0.5192215442657471, + "learning_rate": 2.3604000000000003e-05, + "loss": 0.1103, + "step": 7874 + }, + { + "epoch": 6.20086648286727, + "grad_norm": 0.6213540434837341, + "learning_rate": 2.3607000000000003e-05, + "loss": 0.1125, + "step": 7875 + }, + { + "epoch": 6.20165419456479, + "grad_norm": 0.7361270189285278, + "learning_rate": 2.3610000000000003e-05, + "loss": 0.0772, + "step": 7876 + }, + { + "epoch": 6.202441906262308, + "grad_norm": 0.45745813846588135, + "learning_rate": 2.3613000000000002e-05, + "loss": 0.0624, + "step": 7877 + }, + { + "epoch": 6.203229617959827, + "grad_norm": 0.4094482362270355, + "learning_rate": 2.3616000000000002e-05, + "loss": 0.0469, + "step": 7878 + }, + { + "epoch": 6.204017329657345, + "grad_norm": 0.33941495418548584, + "learning_rate": 2.3619000000000002e-05, + "loss": 0.0214, + "step": 7879 + }, + { + "epoch": 6.204805041354864, + "grad_norm": 0.3399060368537903, + "learning_rate": 2.3622000000000002e-05, + "loss": 0.0285, + "step": 7880 + }, + { + "epoch": 6.2055927530523824, + "grad_norm": 0.3096819818019867, + "learning_rate": 2.3624999999999998e-05, + "loss": 0.0157, + "step": 7881 + }, + { + "epoch": 6.206380464749902, + "grad_norm": 0.25953054428100586, + "learning_rate": 2.3627999999999998e-05, + "loss": 0.0134, + "step": 7882 + }, + { + "epoch": 6.2071681764474205, + "grad_norm": 0.6848036050796509, + "learning_rate": 2.3630999999999998e-05, + "loss": 0.0194, + "step": 7883 + }, + { + "epoch": 6.207955888144939, + "grad_norm": 0.23266029357910156, + "learning_rate": 2.3633999999999998e-05, + "loss": 0.0092, + "step": 7884 + }, + { + "epoch": 6.208743599842458, + "grad_norm": 0.28187820315361023, + "learning_rate": 2.3637e-05, + "loss": 0.019, + "step": 7885 + }, + { + "epoch": 6.209531311539976, + "grad_norm": 0.3843320906162262, + "learning_rate": 2.364e-05, + "loss": 0.0225, + "step": 7886 + }, + { + "epoch": 6.210319023237495, + "grad_norm": 0.3591806888580322, + "learning_rate": 2.3643e-05, + "loss": 0.0192, + "step": 7887 + }, + { + "epoch": 6.211106734935014, + "grad_norm": 0.33908796310424805, + "learning_rate": 2.3646e-05, + "loss": 0.0173, + "step": 7888 + }, + { + "epoch": 6.211894446632533, + "grad_norm": 0.8522966504096985, + "learning_rate": 2.3649e-05, + "loss": 0.0303, + "step": 7889 + }, + { + "epoch": 6.212682158330051, + "grad_norm": 0.33064329624176025, + "learning_rate": 2.3652e-05, + "loss": 0.0263, + "step": 7890 + }, + { + "epoch": 6.21346987002757, + "grad_norm": 0.45677268505096436, + "learning_rate": 2.3655e-05, + "loss": 0.0277, + "step": 7891 + }, + { + "epoch": 6.2142575817250885, + "grad_norm": 0.7585058808326721, + "learning_rate": 2.3658e-05, + "loss": 0.0265, + "step": 7892 + }, + { + "epoch": 6.215045293422607, + "grad_norm": 0.39134201407432556, + "learning_rate": 2.3661e-05, + "loss": 0.0175, + "step": 7893 + }, + { + "epoch": 6.2158330051201265, + "grad_norm": 0.4485762417316437, + "learning_rate": 2.3664e-05, + "loss": 0.0283, + "step": 7894 + }, + { + "epoch": 6.216620716817645, + "grad_norm": 0.43718284368515015, + "learning_rate": 2.3667000000000002e-05, + "loss": 0.0224, + "step": 7895 + }, + { + "epoch": 6.217408428515164, + "grad_norm": 0.37317612767219543, + "learning_rate": 2.3670000000000002e-05, + "loss": 0.0252, + "step": 7896 + }, + { + "epoch": 6.218196140212682, + "grad_norm": 0.6728554964065552, + "learning_rate": 2.3673000000000002e-05, + "loss": 0.0205, + "step": 7897 + }, + { + "epoch": 6.218983851910201, + "grad_norm": 0.28673988580703735, + "learning_rate": 2.3676e-05, + "loss": 0.0142, + "step": 7898 + }, + { + "epoch": 6.219771563607719, + "grad_norm": 0.25218164920806885, + "learning_rate": 2.3679e-05, + "loss": 0.0128, + "step": 7899 + }, + { + "epoch": 6.220559275305238, + "grad_norm": 0.5730993747711182, + "learning_rate": 2.3682e-05, + "loss": 0.0226, + "step": 7900 + }, + { + "epoch": 6.221346987002757, + "grad_norm": 0.2595292627811432, + "learning_rate": 2.3685e-05, + "loss": 0.0098, + "step": 7901 + }, + { + "epoch": 6.222134698700276, + "grad_norm": 0.5277000069618225, + "learning_rate": 2.3688e-05, + "loss": 0.022, + "step": 7902 + }, + { + "epoch": 6.2229224103977945, + "grad_norm": 0.3937973380088806, + "learning_rate": 2.3691e-05, + "loss": 0.0259, + "step": 7903 + }, + { + "epoch": 6.223710122095313, + "grad_norm": 0.5704920291900635, + "learning_rate": 2.3694e-05, + "loss": 0.0275, + "step": 7904 + }, + { + "epoch": 6.224497833792832, + "grad_norm": 1.0195319652557373, + "learning_rate": 2.3697000000000004e-05, + "loss": 0.0276, + "step": 7905 + }, + { + "epoch": 6.22528554549035, + "grad_norm": 0.7016333937644958, + "learning_rate": 2.37e-05, + "loss": 0.0391, + "step": 7906 + }, + { + "epoch": 6.22607325718787, + "grad_norm": 0.374579519033432, + "learning_rate": 2.3703e-05, + "loss": 0.0252, + "step": 7907 + }, + { + "epoch": 6.226860968885388, + "grad_norm": 0.3363591134548187, + "learning_rate": 2.3706e-05, + "loss": 0.0211, + "step": 7908 + }, + { + "epoch": 6.227648680582907, + "grad_norm": 0.357003778219223, + "learning_rate": 2.3709e-05, + "loss": 0.0196, + "step": 7909 + }, + { + "epoch": 6.228436392280425, + "grad_norm": 1.301600694656372, + "learning_rate": 2.3712e-05, + "loss": 0.0362, + "step": 7910 + }, + { + "epoch": 6.229224103977944, + "grad_norm": 0.22894714772701263, + "learning_rate": 2.3715e-05, + "loss": 0.0136, + "step": 7911 + }, + { + "epoch": 6.2300118156754625, + "grad_norm": 0.40669143199920654, + "learning_rate": 2.3718e-05, + "loss": 0.0177, + "step": 7912 + }, + { + "epoch": 6.230799527372982, + "grad_norm": 0.45969676971435547, + "learning_rate": 2.3721e-05, + "loss": 0.0302, + "step": 7913 + }, + { + "epoch": 6.2315872390705005, + "grad_norm": 0.48508453369140625, + "learning_rate": 2.3724e-05, + "loss": 0.03, + "step": 7914 + }, + { + "epoch": 6.232374950768019, + "grad_norm": 0.2846939265727997, + "learning_rate": 2.3727000000000002e-05, + "loss": 0.0129, + "step": 7915 + }, + { + "epoch": 6.233162662465538, + "grad_norm": 0.3314657211303711, + "learning_rate": 2.373e-05, + "loss": 0.0272, + "step": 7916 + }, + { + "epoch": 6.233950374163056, + "grad_norm": 0.46763646602630615, + "learning_rate": 2.3733e-05, + "loss": 0.0286, + "step": 7917 + }, + { + "epoch": 6.234738085860575, + "grad_norm": 0.48464661836624146, + "learning_rate": 2.3736e-05, + "loss": 0.0279, + "step": 7918 + }, + { + "epoch": 6.235525797558093, + "grad_norm": 0.5186179876327515, + "learning_rate": 2.3739e-05, + "loss": 0.0308, + "step": 7919 + }, + { + "epoch": 6.236313509255613, + "grad_norm": 0.5050848722457886, + "learning_rate": 2.3742e-05, + "loss": 0.0295, + "step": 7920 + }, + { + "epoch": 6.237101220953131, + "grad_norm": 0.9732735753059387, + "learning_rate": 2.3745e-05, + "loss": 0.2437, + "step": 7921 + }, + { + "epoch": 6.23788893265065, + "grad_norm": 0.6828211545944214, + "learning_rate": 2.3748e-05, + "loss": 0.1869, + "step": 7922 + }, + { + "epoch": 6.2386766443481685, + "grad_norm": 0.8628581762313843, + "learning_rate": 2.3751e-05, + "loss": 0.1635, + "step": 7923 + }, + { + "epoch": 6.239464356045687, + "grad_norm": 0.8584288358688354, + "learning_rate": 2.3754e-05, + "loss": 0.174, + "step": 7924 + }, + { + "epoch": 6.240252067743206, + "grad_norm": 0.7452594637870789, + "learning_rate": 2.3757000000000003e-05, + "loss": 0.124, + "step": 7925 + }, + { + "epoch": 6.241039779440725, + "grad_norm": 0.35344138741493225, + "learning_rate": 2.3760000000000003e-05, + "loss": 0.0496, + "step": 7926 + }, + { + "epoch": 6.241827491138244, + "grad_norm": 0.5608870387077332, + "learning_rate": 2.3763000000000003e-05, + "loss": 0.0466, + "step": 7927 + }, + { + "epoch": 6.242615202835762, + "grad_norm": 0.6786182522773743, + "learning_rate": 2.3766000000000003e-05, + "loss": 0.0405, + "step": 7928 + }, + { + "epoch": 6.243402914533281, + "grad_norm": 0.325124055147171, + "learning_rate": 2.3769000000000002e-05, + "loss": 0.0248, + "step": 7929 + }, + { + "epoch": 6.244190626230799, + "grad_norm": 0.3129310607910156, + "learning_rate": 2.3772e-05, + "loss": 0.025, + "step": 7930 + }, + { + "epoch": 6.244978337928318, + "grad_norm": 0.4692411422729492, + "learning_rate": 2.3775e-05, + "loss": 0.0704, + "step": 7931 + }, + { + "epoch": 6.245766049625837, + "grad_norm": 0.4974396824836731, + "learning_rate": 2.3778e-05, + "loss": 0.027, + "step": 7932 + }, + { + "epoch": 6.246553761323356, + "grad_norm": 0.2990218997001648, + "learning_rate": 2.3780999999999998e-05, + "loss": 0.0186, + "step": 7933 + }, + { + "epoch": 6.2473414730208745, + "grad_norm": 0.4550445079803467, + "learning_rate": 2.3783999999999998e-05, + "loss": 0.0238, + "step": 7934 + }, + { + "epoch": 6.248129184718393, + "grad_norm": 0.47299423813819885, + "learning_rate": 2.3787e-05, + "loss": 0.0155, + "step": 7935 + }, + { + "epoch": 6.248916896415912, + "grad_norm": 0.4098072648048401, + "learning_rate": 2.379e-05, + "loss": 0.0318, + "step": 7936 + }, + { + "epoch": 6.24970460811343, + "grad_norm": 0.756378173828125, + "learning_rate": 2.3793e-05, + "loss": 0.0283, + "step": 7937 + }, + { + "epoch": 6.250492319810949, + "grad_norm": 0.21892796456813812, + "learning_rate": 2.3796e-05, + "loss": 0.0148, + "step": 7938 + }, + { + "epoch": 6.251280031508468, + "grad_norm": 0.25615304708480835, + "learning_rate": 2.3799e-05, + "loss": 0.0194, + "step": 7939 + }, + { + "epoch": 6.252067743205987, + "grad_norm": 0.2656307518482208, + "learning_rate": 2.3802e-05, + "loss": 0.0197, + "step": 7940 + }, + { + "epoch": 6.252855454903505, + "grad_norm": 0.5476672053337097, + "learning_rate": 2.3805e-05, + "loss": 0.0231, + "step": 7941 + }, + { + "epoch": 6.253643166601024, + "grad_norm": 0.4554208219051361, + "learning_rate": 2.3808e-05, + "loss": 0.0244, + "step": 7942 + }, + { + "epoch": 6.2544308782985425, + "grad_norm": 0.5744851231575012, + "learning_rate": 2.3811e-05, + "loss": 0.0352, + "step": 7943 + }, + { + "epoch": 6.255218589996061, + "grad_norm": 0.5325260162353516, + "learning_rate": 2.3814e-05, + "loss": 0.0264, + "step": 7944 + }, + { + "epoch": 6.2560063016935805, + "grad_norm": 0.4825434386730194, + "learning_rate": 2.3817000000000003e-05, + "loss": 0.0159, + "step": 7945 + }, + { + "epoch": 6.256794013391099, + "grad_norm": 0.36371833086013794, + "learning_rate": 2.3820000000000002e-05, + "loss": 0.0122, + "step": 7946 + }, + { + "epoch": 6.257581725088618, + "grad_norm": 0.3849882185459137, + "learning_rate": 2.3823000000000002e-05, + "loss": 0.0136, + "step": 7947 + }, + { + "epoch": 6.258369436786136, + "grad_norm": 0.36592283844947815, + "learning_rate": 2.3826000000000002e-05, + "loss": 0.0246, + "step": 7948 + }, + { + "epoch": 6.259157148483655, + "grad_norm": 0.43468281626701355, + "learning_rate": 2.3829000000000002e-05, + "loss": 0.025, + "step": 7949 + }, + { + "epoch": 6.259944860181173, + "grad_norm": 1.0109065771102905, + "learning_rate": 2.3832e-05, + "loss": 0.0343, + "step": 7950 + }, + { + "epoch": 6.260732571878693, + "grad_norm": 0.48407232761383057, + "learning_rate": 2.3835e-05, + "loss": 0.0196, + "step": 7951 + }, + { + "epoch": 6.261520283576211, + "grad_norm": 0.309129923582077, + "learning_rate": 2.3838e-05, + "loss": 0.0195, + "step": 7952 + }, + { + "epoch": 6.26230799527373, + "grad_norm": 0.533657431602478, + "learning_rate": 2.3841e-05, + "loss": 0.0242, + "step": 7953 + }, + { + "epoch": 6.2630957069712485, + "grad_norm": 0.4178589880466461, + "learning_rate": 2.3844e-05, + "loss": 0.0198, + "step": 7954 + }, + { + "epoch": 6.263883418668767, + "grad_norm": 0.5979024767875671, + "learning_rate": 2.3847e-05, + "loss": 0.0295, + "step": 7955 + }, + { + "epoch": 6.264671130366286, + "grad_norm": 1.8117016553878784, + "learning_rate": 2.385e-05, + "loss": 0.0214, + "step": 7956 + }, + { + "epoch": 6.265458842063804, + "grad_norm": 0.3839264512062073, + "learning_rate": 2.3853e-05, + "loss": 0.0192, + "step": 7957 + }, + { + "epoch": 6.266246553761324, + "grad_norm": 0.7027137279510498, + "learning_rate": 2.3856e-05, + "loss": 0.031, + "step": 7958 + }, + { + "epoch": 6.267034265458842, + "grad_norm": 0.46380704641342163, + "learning_rate": 2.3859e-05, + "loss": 0.0249, + "step": 7959 + }, + { + "epoch": 6.267821977156361, + "grad_norm": 0.42318636178970337, + "learning_rate": 2.3862e-05, + "loss": 0.0166, + "step": 7960 + }, + { + "epoch": 6.268609688853879, + "grad_norm": 0.2706204056739807, + "learning_rate": 2.3865e-05, + "loss": 0.0179, + "step": 7961 + }, + { + "epoch": 6.269397400551398, + "grad_norm": 0.24123524129390717, + "learning_rate": 2.3868e-05, + "loss": 0.0171, + "step": 7962 + }, + { + "epoch": 6.2701851122489165, + "grad_norm": 0.6370318531990051, + "learning_rate": 2.3871e-05, + "loss": 0.0256, + "step": 7963 + }, + { + "epoch": 6.270972823946436, + "grad_norm": 0.59173983335495, + "learning_rate": 2.3874e-05, + "loss": 0.0296, + "step": 7964 + }, + { + "epoch": 6.2717605356439545, + "grad_norm": 0.48497438430786133, + "learning_rate": 2.3877000000000002e-05, + "loss": 0.0163, + "step": 7965 + }, + { + "epoch": 6.272548247341473, + "grad_norm": 0.7017315030097961, + "learning_rate": 2.3880000000000002e-05, + "loss": 0.0216, + "step": 7966 + }, + { + "epoch": 6.273335959038992, + "grad_norm": 0.5783324241638184, + "learning_rate": 2.3883e-05, + "loss": 0.0276, + "step": 7967 + }, + { + "epoch": 6.27412367073651, + "grad_norm": 0.2868752181529999, + "learning_rate": 2.3886e-05, + "loss": 0.0139, + "step": 7968 + }, + { + "epoch": 6.274911382434029, + "grad_norm": 0.3554132580757141, + "learning_rate": 2.3889e-05, + "loss": 0.0217, + "step": 7969 + }, + { + "epoch": 6.275699094131548, + "grad_norm": 0.9983634948730469, + "learning_rate": 2.3892e-05, + "loss": 0.0228, + "step": 7970 + }, + { + "epoch": 6.276486805829067, + "grad_norm": 0.9250139594078064, + "learning_rate": 2.3895e-05, + "loss": 0.371, + "step": 7971 + }, + { + "epoch": 6.277274517526585, + "grad_norm": 0.8179193139076233, + "learning_rate": 2.3898e-05, + "loss": 0.2742, + "step": 7972 + }, + { + "epoch": 6.278062229224104, + "grad_norm": 0.5309756994247437, + "learning_rate": 2.3901e-05, + "loss": 0.1269, + "step": 7973 + }, + { + "epoch": 6.2788499409216225, + "grad_norm": 0.5598923563957214, + "learning_rate": 2.3904e-05, + "loss": 0.1391, + "step": 7974 + }, + { + "epoch": 6.279637652619141, + "grad_norm": 0.48660075664520264, + "learning_rate": 2.3907000000000003e-05, + "loss": 0.0614, + "step": 7975 + }, + { + "epoch": 6.28042536431666, + "grad_norm": 0.3325641453266144, + "learning_rate": 2.3910000000000003e-05, + "loss": 0.0454, + "step": 7976 + }, + { + "epoch": 6.281213076014179, + "grad_norm": 0.31254395842552185, + "learning_rate": 2.3913000000000003e-05, + "loss": 0.0394, + "step": 7977 + }, + { + "epoch": 6.282000787711698, + "grad_norm": 0.27337247133255005, + "learning_rate": 2.3916000000000003e-05, + "loss": 0.0214, + "step": 7978 + }, + { + "epoch": 6.282788499409216, + "grad_norm": 0.21620944142341614, + "learning_rate": 2.3919e-05, + "loss": 0.013, + "step": 7979 + }, + { + "epoch": 6.283576211106735, + "grad_norm": 0.4572429955005646, + "learning_rate": 2.3922e-05, + "loss": 0.051, + "step": 7980 + }, + { + "epoch": 6.284363922804253, + "grad_norm": 0.2893286347389221, + "learning_rate": 2.3925e-05, + "loss": 0.0181, + "step": 7981 + }, + { + "epoch": 6.285151634501772, + "grad_norm": 1.5596412420272827, + "learning_rate": 2.3928e-05, + "loss": 0.0207, + "step": 7982 + }, + { + "epoch": 6.285939346199291, + "grad_norm": 0.6134533286094666, + "learning_rate": 2.3931e-05, + "loss": 0.0293, + "step": 7983 + }, + { + "epoch": 6.28672705789681, + "grad_norm": 0.422787070274353, + "learning_rate": 2.3933999999999998e-05, + "loss": 0.0295, + "step": 7984 + }, + { + "epoch": 6.2875147695943285, + "grad_norm": 0.3843701183795929, + "learning_rate": 2.3937e-05, + "loss": 0.0204, + "step": 7985 + }, + { + "epoch": 6.288302481291847, + "grad_norm": 0.4042084217071533, + "learning_rate": 2.394e-05, + "loss": 0.0286, + "step": 7986 + }, + { + "epoch": 6.289090192989366, + "grad_norm": 0.6941956281661987, + "learning_rate": 2.3943e-05, + "loss": 0.0265, + "step": 7987 + }, + { + "epoch": 6.289877904686884, + "grad_norm": 0.3217780292034149, + "learning_rate": 2.3946e-05, + "loss": 0.0202, + "step": 7988 + }, + { + "epoch": 6.290665616384404, + "grad_norm": 0.44144949316978455, + "learning_rate": 2.3949e-05, + "loss": 0.0249, + "step": 7989 + }, + { + "epoch": 6.291453328081922, + "grad_norm": 0.38648155331611633, + "learning_rate": 2.3952e-05, + "loss": 0.0219, + "step": 7990 + }, + { + "epoch": 6.292241039779441, + "grad_norm": 0.2769620716571808, + "learning_rate": 2.3955e-05, + "loss": 0.0185, + "step": 7991 + }, + { + "epoch": 6.293028751476959, + "grad_norm": 0.40685710310935974, + "learning_rate": 2.3958e-05, + "loss": 0.0185, + "step": 7992 + }, + { + "epoch": 6.293816463174478, + "grad_norm": 0.2948800325393677, + "learning_rate": 2.3961e-05, + "loss": 0.0153, + "step": 7993 + }, + { + "epoch": 6.2946041748719965, + "grad_norm": 0.366756796836853, + "learning_rate": 2.3964e-05, + "loss": 0.0156, + "step": 7994 + }, + { + "epoch": 6.295391886569515, + "grad_norm": 0.3830591142177582, + "learning_rate": 2.3967000000000003e-05, + "loss": 0.0215, + "step": 7995 + }, + { + "epoch": 6.2961795982670345, + "grad_norm": 0.4174736440181732, + "learning_rate": 2.3970000000000003e-05, + "loss": 0.0162, + "step": 7996 + }, + { + "epoch": 6.296967309964553, + "grad_norm": 0.22804446518421173, + "learning_rate": 2.3973000000000002e-05, + "loss": 0.0114, + "step": 7997 + }, + { + "epoch": 6.297755021662072, + "grad_norm": 0.26903748512268066, + "learning_rate": 2.3976000000000002e-05, + "loss": 0.0125, + "step": 7998 + }, + { + "epoch": 6.29854273335959, + "grad_norm": 0.2978566586971283, + "learning_rate": 2.3979000000000002e-05, + "loss": 0.0154, + "step": 7999 + }, + { + "epoch": 6.299330445057109, + "grad_norm": 0.4349205791950226, + "learning_rate": 2.3982000000000002e-05, + "loss": 0.0177, + "step": 8000 + }, + { + "epoch": 6.299330445057109, + "eval_cer": 0.12090105503279155, + "eval_loss": 0.37983569502830505, + "eval_runtime": 16.068, + "eval_samples_per_second": 18.92, + "eval_steps_per_second": 0.622, + "eval_wer": 0.42900997697620874, + "step": 8000 + }, + { + "epoch": 6.300118156754628, + "grad_norm": 0.3365376591682434, + "learning_rate": 2.3985e-05, + "loss": 0.0211, + "step": 8001 + }, + { + "epoch": 6.300905868452147, + "grad_norm": 0.3225742280483246, + "learning_rate": 2.3988e-05, + "loss": 0.0145, + "step": 8002 + }, + { + "epoch": 6.301693580149665, + "grad_norm": 0.4017374515533447, + "learning_rate": 2.3991e-05, + "loss": 0.0254, + "step": 8003 + }, + { + "epoch": 6.302481291847184, + "grad_norm": 0.5514066219329834, + "learning_rate": 2.3993999999999998e-05, + "loss": 0.0176, + "step": 8004 + }, + { + "epoch": 6.3032690035447025, + "grad_norm": 0.36609649658203125, + "learning_rate": 2.3997e-05, + "loss": 0.0112, + "step": 8005 + }, + { + "epoch": 6.304056715242221, + "grad_norm": 0.6693354249000549, + "learning_rate": 2.4e-05, + "loss": 0.0266, + "step": 8006 + }, + { + "epoch": 6.30484442693974, + "grad_norm": 0.31920546293258667, + "learning_rate": 2.4003e-05, + "loss": 0.014, + "step": 8007 + }, + { + "epoch": 6.305632138637259, + "grad_norm": 0.27999672293663025, + "learning_rate": 2.4006e-05, + "loss": 0.0145, + "step": 8008 + }, + { + "epoch": 6.306419850334778, + "grad_norm": 0.44981980323791504, + "learning_rate": 2.4009e-05, + "loss": 0.0291, + "step": 8009 + }, + { + "epoch": 6.307207562032296, + "grad_norm": 0.3794637620449066, + "learning_rate": 2.4012e-05, + "loss": 0.0275, + "step": 8010 + }, + { + "epoch": 6.307995273729815, + "grad_norm": 0.278202623128891, + "learning_rate": 2.4015e-05, + "loss": 0.0229, + "step": 8011 + }, + { + "epoch": 6.308782985427333, + "grad_norm": 0.5013309121131897, + "learning_rate": 2.4018e-05, + "loss": 0.0267, + "step": 8012 + }, + { + "epoch": 6.309570697124852, + "grad_norm": 0.4133973717689514, + "learning_rate": 2.4021e-05, + "loss": 0.0257, + "step": 8013 + }, + { + "epoch": 6.310358408822371, + "grad_norm": 0.612142026424408, + "learning_rate": 2.4024e-05, + "loss": 0.0257, + "step": 8014 + }, + { + "epoch": 6.31114612051989, + "grad_norm": 0.36843156814575195, + "learning_rate": 2.4027e-05, + "loss": 0.0188, + "step": 8015 + }, + { + "epoch": 6.311933832217409, + "grad_norm": 0.5468989014625549, + "learning_rate": 2.4030000000000002e-05, + "loss": 0.0211, + "step": 8016 + }, + { + "epoch": 6.312721543914927, + "grad_norm": 0.6137649416923523, + "learning_rate": 2.4033000000000002e-05, + "loss": 0.0263, + "step": 8017 + }, + { + "epoch": 6.313509255612446, + "grad_norm": 0.5421299338340759, + "learning_rate": 2.4036e-05, + "loss": 0.0206, + "step": 8018 + }, + { + "epoch": 6.314296967309964, + "grad_norm": 0.4394330382347107, + "learning_rate": 2.4039e-05, + "loss": 0.0229, + "step": 8019 + }, + { + "epoch": 6.315084679007484, + "grad_norm": 0.5677720904350281, + "learning_rate": 2.4042e-05, + "loss": 0.031, + "step": 8020 + }, + { + "epoch": 6.315872390705002, + "grad_norm": 0.9834765791893005, + "learning_rate": 2.4045e-05, + "loss": 0.294, + "step": 8021 + }, + { + "epoch": 6.316660102402521, + "grad_norm": 0.8519297242164612, + "learning_rate": 2.4048e-05, + "loss": 0.212, + "step": 8022 + }, + { + "epoch": 6.317447814100039, + "grad_norm": 0.8104028701782227, + "learning_rate": 2.4051e-05, + "loss": 0.1622, + "step": 8023 + }, + { + "epoch": 6.318235525797558, + "grad_norm": 0.6343229413032532, + "learning_rate": 2.4054e-05, + "loss": 0.1414, + "step": 8024 + }, + { + "epoch": 6.3190232374950766, + "grad_norm": 0.6275216341018677, + "learning_rate": 2.4057e-05, + "loss": 0.1239, + "step": 8025 + }, + { + "epoch": 6.319810949192595, + "grad_norm": 0.313040167093277, + "learning_rate": 2.4060000000000003e-05, + "loss": 0.0527, + "step": 8026 + }, + { + "epoch": 6.320598660890115, + "grad_norm": 0.3901319205760956, + "learning_rate": 2.4063000000000003e-05, + "loss": 0.0401, + "step": 8027 + }, + { + "epoch": 6.321386372587633, + "grad_norm": 0.333215594291687, + "learning_rate": 2.4066000000000003e-05, + "loss": 0.0599, + "step": 8028 + }, + { + "epoch": 6.322174084285152, + "grad_norm": 0.4098881483078003, + "learning_rate": 2.4069e-05, + "loss": 0.0306, + "step": 8029 + }, + { + "epoch": 6.32296179598267, + "grad_norm": 0.3746802508831024, + "learning_rate": 2.4072e-05, + "loss": 0.0397, + "step": 8030 + }, + { + "epoch": 6.323749507680189, + "grad_norm": 0.3601857125759125, + "learning_rate": 2.4075e-05, + "loss": 0.0234, + "step": 8031 + }, + { + "epoch": 6.324537219377707, + "grad_norm": 0.4277665913105011, + "learning_rate": 2.4078e-05, + "loss": 0.0378, + "step": 8032 + }, + { + "epoch": 6.325324931075227, + "grad_norm": 0.49522000551223755, + "learning_rate": 2.4081e-05, + "loss": 0.0224, + "step": 8033 + }, + { + "epoch": 6.326112642772745, + "grad_norm": 0.6680946946144104, + "learning_rate": 2.4084e-05, + "loss": 0.0247, + "step": 8034 + }, + { + "epoch": 6.326900354470264, + "grad_norm": 0.402968168258667, + "learning_rate": 2.4086999999999998e-05, + "loss": 0.0191, + "step": 8035 + }, + { + "epoch": 6.327688066167783, + "grad_norm": 0.2746497690677643, + "learning_rate": 2.409e-05, + "loss": 0.0367, + "step": 8036 + }, + { + "epoch": 6.328475777865301, + "grad_norm": 0.2174566239118576, + "learning_rate": 2.4093e-05, + "loss": 0.0124, + "step": 8037 + }, + { + "epoch": 6.32926348956282, + "grad_norm": 0.30754268169403076, + "learning_rate": 2.4096e-05, + "loss": 0.0164, + "step": 8038 + }, + { + "epoch": 6.330051201260339, + "grad_norm": 0.2396475076675415, + "learning_rate": 2.4099e-05, + "loss": 0.0127, + "step": 8039 + }, + { + "epoch": 6.330838912957858, + "grad_norm": 0.4201110899448395, + "learning_rate": 2.4102e-05, + "loss": 0.0275, + "step": 8040 + }, + { + "epoch": 6.331626624655376, + "grad_norm": 0.36609363555908203, + "learning_rate": 2.4105e-05, + "loss": 0.0177, + "step": 8041 + }, + { + "epoch": 6.332414336352895, + "grad_norm": 0.16397029161453247, + "learning_rate": 2.4108e-05, + "loss": 0.0118, + "step": 8042 + }, + { + "epoch": 6.333202048050413, + "grad_norm": 1.1311626434326172, + "learning_rate": 2.4111e-05, + "loss": 0.0255, + "step": 8043 + }, + { + "epoch": 6.333989759747932, + "grad_norm": 0.35933321714401245, + "learning_rate": 2.4114e-05, + "loss": 0.0178, + "step": 8044 + }, + { + "epoch": 6.334777471445451, + "grad_norm": 0.40267255902290344, + "learning_rate": 2.4117e-05, + "loss": 0.032, + "step": 8045 + }, + { + "epoch": 6.33556518314297, + "grad_norm": 0.4800295829772949, + "learning_rate": 2.4120000000000003e-05, + "loss": 0.0216, + "step": 8046 + }, + { + "epoch": 6.336352894840489, + "grad_norm": 0.48760417103767395, + "learning_rate": 2.4123000000000003e-05, + "loss": 0.0291, + "step": 8047 + }, + { + "epoch": 6.337140606538007, + "grad_norm": 0.4213753044605255, + "learning_rate": 2.4126000000000002e-05, + "loss": 0.0252, + "step": 8048 + }, + { + "epoch": 6.337928318235526, + "grad_norm": 0.3564102053642273, + "learning_rate": 2.4129000000000002e-05, + "loss": 0.0189, + "step": 8049 + }, + { + "epoch": 6.338716029933044, + "grad_norm": 0.3049647808074951, + "learning_rate": 2.4132000000000002e-05, + "loss": 0.0223, + "step": 8050 + }, + { + "epoch": 6.339503741630563, + "grad_norm": 0.2834116816520691, + "learning_rate": 2.4135000000000002e-05, + "loss": 0.0163, + "step": 8051 + }, + { + "epoch": 6.340291453328082, + "grad_norm": 0.45245659351348877, + "learning_rate": 2.4138e-05, + "loss": 0.0161, + "step": 8052 + }, + { + "epoch": 6.341079165025601, + "grad_norm": 0.4324139952659607, + "learning_rate": 2.4140999999999998e-05, + "loss": 0.0215, + "step": 8053 + }, + { + "epoch": 6.341866876723119, + "grad_norm": 0.24926748871803284, + "learning_rate": 2.4143999999999998e-05, + "loss": 0.0109, + "step": 8054 + }, + { + "epoch": 6.342654588420638, + "grad_norm": 0.23679345846176147, + "learning_rate": 2.4146999999999998e-05, + "loss": 0.0171, + "step": 8055 + }, + { + "epoch": 6.343442300118157, + "grad_norm": 0.3199511766433716, + "learning_rate": 2.415e-05, + "loss": 0.0125, + "step": 8056 + }, + { + "epoch": 6.344230011815675, + "grad_norm": 0.6057621240615845, + "learning_rate": 2.4153e-05, + "loss": 0.0286, + "step": 8057 + }, + { + "epoch": 6.345017723513195, + "grad_norm": 0.39492926001548767, + "learning_rate": 2.4156e-05, + "loss": 0.0274, + "step": 8058 + }, + { + "epoch": 6.345805435210713, + "grad_norm": 0.6183503270149231, + "learning_rate": 2.4159e-05, + "loss": 0.0343, + "step": 8059 + }, + { + "epoch": 6.346593146908232, + "grad_norm": 0.25065702199935913, + "learning_rate": 2.4162e-05, + "loss": 0.0213, + "step": 8060 + }, + { + "epoch": 6.34738085860575, + "grad_norm": 0.3498367667198181, + "learning_rate": 2.4165e-05, + "loss": 0.021, + "step": 8061 + }, + { + "epoch": 6.348168570303269, + "grad_norm": 0.6108373999595642, + "learning_rate": 2.4168e-05, + "loss": 0.0314, + "step": 8062 + }, + { + "epoch": 6.348956282000787, + "grad_norm": 0.6563677787780762, + "learning_rate": 2.4171e-05, + "loss": 0.0351, + "step": 8063 + }, + { + "epoch": 6.349743993698306, + "grad_norm": 0.28185218572616577, + "learning_rate": 2.4174e-05, + "loss": 0.0142, + "step": 8064 + }, + { + "epoch": 6.3505317053958255, + "grad_norm": 0.413782000541687, + "learning_rate": 2.4177e-05, + "loss": 0.032, + "step": 8065 + }, + { + "epoch": 6.351319417093344, + "grad_norm": 0.47566765546798706, + "learning_rate": 2.4180000000000002e-05, + "loss": 0.0271, + "step": 8066 + }, + { + "epoch": 6.352107128790863, + "grad_norm": 0.6027641892433167, + "learning_rate": 2.4183000000000002e-05, + "loss": 0.0228, + "step": 8067 + }, + { + "epoch": 6.352894840488381, + "grad_norm": 0.7327744960784912, + "learning_rate": 2.4186000000000002e-05, + "loss": 0.0495, + "step": 8068 + }, + { + "epoch": 6.3536825521859, + "grad_norm": 0.6084136366844177, + "learning_rate": 2.4189e-05, + "loss": 0.0339, + "step": 8069 + }, + { + "epoch": 6.354470263883418, + "grad_norm": 0.6975917816162109, + "learning_rate": 2.4192e-05, + "loss": 0.0394, + "step": 8070 + }, + { + "epoch": 6.355257975580938, + "grad_norm": 0.821324348449707, + "learning_rate": 2.4195e-05, + "loss": 0.2695, + "step": 8071 + }, + { + "epoch": 6.356045687278456, + "grad_norm": 1.091407060623169, + "learning_rate": 2.4198e-05, + "loss": 0.2045, + "step": 8072 + }, + { + "epoch": 6.356833398975975, + "grad_norm": 1.124086618423462, + "learning_rate": 2.4201e-05, + "loss": 0.1793, + "step": 8073 + }, + { + "epoch": 6.3576211106734934, + "grad_norm": 0.8110175132751465, + "learning_rate": 2.4204e-05, + "loss": 0.1787, + "step": 8074 + }, + { + "epoch": 6.358408822371012, + "grad_norm": 0.49150118231773376, + "learning_rate": 2.4207e-05, + "loss": 0.0762, + "step": 8075 + }, + { + "epoch": 6.359196534068531, + "grad_norm": 0.37248337268829346, + "learning_rate": 2.4210000000000004e-05, + "loss": 0.0411, + "step": 8076 + }, + { + "epoch": 6.35998424576605, + "grad_norm": 0.3322901725769043, + "learning_rate": 2.4213000000000003e-05, + "loss": 0.0277, + "step": 8077 + }, + { + "epoch": 6.360771957463569, + "grad_norm": 0.28920501470565796, + "learning_rate": 2.4216e-05, + "loss": 0.0301, + "step": 8078 + }, + { + "epoch": 6.361559669161087, + "grad_norm": 0.33958354592323303, + "learning_rate": 2.4219e-05, + "loss": 0.0361, + "step": 8079 + }, + { + "epoch": 6.362347380858606, + "grad_norm": 0.34891507029533386, + "learning_rate": 2.4222e-05, + "loss": 0.0298, + "step": 8080 + }, + { + "epoch": 6.363135092556124, + "grad_norm": 0.26134437322616577, + "learning_rate": 2.4225e-05, + "loss": 0.0211, + "step": 8081 + }, + { + "epoch": 6.363922804253643, + "grad_norm": 0.8600006103515625, + "learning_rate": 2.4228e-05, + "loss": 0.0442, + "step": 8082 + }, + { + "epoch": 6.364710515951161, + "grad_norm": 0.2709829807281494, + "learning_rate": 2.4231e-05, + "loss": 0.019, + "step": 8083 + }, + { + "epoch": 6.365498227648681, + "grad_norm": 0.24722732603549957, + "learning_rate": 2.4234e-05, + "loss": 0.0124, + "step": 8084 + }, + { + "epoch": 6.3662859393461995, + "grad_norm": 0.3020421862602234, + "learning_rate": 2.4237e-05, + "loss": 0.0208, + "step": 8085 + }, + { + "epoch": 6.367073651043718, + "grad_norm": 0.323044091463089, + "learning_rate": 2.4240000000000002e-05, + "loss": 0.0156, + "step": 8086 + }, + { + "epoch": 6.367861362741237, + "grad_norm": 0.29499906301498413, + "learning_rate": 2.4243e-05, + "loss": 0.0252, + "step": 8087 + }, + { + "epoch": 6.368649074438755, + "grad_norm": 0.17178978025913239, + "learning_rate": 2.4246e-05, + "loss": 0.0109, + "step": 8088 + }, + { + "epoch": 6.369436786136274, + "grad_norm": 0.32260268926620483, + "learning_rate": 2.4249e-05, + "loss": 0.0239, + "step": 8089 + }, + { + "epoch": 6.370224497833793, + "grad_norm": 1.1570062637329102, + "learning_rate": 2.4252e-05, + "loss": 0.0216, + "step": 8090 + }, + { + "epoch": 6.371012209531312, + "grad_norm": 0.3425084948539734, + "learning_rate": 2.4255e-05, + "loss": 0.0375, + "step": 8091 + }, + { + "epoch": 6.37179992122883, + "grad_norm": 0.3976860046386719, + "learning_rate": 2.4258e-05, + "loss": 0.0296, + "step": 8092 + }, + { + "epoch": 6.372587632926349, + "grad_norm": 0.7970375418663025, + "learning_rate": 2.4261e-05, + "loss": 0.0314, + "step": 8093 + }, + { + "epoch": 6.3733753446238675, + "grad_norm": 0.3768119513988495, + "learning_rate": 2.4264e-05, + "loss": 0.0193, + "step": 8094 + }, + { + "epoch": 6.374163056321386, + "grad_norm": 0.3069216012954712, + "learning_rate": 2.4267e-05, + "loss": 0.0197, + "step": 8095 + }, + { + "epoch": 6.3749507680189055, + "grad_norm": 0.42404836416244507, + "learning_rate": 2.4270000000000003e-05, + "loss": 0.0405, + "step": 8096 + }, + { + "epoch": 6.375738479716424, + "grad_norm": 0.3319701552391052, + "learning_rate": 2.4273000000000003e-05, + "loss": 0.0145, + "step": 8097 + }, + { + "epoch": 6.376526191413943, + "grad_norm": 1.1667475700378418, + "learning_rate": 2.4276000000000003e-05, + "loss": 0.0523, + "step": 8098 + }, + { + "epoch": 6.377313903111461, + "grad_norm": 0.2147095650434494, + "learning_rate": 2.4279000000000003e-05, + "loss": 0.0136, + "step": 8099 + }, + { + "epoch": 6.37810161480898, + "grad_norm": 0.6228373050689697, + "learning_rate": 2.4282000000000002e-05, + "loss": 0.035, + "step": 8100 + }, + { + "epoch": 6.378889326506498, + "grad_norm": 0.6388373970985413, + "learning_rate": 2.4285000000000002e-05, + "loss": 0.0265, + "step": 8101 + }, + { + "epoch": 6.379677038204017, + "grad_norm": 0.4501490592956543, + "learning_rate": 2.4288e-05, + "loss": 0.0376, + "step": 8102 + }, + { + "epoch": 6.380464749901536, + "grad_norm": 0.2959890067577362, + "learning_rate": 2.4291e-05, + "loss": 0.0162, + "step": 8103 + }, + { + "epoch": 6.381252461599055, + "grad_norm": 0.49298131465911865, + "learning_rate": 2.4293999999999998e-05, + "loss": 0.0234, + "step": 8104 + }, + { + "epoch": 6.3820401732965735, + "grad_norm": 0.663817822933197, + "learning_rate": 2.4296999999999998e-05, + "loss": 0.0202, + "step": 8105 + }, + { + "epoch": 6.382827884994092, + "grad_norm": 0.3254401981830597, + "learning_rate": 2.43e-05, + "loss": 0.02, + "step": 8106 + }, + { + "epoch": 6.383615596691611, + "grad_norm": 0.19804948568344116, + "learning_rate": 2.4303e-05, + "loss": 0.0114, + "step": 8107 + }, + { + "epoch": 6.384403308389129, + "grad_norm": 0.41038912534713745, + "learning_rate": 2.4306e-05, + "loss": 0.0175, + "step": 8108 + }, + { + "epoch": 6.385191020086649, + "grad_norm": 0.3646678030490875, + "learning_rate": 2.4309e-05, + "loss": 0.0266, + "step": 8109 + }, + { + "epoch": 6.385978731784167, + "grad_norm": 0.36121365427970886, + "learning_rate": 2.4312e-05, + "loss": 0.0236, + "step": 8110 + }, + { + "epoch": 6.386766443481686, + "grad_norm": 0.5704746246337891, + "learning_rate": 2.4315e-05, + "loss": 0.0428, + "step": 8111 + }, + { + "epoch": 6.387554155179204, + "grad_norm": 1.2635672092437744, + "learning_rate": 2.4318e-05, + "loss": 0.0364, + "step": 8112 + }, + { + "epoch": 6.388341866876723, + "grad_norm": 0.5196788311004639, + "learning_rate": 2.4321e-05, + "loss": 0.0536, + "step": 8113 + }, + { + "epoch": 6.3891295785742415, + "grad_norm": 0.41412389278411865, + "learning_rate": 2.4324e-05, + "loss": 0.0215, + "step": 8114 + }, + { + "epoch": 6.389917290271761, + "grad_norm": 0.3673284649848938, + "learning_rate": 2.4327e-05, + "loss": 0.019, + "step": 8115 + }, + { + "epoch": 6.3907050019692795, + "grad_norm": 0.3565060794353485, + "learning_rate": 2.4330000000000003e-05, + "loss": 0.0291, + "step": 8116 + }, + { + "epoch": 6.391492713666798, + "grad_norm": 0.4250313639640808, + "learning_rate": 2.4333000000000002e-05, + "loss": 0.0236, + "step": 8117 + }, + { + "epoch": 6.392280425364317, + "grad_norm": 0.30988430976867676, + "learning_rate": 2.4336000000000002e-05, + "loss": 0.0106, + "step": 8118 + }, + { + "epoch": 6.393068137061835, + "grad_norm": 0.32990652322769165, + "learning_rate": 2.4339000000000002e-05, + "loss": 0.0153, + "step": 8119 + }, + { + "epoch": 6.393855848759354, + "grad_norm": 0.5694789290428162, + "learning_rate": 2.4342000000000002e-05, + "loss": 0.0323, + "step": 8120 + }, + { + "epoch": 6.394643560456872, + "grad_norm": 0.9652872681617737, + "learning_rate": 2.4345e-05, + "loss": 0.297, + "step": 8121 + }, + { + "epoch": 6.395431272154392, + "grad_norm": 0.8342291712760925, + "learning_rate": 2.4348e-05, + "loss": 0.2522, + "step": 8122 + }, + { + "epoch": 6.39621898385191, + "grad_norm": 0.7718333005905151, + "learning_rate": 2.4351e-05, + "loss": 0.2274, + "step": 8123 + }, + { + "epoch": 6.397006695549429, + "grad_norm": 0.9594599008560181, + "learning_rate": 2.4354e-05, + "loss": 0.1765, + "step": 8124 + }, + { + "epoch": 6.3977944072469475, + "grad_norm": 0.7966092228889465, + "learning_rate": 2.4357e-05, + "loss": 0.0943, + "step": 8125 + }, + { + "epoch": 6.398582118944466, + "grad_norm": 0.48805898427963257, + "learning_rate": 2.4360000000000004e-05, + "loss": 0.0743, + "step": 8126 + }, + { + "epoch": 6.3993698306419855, + "grad_norm": 0.24118982255458832, + "learning_rate": 2.4363e-05, + "loss": 0.0271, + "step": 8127 + }, + { + "epoch": 6.400157542339504, + "grad_norm": 0.5320485830307007, + "learning_rate": 2.4366e-05, + "loss": 0.0673, + "step": 8128 + }, + { + "epoch": 6.400945254037023, + "grad_norm": 0.36537545919418335, + "learning_rate": 2.4369e-05, + "loss": 0.0374, + "step": 8129 + }, + { + "epoch": 6.401732965734541, + "grad_norm": 0.34728679060935974, + "learning_rate": 2.4372e-05, + "loss": 0.0267, + "step": 8130 + }, + { + "epoch": 6.40252067743206, + "grad_norm": 0.465708464384079, + "learning_rate": 2.4375e-05, + "loss": 0.0273, + "step": 8131 + }, + { + "epoch": 6.403308389129578, + "grad_norm": 0.5694284439086914, + "learning_rate": 2.4378e-05, + "loss": 0.0325, + "step": 8132 + }, + { + "epoch": 6.404096100827097, + "grad_norm": 0.3107815086841583, + "learning_rate": 2.4381e-05, + "loss": 0.0181, + "step": 8133 + }, + { + "epoch": 6.404883812524616, + "grad_norm": 0.5062411427497864, + "learning_rate": 2.4384e-05, + "loss": 0.0248, + "step": 8134 + }, + { + "epoch": 6.405671524222135, + "grad_norm": 0.3923453688621521, + "learning_rate": 2.4387e-05, + "loss": 0.022, + "step": 8135 + }, + { + "epoch": 6.4064592359196535, + "grad_norm": 0.9319040179252625, + "learning_rate": 2.439e-05, + "loss": 0.0602, + "step": 8136 + }, + { + "epoch": 6.407246947617172, + "grad_norm": 0.6637342572212219, + "learning_rate": 2.4393000000000002e-05, + "loss": 0.0311, + "step": 8137 + }, + { + "epoch": 6.408034659314691, + "grad_norm": 0.4715809226036072, + "learning_rate": 2.4396e-05, + "loss": 0.0213, + "step": 8138 + }, + { + "epoch": 6.408822371012209, + "grad_norm": 0.3803263008594513, + "learning_rate": 2.4399e-05, + "loss": 0.0148, + "step": 8139 + }, + { + "epoch": 6.409610082709729, + "grad_norm": 0.6626906394958496, + "learning_rate": 2.4402e-05, + "loss": 0.0276, + "step": 8140 + }, + { + "epoch": 6.410397794407247, + "grad_norm": 0.32557928562164307, + "learning_rate": 2.4405e-05, + "loss": 0.0161, + "step": 8141 + }, + { + "epoch": 6.411185506104766, + "grad_norm": 0.2417265921831131, + "learning_rate": 2.4408e-05, + "loss": 0.0233, + "step": 8142 + }, + { + "epoch": 6.411973217802284, + "grad_norm": 0.4140770137310028, + "learning_rate": 2.4411e-05, + "loss": 0.0184, + "step": 8143 + }, + { + "epoch": 6.412760929499803, + "grad_norm": 0.3764866590499878, + "learning_rate": 2.4414e-05, + "loss": 0.0174, + "step": 8144 + }, + { + "epoch": 6.4135486411973215, + "grad_norm": 0.5445845723152161, + "learning_rate": 2.4417e-05, + "loss": 0.0149, + "step": 8145 + }, + { + "epoch": 6.414336352894841, + "grad_norm": 0.3224684000015259, + "learning_rate": 2.442e-05, + "loss": 0.0148, + "step": 8146 + }, + { + "epoch": 6.4151240645923595, + "grad_norm": 0.31678566336631775, + "learning_rate": 2.4423000000000003e-05, + "loss": 0.0182, + "step": 8147 + }, + { + "epoch": 6.415911776289878, + "grad_norm": 0.35785791277885437, + "learning_rate": 2.4426000000000003e-05, + "loss": 0.0168, + "step": 8148 + }, + { + "epoch": 6.416699487987397, + "grad_norm": 0.3549093008041382, + "learning_rate": 2.4429000000000003e-05, + "loss": 0.0212, + "step": 8149 + }, + { + "epoch": 6.417487199684915, + "grad_norm": 0.5690730214118958, + "learning_rate": 2.4432000000000003e-05, + "loss": 0.02, + "step": 8150 + }, + { + "epoch": 6.418274911382434, + "grad_norm": 0.2594885230064392, + "learning_rate": 2.4435e-05, + "loss": 0.0176, + "step": 8151 + }, + { + "epoch": 6.419062623079952, + "grad_norm": 0.34089192748069763, + "learning_rate": 2.4438e-05, + "loss": 0.0174, + "step": 8152 + }, + { + "epoch": 6.419850334777472, + "grad_norm": 0.418494313955307, + "learning_rate": 2.4441e-05, + "loss": 0.0276, + "step": 8153 + }, + { + "epoch": 6.42063804647499, + "grad_norm": 0.36179181933403015, + "learning_rate": 2.4444e-05, + "loss": 0.0273, + "step": 8154 + }, + { + "epoch": 6.421425758172509, + "grad_norm": 0.295573353767395, + "learning_rate": 2.4446999999999998e-05, + "loss": 0.0271, + "step": 8155 + }, + { + "epoch": 6.4222134698700275, + "grad_norm": 0.8461042642593384, + "learning_rate": 2.4449999999999998e-05, + "loss": 0.0341, + "step": 8156 + }, + { + "epoch": 6.423001181567546, + "grad_norm": 0.5643425583839417, + "learning_rate": 2.4453e-05, + "loss": 0.0227, + "step": 8157 + }, + { + "epoch": 6.423788893265065, + "grad_norm": 0.25550344586372375, + "learning_rate": 2.4456e-05, + "loss": 0.0215, + "step": 8158 + }, + { + "epoch": 6.424576604962584, + "grad_norm": 0.38696905970573425, + "learning_rate": 2.4459e-05, + "loss": 0.0189, + "step": 8159 + }, + { + "epoch": 6.425364316660103, + "grad_norm": 0.35486775636672974, + "learning_rate": 2.4462e-05, + "loss": 0.0228, + "step": 8160 + }, + { + "epoch": 6.426152028357621, + "grad_norm": 0.3730497658252716, + "learning_rate": 2.4465e-05, + "loss": 0.0289, + "step": 8161 + }, + { + "epoch": 6.42693974005514, + "grad_norm": 0.6123403310775757, + "learning_rate": 2.4468e-05, + "loss": 0.0309, + "step": 8162 + }, + { + "epoch": 6.427727451752658, + "grad_norm": 0.4076080024242401, + "learning_rate": 2.4471e-05, + "loss": 0.0222, + "step": 8163 + }, + { + "epoch": 6.428515163450177, + "grad_norm": 0.5882677435874939, + "learning_rate": 2.4474e-05, + "loss": 0.0252, + "step": 8164 + }, + { + "epoch": 6.429302875147696, + "grad_norm": 0.4363529086112976, + "learning_rate": 2.4477e-05, + "loss": 0.0158, + "step": 8165 + }, + { + "epoch": 6.430090586845215, + "grad_norm": 0.23067784309387207, + "learning_rate": 2.448e-05, + "loss": 0.0118, + "step": 8166 + }, + { + "epoch": 6.4308782985427335, + "grad_norm": 0.44179767370224, + "learning_rate": 2.4483000000000003e-05, + "loss": 0.0236, + "step": 8167 + }, + { + "epoch": 6.431666010240252, + "grad_norm": 0.44780248403549194, + "learning_rate": 2.4486000000000002e-05, + "loss": 0.0307, + "step": 8168 + }, + { + "epoch": 6.432453721937771, + "grad_norm": 0.7623103260993958, + "learning_rate": 2.4489000000000002e-05, + "loss": 0.0271, + "step": 8169 + }, + { + "epoch": 6.433241433635289, + "grad_norm": 0.6179875135421753, + "learning_rate": 2.4492000000000002e-05, + "loss": 0.0226, + "step": 8170 + }, + { + "epoch": 6.434029145332808, + "grad_norm": 1.147560954093933, + "learning_rate": 2.4495000000000002e-05, + "loss": 0.2699, + "step": 8171 + }, + { + "epoch": 6.434816857030327, + "grad_norm": 0.6855278611183167, + "learning_rate": 2.4498e-05, + "loss": 0.2072, + "step": 8172 + }, + { + "epoch": 6.435604568727846, + "grad_norm": 0.6224079728126526, + "learning_rate": 2.4501e-05, + "loss": 0.1418, + "step": 8173 + }, + { + "epoch": 6.436392280425364, + "grad_norm": 0.7845330238342285, + "learning_rate": 2.4504e-05, + "loss": 0.1336, + "step": 8174 + }, + { + "epoch": 6.437179992122883, + "grad_norm": 0.9869411587715149, + "learning_rate": 2.4507e-05, + "loss": 0.1243, + "step": 8175 + }, + { + "epoch": 6.4379677038204015, + "grad_norm": 1.0632925033569336, + "learning_rate": 2.4509999999999997e-05, + "loss": 0.1707, + "step": 8176 + }, + { + "epoch": 6.43875541551792, + "grad_norm": 0.26443612575531006, + "learning_rate": 2.4513e-05, + "loss": 0.0262, + "step": 8177 + }, + { + "epoch": 6.4395431272154395, + "grad_norm": 0.3244243562221527, + "learning_rate": 2.4516e-05, + "loss": 0.0317, + "step": 8178 + }, + { + "epoch": 6.440330838912958, + "grad_norm": 0.22608764469623566, + "learning_rate": 2.4519e-05, + "loss": 0.0178, + "step": 8179 + }, + { + "epoch": 6.441118550610477, + "grad_norm": 0.5812906622886658, + "learning_rate": 2.4522e-05, + "loss": 0.0208, + "step": 8180 + }, + { + "epoch": 6.441906262307995, + "grad_norm": 0.32107046246528625, + "learning_rate": 2.4525e-05, + "loss": 0.0279, + "step": 8181 + }, + { + "epoch": 6.442693974005514, + "grad_norm": 0.32634782791137695, + "learning_rate": 2.4528e-05, + "loss": 0.0179, + "step": 8182 + }, + { + "epoch": 6.443481685703032, + "grad_norm": 0.3665968179702759, + "learning_rate": 2.4531e-05, + "loss": 0.0334, + "step": 8183 + }, + { + "epoch": 6.444269397400552, + "grad_norm": 0.1718834489583969, + "learning_rate": 2.4534e-05, + "loss": 0.0113, + "step": 8184 + }, + { + "epoch": 6.44505710909807, + "grad_norm": 0.3138217628002167, + "learning_rate": 2.4537e-05, + "loss": 0.0125, + "step": 8185 + }, + { + "epoch": 6.445844820795589, + "grad_norm": 0.6625983119010925, + "learning_rate": 2.454e-05, + "loss": 0.0317, + "step": 8186 + }, + { + "epoch": 6.4466325324931075, + "grad_norm": 0.23322686553001404, + "learning_rate": 2.4543000000000002e-05, + "loss": 0.0161, + "step": 8187 + }, + { + "epoch": 6.447420244190626, + "grad_norm": 0.38095787167549133, + "learning_rate": 2.4546000000000002e-05, + "loss": 0.0267, + "step": 8188 + }, + { + "epoch": 6.448207955888145, + "grad_norm": 0.444044291973114, + "learning_rate": 2.4549e-05, + "loss": 0.0238, + "step": 8189 + }, + { + "epoch": 6.448995667585663, + "grad_norm": 0.2415231615304947, + "learning_rate": 2.4552e-05, + "loss": 0.0183, + "step": 8190 + }, + { + "epoch": 6.449783379283183, + "grad_norm": 0.34539830684661865, + "learning_rate": 2.4555e-05, + "loss": 0.0137, + "step": 8191 + }, + { + "epoch": 6.450571090980701, + "grad_norm": 0.32044610381126404, + "learning_rate": 2.4558e-05, + "loss": 0.0123, + "step": 8192 + }, + { + "epoch": 6.45135880267822, + "grad_norm": 0.2542300820350647, + "learning_rate": 2.4561e-05, + "loss": 0.0162, + "step": 8193 + }, + { + "epoch": 6.452146514375738, + "grad_norm": 0.24830761551856995, + "learning_rate": 2.4564e-05, + "loss": 0.0127, + "step": 8194 + }, + { + "epoch": 6.452934226073257, + "grad_norm": 0.3619571328163147, + "learning_rate": 2.4567e-05, + "loss": 0.0141, + "step": 8195 + }, + { + "epoch": 6.4537219377707755, + "grad_norm": 0.3959226906299591, + "learning_rate": 2.457e-05, + "loss": 0.0168, + "step": 8196 + }, + { + "epoch": 6.454509649468295, + "grad_norm": 0.75862717628479, + "learning_rate": 2.4573000000000003e-05, + "loss": 0.0219, + "step": 8197 + }, + { + "epoch": 6.4552973611658135, + "grad_norm": 1.4419852495193481, + "learning_rate": 2.4576000000000003e-05, + "loss": 0.0507, + "step": 8198 + }, + { + "epoch": 6.456085072863332, + "grad_norm": 0.6223896145820618, + "learning_rate": 2.4579000000000003e-05, + "loss": 0.0215, + "step": 8199 + }, + { + "epoch": 6.456872784560851, + "grad_norm": 0.6641297936439514, + "learning_rate": 2.4582000000000003e-05, + "loss": 0.0357, + "step": 8200 + }, + { + "epoch": 6.457660496258369, + "grad_norm": 0.3162485361099243, + "learning_rate": 2.4585e-05, + "loss": 0.0138, + "step": 8201 + }, + { + "epoch": 6.458448207955888, + "grad_norm": 0.32597216963768005, + "learning_rate": 2.4588e-05, + "loss": 0.0299, + "step": 8202 + }, + { + "epoch": 6.459235919653407, + "grad_norm": 0.31235411763191223, + "learning_rate": 2.4591e-05, + "loss": 0.0126, + "step": 8203 + }, + { + "epoch": 6.460023631350926, + "grad_norm": 0.6656377911567688, + "learning_rate": 2.4594e-05, + "loss": 0.0181, + "step": 8204 + }, + { + "epoch": 6.460811343048444, + "grad_norm": 0.4175049960613251, + "learning_rate": 2.4597e-05, + "loss": 0.0175, + "step": 8205 + }, + { + "epoch": 6.461599054745963, + "grad_norm": 0.33635419607162476, + "learning_rate": 2.4599999999999998e-05, + "loss": 0.0133, + "step": 8206 + }, + { + "epoch": 6.4623867664434815, + "grad_norm": 0.9826563596725464, + "learning_rate": 2.4603e-05, + "loss": 0.033, + "step": 8207 + }, + { + "epoch": 6.463174478141, + "grad_norm": 0.40713584423065186, + "learning_rate": 2.4606e-05, + "loss": 0.0184, + "step": 8208 + }, + { + "epoch": 6.463962189838519, + "grad_norm": 0.3799884021282196, + "learning_rate": 2.4609e-05, + "loss": 0.0158, + "step": 8209 + }, + { + "epoch": 6.464749901536038, + "grad_norm": 0.6829257607460022, + "learning_rate": 2.4612e-05, + "loss": 0.0343, + "step": 8210 + }, + { + "epoch": 6.465537613233557, + "grad_norm": 0.6133707761764526, + "learning_rate": 2.4615e-05, + "loss": 0.0304, + "step": 8211 + }, + { + "epoch": 6.466325324931075, + "grad_norm": 0.3794424831867218, + "learning_rate": 2.4618e-05, + "loss": 0.0107, + "step": 8212 + }, + { + "epoch": 6.467113036628594, + "grad_norm": 0.7242105007171631, + "learning_rate": 2.4621e-05, + "loss": 0.0287, + "step": 8213 + }, + { + "epoch": 6.467900748326112, + "grad_norm": 0.43121474981307983, + "learning_rate": 2.4624e-05, + "loss": 0.0225, + "step": 8214 + }, + { + "epoch": 6.468688460023631, + "grad_norm": 1.7606748342514038, + "learning_rate": 2.4627e-05, + "loss": 0.0316, + "step": 8215 + }, + { + "epoch": 6.46947617172115, + "grad_norm": 0.7029223442077637, + "learning_rate": 2.463e-05, + "loss": 0.0221, + "step": 8216 + }, + { + "epoch": 6.470263883418669, + "grad_norm": 0.8388092517852783, + "learning_rate": 2.4633000000000003e-05, + "loss": 0.057, + "step": 8217 + }, + { + "epoch": 6.4710515951161875, + "grad_norm": 0.4201033413410187, + "learning_rate": 2.4636000000000003e-05, + "loss": 0.0226, + "step": 8218 + }, + { + "epoch": 6.471839306813706, + "grad_norm": 0.9798861742019653, + "learning_rate": 2.4639000000000002e-05, + "loss": 0.035, + "step": 8219 + }, + { + "epoch": 6.472627018511225, + "grad_norm": 0.6313253045082092, + "learning_rate": 2.4642000000000002e-05, + "loss": 0.0314, + "step": 8220 + }, + { + "epoch": 6.473414730208743, + "grad_norm": 0.86784827709198, + "learning_rate": 2.4645000000000002e-05, + "loss": 0.2848, + "step": 8221 + }, + { + "epoch": 6.474202441906263, + "grad_norm": 0.8437435626983643, + "learning_rate": 2.4648000000000002e-05, + "loss": 0.2442, + "step": 8222 + }, + { + "epoch": 6.474990153603781, + "grad_norm": 0.5609810948371887, + "learning_rate": 2.4651e-05, + "loss": 0.1531, + "step": 8223 + }, + { + "epoch": 6.4757778653013, + "grad_norm": 0.6989643573760986, + "learning_rate": 2.4654e-05, + "loss": 0.1134, + "step": 8224 + }, + { + "epoch": 6.476565576998818, + "grad_norm": 0.8835873007774353, + "learning_rate": 2.4656999999999998e-05, + "loss": 0.1078, + "step": 8225 + }, + { + "epoch": 6.477353288696337, + "grad_norm": 0.46887290477752686, + "learning_rate": 2.4659999999999998e-05, + "loss": 0.0501, + "step": 8226 + }, + { + "epoch": 6.4781410003938555, + "grad_norm": 0.4289776086807251, + "learning_rate": 2.4663e-05, + "loss": 0.0738, + "step": 8227 + }, + { + "epoch": 6.478928712091374, + "grad_norm": 0.368954598903656, + "learning_rate": 2.4666e-05, + "loss": 0.0298, + "step": 8228 + }, + { + "epoch": 6.479716423788894, + "grad_norm": 0.5876852869987488, + "learning_rate": 2.4669e-05, + "loss": 0.02, + "step": 8229 + }, + { + "epoch": 6.480504135486412, + "grad_norm": 0.378916472196579, + "learning_rate": 2.4672e-05, + "loss": 0.0268, + "step": 8230 + }, + { + "epoch": 6.481291847183931, + "grad_norm": 0.2777075469493866, + "learning_rate": 2.4675e-05, + "loss": 0.0178, + "step": 8231 + }, + { + "epoch": 6.482079558881449, + "grad_norm": 0.37915971875190735, + "learning_rate": 2.4678e-05, + "loss": 0.0174, + "step": 8232 + }, + { + "epoch": 6.482867270578968, + "grad_norm": 0.45654037594795227, + "learning_rate": 2.4681e-05, + "loss": 0.026, + "step": 8233 + }, + { + "epoch": 6.483654982276486, + "grad_norm": 0.4149569571018219, + "learning_rate": 2.4684e-05, + "loss": 0.0266, + "step": 8234 + }, + { + "epoch": 6.484442693974006, + "grad_norm": 0.42631039023399353, + "learning_rate": 2.4687e-05, + "loss": 0.0284, + "step": 8235 + }, + { + "epoch": 6.485230405671524, + "grad_norm": 0.41289022564888, + "learning_rate": 2.469e-05, + "loss": 0.022, + "step": 8236 + }, + { + "epoch": 6.486018117369043, + "grad_norm": 0.4066372513771057, + "learning_rate": 2.4693000000000002e-05, + "loss": 0.0213, + "step": 8237 + }, + { + "epoch": 6.486805829066562, + "grad_norm": 0.2901860475540161, + "learning_rate": 2.4696000000000002e-05, + "loss": 0.027, + "step": 8238 + }, + { + "epoch": 6.48759354076408, + "grad_norm": 0.3661728799343109, + "learning_rate": 2.4699000000000002e-05, + "loss": 0.0251, + "step": 8239 + }, + { + "epoch": 6.488381252461599, + "grad_norm": 0.28310951590538025, + "learning_rate": 2.4702e-05, + "loss": 0.0246, + "step": 8240 + }, + { + "epoch": 6.489168964159118, + "grad_norm": 0.4791073203086853, + "learning_rate": 2.4705e-05, + "loss": 0.0243, + "step": 8241 + }, + { + "epoch": 6.489956675856637, + "grad_norm": 0.1844562441110611, + "learning_rate": 2.4708e-05, + "loss": 0.0071, + "step": 8242 + }, + { + "epoch": 6.490744387554155, + "grad_norm": 0.14850977063179016, + "learning_rate": 2.4711e-05, + "loss": 0.0148, + "step": 8243 + }, + { + "epoch": 6.491532099251674, + "grad_norm": 0.3300732672214508, + "learning_rate": 2.4714e-05, + "loss": 0.0128, + "step": 8244 + }, + { + "epoch": 6.492319810949192, + "grad_norm": 0.39375370740890503, + "learning_rate": 2.4717e-05, + "loss": 0.0241, + "step": 8245 + }, + { + "epoch": 6.493107522646711, + "grad_norm": 1.374701738357544, + "learning_rate": 2.472e-05, + "loss": 0.0192, + "step": 8246 + }, + { + "epoch": 6.4938952343442296, + "grad_norm": 0.8974776864051819, + "learning_rate": 2.4723000000000004e-05, + "loss": 0.0377, + "step": 8247 + }, + { + "epoch": 6.494682946041749, + "grad_norm": 0.5663589835166931, + "learning_rate": 2.4726000000000003e-05, + "loss": 0.0212, + "step": 8248 + }, + { + "epoch": 6.495470657739268, + "grad_norm": 0.5840190649032593, + "learning_rate": 2.4729000000000003e-05, + "loss": 0.0218, + "step": 8249 + }, + { + "epoch": 6.496258369436786, + "grad_norm": 0.5125323534011841, + "learning_rate": 2.4732e-05, + "loss": 0.022, + "step": 8250 + }, + { + "epoch": 6.497046081134305, + "grad_norm": 0.39540499448776245, + "learning_rate": 2.4735e-05, + "loss": 0.0206, + "step": 8251 + }, + { + "epoch": 6.497833792831823, + "grad_norm": 0.34888631105422974, + "learning_rate": 2.4738e-05, + "loss": 0.018, + "step": 8252 + }, + { + "epoch": 6.498621504529343, + "grad_norm": 0.8811116218566895, + "learning_rate": 2.4741e-05, + "loss": 0.0258, + "step": 8253 + }, + { + "epoch": 6.499409216226861, + "grad_norm": 0.6220777034759521, + "learning_rate": 2.4744e-05, + "loss": 0.0174, + "step": 8254 + }, + { + "epoch": 6.50019692792438, + "grad_norm": 0.34603607654571533, + "learning_rate": 2.4747e-05, + "loss": 0.0161, + "step": 8255 + }, + { + "epoch": 6.500984639621898, + "grad_norm": 0.2811470627784729, + "learning_rate": 2.475e-05, + "loss": 0.0086, + "step": 8256 + }, + { + "epoch": 6.501772351319417, + "grad_norm": 0.34930577874183655, + "learning_rate": 2.4753e-05, + "loss": 0.0242, + "step": 8257 + }, + { + "epoch": 6.502560063016936, + "grad_norm": 0.48546674847602844, + "learning_rate": 2.4756e-05, + "loss": 0.0237, + "step": 8258 + }, + { + "epoch": 6.503347774714454, + "grad_norm": 0.5764256715774536, + "learning_rate": 2.4759e-05, + "loss": 0.0278, + "step": 8259 + }, + { + "epoch": 6.504135486411974, + "grad_norm": 0.7277646660804749, + "learning_rate": 2.4762e-05, + "loss": 0.0232, + "step": 8260 + }, + { + "epoch": 6.504923198109492, + "grad_norm": 0.6706170439720154, + "learning_rate": 2.4765e-05, + "loss": 0.0351, + "step": 8261 + }, + { + "epoch": 6.505710909807011, + "grad_norm": 0.49057066440582275, + "learning_rate": 2.4768e-05, + "loss": 0.0326, + "step": 8262 + }, + { + "epoch": 6.506498621504529, + "grad_norm": 0.8753707408905029, + "learning_rate": 2.4771e-05, + "loss": 0.0368, + "step": 8263 + }, + { + "epoch": 6.507286333202048, + "grad_norm": 0.5211708545684814, + "learning_rate": 2.4774e-05, + "loss": 0.0205, + "step": 8264 + }, + { + "epoch": 6.508074044899566, + "grad_norm": 0.5428696870803833, + "learning_rate": 2.4777e-05, + "loss": 0.0273, + "step": 8265 + }, + { + "epoch": 6.508861756597085, + "grad_norm": 0.37007609009742737, + "learning_rate": 2.478e-05, + "loss": 0.0241, + "step": 8266 + }, + { + "epoch": 6.5096494682946044, + "grad_norm": 0.6831022500991821, + "learning_rate": 2.4783e-05, + "loss": 0.0424, + "step": 8267 + }, + { + "epoch": 6.510437179992123, + "grad_norm": 0.41658732295036316, + "learning_rate": 2.4786000000000003e-05, + "loss": 0.0368, + "step": 8268 + }, + { + "epoch": 6.511224891689642, + "grad_norm": 0.5788942575454712, + "learning_rate": 2.4789000000000003e-05, + "loss": 0.0242, + "step": 8269 + }, + { + "epoch": 6.51201260338716, + "grad_norm": 1.0915744304656982, + "learning_rate": 2.4792000000000003e-05, + "loss": 0.0579, + "step": 8270 + }, + { + "epoch": 6.512800315084679, + "grad_norm": 0.6722581386566162, + "learning_rate": 2.4795000000000002e-05, + "loss": 0.255, + "step": 8271 + }, + { + "epoch": 6.513588026782198, + "grad_norm": 0.7623412609100342, + "learning_rate": 2.4798000000000002e-05, + "loss": 0.2006, + "step": 8272 + }, + { + "epoch": 6.514375738479717, + "grad_norm": 0.635394275188446, + "learning_rate": 2.4801000000000002e-05, + "loss": 0.1722, + "step": 8273 + }, + { + "epoch": 6.515163450177235, + "grad_norm": 1.1668505668640137, + "learning_rate": 2.4804e-05, + "loss": 0.1505, + "step": 8274 + }, + { + "epoch": 6.515951161874754, + "grad_norm": 0.46465566754341125, + "learning_rate": 2.4806999999999998e-05, + "loss": 0.1036, + "step": 8275 + }, + { + "epoch": 6.516738873572272, + "grad_norm": 0.5853115320205688, + "learning_rate": 2.4809999999999998e-05, + "loss": 0.099, + "step": 8276 + }, + { + "epoch": 6.517526585269791, + "grad_norm": 0.42251649498939514, + "learning_rate": 2.4812999999999998e-05, + "loss": 0.043, + "step": 8277 + }, + { + "epoch": 6.51831429696731, + "grad_norm": 0.4953143894672394, + "learning_rate": 2.4816e-05, + "loss": 0.0664, + "step": 8278 + }, + { + "epoch": 6.519102008664829, + "grad_norm": 0.5021038055419922, + "learning_rate": 2.4819e-05, + "loss": 0.0364, + "step": 8279 + }, + { + "epoch": 6.519889720362348, + "grad_norm": 0.31517529487609863, + "learning_rate": 2.4822e-05, + "loss": 0.0194, + "step": 8280 + }, + { + "epoch": 6.520677432059866, + "grad_norm": 0.2619416415691376, + "learning_rate": 2.4825e-05, + "loss": 0.0201, + "step": 8281 + }, + { + "epoch": 6.521465143757385, + "grad_norm": 0.44607052206993103, + "learning_rate": 2.4828e-05, + "loss": 0.0297, + "step": 8282 + }, + { + "epoch": 6.522252855454903, + "grad_norm": 0.281742125749588, + "learning_rate": 2.4831e-05, + "loss": 0.0138, + "step": 8283 + }, + { + "epoch": 6.523040567152422, + "grad_norm": 1.1033151149749756, + "learning_rate": 2.4834e-05, + "loss": 0.0221, + "step": 8284 + }, + { + "epoch": 6.52382827884994, + "grad_norm": 0.24157384037971497, + "learning_rate": 2.4837e-05, + "loss": 0.0139, + "step": 8285 + }, + { + "epoch": 6.52461599054746, + "grad_norm": 0.4070669114589691, + "learning_rate": 2.484e-05, + "loss": 0.0366, + "step": 8286 + }, + { + "epoch": 6.5254037022449785, + "grad_norm": 0.40378960967063904, + "learning_rate": 2.4843e-05, + "loss": 0.0317, + "step": 8287 + }, + { + "epoch": 6.526191413942497, + "grad_norm": 0.46572738885879517, + "learning_rate": 2.4846000000000002e-05, + "loss": 0.0209, + "step": 8288 + }, + { + "epoch": 6.526979125640016, + "grad_norm": 0.3142315447330475, + "learning_rate": 2.4849000000000002e-05, + "loss": 0.0135, + "step": 8289 + }, + { + "epoch": 6.527766837337534, + "grad_norm": 0.32479044795036316, + "learning_rate": 2.4852000000000002e-05, + "loss": 0.0148, + "step": 8290 + }, + { + "epoch": 6.528554549035054, + "grad_norm": 0.4259856939315796, + "learning_rate": 2.4855000000000002e-05, + "loss": 0.0216, + "step": 8291 + }, + { + "epoch": 6.529342260732572, + "grad_norm": 0.8411016464233398, + "learning_rate": 2.4858e-05, + "loss": 0.0188, + "step": 8292 + }, + { + "epoch": 6.530129972430091, + "grad_norm": 0.40862566232681274, + "learning_rate": 2.4861e-05, + "loss": 0.0285, + "step": 8293 + }, + { + "epoch": 6.530917684127609, + "grad_norm": 0.7099668383598328, + "learning_rate": 2.4864e-05, + "loss": 0.03, + "step": 8294 + }, + { + "epoch": 6.531705395825128, + "grad_norm": 0.520905613899231, + "learning_rate": 2.4867e-05, + "loss": 0.0201, + "step": 8295 + }, + { + "epoch": 6.5324931075226464, + "grad_norm": 0.46125924587249756, + "learning_rate": 2.487e-05, + "loss": 0.0186, + "step": 8296 + }, + { + "epoch": 6.533280819220165, + "grad_norm": 0.481790691614151, + "learning_rate": 2.4873e-05, + "loss": 0.0365, + "step": 8297 + }, + { + "epoch": 6.5340685309176845, + "grad_norm": 0.3815932869911194, + "learning_rate": 2.4876000000000004e-05, + "loss": 0.0192, + "step": 8298 + }, + { + "epoch": 6.534856242615203, + "grad_norm": 0.4021844267845154, + "learning_rate": 2.4879e-05, + "loss": 0.0244, + "step": 8299 + }, + { + "epoch": 6.535643954312722, + "grad_norm": 0.556207001209259, + "learning_rate": 2.4882e-05, + "loss": 0.0125, + "step": 8300 + }, + { + "epoch": 6.53643166601024, + "grad_norm": 0.3414762318134308, + "learning_rate": 2.4885e-05, + "loss": 0.012, + "step": 8301 + }, + { + "epoch": 6.537219377707759, + "grad_norm": 0.26647812128067017, + "learning_rate": 2.4888e-05, + "loss": 0.0183, + "step": 8302 + }, + { + "epoch": 6.538007089405277, + "grad_norm": 0.40184757113456726, + "learning_rate": 2.4891e-05, + "loss": 0.0234, + "step": 8303 + }, + { + "epoch": 6.538794801102797, + "grad_norm": 0.3006730079650879, + "learning_rate": 2.4894e-05, + "loss": 0.0133, + "step": 8304 + }, + { + "epoch": 6.539582512800315, + "grad_norm": 0.4281511604785919, + "learning_rate": 2.4897e-05, + "loss": 0.0123, + "step": 8305 + }, + { + "epoch": 6.540370224497834, + "grad_norm": 0.5353279113769531, + "learning_rate": 2.49e-05, + "loss": 0.0338, + "step": 8306 + }, + { + "epoch": 6.5411579361953525, + "grad_norm": 0.17052394151687622, + "learning_rate": 2.4903e-05, + "loss": 0.0121, + "step": 8307 + }, + { + "epoch": 6.541945647892871, + "grad_norm": 0.5463525056838989, + "learning_rate": 2.4906000000000002e-05, + "loss": 0.0283, + "step": 8308 + }, + { + "epoch": 6.54273335959039, + "grad_norm": 0.5096632838249207, + "learning_rate": 2.4909e-05, + "loss": 0.0244, + "step": 8309 + }, + { + "epoch": 6.543521071287909, + "grad_norm": 0.7533449530601501, + "learning_rate": 2.4912e-05, + "loss": 0.0251, + "step": 8310 + }, + { + "epoch": 6.544308782985428, + "grad_norm": 0.19262368977069855, + "learning_rate": 2.4915e-05, + "loss": 0.0127, + "step": 8311 + }, + { + "epoch": 6.545096494682946, + "grad_norm": 0.43112635612487793, + "learning_rate": 2.4918e-05, + "loss": 0.0218, + "step": 8312 + }, + { + "epoch": 6.545884206380465, + "grad_norm": 0.5515446066856384, + "learning_rate": 2.4921e-05, + "loss": 0.0253, + "step": 8313 + }, + { + "epoch": 6.546671918077983, + "grad_norm": 0.8005467653274536, + "learning_rate": 2.4924e-05, + "loss": 0.0217, + "step": 8314 + }, + { + "epoch": 6.547459629775502, + "grad_norm": 0.565753698348999, + "learning_rate": 2.4927e-05, + "loss": 0.0387, + "step": 8315 + }, + { + "epoch": 6.5482473414730205, + "grad_norm": 0.4065304696559906, + "learning_rate": 2.493e-05, + "loss": 0.0195, + "step": 8316 + }, + { + "epoch": 6.54903505317054, + "grad_norm": 0.4855109751224518, + "learning_rate": 2.4933e-05, + "loss": 0.0222, + "step": 8317 + }, + { + "epoch": 6.5498227648680585, + "grad_norm": 0.3157244622707367, + "learning_rate": 2.4936000000000003e-05, + "loss": 0.0192, + "step": 8318 + }, + { + "epoch": 6.550610476565577, + "grad_norm": 0.6583613753318787, + "learning_rate": 2.4939000000000003e-05, + "loss": 0.0188, + "step": 8319 + }, + { + "epoch": 6.551398188263096, + "grad_norm": 0.8197461366653442, + "learning_rate": 2.4942000000000003e-05, + "loss": 0.0493, + "step": 8320 + }, + { + "epoch": 6.552185899960614, + "grad_norm": 3.1658244132995605, + "learning_rate": 2.4945000000000003e-05, + "loss": 0.382, + "step": 8321 + }, + { + "epoch": 6.552973611658133, + "grad_norm": 1.3207026720046997, + "learning_rate": 2.4948000000000002e-05, + "loss": 0.2498, + "step": 8322 + }, + { + "epoch": 6.553761323355652, + "grad_norm": 0.8095518946647644, + "learning_rate": 2.4951e-05, + "loss": 0.1758, + "step": 8323 + }, + { + "epoch": 6.554549035053171, + "grad_norm": 0.7672715187072754, + "learning_rate": 2.4954e-05, + "loss": 0.1296, + "step": 8324 + }, + { + "epoch": 6.555336746750689, + "grad_norm": 0.6171823143959045, + "learning_rate": 2.4957e-05, + "loss": 0.1086, + "step": 8325 + }, + { + "epoch": 6.556124458448208, + "grad_norm": 0.44353625178337097, + "learning_rate": 2.4959999999999998e-05, + "loss": 0.0538, + "step": 8326 + }, + { + "epoch": 6.5569121701457265, + "grad_norm": 1.2296768426895142, + "learning_rate": 2.4962999999999998e-05, + "loss": 0.0627, + "step": 8327 + }, + { + "epoch": 6.557699881843245, + "grad_norm": 0.7097201943397522, + "learning_rate": 2.4966e-05, + "loss": 0.0454, + "step": 8328 + }, + { + "epoch": 6.5584875935407645, + "grad_norm": 0.43500205874443054, + "learning_rate": 2.4969e-05, + "loss": 0.0691, + "step": 8329 + }, + { + "epoch": 6.559275305238283, + "grad_norm": 0.4017832577228546, + "learning_rate": 2.4972e-05, + "loss": 0.0252, + "step": 8330 + }, + { + "epoch": 6.560063016935802, + "grad_norm": 0.2549806237220764, + "learning_rate": 2.4975e-05, + "loss": 0.0201, + "step": 8331 + }, + { + "epoch": 6.56085072863332, + "grad_norm": 0.3639694154262543, + "learning_rate": 2.4978e-05, + "loss": 0.0257, + "step": 8332 + }, + { + "epoch": 6.561638440330839, + "grad_norm": 1.0086219310760498, + "learning_rate": 2.4981e-05, + "loss": 0.0204, + "step": 8333 + }, + { + "epoch": 6.562426152028357, + "grad_norm": 0.2566254436969757, + "learning_rate": 2.4984e-05, + "loss": 0.0164, + "step": 8334 + }, + { + "epoch": 6.563213863725876, + "grad_norm": 0.7520695328712463, + "learning_rate": 2.4987e-05, + "loss": 0.0394, + "step": 8335 + }, + { + "epoch": 6.564001575423395, + "grad_norm": 0.4685310125350952, + "learning_rate": 2.499e-05, + "loss": 0.0284, + "step": 8336 + }, + { + "epoch": 6.564789287120914, + "grad_norm": 0.3660299479961395, + "learning_rate": 2.4993e-05, + "loss": 0.0195, + "step": 8337 + }, + { + "epoch": 6.5655769988184325, + "grad_norm": 0.296739786863327, + "learning_rate": 2.4996000000000003e-05, + "loss": 0.0227, + "step": 8338 + }, + { + "epoch": 6.566364710515951, + "grad_norm": 0.5599357485771179, + "learning_rate": 2.4999000000000002e-05, + "loss": 0.0277, + "step": 8339 + }, + { + "epoch": 6.56715242221347, + "grad_norm": 0.45720136165618896, + "learning_rate": 2.5002000000000002e-05, + "loss": 0.017, + "step": 8340 + }, + { + "epoch": 6.567940133910989, + "grad_norm": 0.4402177035808563, + "learning_rate": 2.5005000000000002e-05, + "loss": 0.0242, + "step": 8341 + }, + { + "epoch": 6.568727845608508, + "grad_norm": 0.44630733132362366, + "learning_rate": 2.5008000000000002e-05, + "loss": 0.0154, + "step": 8342 + }, + { + "epoch": 6.569515557306026, + "grad_norm": 0.6076335906982422, + "learning_rate": 2.5011e-05, + "loss": 0.0336, + "step": 8343 + }, + { + "epoch": 6.570303269003545, + "grad_norm": 0.5733380913734436, + "learning_rate": 2.5014e-05, + "loss": 0.0269, + "step": 8344 + }, + { + "epoch": 6.571090980701063, + "grad_norm": 0.8657634854316711, + "learning_rate": 2.5017e-05, + "loss": 0.0271, + "step": 8345 + }, + { + "epoch": 6.571878692398582, + "grad_norm": 0.3393764793872833, + "learning_rate": 2.502e-05, + "loss": 0.0199, + "step": 8346 + }, + { + "epoch": 6.5726664040961005, + "grad_norm": 0.29814237356185913, + "learning_rate": 2.5023e-05, + "loss": 0.0199, + "step": 8347 + }, + { + "epoch": 6.57345411579362, + "grad_norm": 0.4222763478755951, + "learning_rate": 2.5026e-05, + "loss": 0.0173, + "step": 8348 + }, + { + "epoch": 6.5742418274911385, + "grad_norm": 0.3204270005226135, + "learning_rate": 2.5029e-05, + "loss": 0.0171, + "step": 8349 + }, + { + "epoch": 6.575029539188657, + "grad_norm": 0.3052057921886444, + "learning_rate": 2.5032e-05, + "loss": 0.0225, + "step": 8350 + }, + { + "epoch": 6.575817250886176, + "grad_norm": 0.4332009553909302, + "learning_rate": 2.5035e-05, + "loss": 0.0261, + "step": 8351 + }, + { + "epoch": 6.576604962583694, + "grad_norm": 0.2187678962945938, + "learning_rate": 2.5038e-05, + "loss": 0.0119, + "step": 8352 + }, + { + "epoch": 6.577392674281213, + "grad_norm": 0.4042944312095642, + "learning_rate": 2.5041e-05, + "loss": 0.0267, + "step": 8353 + }, + { + "epoch": 6.578180385978731, + "grad_norm": 0.4238041937351227, + "learning_rate": 2.5044e-05, + "loss": 0.0208, + "step": 8354 + }, + { + "epoch": 6.578968097676251, + "grad_norm": 0.960939884185791, + "learning_rate": 2.5047e-05, + "loss": 0.0352, + "step": 8355 + }, + { + "epoch": 6.579755809373769, + "grad_norm": 0.4085662066936493, + "learning_rate": 2.505e-05, + "loss": 0.0284, + "step": 8356 + }, + { + "epoch": 6.580543521071288, + "grad_norm": 0.2199387401342392, + "learning_rate": 2.5053e-05, + "loss": 0.0099, + "step": 8357 + }, + { + "epoch": 6.5813312327688065, + "grad_norm": 0.34147509932518005, + "learning_rate": 2.5056000000000002e-05, + "loss": 0.0176, + "step": 8358 + }, + { + "epoch": 6.582118944466325, + "grad_norm": 0.3113712668418884, + "learning_rate": 2.5059000000000002e-05, + "loss": 0.0164, + "step": 8359 + }, + { + "epoch": 6.5829066561638445, + "grad_norm": 0.5545753836631775, + "learning_rate": 2.5062e-05, + "loss": 0.0142, + "step": 8360 + }, + { + "epoch": 6.583694367861363, + "grad_norm": 0.48508524894714355, + "learning_rate": 2.5065e-05, + "loss": 0.0277, + "step": 8361 + }, + { + "epoch": 6.584482079558882, + "grad_norm": 1.2399680614471436, + "learning_rate": 2.5068e-05, + "loss": 0.0252, + "step": 8362 + }, + { + "epoch": 6.5852697912564, + "grad_norm": 0.5665269494056702, + "learning_rate": 2.5071e-05, + "loss": 0.0217, + "step": 8363 + }, + { + "epoch": 6.586057502953919, + "grad_norm": 1.8266396522521973, + "learning_rate": 2.5074e-05, + "loss": 0.0336, + "step": 8364 + }, + { + "epoch": 6.586845214651437, + "grad_norm": 0.37213870882987976, + "learning_rate": 2.5077e-05, + "loss": 0.019, + "step": 8365 + }, + { + "epoch": 6.587632926348956, + "grad_norm": 0.26454636454582214, + "learning_rate": 2.508e-05, + "loss": 0.0144, + "step": 8366 + }, + { + "epoch": 6.588420638046475, + "grad_norm": 0.7698396444320679, + "learning_rate": 2.5083e-05, + "loss": 0.0266, + "step": 8367 + }, + { + "epoch": 6.589208349743994, + "grad_norm": 0.6688774228096008, + "learning_rate": 2.5086000000000003e-05, + "loss": 0.0371, + "step": 8368 + }, + { + "epoch": 6.5899960614415125, + "grad_norm": 0.6594734787940979, + "learning_rate": 2.5089000000000003e-05, + "loss": 0.0211, + "step": 8369 + }, + { + "epoch": 6.590783773139031, + "grad_norm": 0.5405305624008179, + "learning_rate": 2.5092000000000003e-05, + "loss": 0.027, + "step": 8370 + }, + { + "epoch": 6.59157148483655, + "grad_norm": 1.0490652322769165, + "learning_rate": 2.5095000000000003e-05, + "loss": 0.32, + "step": 8371 + }, + { + "epoch": 6.592359196534068, + "grad_norm": 0.9519870281219482, + "learning_rate": 2.5098000000000003e-05, + "loss": 0.2749, + "step": 8372 + }, + { + "epoch": 6.593146908231587, + "grad_norm": 0.7809500694274902, + "learning_rate": 2.5101e-05, + "loss": 0.1927, + "step": 8373 + }, + { + "epoch": 6.593934619929106, + "grad_norm": 0.6184051632881165, + "learning_rate": 2.5104e-05, + "loss": 0.1658, + "step": 8374 + }, + { + "epoch": 6.594722331626625, + "grad_norm": 0.5669569373130798, + "learning_rate": 2.5107e-05, + "loss": 0.1223, + "step": 8375 + }, + { + "epoch": 6.595510043324143, + "grad_norm": 0.450693279504776, + "learning_rate": 2.511e-05, + "loss": 0.0759, + "step": 8376 + }, + { + "epoch": 6.596297755021662, + "grad_norm": 0.4017464220523834, + "learning_rate": 2.5112999999999998e-05, + "loss": 0.0344, + "step": 8377 + }, + { + "epoch": 6.5970854667191805, + "grad_norm": 0.5821024179458618, + "learning_rate": 2.5116e-05, + "loss": 0.0381, + "step": 8378 + }, + { + "epoch": 6.5978731784167, + "grad_norm": 0.3902573883533478, + "learning_rate": 2.5119e-05, + "loss": 0.0399, + "step": 8379 + }, + { + "epoch": 6.5986608901142185, + "grad_norm": 0.3696581721305847, + "learning_rate": 2.5122e-05, + "loss": 0.0225, + "step": 8380 + }, + { + "epoch": 6.599448601811737, + "grad_norm": 0.33397209644317627, + "learning_rate": 2.5125e-05, + "loss": 0.0319, + "step": 8381 + }, + { + "epoch": 6.600236313509256, + "grad_norm": 0.3315282464027405, + "learning_rate": 2.5128e-05, + "loss": 0.0146, + "step": 8382 + }, + { + "epoch": 6.601024025206774, + "grad_norm": 0.5732565522193909, + "learning_rate": 2.5131e-05, + "loss": 0.0229, + "step": 8383 + }, + { + "epoch": 6.601811736904293, + "grad_norm": 0.26378771662712097, + "learning_rate": 2.5134e-05, + "loss": 0.0117, + "step": 8384 + }, + { + "epoch": 6.602599448601811, + "grad_norm": 0.9006674289703369, + "learning_rate": 2.5137e-05, + "loss": 0.009, + "step": 8385 + }, + { + "epoch": 6.603387160299331, + "grad_norm": 0.45492109656333923, + "learning_rate": 2.514e-05, + "loss": 0.0273, + "step": 8386 + }, + { + "epoch": 6.604174871996849, + "grad_norm": 0.5090202689170837, + "learning_rate": 2.5143e-05, + "loss": 0.0127, + "step": 8387 + }, + { + "epoch": 6.604962583694368, + "grad_norm": 0.417860209941864, + "learning_rate": 2.5146e-05, + "loss": 0.0247, + "step": 8388 + }, + { + "epoch": 6.6057502953918865, + "grad_norm": 0.4105871617794037, + "learning_rate": 2.5149000000000003e-05, + "loss": 0.0113, + "step": 8389 + }, + { + "epoch": 6.606538007089405, + "grad_norm": 0.5403918027877808, + "learning_rate": 2.5152000000000002e-05, + "loss": 0.036, + "step": 8390 + }, + { + "epoch": 6.607325718786924, + "grad_norm": 0.4179697334766388, + "learning_rate": 2.5155000000000002e-05, + "loss": 0.019, + "step": 8391 + }, + { + "epoch": 6.608113430484442, + "grad_norm": 0.5067605376243591, + "learning_rate": 2.5158000000000002e-05, + "loss": 0.0171, + "step": 8392 + }, + { + "epoch": 6.608901142181962, + "grad_norm": 0.294051855802536, + "learning_rate": 2.5161000000000002e-05, + "loss": 0.0159, + "step": 8393 + }, + { + "epoch": 6.60968885387948, + "grad_norm": 0.517814040184021, + "learning_rate": 2.5164e-05, + "loss": 0.0289, + "step": 8394 + }, + { + "epoch": 6.610476565576999, + "grad_norm": 0.23828454315662384, + "learning_rate": 2.5167e-05, + "loss": 0.0151, + "step": 8395 + }, + { + "epoch": 6.611264277274517, + "grad_norm": 0.28406375646591187, + "learning_rate": 2.517e-05, + "loss": 0.0139, + "step": 8396 + }, + { + "epoch": 6.612051988972036, + "grad_norm": 0.4818016290664673, + "learning_rate": 2.5172999999999998e-05, + "loss": 0.0157, + "step": 8397 + }, + { + "epoch": 6.612839700669555, + "grad_norm": 0.5826446413993835, + "learning_rate": 2.5175999999999997e-05, + "loss": 0.026, + "step": 8398 + }, + { + "epoch": 6.613627412367074, + "grad_norm": 0.7850905656814575, + "learning_rate": 2.5179e-05, + "loss": 0.0228, + "step": 8399 + }, + { + "epoch": 6.6144151240645925, + "grad_norm": 0.3790251314640045, + "learning_rate": 2.5182e-05, + "loss": 0.019, + "step": 8400 + }, + { + "epoch": 6.615202835762111, + "grad_norm": 0.49274829030036926, + "learning_rate": 2.5185e-05, + "loss": 0.0165, + "step": 8401 + }, + { + "epoch": 6.61599054745963, + "grad_norm": 0.46635767817497253, + "learning_rate": 2.5188e-05, + "loss": 0.0293, + "step": 8402 + }, + { + "epoch": 6.616778259157148, + "grad_norm": 1.019291639328003, + "learning_rate": 2.5191e-05, + "loss": 0.0302, + "step": 8403 + }, + { + "epoch": 6.617565970854667, + "grad_norm": 0.5232030749320984, + "learning_rate": 2.5194e-05, + "loss": 0.071, + "step": 8404 + }, + { + "epoch": 6.618353682552186, + "grad_norm": 0.2014026790857315, + "learning_rate": 2.5197e-05, + "loss": 0.0143, + "step": 8405 + }, + { + "epoch": 6.619141394249705, + "grad_norm": 0.45068126916885376, + "learning_rate": 2.52e-05, + "loss": 0.0226, + "step": 8406 + }, + { + "epoch": 6.619929105947223, + "grad_norm": 0.46159136295318604, + "learning_rate": 2.5203e-05, + "loss": 0.0249, + "step": 8407 + }, + { + "epoch": 6.620716817644742, + "grad_norm": 0.262361079454422, + "learning_rate": 2.5206e-05, + "loss": 0.0138, + "step": 8408 + }, + { + "epoch": 6.6215045293422605, + "grad_norm": 0.6506526470184326, + "learning_rate": 2.5209000000000002e-05, + "loss": 0.0208, + "step": 8409 + }, + { + "epoch": 6.622292241039779, + "grad_norm": 0.29905202984809875, + "learning_rate": 2.5212000000000002e-05, + "loss": 0.019, + "step": 8410 + }, + { + "epoch": 6.623079952737298, + "grad_norm": 0.5442476272583008, + "learning_rate": 2.5215e-05, + "loss": 0.0264, + "step": 8411 + }, + { + "epoch": 6.623867664434817, + "grad_norm": 0.5666741728782654, + "learning_rate": 2.5218e-05, + "loss": 0.0313, + "step": 8412 + }, + { + "epoch": 6.624655376132336, + "grad_norm": 0.2771000862121582, + "learning_rate": 2.5221e-05, + "loss": 0.0197, + "step": 8413 + }, + { + "epoch": 6.625443087829854, + "grad_norm": 0.560008704662323, + "learning_rate": 2.5224e-05, + "loss": 0.0347, + "step": 8414 + }, + { + "epoch": 6.626230799527373, + "grad_norm": 0.4404643774032593, + "learning_rate": 2.5227e-05, + "loss": 0.0283, + "step": 8415 + }, + { + "epoch": 6.627018511224891, + "grad_norm": 0.7216337323188782, + "learning_rate": 2.523e-05, + "loss": 0.031, + "step": 8416 + }, + { + "epoch": 6.627806222922411, + "grad_norm": 0.43791866302490234, + "learning_rate": 2.5233e-05, + "loss": 0.0232, + "step": 8417 + }, + { + "epoch": 6.628593934619929, + "grad_norm": 0.6402605175971985, + "learning_rate": 2.5236e-05, + "loss": 0.0249, + "step": 8418 + }, + { + "epoch": 6.629381646317448, + "grad_norm": 1.2127920389175415, + "learning_rate": 2.5239000000000003e-05, + "loss": 0.0363, + "step": 8419 + }, + { + "epoch": 6.6301693580149665, + "grad_norm": 0.5400816798210144, + "learning_rate": 2.5242000000000003e-05, + "loss": 0.0262, + "step": 8420 + }, + { + "epoch": 6.630957069712485, + "grad_norm": 1.649523138999939, + "learning_rate": 2.5245000000000003e-05, + "loss": 0.3244, + "step": 8421 + }, + { + "epoch": 6.631744781410004, + "grad_norm": 0.8853986859321594, + "learning_rate": 2.5248e-05, + "loss": 0.2236, + "step": 8422 + }, + { + "epoch": 6.632532493107522, + "grad_norm": 0.5876579880714417, + "learning_rate": 2.5251e-05, + "loss": 0.1431, + "step": 8423 + }, + { + "epoch": 6.633320204805042, + "grad_norm": 0.7531207799911499, + "learning_rate": 2.5254e-05, + "loss": 0.1665, + "step": 8424 + }, + { + "epoch": 6.63410791650256, + "grad_norm": 0.6789083480834961, + "learning_rate": 2.5257e-05, + "loss": 0.1061, + "step": 8425 + }, + { + "epoch": 6.634895628200079, + "grad_norm": 0.3759911358356476, + "learning_rate": 2.526e-05, + "loss": 0.0459, + "step": 8426 + }, + { + "epoch": 6.635683339897597, + "grad_norm": 0.5670994520187378, + "learning_rate": 2.5263e-05, + "loss": 0.0733, + "step": 8427 + }, + { + "epoch": 6.636471051595116, + "grad_norm": 0.331146240234375, + "learning_rate": 2.5266e-05, + "loss": 0.027, + "step": 8428 + }, + { + "epoch": 6.6372587632926345, + "grad_norm": 0.34643203020095825, + "learning_rate": 2.5269e-05, + "loss": 0.0259, + "step": 8429 + }, + { + "epoch": 6.638046474990154, + "grad_norm": 0.4471551477909088, + "learning_rate": 2.5272e-05, + "loss": 0.027, + "step": 8430 + }, + { + "epoch": 6.638834186687673, + "grad_norm": 0.3205636441707611, + "learning_rate": 2.5275e-05, + "loss": 0.0127, + "step": 8431 + }, + { + "epoch": 6.639621898385191, + "grad_norm": 0.33694809675216675, + "learning_rate": 2.5278e-05, + "loss": 0.0175, + "step": 8432 + }, + { + "epoch": 6.64040961008271, + "grad_norm": 0.3489135801792145, + "learning_rate": 2.5281e-05, + "loss": 0.0223, + "step": 8433 + }, + { + "epoch": 6.641197321780228, + "grad_norm": 0.36783644556999207, + "learning_rate": 2.5284e-05, + "loss": 0.019, + "step": 8434 + }, + { + "epoch": 6.641985033477747, + "grad_norm": 0.34025049209594727, + "learning_rate": 2.5287e-05, + "loss": 0.0148, + "step": 8435 + }, + { + "epoch": 6.642772745175266, + "grad_norm": 0.2991456687450409, + "learning_rate": 2.529e-05, + "loss": 0.0156, + "step": 8436 + }, + { + "epoch": 6.643560456872785, + "grad_norm": 0.8696287870407104, + "learning_rate": 2.5293e-05, + "loss": 0.0271, + "step": 8437 + }, + { + "epoch": 6.644348168570303, + "grad_norm": 0.2504129707813263, + "learning_rate": 2.5296e-05, + "loss": 0.0196, + "step": 8438 + }, + { + "epoch": 6.645135880267822, + "grad_norm": 0.29048001766204834, + "learning_rate": 2.5299000000000003e-05, + "loss": 0.0179, + "step": 8439 + }, + { + "epoch": 6.6459235919653405, + "grad_norm": 0.334194153547287, + "learning_rate": 2.5302000000000003e-05, + "loss": 0.0278, + "step": 8440 + }, + { + "epoch": 6.646711303662859, + "grad_norm": 0.29566138982772827, + "learning_rate": 2.5305000000000003e-05, + "loss": 0.0193, + "step": 8441 + }, + { + "epoch": 6.647499015360378, + "grad_norm": 0.3912281095981598, + "learning_rate": 2.5308000000000002e-05, + "loss": 0.025, + "step": 8442 + }, + { + "epoch": 6.648286727057897, + "grad_norm": 0.563808798789978, + "learning_rate": 2.5311000000000002e-05, + "loss": 0.0144, + "step": 8443 + }, + { + "epoch": 6.649074438755416, + "grad_norm": 0.8781803250312805, + "learning_rate": 2.5314000000000002e-05, + "loss": 0.0288, + "step": 8444 + }, + { + "epoch": 6.649862150452934, + "grad_norm": 0.5713102221488953, + "learning_rate": 2.5317000000000002e-05, + "loss": 0.0263, + "step": 8445 + }, + { + "epoch": 6.650649862150453, + "grad_norm": 0.3451637923717499, + "learning_rate": 2.5319999999999998e-05, + "loss": 0.0173, + "step": 8446 + }, + { + "epoch": 6.651437573847971, + "grad_norm": 0.9730375409126282, + "learning_rate": 2.5322999999999998e-05, + "loss": 0.033, + "step": 8447 + }, + { + "epoch": 6.65222528554549, + "grad_norm": 0.4042341411113739, + "learning_rate": 2.5325999999999998e-05, + "loss": 0.014, + "step": 8448 + }, + { + "epoch": 6.653012997243009, + "grad_norm": 0.35057932138442993, + "learning_rate": 2.5329e-05, + "loss": 0.0231, + "step": 8449 + }, + { + "epoch": 6.653800708940528, + "grad_norm": 0.4583877623081207, + "learning_rate": 2.5332e-05, + "loss": 0.033, + "step": 8450 + }, + { + "epoch": 6.654588420638047, + "grad_norm": 0.45887088775634766, + "learning_rate": 2.5335e-05, + "loss": 0.017, + "step": 8451 + }, + { + "epoch": 6.655376132335565, + "grad_norm": 0.3424683213233948, + "learning_rate": 2.5338e-05, + "loss": 0.0267, + "step": 8452 + }, + { + "epoch": 6.656163844033084, + "grad_norm": 0.5580188632011414, + "learning_rate": 2.5341e-05, + "loss": 0.0259, + "step": 8453 + }, + { + "epoch": 6.656951555730602, + "grad_norm": 0.29386600852012634, + "learning_rate": 2.5344e-05, + "loss": 0.0279, + "step": 8454 + }, + { + "epoch": 6.657739267428122, + "grad_norm": 0.29248717427253723, + "learning_rate": 2.5347e-05, + "loss": 0.0198, + "step": 8455 + }, + { + "epoch": 6.65852697912564, + "grad_norm": 0.5985337495803833, + "learning_rate": 2.535e-05, + "loss": 0.0189, + "step": 8456 + }, + { + "epoch": 6.659314690823159, + "grad_norm": 0.3096971809864044, + "learning_rate": 2.5353e-05, + "loss": 0.0253, + "step": 8457 + }, + { + "epoch": 6.660102402520677, + "grad_norm": 0.4098826050758362, + "learning_rate": 2.5356e-05, + "loss": 0.0206, + "step": 8458 + }, + { + "epoch": 6.660890114218196, + "grad_norm": 0.5403032302856445, + "learning_rate": 2.5359000000000002e-05, + "loss": 0.0281, + "step": 8459 + }, + { + "epoch": 6.661677825915715, + "grad_norm": 0.5928537249565125, + "learning_rate": 2.5362000000000002e-05, + "loss": 0.025, + "step": 8460 + }, + { + "epoch": 6.662465537613233, + "grad_norm": 0.5711528062820435, + "learning_rate": 2.5365000000000002e-05, + "loss": 0.0248, + "step": 8461 + }, + { + "epoch": 6.663253249310753, + "grad_norm": 0.5284197330474854, + "learning_rate": 2.5368000000000002e-05, + "loss": 0.0201, + "step": 8462 + }, + { + "epoch": 6.664040961008271, + "grad_norm": 0.5520274043083191, + "learning_rate": 2.5371e-05, + "loss": 0.0171, + "step": 8463 + }, + { + "epoch": 6.66482867270579, + "grad_norm": 0.6765311360359192, + "learning_rate": 2.5374e-05, + "loss": 0.05, + "step": 8464 + }, + { + "epoch": 6.665616384403308, + "grad_norm": 0.5400881767272949, + "learning_rate": 2.5377e-05, + "loss": 0.0306, + "step": 8465 + }, + { + "epoch": 6.666404096100827, + "grad_norm": 1.3154621124267578, + "learning_rate": 2.538e-05, + "loss": 0.0589, + "step": 8466 + }, + { + "epoch": 6.667191807798346, + "grad_norm": 0.5126498341560364, + "learning_rate": 2.5383e-05, + "loss": 0.0151, + "step": 8467 + }, + { + "epoch": 6.667979519495865, + "grad_norm": 0.4571470022201538, + "learning_rate": 2.5386e-05, + "loss": 0.0224, + "step": 8468 + }, + { + "epoch": 6.668767231193383, + "grad_norm": 0.6027207374572754, + "learning_rate": 2.5389000000000004e-05, + "loss": 0.0221, + "step": 8469 + }, + { + "epoch": 6.669554942890902, + "grad_norm": 0.7179552912712097, + "learning_rate": 2.5392000000000004e-05, + "loss": 0.0406, + "step": 8470 + }, + { + "epoch": 6.670342654588421, + "grad_norm": 1.4926642179489136, + "learning_rate": 2.5395e-05, + "loss": 0.2714, + "step": 8471 + }, + { + "epoch": 6.671130366285939, + "grad_norm": 0.9670397043228149, + "learning_rate": 2.5398e-05, + "loss": 0.2418, + "step": 8472 + }, + { + "epoch": 6.671918077983458, + "grad_norm": 0.8487398624420166, + "learning_rate": 2.5401e-05, + "loss": 0.2247, + "step": 8473 + }, + { + "epoch": 6.672705789680977, + "grad_norm": 0.8419133424758911, + "learning_rate": 2.5404e-05, + "loss": 0.1485, + "step": 8474 + }, + { + "epoch": 6.673493501378496, + "grad_norm": 0.4898766577243805, + "learning_rate": 2.5407e-05, + "loss": 0.0824, + "step": 8475 + }, + { + "epoch": 6.674281213076014, + "grad_norm": 0.370620459318161, + "learning_rate": 2.541e-05, + "loss": 0.05, + "step": 8476 + }, + { + "epoch": 6.675068924773533, + "grad_norm": 0.3898319900035858, + "learning_rate": 2.5413e-05, + "loss": 0.0459, + "step": 8477 + }, + { + "epoch": 6.675856636471051, + "grad_norm": 0.628893256187439, + "learning_rate": 2.5416e-05, + "loss": 0.0538, + "step": 8478 + }, + { + "epoch": 6.67664434816857, + "grad_norm": 0.2677238881587982, + "learning_rate": 2.5419000000000002e-05, + "loss": 0.0201, + "step": 8479 + }, + { + "epoch": 6.677432059866089, + "grad_norm": 0.3230205178260803, + "learning_rate": 2.5422e-05, + "loss": 0.0214, + "step": 8480 + }, + { + "epoch": 6.678219771563608, + "grad_norm": 0.3059491813182831, + "learning_rate": 2.5425e-05, + "loss": 0.0222, + "step": 8481 + }, + { + "epoch": 6.679007483261127, + "grad_norm": 0.27511700987815857, + "learning_rate": 2.5428e-05, + "loss": 0.0191, + "step": 8482 + }, + { + "epoch": 6.679795194958645, + "grad_norm": 0.3468576967716217, + "learning_rate": 2.5431e-05, + "loss": 0.0179, + "step": 8483 + }, + { + "epoch": 6.680582906656164, + "grad_norm": 1.3196218013763428, + "learning_rate": 2.5434e-05, + "loss": 0.0609, + "step": 8484 + }, + { + "epoch": 6.681370618353682, + "grad_norm": 0.47090697288513184, + "learning_rate": 2.5437e-05, + "loss": 0.022, + "step": 8485 + }, + { + "epoch": 6.682158330051202, + "grad_norm": 0.3971528708934784, + "learning_rate": 2.544e-05, + "loss": 0.0246, + "step": 8486 + }, + { + "epoch": 6.68294604174872, + "grad_norm": 0.4302924871444702, + "learning_rate": 2.5443e-05, + "loss": 0.0351, + "step": 8487 + }, + { + "epoch": 6.683733753446239, + "grad_norm": 0.5986517071723938, + "learning_rate": 2.5446e-05, + "loss": 0.0255, + "step": 8488 + }, + { + "epoch": 6.6845214651437574, + "grad_norm": 0.3460017144680023, + "learning_rate": 2.5449000000000003e-05, + "loss": 0.0187, + "step": 8489 + }, + { + "epoch": 6.685309176841276, + "grad_norm": 0.84758460521698, + "learning_rate": 2.5452000000000003e-05, + "loss": 0.0394, + "step": 8490 + }, + { + "epoch": 6.686096888538795, + "grad_norm": 0.377433180809021, + "learning_rate": 2.5455000000000003e-05, + "loss": 0.0161, + "step": 8491 + }, + { + "epoch": 6.686884600236313, + "grad_norm": 0.5766827464103699, + "learning_rate": 2.5458000000000003e-05, + "loss": 0.0225, + "step": 8492 + }, + { + "epoch": 6.687672311933833, + "grad_norm": 0.7394462823867798, + "learning_rate": 2.5461000000000002e-05, + "loss": 0.0263, + "step": 8493 + }, + { + "epoch": 6.688460023631351, + "grad_norm": 0.6138095259666443, + "learning_rate": 2.5464000000000002e-05, + "loss": 0.0499, + "step": 8494 + }, + { + "epoch": 6.68924773532887, + "grad_norm": 0.41281628608703613, + "learning_rate": 2.5467e-05, + "loss": 0.0211, + "step": 8495 + }, + { + "epoch": 6.690035447026388, + "grad_norm": 0.24440282583236694, + "learning_rate": 2.547e-05, + "loss": 0.0202, + "step": 8496 + }, + { + "epoch": 6.690823158723907, + "grad_norm": 0.26068586111068726, + "learning_rate": 2.5472999999999998e-05, + "loss": 0.0185, + "step": 8497 + }, + { + "epoch": 6.691610870421425, + "grad_norm": 0.5372610092163086, + "learning_rate": 2.5475999999999998e-05, + "loss": 0.0206, + "step": 8498 + }, + { + "epoch": 6.692398582118944, + "grad_norm": 0.5322094559669495, + "learning_rate": 2.5479e-05, + "loss": 0.0271, + "step": 8499 + }, + { + "epoch": 6.6931862938164635, + "grad_norm": 0.6750620007514954, + "learning_rate": 2.5482e-05, + "loss": 0.0227, + "step": 8500 + }, + { + "epoch": 6.693974005513982, + "grad_norm": 0.541147768497467, + "learning_rate": 2.5485e-05, + "loss": 0.029, + "step": 8501 + }, + { + "epoch": 6.694761717211501, + "grad_norm": 0.2956663966178894, + "learning_rate": 2.5488e-05, + "loss": 0.0173, + "step": 8502 + }, + { + "epoch": 6.695549428909019, + "grad_norm": 0.2843814790248871, + "learning_rate": 2.5491e-05, + "loss": 0.0131, + "step": 8503 + }, + { + "epoch": 6.696337140606538, + "grad_norm": 0.31731483340263367, + "learning_rate": 2.5494e-05, + "loss": 0.0175, + "step": 8504 + }, + { + "epoch": 6.697124852304057, + "grad_norm": 0.1850367933511734, + "learning_rate": 2.5497e-05, + "loss": 0.011, + "step": 8505 + }, + { + "epoch": 6.697912564001576, + "grad_norm": 0.37003934383392334, + "learning_rate": 2.55e-05, + "loss": 0.0109, + "step": 8506 + }, + { + "epoch": 6.698700275699094, + "grad_norm": 0.5180025100708008, + "learning_rate": 2.5503e-05, + "loss": 0.0203, + "step": 8507 + }, + { + "epoch": 6.699487987396613, + "grad_norm": 0.459881991147995, + "learning_rate": 2.5506e-05, + "loss": 0.0171, + "step": 8508 + }, + { + "epoch": 6.7002756990941315, + "grad_norm": 0.3802952468395233, + "learning_rate": 2.5509e-05, + "loss": 0.0178, + "step": 8509 + }, + { + "epoch": 6.70106341079165, + "grad_norm": 0.4518692195415497, + "learning_rate": 2.5512000000000002e-05, + "loss": 0.0177, + "step": 8510 + }, + { + "epoch": 6.701851122489169, + "grad_norm": 0.37887144088745117, + "learning_rate": 2.5515000000000002e-05, + "loss": 0.0199, + "step": 8511 + }, + { + "epoch": 6.702638834186688, + "grad_norm": 0.4604039192199707, + "learning_rate": 2.5518000000000002e-05, + "loss": 0.0244, + "step": 8512 + }, + { + "epoch": 6.703426545884207, + "grad_norm": 0.4336566925048828, + "learning_rate": 2.5521000000000002e-05, + "loss": 0.0234, + "step": 8513 + }, + { + "epoch": 6.704214257581725, + "grad_norm": 0.5924338698387146, + "learning_rate": 2.5524e-05, + "loss": 0.0296, + "step": 8514 + }, + { + "epoch": 6.705001969279244, + "grad_norm": 0.7218816876411438, + "learning_rate": 2.5527e-05, + "loss": 0.0375, + "step": 8515 + }, + { + "epoch": 6.705789680976762, + "grad_norm": 0.4944067597389221, + "learning_rate": 2.553e-05, + "loss": 0.019, + "step": 8516 + }, + { + "epoch": 6.706577392674281, + "grad_norm": 0.45337775349617004, + "learning_rate": 2.5533e-05, + "loss": 0.0195, + "step": 8517 + }, + { + "epoch": 6.7073651043717994, + "grad_norm": 0.9278541207313538, + "learning_rate": 2.5536e-05, + "loss": 0.0328, + "step": 8518 + }, + { + "epoch": 6.708152816069319, + "grad_norm": 1.4845964908599854, + "learning_rate": 2.5539e-05, + "loss": 0.0327, + "step": 8519 + }, + { + "epoch": 6.7089405277668375, + "grad_norm": 0.3291417360305786, + "learning_rate": 2.5542e-05, + "loss": 0.0202, + "step": 8520 + }, + { + "epoch": 6.709728239464356, + "grad_norm": 0.945690929889679, + "learning_rate": 2.5545e-05, + "loss": 0.301, + "step": 8521 + }, + { + "epoch": 6.710515951161875, + "grad_norm": 1.0391206741333008, + "learning_rate": 2.5548e-05, + "loss": 0.2505, + "step": 8522 + }, + { + "epoch": 6.711303662859393, + "grad_norm": 0.9887515306472778, + "learning_rate": 2.5551e-05, + "loss": 0.1649, + "step": 8523 + }, + { + "epoch": 6.712091374556913, + "grad_norm": 0.6998437643051147, + "learning_rate": 2.5554e-05, + "loss": 0.1859, + "step": 8524 + }, + { + "epoch": 6.712879086254431, + "grad_norm": 0.4428141117095947, + "learning_rate": 2.5557e-05, + "loss": 0.0928, + "step": 8525 + }, + { + "epoch": 6.71366679795195, + "grad_norm": 0.48846930265426636, + "learning_rate": 2.556e-05, + "loss": 0.0531, + "step": 8526 + }, + { + "epoch": 6.714454509649468, + "grad_norm": 0.3772827386856079, + "learning_rate": 2.5563e-05, + "loss": 0.0357, + "step": 8527 + }, + { + "epoch": 6.715242221346987, + "grad_norm": 0.26883772015571594, + "learning_rate": 2.5566e-05, + "loss": 0.023, + "step": 8528 + }, + { + "epoch": 6.7160299330445055, + "grad_norm": 0.6153429746627808, + "learning_rate": 2.5569e-05, + "loss": 0.0648, + "step": 8529 + }, + { + "epoch": 6.716817644742024, + "grad_norm": 0.5394822359085083, + "learning_rate": 2.5572000000000002e-05, + "loss": 0.0391, + "step": 8530 + }, + { + "epoch": 6.7176053564395435, + "grad_norm": 0.3547879457473755, + "learning_rate": 2.5575e-05, + "loss": 0.0142, + "step": 8531 + }, + { + "epoch": 6.718393068137062, + "grad_norm": 0.342109739780426, + "learning_rate": 2.5578e-05, + "loss": 0.0422, + "step": 8532 + }, + { + "epoch": 6.719180779834581, + "grad_norm": 0.41309216618537903, + "learning_rate": 2.5581e-05, + "loss": 0.0201, + "step": 8533 + }, + { + "epoch": 6.719968491532099, + "grad_norm": 0.41401347517967224, + "learning_rate": 2.5584e-05, + "loss": 0.0201, + "step": 8534 + }, + { + "epoch": 6.720756203229618, + "grad_norm": 0.3323851227760315, + "learning_rate": 2.5587e-05, + "loss": 0.0229, + "step": 8535 + }, + { + "epoch": 6.721543914927136, + "grad_norm": 0.1918615996837616, + "learning_rate": 2.559e-05, + "loss": 0.0162, + "step": 8536 + }, + { + "epoch": 6.722331626624655, + "grad_norm": 0.4371715486049652, + "learning_rate": 2.5593e-05, + "loss": 0.023, + "step": 8537 + }, + { + "epoch": 6.723119338322174, + "grad_norm": 0.297057569026947, + "learning_rate": 2.5596e-05, + "loss": 0.0218, + "step": 8538 + }, + { + "epoch": 6.723907050019693, + "grad_norm": 0.5925679206848145, + "learning_rate": 2.5599e-05, + "loss": 0.0173, + "step": 8539 + }, + { + "epoch": 6.7246947617172115, + "grad_norm": 0.2832736074924469, + "learning_rate": 2.5602000000000003e-05, + "loss": 0.0112, + "step": 8540 + }, + { + "epoch": 6.72548247341473, + "grad_norm": 0.3102399706840515, + "learning_rate": 2.5605000000000003e-05, + "loss": 0.0145, + "step": 8541 + }, + { + "epoch": 6.726270185112249, + "grad_norm": 0.14887356758117676, + "learning_rate": 2.5608000000000003e-05, + "loss": 0.0076, + "step": 8542 + }, + { + "epoch": 6.727057896809768, + "grad_norm": 0.41671836376190186, + "learning_rate": 2.5611000000000003e-05, + "loss": 0.0297, + "step": 8543 + }, + { + "epoch": 6.727845608507287, + "grad_norm": 0.4317210614681244, + "learning_rate": 2.5614000000000002e-05, + "loss": 0.0149, + "step": 8544 + }, + { + "epoch": 6.728633320204805, + "grad_norm": 0.297189861536026, + "learning_rate": 2.5617e-05, + "loss": 0.0221, + "step": 8545 + }, + { + "epoch": 6.729421031902324, + "grad_norm": 0.5505155324935913, + "learning_rate": 2.562e-05, + "loss": 0.0246, + "step": 8546 + }, + { + "epoch": 6.730208743599842, + "grad_norm": 0.31254422664642334, + "learning_rate": 2.5623e-05, + "loss": 0.0216, + "step": 8547 + }, + { + "epoch": 6.730996455297361, + "grad_norm": 0.5439722537994385, + "learning_rate": 2.5625999999999998e-05, + "loss": 0.0303, + "step": 8548 + }, + { + "epoch": 6.7317841669948795, + "grad_norm": 0.5331518650054932, + "learning_rate": 2.5628999999999998e-05, + "loss": 0.0206, + "step": 8549 + }, + { + "epoch": 6.732571878692399, + "grad_norm": 0.41231870651245117, + "learning_rate": 2.5632e-05, + "loss": 0.018, + "step": 8550 + }, + { + "epoch": 6.7333595903899175, + "grad_norm": 0.4185292422771454, + "learning_rate": 2.5635e-05, + "loss": 0.0221, + "step": 8551 + }, + { + "epoch": 6.734147302087436, + "grad_norm": 0.421338826417923, + "learning_rate": 2.5638e-05, + "loss": 0.0164, + "step": 8552 + }, + { + "epoch": 6.734935013784955, + "grad_norm": 0.23962026834487915, + "learning_rate": 2.5641e-05, + "loss": 0.0149, + "step": 8553 + }, + { + "epoch": 6.735722725482473, + "grad_norm": 0.36129480600357056, + "learning_rate": 2.5644e-05, + "loss": 0.0198, + "step": 8554 + }, + { + "epoch": 6.736510437179992, + "grad_norm": 0.24691352248191833, + "learning_rate": 2.5647e-05, + "loss": 0.0152, + "step": 8555 + }, + { + "epoch": 6.737298148877511, + "grad_norm": 0.5785663723945618, + "learning_rate": 2.565e-05, + "loss": 0.033, + "step": 8556 + }, + { + "epoch": 6.73808586057503, + "grad_norm": 0.5498188138008118, + "learning_rate": 2.5653e-05, + "loss": 0.0173, + "step": 8557 + }, + { + "epoch": 6.738873572272548, + "grad_norm": 0.9495429992675781, + "learning_rate": 2.5656e-05, + "loss": 0.018, + "step": 8558 + }, + { + "epoch": 6.739661283970067, + "grad_norm": 0.6276541352272034, + "learning_rate": 2.5659e-05, + "loss": 0.0163, + "step": 8559 + }, + { + "epoch": 6.7404489956675855, + "grad_norm": 0.5157200694084167, + "learning_rate": 2.5662000000000003e-05, + "loss": 0.0361, + "step": 8560 + }, + { + "epoch": 6.741236707365104, + "grad_norm": 0.4804075360298157, + "learning_rate": 2.5665000000000002e-05, + "loss": 0.0146, + "step": 8561 + }, + { + "epoch": 6.7420244190626235, + "grad_norm": 0.3086695075035095, + "learning_rate": 2.5668000000000002e-05, + "loss": 0.0228, + "step": 8562 + }, + { + "epoch": 6.742812130760142, + "grad_norm": 0.3367989957332611, + "learning_rate": 2.5671000000000002e-05, + "loss": 0.0204, + "step": 8563 + }, + { + "epoch": 6.743599842457661, + "grad_norm": 0.4934751093387604, + "learning_rate": 2.5674000000000002e-05, + "loss": 0.0249, + "step": 8564 + }, + { + "epoch": 6.744387554155179, + "grad_norm": 0.34603601694107056, + "learning_rate": 2.5677e-05, + "loss": 0.0148, + "step": 8565 + }, + { + "epoch": 6.745175265852698, + "grad_norm": 0.6157822608947754, + "learning_rate": 2.568e-05, + "loss": 0.0184, + "step": 8566 + }, + { + "epoch": 6.745962977550216, + "grad_norm": 0.37675055861473083, + "learning_rate": 2.5683e-05, + "loss": 0.0135, + "step": 8567 + }, + { + "epoch": 6.746750689247735, + "grad_norm": 0.8231995701789856, + "learning_rate": 2.5686e-05, + "loss": 0.0295, + "step": 8568 + }, + { + "epoch": 6.747538400945254, + "grad_norm": 0.613232433795929, + "learning_rate": 2.5688999999999997e-05, + "loss": 0.0322, + "step": 8569 + }, + { + "epoch": 6.748326112642773, + "grad_norm": 0.6319842338562012, + "learning_rate": 2.5692e-05, + "loss": 0.0311, + "step": 8570 + }, + { + "epoch": 6.7491138243402915, + "grad_norm": 0.8564237952232361, + "learning_rate": 2.5695e-05, + "loss": 0.277, + "step": 8571 + }, + { + "epoch": 6.74990153603781, + "grad_norm": 0.8525859117507935, + "learning_rate": 2.5698e-05, + "loss": 0.2109, + "step": 8572 + }, + { + "epoch": 6.750689247735329, + "grad_norm": 0.6462075710296631, + "learning_rate": 2.5701e-05, + "loss": 0.1374, + "step": 8573 + }, + { + "epoch": 6.751476959432847, + "grad_norm": 0.9382593631744385, + "learning_rate": 2.5704e-05, + "loss": 0.2116, + "step": 8574 + }, + { + "epoch": 6.752264671130367, + "grad_norm": 0.6488710641860962, + "learning_rate": 2.5707e-05, + "loss": 0.0546, + "step": 8575 + }, + { + "epoch": 6.753052382827885, + "grad_norm": 0.6893622875213623, + "learning_rate": 2.571e-05, + "loss": 0.054, + "step": 8576 + }, + { + "epoch": 6.753840094525404, + "grad_norm": 0.5056110620498657, + "learning_rate": 2.5713e-05, + "loss": 0.0516, + "step": 8577 + }, + { + "epoch": 6.754627806222922, + "grad_norm": 0.6273099780082703, + "learning_rate": 2.5716e-05, + "loss": 0.0644, + "step": 8578 + }, + { + "epoch": 6.755415517920441, + "grad_norm": 0.7501806020736694, + "learning_rate": 2.5719e-05, + "loss": 0.0452, + "step": 8579 + }, + { + "epoch": 6.7562032296179595, + "grad_norm": 0.4035813510417938, + "learning_rate": 2.5722000000000002e-05, + "loss": 0.0309, + "step": 8580 + }, + { + "epoch": 6.756990941315479, + "grad_norm": 0.3696702718734741, + "learning_rate": 2.5725000000000002e-05, + "loss": 0.0211, + "step": 8581 + }, + { + "epoch": 6.7577786530129975, + "grad_norm": 0.5430330634117126, + "learning_rate": 2.5728e-05, + "loss": 0.032, + "step": 8582 + }, + { + "epoch": 6.758566364710516, + "grad_norm": 0.3842170834541321, + "learning_rate": 2.5731e-05, + "loss": 0.0198, + "step": 8583 + }, + { + "epoch": 6.759354076408035, + "grad_norm": 0.2795916199684143, + "learning_rate": 2.5734e-05, + "loss": 0.019, + "step": 8584 + }, + { + "epoch": 6.760141788105553, + "grad_norm": 0.3296435475349426, + "learning_rate": 2.5737e-05, + "loss": 0.0219, + "step": 8585 + }, + { + "epoch": 6.760929499803072, + "grad_norm": 0.41674524545669556, + "learning_rate": 2.574e-05, + "loss": 0.0376, + "step": 8586 + }, + { + "epoch": 6.76171721150059, + "grad_norm": 0.5358374714851379, + "learning_rate": 2.5743e-05, + "loss": 0.0309, + "step": 8587 + }, + { + "epoch": 6.76250492319811, + "grad_norm": 0.3391485810279846, + "learning_rate": 2.5746e-05, + "loss": 0.0229, + "step": 8588 + }, + { + "epoch": 6.763292634895628, + "grad_norm": 0.2438211292028427, + "learning_rate": 2.5749e-05, + "loss": 0.0187, + "step": 8589 + }, + { + "epoch": 6.764080346593147, + "grad_norm": 0.29582399129867554, + "learning_rate": 2.5752000000000003e-05, + "loss": 0.0122, + "step": 8590 + }, + { + "epoch": 6.7648680582906655, + "grad_norm": 0.353027880191803, + "learning_rate": 2.5755000000000003e-05, + "loss": 0.0173, + "step": 8591 + }, + { + "epoch": 6.765655769988184, + "grad_norm": 0.3724329471588135, + "learning_rate": 2.5758000000000003e-05, + "loss": 0.0254, + "step": 8592 + }, + { + "epoch": 6.7664434816857035, + "grad_norm": 0.6196630597114563, + "learning_rate": 2.5761000000000003e-05, + "loss": 0.0283, + "step": 8593 + }, + { + "epoch": 6.767231193383222, + "grad_norm": 0.4799487888813019, + "learning_rate": 2.5764e-05, + "loss": 0.0265, + "step": 8594 + }, + { + "epoch": 6.768018905080741, + "grad_norm": 0.6259503960609436, + "learning_rate": 2.5767e-05, + "loss": 0.0697, + "step": 8595 + }, + { + "epoch": 6.768806616778259, + "grad_norm": 0.27398237586021423, + "learning_rate": 2.577e-05, + "loss": 0.0201, + "step": 8596 + }, + { + "epoch": 6.769594328475778, + "grad_norm": 0.5070346593856812, + "learning_rate": 2.5773e-05, + "loss": 0.0223, + "step": 8597 + }, + { + "epoch": 6.770382040173296, + "grad_norm": 0.186662495136261, + "learning_rate": 2.5776e-05, + "loss": 0.0119, + "step": 8598 + }, + { + "epoch": 6.771169751870815, + "grad_norm": 0.4541865885257721, + "learning_rate": 2.5779e-05, + "loss": 0.0261, + "step": 8599 + }, + { + "epoch": 6.771957463568334, + "grad_norm": 0.5288323163986206, + "learning_rate": 2.5782e-05, + "loss": 0.0323, + "step": 8600 + }, + { + "epoch": 6.772745175265853, + "grad_norm": 0.9081085920333862, + "learning_rate": 2.5785e-05, + "loss": 0.028, + "step": 8601 + }, + { + "epoch": 6.7735328869633715, + "grad_norm": 0.630748987197876, + "learning_rate": 2.5788e-05, + "loss": 0.0213, + "step": 8602 + }, + { + "epoch": 6.77432059866089, + "grad_norm": 0.9060018062591553, + "learning_rate": 2.5791e-05, + "loss": 0.0209, + "step": 8603 + }, + { + "epoch": 6.775108310358409, + "grad_norm": 0.4530661106109619, + "learning_rate": 2.5794e-05, + "loss": 0.0254, + "step": 8604 + }, + { + "epoch": 6.775896022055927, + "grad_norm": 0.6860009431838989, + "learning_rate": 2.5797e-05, + "loss": 0.0349, + "step": 8605 + }, + { + "epoch": 6.776683733753446, + "grad_norm": 0.4204105734825134, + "learning_rate": 2.58e-05, + "loss": 0.0232, + "step": 8606 + }, + { + "epoch": 6.777471445450965, + "grad_norm": 0.3228384554386139, + "learning_rate": 2.5803e-05, + "loss": 0.0138, + "step": 8607 + }, + { + "epoch": 6.778259157148484, + "grad_norm": 0.4206915497779846, + "learning_rate": 2.5806e-05, + "loss": 0.0223, + "step": 8608 + }, + { + "epoch": 6.779046868846002, + "grad_norm": 0.42782407999038696, + "learning_rate": 2.5809e-05, + "loss": 0.0251, + "step": 8609 + }, + { + "epoch": 6.779834580543521, + "grad_norm": 0.39186254143714905, + "learning_rate": 2.5812000000000003e-05, + "loss": 0.019, + "step": 8610 + }, + { + "epoch": 6.7806222922410395, + "grad_norm": 0.7530924081802368, + "learning_rate": 2.5815000000000003e-05, + "loss": 0.0274, + "step": 8611 + }, + { + "epoch": 6.781410003938559, + "grad_norm": 0.1995660364627838, + "learning_rate": 2.5818000000000003e-05, + "loss": 0.013, + "step": 8612 + }, + { + "epoch": 6.7821977156360775, + "grad_norm": 0.3778226971626282, + "learning_rate": 2.5821000000000002e-05, + "loss": 0.0159, + "step": 8613 + }, + { + "epoch": 6.782985427333596, + "grad_norm": 0.37939587235450745, + "learning_rate": 2.5824000000000002e-05, + "loss": 0.0283, + "step": 8614 + }, + { + "epoch": 6.783773139031115, + "grad_norm": 0.31677180528640747, + "learning_rate": 2.5827000000000002e-05, + "loss": 0.0192, + "step": 8615 + }, + { + "epoch": 6.784560850728633, + "grad_norm": 0.342180460691452, + "learning_rate": 2.5830000000000002e-05, + "loss": 0.0265, + "step": 8616 + }, + { + "epoch": 6.785348562426152, + "grad_norm": 0.2754807472229004, + "learning_rate": 2.5833e-05, + "loss": 0.011, + "step": 8617 + }, + { + "epoch": 6.78613627412367, + "grad_norm": 0.6594019532203674, + "learning_rate": 2.5835999999999998e-05, + "loss": 0.0432, + "step": 8618 + }, + { + "epoch": 6.78692398582119, + "grad_norm": 0.4751303493976593, + "learning_rate": 2.5838999999999998e-05, + "loss": 0.0183, + "step": 8619 + }, + { + "epoch": 6.787711697518708, + "grad_norm": 0.8364858031272888, + "learning_rate": 2.5842e-05, + "loss": 0.0286, + "step": 8620 + }, + { + "epoch": 6.788499409216227, + "grad_norm": 1.2321431636810303, + "learning_rate": 2.5845e-05, + "loss": 0.3234, + "step": 8621 + }, + { + "epoch": 6.7892871209137455, + "grad_norm": 0.7481720447540283, + "learning_rate": 2.5848e-05, + "loss": 0.1756, + "step": 8622 + }, + { + "epoch": 6.790074832611264, + "grad_norm": 0.6591187119483948, + "learning_rate": 2.5851e-05, + "loss": 0.1554, + "step": 8623 + }, + { + "epoch": 6.790862544308783, + "grad_norm": 0.6217700242996216, + "learning_rate": 2.5854e-05, + "loss": 0.1068, + "step": 8624 + }, + { + "epoch": 6.791650256006301, + "grad_norm": 0.6671390533447266, + "learning_rate": 2.5857e-05, + "loss": 0.1143, + "step": 8625 + }, + { + "epoch": 6.792437967703821, + "grad_norm": 0.9620509743690491, + "learning_rate": 2.586e-05, + "loss": 0.0702, + "step": 8626 + }, + { + "epoch": 6.793225679401339, + "grad_norm": 0.40763217210769653, + "learning_rate": 2.5863e-05, + "loss": 0.0385, + "step": 8627 + }, + { + "epoch": 6.794013391098858, + "grad_norm": 0.5585855841636658, + "learning_rate": 2.5866e-05, + "loss": 0.0509, + "step": 8628 + }, + { + "epoch": 6.794801102796376, + "grad_norm": 0.5022106170654297, + "learning_rate": 2.5869e-05, + "loss": 0.039, + "step": 8629 + }, + { + "epoch": 6.795588814493895, + "grad_norm": 0.5659006237983704, + "learning_rate": 2.5872000000000002e-05, + "loss": 0.0346, + "step": 8630 + }, + { + "epoch": 6.796376526191414, + "grad_norm": 0.6594074964523315, + "learning_rate": 2.5875000000000002e-05, + "loss": 0.0416, + "step": 8631 + }, + { + "epoch": 6.797164237888933, + "grad_norm": 0.34042689204216003, + "learning_rate": 2.5878000000000002e-05, + "loss": 0.0323, + "step": 8632 + }, + { + "epoch": 6.7979519495864515, + "grad_norm": 0.37891885638237, + "learning_rate": 2.5881000000000002e-05, + "loss": 0.0215, + "step": 8633 + }, + { + "epoch": 6.79873966128397, + "grad_norm": 0.44058758020401, + "learning_rate": 2.5884e-05, + "loss": 0.0193, + "step": 8634 + }, + { + "epoch": 6.799527372981489, + "grad_norm": 0.41800981760025024, + "learning_rate": 2.5887e-05, + "loss": 0.0299, + "step": 8635 + }, + { + "epoch": 6.800315084679007, + "grad_norm": 0.334959477186203, + "learning_rate": 2.589e-05, + "loss": 0.0278, + "step": 8636 + }, + { + "epoch": 6.801102796376526, + "grad_norm": 0.5256922245025635, + "learning_rate": 2.5893e-05, + "loss": 0.0146, + "step": 8637 + }, + { + "epoch": 6.801890508074045, + "grad_norm": 0.2521483898162842, + "learning_rate": 2.5896e-05, + "loss": 0.0122, + "step": 8638 + }, + { + "epoch": 6.802678219771564, + "grad_norm": 0.7929751873016357, + "learning_rate": 2.5899e-05, + "loss": 0.0397, + "step": 8639 + }, + { + "epoch": 6.803465931469082, + "grad_norm": 0.3838658332824707, + "learning_rate": 2.5902e-05, + "loss": 0.0194, + "step": 8640 + }, + { + "epoch": 6.804253643166601, + "grad_norm": 0.450953871011734, + "learning_rate": 2.5905000000000004e-05, + "loss": 0.0276, + "step": 8641 + }, + { + "epoch": 6.8050413548641195, + "grad_norm": 0.22305771708488464, + "learning_rate": 2.5908000000000003e-05, + "loss": 0.024, + "step": 8642 + }, + { + "epoch": 6.805829066561638, + "grad_norm": 0.2883995771408081, + "learning_rate": 2.5911e-05, + "loss": 0.0291, + "step": 8643 + }, + { + "epoch": 6.806616778259157, + "grad_norm": 0.7829564809799194, + "learning_rate": 2.5914e-05, + "loss": 0.021, + "step": 8644 + }, + { + "epoch": 6.807404489956676, + "grad_norm": 0.2938883900642395, + "learning_rate": 2.5917e-05, + "loss": 0.0142, + "step": 8645 + }, + { + "epoch": 6.808192201654195, + "grad_norm": 0.5260411500930786, + "learning_rate": 2.592e-05, + "loss": 0.0175, + "step": 8646 + }, + { + "epoch": 6.808979913351713, + "grad_norm": 0.2261204719543457, + "learning_rate": 2.5923e-05, + "loss": 0.0144, + "step": 8647 + }, + { + "epoch": 6.809767625049232, + "grad_norm": 0.46947672963142395, + "learning_rate": 2.5926e-05, + "loss": 0.0249, + "step": 8648 + }, + { + "epoch": 6.81055533674675, + "grad_norm": 0.4316644072532654, + "learning_rate": 2.5929e-05, + "loss": 0.0197, + "step": 8649 + }, + { + "epoch": 6.81134304844427, + "grad_norm": 0.40948987007141113, + "learning_rate": 2.5932e-05, + "loss": 0.0255, + "step": 8650 + }, + { + "epoch": 6.812130760141788, + "grad_norm": 0.3005601167678833, + "learning_rate": 2.5935e-05, + "loss": 0.0225, + "step": 8651 + }, + { + "epoch": 6.812918471839307, + "grad_norm": 0.35649338364601135, + "learning_rate": 2.5938e-05, + "loss": 0.0191, + "step": 8652 + }, + { + "epoch": 6.8137061835368256, + "grad_norm": 0.6679087281227112, + "learning_rate": 2.5941e-05, + "loss": 0.0262, + "step": 8653 + }, + { + "epoch": 6.814493895234344, + "grad_norm": 0.2579329311847687, + "learning_rate": 2.5944e-05, + "loss": 0.009, + "step": 8654 + }, + { + "epoch": 6.815281606931863, + "grad_norm": 0.37319931387901306, + "learning_rate": 2.5947e-05, + "loss": 0.0224, + "step": 8655 + }, + { + "epoch": 6.816069318629381, + "grad_norm": 0.5229111909866333, + "learning_rate": 2.595e-05, + "loss": 0.0118, + "step": 8656 + }, + { + "epoch": 6.816857030326901, + "grad_norm": 0.48505645990371704, + "learning_rate": 2.5953e-05, + "loss": 0.0168, + "step": 8657 + }, + { + "epoch": 6.817644742024419, + "grad_norm": 0.2716978192329407, + "learning_rate": 2.5956e-05, + "loss": 0.0212, + "step": 8658 + }, + { + "epoch": 6.818432453721938, + "grad_norm": 0.5390247106552124, + "learning_rate": 2.5959e-05, + "loss": 0.026, + "step": 8659 + }, + { + "epoch": 6.819220165419456, + "grad_norm": 0.38578036427497864, + "learning_rate": 2.5962e-05, + "loss": 0.0193, + "step": 8660 + }, + { + "epoch": 6.820007877116975, + "grad_norm": 0.5586953163146973, + "learning_rate": 2.5965000000000003e-05, + "loss": 0.0406, + "step": 8661 + }, + { + "epoch": 6.8207955888144935, + "grad_norm": 0.49259480834007263, + "learning_rate": 2.5968000000000003e-05, + "loss": 0.0297, + "step": 8662 + }, + { + "epoch": 6.821583300512012, + "grad_norm": 0.40396684408187866, + "learning_rate": 2.5971000000000003e-05, + "loss": 0.0168, + "step": 8663 + }, + { + "epoch": 6.822371012209532, + "grad_norm": 0.5168192982673645, + "learning_rate": 2.5974000000000002e-05, + "loss": 0.0253, + "step": 8664 + }, + { + "epoch": 6.82315872390705, + "grad_norm": 0.820440948009491, + "learning_rate": 2.5977000000000002e-05, + "loss": 0.0234, + "step": 8665 + }, + { + "epoch": 6.823946435604569, + "grad_norm": 0.3804510533809662, + "learning_rate": 2.5980000000000002e-05, + "loss": 0.022, + "step": 8666 + }, + { + "epoch": 6.824734147302087, + "grad_norm": 0.31648215651512146, + "learning_rate": 2.5983000000000002e-05, + "loss": 0.0149, + "step": 8667 + }, + { + "epoch": 6.825521858999606, + "grad_norm": 0.299342542886734, + "learning_rate": 2.5985999999999998e-05, + "loss": 0.0135, + "step": 8668 + }, + { + "epoch": 6.826309570697125, + "grad_norm": 0.6398104429244995, + "learning_rate": 2.5988999999999998e-05, + "loss": 0.0236, + "step": 8669 + }, + { + "epoch": 6.827097282394644, + "grad_norm": 0.8903769850730896, + "learning_rate": 2.5991999999999998e-05, + "loss": 0.0586, + "step": 8670 + }, + { + "epoch": 6.827884994092162, + "grad_norm": 0.6814420819282532, + "learning_rate": 2.5995e-05, + "loss": 0.2854, + "step": 8671 + }, + { + "epoch": 6.828672705789681, + "grad_norm": 0.7076085805892944, + "learning_rate": 2.5998e-05, + "loss": 0.2789, + "step": 8672 + }, + { + "epoch": 6.8294604174872, + "grad_norm": 0.5271515846252441, + "learning_rate": 2.6001e-05, + "loss": 0.1245, + "step": 8673 + }, + { + "epoch": 6.830248129184718, + "grad_norm": 0.5377849340438843, + "learning_rate": 2.6004e-05, + "loss": 0.1114, + "step": 8674 + }, + { + "epoch": 6.831035840882237, + "grad_norm": 0.40871158242225647, + "learning_rate": 2.6007e-05, + "loss": 0.0692, + "step": 8675 + }, + { + "epoch": 6.831823552579756, + "grad_norm": 0.40565717220306396, + "learning_rate": 2.601e-05, + "loss": 0.0642, + "step": 8676 + }, + { + "epoch": 6.832611264277275, + "grad_norm": 0.30157023668289185, + "learning_rate": 2.6013e-05, + "loss": 0.029, + "step": 8677 + }, + { + "epoch": 6.833398975974793, + "grad_norm": 0.4144296646118164, + "learning_rate": 2.6016e-05, + "loss": 0.0881, + "step": 8678 + }, + { + "epoch": 6.834186687672312, + "grad_norm": 0.26170286536216736, + "learning_rate": 2.6019e-05, + "loss": 0.025, + "step": 8679 + }, + { + "epoch": 6.83497439936983, + "grad_norm": 0.24707594513893127, + "learning_rate": 2.6022e-05, + "loss": 0.0151, + "step": 8680 + }, + { + "epoch": 6.835762111067349, + "grad_norm": 0.24147683382034302, + "learning_rate": 2.6025000000000002e-05, + "loss": 0.0226, + "step": 8681 + }, + { + "epoch": 6.8365498227648684, + "grad_norm": 0.2501821517944336, + "learning_rate": 2.6028000000000002e-05, + "loss": 0.0159, + "step": 8682 + }, + { + "epoch": 6.837337534462387, + "grad_norm": 0.47284355759620667, + "learning_rate": 2.6031000000000002e-05, + "loss": 0.0461, + "step": 8683 + }, + { + "epoch": 6.838125246159906, + "grad_norm": 0.1804259568452835, + "learning_rate": 2.6034000000000002e-05, + "loss": 0.0091, + "step": 8684 + }, + { + "epoch": 6.838912957857424, + "grad_norm": 0.3014083206653595, + "learning_rate": 2.6037e-05, + "loss": 0.0144, + "step": 8685 + }, + { + "epoch": 6.839700669554943, + "grad_norm": 0.4493468105792999, + "learning_rate": 2.604e-05, + "loss": 0.0286, + "step": 8686 + }, + { + "epoch": 6.840488381252461, + "grad_norm": 0.38948410749435425, + "learning_rate": 2.6043e-05, + "loss": 0.0214, + "step": 8687 + }, + { + "epoch": 6.841276092949981, + "grad_norm": 0.3335849344730377, + "learning_rate": 2.6046e-05, + "loss": 0.014, + "step": 8688 + }, + { + "epoch": 6.842063804647499, + "grad_norm": 0.46737971901893616, + "learning_rate": 2.6049e-05, + "loss": 0.0241, + "step": 8689 + }, + { + "epoch": 6.842851516345018, + "grad_norm": 0.472695529460907, + "learning_rate": 2.6052e-05, + "loss": 0.0463, + "step": 8690 + }, + { + "epoch": 6.843639228042536, + "grad_norm": 0.5998688340187073, + "learning_rate": 2.6055000000000004e-05, + "loss": 0.0235, + "step": 8691 + }, + { + "epoch": 6.844426939740055, + "grad_norm": 0.22467249631881714, + "learning_rate": 2.6058e-05, + "loss": 0.0141, + "step": 8692 + }, + { + "epoch": 6.845214651437574, + "grad_norm": 0.5394048094749451, + "learning_rate": 2.6061e-05, + "loss": 0.0298, + "step": 8693 + }, + { + "epoch": 6.846002363135092, + "grad_norm": 0.40547510981559753, + "learning_rate": 2.6064e-05, + "loss": 0.0221, + "step": 8694 + }, + { + "epoch": 6.846790074832612, + "grad_norm": 0.49767670035362244, + "learning_rate": 2.6067e-05, + "loss": 0.0688, + "step": 8695 + }, + { + "epoch": 6.84757778653013, + "grad_norm": 0.4056391417980194, + "learning_rate": 2.607e-05, + "loss": 0.0217, + "step": 8696 + }, + { + "epoch": 6.848365498227649, + "grad_norm": 0.36737602949142456, + "learning_rate": 2.6073e-05, + "loss": 0.0217, + "step": 8697 + }, + { + "epoch": 6.849153209925167, + "grad_norm": 0.6857969760894775, + "learning_rate": 2.6076e-05, + "loss": 0.0192, + "step": 8698 + }, + { + "epoch": 6.849940921622686, + "grad_norm": 0.5823538303375244, + "learning_rate": 2.6079e-05, + "loss": 0.0159, + "step": 8699 + }, + { + "epoch": 6.850728633320204, + "grad_norm": 0.8295487761497498, + "learning_rate": 2.6082e-05, + "loss": 0.0273, + "step": 8700 + }, + { + "epoch": 6.851516345017724, + "grad_norm": 0.3770376741886139, + "learning_rate": 2.6085000000000002e-05, + "loss": 0.0289, + "step": 8701 + }, + { + "epoch": 6.8523040567152425, + "grad_norm": 0.6854429244995117, + "learning_rate": 2.6088e-05, + "loss": 0.0181, + "step": 8702 + }, + { + "epoch": 6.853091768412761, + "grad_norm": 0.30213284492492676, + "learning_rate": 2.6091e-05, + "loss": 0.0194, + "step": 8703 + }, + { + "epoch": 6.85387948011028, + "grad_norm": 0.3373921513557434, + "learning_rate": 2.6094e-05, + "loss": 0.0213, + "step": 8704 + }, + { + "epoch": 6.854667191807798, + "grad_norm": 0.33435049653053284, + "learning_rate": 2.6097e-05, + "loss": 0.0153, + "step": 8705 + }, + { + "epoch": 6.855454903505317, + "grad_norm": 0.3336784541606903, + "learning_rate": 2.61e-05, + "loss": 0.0171, + "step": 8706 + }, + { + "epoch": 6.856242615202836, + "grad_norm": 0.3965630531311035, + "learning_rate": 2.6103e-05, + "loss": 0.0275, + "step": 8707 + }, + { + "epoch": 6.857030326900355, + "grad_norm": 0.4361603558063507, + "learning_rate": 2.6106e-05, + "loss": 0.0243, + "step": 8708 + }, + { + "epoch": 6.857818038597873, + "grad_norm": 0.3272637128829956, + "learning_rate": 2.6109e-05, + "loss": 0.0179, + "step": 8709 + }, + { + "epoch": 6.858605750295392, + "grad_norm": 0.33052098751068115, + "learning_rate": 2.6112e-05, + "loss": 0.0269, + "step": 8710 + }, + { + "epoch": 6.8593934619929104, + "grad_norm": 0.4679984152317047, + "learning_rate": 2.6115000000000003e-05, + "loss": 0.0352, + "step": 8711 + }, + { + "epoch": 6.860181173690429, + "grad_norm": 0.5360848903656006, + "learning_rate": 2.6118000000000003e-05, + "loss": 0.0286, + "step": 8712 + }, + { + "epoch": 6.860968885387948, + "grad_norm": 0.3495211899280548, + "learning_rate": 2.6121000000000003e-05, + "loss": 0.0162, + "step": 8713 + }, + { + "epoch": 6.861756597085467, + "grad_norm": 0.6099099516868591, + "learning_rate": 2.6124000000000003e-05, + "loss": 0.0383, + "step": 8714 + }, + { + "epoch": 6.862544308782986, + "grad_norm": 0.5674789547920227, + "learning_rate": 2.6127000000000002e-05, + "loss": 0.0298, + "step": 8715 + }, + { + "epoch": 6.863332020480504, + "grad_norm": 0.5369548797607422, + "learning_rate": 2.6130000000000002e-05, + "loss": 0.0296, + "step": 8716 + }, + { + "epoch": 6.864119732178023, + "grad_norm": 0.6708800196647644, + "learning_rate": 2.6133e-05, + "loss": 0.0363, + "step": 8717 + }, + { + "epoch": 6.864907443875541, + "grad_norm": 0.4211864173412323, + "learning_rate": 2.6136e-05, + "loss": 0.0187, + "step": 8718 + }, + { + "epoch": 6.865695155573061, + "grad_norm": 0.702405571937561, + "learning_rate": 2.6138999999999998e-05, + "loss": 0.0309, + "step": 8719 + }, + { + "epoch": 6.866482867270579, + "grad_norm": 0.3512851595878601, + "learning_rate": 2.6141999999999998e-05, + "loss": 0.0221, + "step": 8720 + }, + { + "epoch": 6.867270578968098, + "grad_norm": 1.3755265474319458, + "learning_rate": 2.6145e-05, + "loss": 0.3215, + "step": 8721 + }, + { + "epoch": 6.8680582906656165, + "grad_norm": 0.6197632551193237, + "learning_rate": 2.6148e-05, + "loss": 0.2326, + "step": 8722 + }, + { + "epoch": 6.868846002363135, + "grad_norm": 0.5988715887069702, + "learning_rate": 2.6151e-05, + "loss": 0.1176, + "step": 8723 + }, + { + "epoch": 6.869633714060654, + "grad_norm": 0.5544222593307495, + "learning_rate": 2.6154e-05, + "loss": 0.0826, + "step": 8724 + }, + { + "epoch": 6.870421425758172, + "grad_norm": 0.7580695152282715, + "learning_rate": 2.6157e-05, + "loss": 0.0913, + "step": 8725 + }, + { + "epoch": 6.871209137455692, + "grad_norm": 2.463040590286255, + "learning_rate": 2.616e-05, + "loss": 0.0912, + "step": 8726 + }, + { + "epoch": 6.87199684915321, + "grad_norm": 0.4525309205055237, + "learning_rate": 2.6163e-05, + "loss": 0.0347, + "step": 8727 + }, + { + "epoch": 6.872784560850729, + "grad_norm": 0.5068159103393555, + "learning_rate": 2.6166e-05, + "loss": 0.0684, + "step": 8728 + }, + { + "epoch": 6.873572272548247, + "grad_norm": 0.20590372383594513, + "learning_rate": 2.6169e-05, + "loss": 0.0148, + "step": 8729 + }, + { + "epoch": 6.874359984245766, + "grad_norm": 0.2526188790798187, + "learning_rate": 2.6172e-05, + "loss": 0.0197, + "step": 8730 + }, + { + "epoch": 6.8751476959432845, + "grad_norm": 0.19082830846309662, + "learning_rate": 2.6175000000000003e-05, + "loss": 0.0135, + "step": 8731 + }, + { + "epoch": 6.875935407640803, + "grad_norm": 0.3808480203151703, + "learning_rate": 2.6178000000000002e-05, + "loss": 0.0287, + "step": 8732 + }, + { + "epoch": 6.8767231193383225, + "grad_norm": 0.4938771724700928, + "learning_rate": 2.6181000000000002e-05, + "loss": 0.0324, + "step": 8733 + }, + { + "epoch": 6.877510831035841, + "grad_norm": 0.3342337906360626, + "learning_rate": 2.6184000000000002e-05, + "loss": 0.0157, + "step": 8734 + }, + { + "epoch": 6.87829854273336, + "grad_norm": 0.2596215605735779, + "learning_rate": 2.6187000000000002e-05, + "loss": 0.014, + "step": 8735 + }, + { + "epoch": 6.879086254430878, + "grad_norm": 0.3873337507247925, + "learning_rate": 2.619e-05, + "loss": 0.0283, + "step": 8736 + }, + { + "epoch": 6.879873966128397, + "grad_norm": 0.32350486516952515, + "learning_rate": 2.6193e-05, + "loss": 0.0287, + "step": 8737 + }, + { + "epoch": 6.880661677825916, + "grad_norm": 0.23269681632518768, + "learning_rate": 2.6196e-05, + "loss": 0.013, + "step": 8738 + }, + { + "epoch": 6.881449389523435, + "grad_norm": 0.241468146443367, + "learning_rate": 2.6199e-05, + "loss": 0.0174, + "step": 8739 + }, + { + "epoch": 6.882237101220953, + "grad_norm": 0.26241177320480347, + "learning_rate": 2.6202e-05, + "loss": 0.0296, + "step": 8740 + }, + { + "epoch": 6.883024812918472, + "grad_norm": 0.7748896479606628, + "learning_rate": 2.6205e-05, + "loss": 0.0241, + "step": 8741 + }, + { + "epoch": 6.8838125246159905, + "grad_norm": 0.2871212363243103, + "learning_rate": 2.6208e-05, + "loss": 0.014, + "step": 8742 + }, + { + "epoch": 6.884600236313509, + "grad_norm": 0.3560711443424225, + "learning_rate": 2.6211e-05, + "loss": 0.0175, + "step": 8743 + }, + { + "epoch": 6.885387948011028, + "grad_norm": 0.3779078722000122, + "learning_rate": 2.6214e-05, + "loss": 0.0225, + "step": 8744 + }, + { + "epoch": 6.886175659708547, + "grad_norm": 0.2988138794898987, + "learning_rate": 2.6217e-05, + "loss": 0.0218, + "step": 8745 + }, + { + "epoch": 6.886963371406066, + "grad_norm": 0.20766353607177734, + "learning_rate": 2.622e-05, + "loss": 0.0143, + "step": 8746 + }, + { + "epoch": 6.887751083103584, + "grad_norm": 0.436580628156662, + "learning_rate": 2.6223e-05, + "loss": 0.0135, + "step": 8747 + }, + { + "epoch": 6.888538794801103, + "grad_norm": 0.22509999573230743, + "learning_rate": 2.6226e-05, + "loss": 0.0133, + "step": 8748 + }, + { + "epoch": 6.889326506498621, + "grad_norm": 0.3433630168437958, + "learning_rate": 2.6229e-05, + "loss": 0.0211, + "step": 8749 + }, + { + "epoch": 6.89011421819614, + "grad_norm": 0.5164903998374939, + "learning_rate": 2.6232e-05, + "loss": 0.0304, + "step": 8750 + }, + { + "epoch": 6.8909019298936585, + "grad_norm": 1.0884712934494019, + "learning_rate": 2.6235000000000002e-05, + "loss": 0.029, + "step": 8751 + }, + { + "epoch": 6.891689641591178, + "grad_norm": 0.23380860686302185, + "learning_rate": 2.6238000000000002e-05, + "loss": 0.01, + "step": 8752 + }, + { + "epoch": 6.8924773532886965, + "grad_norm": 1.2075018882751465, + "learning_rate": 2.6241e-05, + "loss": 0.0289, + "step": 8753 + }, + { + "epoch": 6.893265064986215, + "grad_norm": 0.7143729329109192, + "learning_rate": 2.6244e-05, + "loss": 0.0207, + "step": 8754 + }, + { + "epoch": 6.894052776683734, + "grad_norm": 1.086371898651123, + "learning_rate": 2.6247e-05, + "loss": 0.0253, + "step": 8755 + }, + { + "epoch": 6.894840488381252, + "grad_norm": 0.3840600252151489, + "learning_rate": 2.625e-05, + "loss": 0.0159, + "step": 8756 + }, + { + "epoch": 6.895628200078772, + "grad_norm": 0.5224735140800476, + "learning_rate": 2.6253e-05, + "loss": 0.0221, + "step": 8757 + }, + { + "epoch": 6.89641591177629, + "grad_norm": 1.3923192024230957, + "learning_rate": 2.6256e-05, + "loss": 0.0537, + "step": 8758 + }, + { + "epoch": 6.897203623473809, + "grad_norm": 0.3728491961956024, + "learning_rate": 2.6259e-05, + "loss": 0.0243, + "step": 8759 + }, + { + "epoch": 6.897991335171327, + "grad_norm": 0.46536219120025635, + "learning_rate": 2.6262e-05, + "loss": 0.0295, + "step": 8760 + }, + { + "epoch": 6.898779046868846, + "grad_norm": 1.2512086629867554, + "learning_rate": 2.6265e-05, + "loss": 0.0226, + "step": 8761 + }, + { + "epoch": 6.8995667585663645, + "grad_norm": 0.49757063388824463, + "learning_rate": 2.6268000000000003e-05, + "loss": 0.0351, + "step": 8762 + }, + { + "epoch": 6.900354470263883, + "grad_norm": 0.5271503925323486, + "learning_rate": 2.6271000000000003e-05, + "loss": 0.0201, + "step": 8763 + }, + { + "epoch": 6.9011421819614025, + "grad_norm": 0.4722538888454437, + "learning_rate": 2.6274000000000003e-05, + "loss": 0.0355, + "step": 8764 + }, + { + "epoch": 6.901929893658921, + "grad_norm": 0.3685853183269501, + "learning_rate": 2.6277000000000003e-05, + "loss": 0.0182, + "step": 8765 + }, + { + "epoch": 6.90271760535644, + "grad_norm": 0.7186920046806335, + "learning_rate": 2.628e-05, + "loss": 0.0226, + "step": 8766 + }, + { + "epoch": 6.903505317053958, + "grad_norm": 0.479366272687912, + "learning_rate": 2.6283e-05, + "loss": 0.0294, + "step": 8767 + }, + { + "epoch": 6.904293028751477, + "grad_norm": 0.5418209433555603, + "learning_rate": 2.6286e-05, + "loss": 0.031, + "step": 8768 + }, + { + "epoch": 6.905080740448995, + "grad_norm": 0.8411550521850586, + "learning_rate": 2.6289e-05, + "loss": 0.0357, + "step": 8769 + }, + { + "epoch": 6.905868452146514, + "grad_norm": 0.5672304630279541, + "learning_rate": 2.6292e-05, + "loss": 0.0277, + "step": 8770 + }, + { + "epoch": 6.906656163844033, + "grad_norm": 0.9773092269897461, + "learning_rate": 2.6294999999999998e-05, + "loss": 0.3192, + "step": 8771 + }, + { + "epoch": 6.907443875541552, + "grad_norm": 0.5743794441223145, + "learning_rate": 2.6298e-05, + "loss": 0.1392, + "step": 8772 + }, + { + "epoch": 6.9082315872390705, + "grad_norm": 0.8234010338783264, + "learning_rate": 2.6301e-05, + "loss": 0.216, + "step": 8773 + }, + { + "epoch": 6.909019298936589, + "grad_norm": 0.5944494009017944, + "learning_rate": 2.6304e-05, + "loss": 0.1278, + "step": 8774 + }, + { + "epoch": 6.909807010634108, + "grad_norm": 0.5891231298446655, + "learning_rate": 2.6307e-05, + "loss": 0.1021, + "step": 8775 + }, + { + "epoch": 6.910594722331627, + "grad_norm": 0.7408989667892456, + "learning_rate": 2.631e-05, + "loss": 0.1702, + "step": 8776 + }, + { + "epoch": 6.911382434029146, + "grad_norm": 0.4822353422641754, + "learning_rate": 2.6313e-05, + "loss": 0.0585, + "step": 8777 + }, + { + "epoch": 6.912170145726664, + "grad_norm": 0.28489673137664795, + "learning_rate": 2.6316e-05, + "loss": 0.019, + "step": 8778 + }, + { + "epoch": 6.912957857424183, + "grad_norm": 0.30988746881484985, + "learning_rate": 2.6319e-05, + "loss": 0.0232, + "step": 8779 + }, + { + "epoch": 6.913745569121701, + "grad_norm": 0.29270192980766296, + "learning_rate": 2.6322e-05, + "loss": 0.0237, + "step": 8780 + }, + { + "epoch": 6.91453328081922, + "grad_norm": 0.5038040280342102, + "learning_rate": 2.6325e-05, + "loss": 0.0261, + "step": 8781 + }, + { + "epoch": 6.9153209925167385, + "grad_norm": 0.40586623549461365, + "learning_rate": 2.6328000000000003e-05, + "loss": 0.0231, + "step": 8782 + }, + { + "epoch": 6.916108704214258, + "grad_norm": 0.6632503867149353, + "learning_rate": 2.6331000000000003e-05, + "loss": 0.0303, + "step": 8783 + }, + { + "epoch": 6.9168964159117765, + "grad_norm": 0.24141602218151093, + "learning_rate": 2.6334000000000002e-05, + "loss": 0.02, + "step": 8784 + }, + { + "epoch": 6.917684127609295, + "grad_norm": 0.3972438871860504, + "learning_rate": 2.6337000000000002e-05, + "loss": 0.0231, + "step": 8785 + }, + { + "epoch": 6.918471839306814, + "grad_norm": 0.4748702049255371, + "learning_rate": 2.6340000000000002e-05, + "loss": 0.0201, + "step": 8786 + }, + { + "epoch": 6.919259551004332, + "grad_norm": 0.4439651370048523, + "learning_rate": 2.6343000000000002e-05, + "loss": 0.0315, + "step": 8787 + }, + { + "epoch": 6.920047262701851, + "grad_norm": 0.7873032689094543, + "learning_rate": 2.6346e-05, + "loss": 0.0217, + "step": 8788 + }, + { + "epoch": 6.920834974399369, + "grad_norm": 0.5218839645385742, + "learning_rate": 2.6349e-05, + "loss": 0.0229, + "step": 8789 + }, + { + "epoch": 6.921622686096889, + "grad_norm": 0.25690698623657227, + "learning_rate": 2.6351999999999998e-05, + "loss": 0.0144, + "step": 8790 + }, + { + "epoch": 6.922410397794407, + "grad_norm": 0.29999732971191406, + "learning_rate": 2.6354999999999998e-05, + "loss": 0.0197, + "step": 8791 + }, + { + "epoch": 6.923198109491926, + "grad_norm": 0.4967188835144043, + "learning_rate": 2.6358e-05, + "loss": 0.0265, + "step": 8792 + }, + { + "epoch": 6.9239858211894445, + "grad_norm": 0.47188258171081543, + "learning_rate": 2.6361e-05, + "loss": 0.0155, + "step": 8793 + }, + { + "epoch": 6.924773532886963, + "grad_norm": 0.33098384737968445, + "learning_rate": 2.6364e-05, + "loss": 0.0208, + "step": 8794 + }, + { + "epoch": 6.9255612445844825, + "grad_norm": 0.41832366585731506, + "learning_rate": 2.6367e-05, + "loss": 0.0341, + "step": 8795 + }, + { + "epoch": 6.926348956282001, + "grad_norm": 0.6475971341133118, + "learning_rate": 2.637e-05, + "loss": 0.0304, + "step": 8796 + }, + { + "epoch": 6.92713666797952, + "grad_norm": 0.3733854591846466, + "learning_rate": 2.6373e-05, + "loss": 0.0188, + "step": 8797 + }, + { + "epoch": 6.927924379677038, + "grad_norm": 0.4526413381099701, + "learning_rate": 2.6376e-05, + "loss": 0.0353, + "step": 8798 + }, + { + "epoch": 6.928712091374557, + "grad_norm": 0.7982474565505981, + "learning_rate": 2.6379e-05, + "loss": 0.0246, + "step": 8799 + }, + { + "epoch": 6.929499803072075, + "grad_norm": 0.21594476699829102, + "learning_rate": 2.6382e-05, + "loss": 0.0117, + "step": 8800 + }, + { + "epoch": 6.930287514769594, + "grad_norm": 0.4392867684364319, + "learning_rate": 2.6385e-05, + "loss": 0.0234, + "step": 8801 + }, + { + "epoch": 6.931075226467113, + "grad_norm": 0.6865858435630798, + "learning_rate": 2.6388000000000002e-05, + "loss": 0.0392, + "step": 8802 + }, + { + "epoch": 6.931862938164632, + "grad_norm": 0.5799501538276672, + "learning_rate": 2.6391000000000002e-05, + "loss": 0.0368, + "step": 8803 + }, + { + "epoch": 6.9326506498621505, + "grad_norm": 0.8295812606811523, + "learning_rate": 2.6394000000000002e-05, + "loss": 0.0181, + "step": 8804 + }, + { + "epoch": 6.933438361559669, + "grad_norm": 0.4357761740684509, + "learning_rate": 2.6397e-05, + "loss": 0.0311, + "step": 8805 + }, + { + "epoch": 6.934226073257188, + "grad_norm": 0.1834196299314499, + "learning_rate": 2.64e-05, + "loss": 0.015, + "step": 8806 + }, + { + "epoch": 6.935013784954706, + "grad_norm": 0.38843846321105957, + "learning_rate": 2.6403e-05, + "loss": 0.0233, + "step": 8807 + }, + { + "epoch": 6.935801496652226, + "grad_norm": 0.6505658030509949, + "learning_rate": 2.6406e-05, + "loss": 0.033, + "step": 8808 + }, + { + "epoch": 6.936589208349744, + "grad_norm": 0.514893114566803, + "learning_rate": 2.6409e-05, + "loss": 0.0245, + "step": 8809 + }, + { + "epoch": 6.937376920047263, + "grad_norm": 0.2828955054283142, + "learning_rate": 2.6412e-05, + "loss": 0.0138, + "step": 8810 + }, + { + "epoch": 6.938164631744781, + "grad_norm": 0.44026610255241394, + "learning_rate": 2.6415e-05, + "loss": 0.0346, + "step": 8811 + }, + { + "epoch": 6.9389523434423, + "grad_norm": 0.5152646899223328, + "learning_rate": 2.6418000000000004e-05, + "loss": 0.0286, + "step": 8812 + }, + { + "epoch": 6.9397400551398185, + "grad_norm": 0.45417657494544983, + "learning_rate": 2.6421000000000003e-05, + "loss": 0.0183, + "step": 8813 + }, + { + "epoch": 6.940527766837338, + "grad_norm": 0.5378008484840393, + "learning_rate": 2.6424000000000003e-05, + "loss": 0.0227, + "step": 8814 + }, + { + "epoch": 6.9413154785348565, + "grad_norm": 0.48766395449638367, + "learning_rate": 2.6427e-05, + "loss": 0.0251, + "step": 8815 + }, + { + "epoch": 6.942103190232375, + "grad_norm": 0.6467370986938477, + "learning_rate": 2.643e-05, + "loss": 0.0236, + "step": 8816 + }, + { + "epoch": 6.942890901929894, + "grad_norm": 0.43661171197891235, + "learning_rate": 2.6433e-05, + "loss": 0.0237, + "step": 8817 + }, + { + "epoch": 6.943678613627412, + "grad_norm": 0.41530266404151917, + "learning_rate": 2.6436e-05, + "loss": 0.0199, + "step": 8818 + }, + { + "epoch": 6.944466325324931, + "grad_norm": 0.6528154015541077, + "learning_rate": 2.6439e-05, + "loss": 0.0449, + "step": 8819 + }, + { + "epoch": 6.945254037022449, + "grad_norm": 0.4626886546611786, + "learning_rate": 2.6442e-05, + "loss": 0.0299, + "step": 8820 + }, + { + "epoch": 6.946041748719969, + "grad_norm": 1.108961820602417, + "learning_rate": 2.6445e-05, + "loss": 0.3106, + "step": 8821 + }, + { + "epoch": 6.946829460417487, + "grad_norm": 0.7407185435295105, + "learning_rate": 2.6448e-05, + "loss": 0.1849, + "step": 8822 + }, + { + "epoch": 6.947617172115006, + "grad_norm": 0.706218957901001, + "learning_rate": 2.6451e-05, + "loss": 0.2181, + "step": 8823 + }, + { + "epoch": 6.9484048838125245, + "grad_norm": 0.8490694165229797, + "learning_rate": 2.6454e-05, + "loss": 0.151, + "step": 8824 + }, + { + "epoch": 6.949192595510043, + "grad_norm": 1.4479329586029053, + "learning_rate": 2.6457e-05, + "loss": 0.1953, + "step": 8825 + }, + { + "epoch": 6.949980307207562, + "grad_norm": 0.6106417775154114, + "learning_rate": 2.646e-05, + "loss": 0.1121, + "step": 8826 + }, + { + "epoch": 6.950768018905081, + "grad_norm": 0.5572705864906311, + "learning_rate": 2.6463e-05, + "loss": 0.0633, + "step": 8827 + }, + { + "epoch": 6.9515557306026, + "grad_norm": 0.325728178024292, + "learning_rate": 2.6466e-05, + "loss": 0.0349, + "step": 8828 + }, + { + "epoch": 6.952343442300118, + "grad_norm": 0.30949804186820984, + "learning_rate": 2.6469e-05, + "loss": 0.0239, + "step": 8829 + }, + { + "epoch": 6.953131153997637, + "grad_norm": 0.3003718852996826, + "learning_rate": 2.6472e-05, + "loss": 0.0226, + "step": 8830 + }, + { + "epoch": 6.953918865695155, + "grad_norm": 0.4035646617412567, + "learning_rate": 2.6475e-05, + "loss": 0.0355, + "step": 8831 + }, + { + "epoch": 6.954706577392674, + "grad_norm": 0.28152576088905334, + "learning_rate": 2.6478000000000003e-05, + "loss": 0.0275, + "step": 8832 + }, + { + "epoch": 6.955494289090193, + "grad_norm": 0.34612470865249634, + "learning_rate": 2.6481000000000003e-05, + "loss": 0.017, + "step": 8833 + }, + { + "epoch": 6.956282000787712, + "grad_norm": 0.22787141799926758, + "learning_rate": 2.6484000000000003e-05, + "loss": 0.0162, + "step": 8834 + }, + { + "epoch": 6.9570697124852305, + "grad_norm": 0.43813782930374146, + "learning_rate": 2.6487000000000002e-05, + "loss": 0.0155, + "step": 8835 + }, + { + "epoch": 6.957857424182749, + "grad_norm": 0.39769604802131653, + "learning_rate": 2.6490000000000002e-05, + "loss": 0.02, + "step": 8836 + }, + { + "epoch": 6.958645135880268, + "grad_norm": 0.22946923971176147, + "learning_rate": 2.6493000000000002e-05, + "loss": 0.0167, + "step": 8837 + }, + { + "epoch": 6.959432847577786, + "grad_norm": 0.36745089292526245, + "learning_rate": 2.6496000000000002e-05, + "loss": 0.0176, + "step": 8838 + }, + { + "epoch": 6.960220559275305, + "grad_norm": 0.2670697569847107, + "learning_rate": 2.6499e-05, + "loss": 0.0134, + "step": 8839 + }, + { + "epoch": 6.961008270972824, + "grad_norm": 0.315774142742157, + "learning_rate": 2.6501999999999998e-05, + "loss": 0.0126, + "step": 8840 + }, + { + "epoch": 6.961795982670343, + "grad_norm": 0.4539404809474945, + "learning_rate": 2.6504999999999998e-05, + "loss": 0.0243, + "step": 8841 + }, + { + "epoch": 6.962583694367861, + "grad_norm": 0.22565358877182007, + "learning_rate": 2.6508e-05, + "loss": 0.0144, + "step": 8842 + }, + { + "epoch": 6.96337140606538, + "grad_norm": 0.5222300887107849, + "learning_rate": 2.6511e-05, + "loss": 0.0306, + "step": 8843 + }, + { + "epoch": 6.9641591177628985, + "grad_norm": 0.5166059732437134, + "learning_rate": 2.6514e-05, + "loss": 0.03, + "step": 8844 + }, + { + "epoch": 6.964946829460417, + "grad_norm": 0.5290549397468567, + "learning_rate": 2.6517e-05, + "loss": 0.0266, + "step": 8845 + }, + { + "epoch": 6.9657345411579366, + "grad_norm": 0.6949226260185242, + "learning_rate": 2.652e-05, + "loss": 0.0255, + "step": 8846 + }, + { + "epoch": 6.966522252855455, + "grad_norm": 0.6777223348617554, + "learning_rate": 2.6523e-05, + "loss": 0.037, + "step": 8847 + }, + { + "epoch": 6.967309964552974, + "grad_norm": 0.4338792562484741, + "learning_rate": 2.6526e-05, + "loss": 0.0275, + "step": 8848 + }, + { + "epoch": 6.968097676250492, + "grad_norm": 0.24579089879989624, + "learning_rate": 2.6529e-05, + "loss": 0.0183, + "step": 8849 + }, + { + "epoch": 6.968885387948011, + "grad_norm": 0.4929283559322357, + "learning_rate": 2.6532e-05, + "loss": 0.0291, + "step": 8850 + }, + { + "epoch": 6.969673099645529, + "grad_norm": 0.29070648550987244, + "learning_rate": 2.6535e-05, + "loss": 0.0207, + "step": 8851 + }, + { + "epoch": 6.970460811343049, + "grad_norm": 0.33716732263565063, + "learning_rate": 2.6538000000000002e-05, + "loss": 0.0151, + "step": 8852 + }, + { + "epoch": 6.971248523040567, + "grad_norm": 0.49038779735565186, + "learning_rate": 2.6541000000000002e-05, + "loss": 0.025, + "step": 8853 + }, + { + "epoch": 6.972036234738086, + "grad_norm": 0.35841110348701477, + "learning_rate": 2.6544000000000002e-05, + "loss": 0.0278, + "step": 8854 + }, + { + "epoch": 6.9728239464356045, + "grad_norm": 0.4321022033691406, + "learning_rate": 2.6547000000000002e-05, + "loss": 0.0358, + "step": 8855 + }, + { + "epoch": 6.973611658133123, + "grad_norm": 0.8093224763870239, + "learning_rate": 2.655e-05, + "loss": 0.0262, + "step": 8856 + }, + { + "epoch": 6.974399369830642, + "grad_norm": 0.6968702077865601, + "learning_rate": 2.6553e-05, + "loss": 0.027, + "step": 8857 + }, + { + "epoch": 6.97518708152816, + "grad_norm": 0.37501779198646545, + "learning_rate": 2.6556e-05, + "loss": 0.0255, + "step": 8858 + }, + { + "epoch": 6.97597479322568, + "grad_norm": 0.3646599352359772, + "learning_rate": 2.6559e-05, + "loss": 0.0231, + "step": 8859 + }, + { + "epoch": 6.976762504923198, + "grad_norm": 0.44488877058029175, + "learning_rate": 2.6562e-05, + "loss": 0.0275, + "step": 8860 + }, + { + "epoch": 6.977550216620717, + "grad_norm": 0.39402228593826294, + "learning_rate": 2.6565e-05, + "loss": 0.0195, + "step": 8861 + }, + { + "epoch": 6.978337928318235, + "grad_norm": 0.6341093182563782, + "learning_rate": 2.6568000000000004e-05, + "loss": 0.0301, + "step": 8862 + }, + { + "epoch": 6.979125640015754, + "grad_norm": 0.4298066198825836, + "learning_rate": 2.6571000000000004e-05, + "loss": 0.0325, + "step": 8863 + }, + { + "epoch": 6.979913351713273, + "grad_norm": 0.38136163353919983, + "learning_rate": 2.6574e-05, + "loss": 0.029, + "step": 8864 + }, + { + "epoch": 6.980701063410792, + "grad_norm": 0.48309126496315, + "learning_rate": 2.6577e-05, + "loss": 0.0297, + "step": 8865 + }, + { + "epoch": 6.981488775108311, + "grad_norm": 0.524287223815918, + "learning_rate": 2.658e-05, + "loss": 0.0239, + "step": 8866 + }, + { + "epoch": 6.982276486805829, + "grad_norm": 0.4756283760070801, + "learning_rate": 2.6583e-05, + "loss": 0.0267, + "step": 8867 + }, + { + "epoch": 6.983064198503348, + "grad_norm": 1.001167893409729, + "learning_rate": 2.6586e-05, + "loss": 0.0331, + "step": 8868 + }, + { + "epoch": 6.983851910200866, + "grad_norm": 0.30260613560676575, + "learning_rate": 2.6589e-05, + "loss": 0.0216, + "step": 8869 + }, + { + "epoch": 6.984639621898385, + "grad_norm": 0.8797411918640137, + "learning_rate": 2.6592e-05, + "loss": 0.034, + "step": 8870 + }, + { + "epoch": 6.985427333595904, + "grad_norm": 0.9055960178375244, + "learning_rate": 2.6595e-05, + "loss": 0.2196, + "step": 8871 + }, + { + "epoch": 6.986215045293423, + "grad_norm": 0.6149003505706787, + "learning_rate": 2.6598000000000002e-05, + "loss": 0.126, + "step": 8872 + }, + { + "epoch": 6.987002756990941, + "grad_norm": 0.28517234325408936, + "learning_rate": 2.6601e-05, + "loss": 0.0199, + "step": 8873 + }, + { + "epoch": 6.98779046868846, + "grad_norm": 0.3384609818458557, + "learning_rate": 2.6604e-05, + "loss": 0.0272, + "step": 8874 + }, + { + "epoch": 6.9885781803859786, + "grad_norm": 0.28729432821273804, + "learning_rate": 2.6607e-05, + "loss": 0.02, + "step": 8875 + }, + { + "epoch": 6.989365892083497, + "grad_norm": 0.4007951021194458, + "learning_rate": 2.661e-05, + "loss": 0.013, + "step": 8876 + }, + { + "epoch": 6.990153603781016, + "grad_norm": 0.5396923422813416, + "learning_rate": 2.6613e-05, + "loss": 0.0188, + "step": 8877 + }, + { + "epoch": 6.990941315478535, + "grad_norm": 0.4116380214691162, + "learning_rate": 2.6616e-05, + "loss": 0.027, + "step": 8878 + }, + { + "epoch": 6.991729027176054, + "grad_norm": 0.5387341380119324, + "learning_rate": 2.6619e-05, + "loss": 0.0342, + "step": 8879 + }, + { + "epoch": 6.992516738873572, + "grad_norm": 0.3570578396320343, + "learning_rate": 2.6622e-05, + "loss": 0.0285, + "step": 8880 + }, + { + "epoch": 6.993304450571091, + "grad_norm": 0.9136254787445068, + "learning_rate": 2.6625e-05, + "loss": 0.021, + "step": 8881 + }, + { + "epoch": 6.994092162268609, + "grad_norm": 0.6713568568229675, + "learning_rate": 2.6628e-05, + "loss": 0.0279, + "step": 8882 + }, + { + "epoch": 6.994879873966129, + "grad_norm": 1.0676121711730957, + "learning_rate": 2.6631000000000003e-05, + "loss": 0.0213, + "step": 8883 + }, + { + "epoch": 6.995667585663647, + "grad_norm": 0.6425487399101257, + "learning_rate": 2.6634000000000003e-05, + "loss": 0.0285, + "step": 8884 + }, + { + "epoch": 6.996455297361166, + "grad_norm": 0.3131065368652344, + "learning_rate": 2.6637000000000003e-05, + "loss": 0.0207, + "step": 8885 + }, + { + "epoch": 6.997243009058685, + "grad_norm": 0.43675678968429565, + "learning_rate": 2.6640000000000002e-05, + "loss": 0.0156, + "step": 8886 + }, + { + "epoch": 6.998030720756203, + "grad_norm": 0.5161388516426086, + "learning_rate": 2.6643000000000002e-05, + "loss": 0.0225, + "step": 8887 + }, + { + "epoch": 6.998818432453722, + "grad_norm": 0.4142833352088928, + "learning_rate": 2.6646000000000002e-05, + "loss": 0.0142, + "step": 8888 + }, + { + "epoch": 6.99960614415124, + "grad_norm": 0.4293050765991211, + "learning_rate": 2.6649e-05, + "loss": 0.0327, + "step": 8889 + }, + { + "epoch": 7.0, + "grad_norm": 0.3406226634979248, + "learning_rate": 2.6651999999999998e-05, + "loss": 0.0066, + "step": 8890 + }, + { + "epoch": 7.000787711697519, + "grad_norm": 1.908300757408142, + "learning_rate": 2.6654999999999998e-05, + "loss": 0.3051, + "step": 8891 + }, + { + "epoch": 7.001575423395037, + "grad_norm": 0.6797512769699097, + "learning_rate": 2.6657999999999998e-05, + "loss": 0.1492, + "step": 8892 + }, + { + "epoch": 7.002363135092556, + "grad_norm": 1.1685974597930908, + "learning_rate": 2.6661e-05, + "loss": 0.1757, + "step": 8893 + }, + { + "epoch": 7.003150846790075, + "grad_norm": 0.7020420432090759, + "learning_rate": 2.6664e-05, + "loss": 0.1101, + "step": 8894 + }, + { + "epoch": 7.003938558487594, + "grad_norm": 0.9637179374694824, + "learning_rate": 2.6667e-05, + "loss": 0.1066, + "step": 8895 + }, + { + "epoch": 7.004726270185112, + "grad_norm": 0.6759446859359741, + "learning_rate": 2.667e-05, + "loss": 0.0413, + "step": 8896 + }, + { + "epoch": 7.005513981882631, + "grad_norm": 0.28682807087898254, + "learning_rate": 2.6673e-05, + "loss": 0.0357, + "step": 8897 + }, + { + "epoch": 7.006301693580149, + "grad_norm": 0.2597202956676483, + "learning_rate": 2.6676e-05, + "loss": 0.0223, + "step": 8898 + }, + { + "epoch": 7.007089405277668, + "grad_norm": 0.34866827726364136, + "learning_rate": 2.6679e-05, + "loss": 0.0201, + "step": 8899 + }, + { + "epoch": 7.0078771169751874, + "grad_norm": 0.270492821931839, + "learning_rate": 2.6682e-05, + "loss": 0.0266, + "step": 8900 + }, + { + "epoch": 7.008664828672706, + "grad_norm": 0.40362340211868286, + "learning_rate": 2.6685e-05, + "loss": 0.0233, + "step": 8901 + }, + { + "epoch": 7.009452540370225, + "grad_norm": 0.5170448422431946, + "learning_rate": 2.6688e-05, + "loss": 0.0224, + "step": 8902 + }, + { + "epoch": 7.010240252067743, + "grad_norm": 0.32831308245658875, + "learning_rate": 2.6691000000000002e-05, + "loss": 0.0224, + "step": 8903 + }, + { + "epoch": 7.011027963765262, + "grad_norm": 0.2539427578449249, + "learning_rate": 2.6694000000000002e-05, + "loss": 0.0153, + "step": 8904 + }, + { + "epoch": 7.01181567546278, + "grad_norm": 0.3339586555957794, + "learning_rate": 2.6697000000000002e-05, + "loss": 0.0158, + "step": 8905 + }, + { + "epoch": 7.0126033871603, + "grad_norm": 0.2578681409358978, + "learning_rate": 2.6700000000000002e-05, + "loss": 0.0307, + "step": 8906 + }, + { + "epoch": 7.013391098857818, + "grad_norm": 0.19901172816753387, + "learning_rate": 2.6703e-05, + "loss": 0.0141, + "step": 8907 + }, + { + "epoch": 7.014178810555337, + "grad_norm": 0.3542712330818176, + "learning_rate": 2.6706e-05, + "loss": 0.0181, + "step": 8908 + }, + { + "epoch": 7.014966522252855, + "grad_norm": 0.2969875931739807, + "learning_rate": 2.6709e-05, + "loss": 0.0204, + "step": 8909 + }, + { + "epoch": 7.015754233950374, + "grad_norm": 0.31509119272232056, + "learning_rate": 2.6712e-05, + "loss": 0.0129, + "step": 8910 + }, + { + "epoch": 7.016541945647893, + "grad_norm": 0.3205234706401825, + "learning_rate": 2.6715e-05, + "loss": 0.0171, + "step": 8911 + }, + { + "epoch": 7.017329657345411, + "grad_norm": 0.21532565355300903, + "learning_rate": 2.6718e-05, + "loss": 0.0088, + "step": 8912 + }, + { + "epoch": 7.018117369042931, + "grad_norm": 0.27600350975990295, + "learning_rate": 2.6721e-05, + "loss": 0.0179, + "step": 8913 + }, + { + "epoch": 7.018905080740449, + "grad_norm": 0.41006237268447876, + "learning_rate": 2.6724e-05, + "loss": 0.0145, + "step": 8914 + }, + { + "epoch": 7.019692792437968, + "grad_norm": 0.3845527768135071, + "learning_rate": 2.6727e-05, + "loss": 0.0184, + "step": 8915 + }, + { + "epoch": 7.020480504135486, + "grad_norm": 0.4072458744049072, + "learning_rate": 2.673e-05, + "loss": 0.0191, + "step": 8916 + }, + { + "epoch": 7.021268215833005, + "grad_norm": 0.3527422845363617, + "learning_rate": 2.6733e-05, + "loss": 0.0214, + "step": 8917 + }, + { + "epoch": 7.022055927530523, + "grad_norm": 0.27080997824668884, + "learning_rate": 2.6736e-05, + "loss": 0.0132, + "step": 8918 + }, + { + "epoch": 7.022843639228043, + "grad_norm": 0.4550851285457611, + "learning_rate": 2.6739e-05, + "loss": 0.0127, + "step": 8919 + }, + { + "epoch": 7.0236313509255615, + "grad_norm": 0.6714853048324585, + "learning_rate": 2.6742e-05, + "loss": 0.032, + "step": 8920 + }, + { + "epoch": 7.02441906262308, + "grad_norm": 0.31453296542167664, + "learning_rate": 2.6745e-05, + "loss": 0.0151, + "step": 8921 + }, + { + "epoch": 7.025206774320599, + "grad_norm": 0.5811120271682739, + "learning_rate": 2.6748e-05, + "loss": 0.0256, + "step": 8922 + }, + { + "epoch": 7.025994486018117, + "grad_norm": 0.4438822567462921, + "learning_rate": 2.6751000000000002e-05, + "loss": 0.0172, + "step": 8923 + }, + { + "epoch": 7.026782197715636, + "grad_norm": 0.5507035255432129, + "learning_rate": 2.6754e-05, + "loss": 0.0226, + "step": 8924 + }, + { + "epoch": 7.027569909413155, + "grad_norm": 0.24456430971622467, + "learning_rate": 2.6757e-05, + "loss": 0.0106, + "step": 8925 + }, + { + "epoch": 7.028357621110674, + "grad_norm": 0.47915172576904297, + "learning_rate": 2.676e-05, + "loss": 0.0181, + "step": 8926 + }, + { + "epoch": 7.029145332808192, + "grad_norm": 0.9841799139976501, + "learning_rate": 2.6763e-05, + "loss": 0.0262, + "step": 8927 + }, + { + "epoch": 7.029933044505711, + "grad_norm": 0.4866551160812378, + "learning_rate": 2.6766e-05, + "loss": 0.018, + "step": 8928 + }, + { + "epoch": 7.0307207562032294, + "grad_norm": 0.5223598480224609, + "learning_rate": 2.6769e-05, + "loss": 0.0179, + "step": 8929 + }, + { + "epoch": 7.031508467900748, + "grad_norm": 0.24352088570594788, + "learning_rate": 2.6772e-05, + "loss": 0.0125, + "step": 8930 + }, + { + "epoch": 7.0322961795982675, + "grad_norm": 0.352948933839798, + "learning_rate": 2.6775e-05, + "loss": 0.0155, + "step": 8931 + }, + { + "epoch": 7.033083891295786, + "grad_norm": 0.7929678559303284, + "learning_rate": 2.6778e-05, + "loss": 0.0298, + "step": 8932 + }, + { + "epoch": 7.033871602993305, + "grad_norm": 0.5788719654083252, + "learning_rate": 2.6781000000000003e-05, + "loss": 0.0259, + "step": 8933 + }, + { + "epoch": 7.034659314690823, + "grad_norm": 0.3140774667263031, + "learning_rate": 2.6784000000000003e-05, + "loss": 0.0179, + "step": 8934 + }, + { + "epoch": 7.035447026388342, + "grad_norm": 0.3777981996536255, + "learning_rate": 2.6787000000000003e-05, + "loss": 0.0193, + "step": 8935 + }, + { + "epoch": 7.03623473808586, + "grad_norm": 0.29450440406799316, + "learning_rate": 2.6790000000000003e-05, + "loss": 0.0201, + "step": 8936 + }, + { + "epoch": 7.037022449783379, + "grad_norm": 0.26669347286224365, + "learning_rate": 2.6793000000000002e-05, + "loss": 0.0138, + "step": 8937 + }, + { + "epoch": 7.037810161480898, + "grad_norm": 0.30888354778289795, + "learning_rate": 2.6796e-05, + "loss": 0.0193, + "step": 8938 + }, + { + "epoch": 7.038597873178417, + "grad_norm": 0.7685030102729797, + "learning_rate": 2.6799e-05, + "loss": 0.0324, + "step": 8939 + }, + { + "epoch": 7.0393855848759355, + "grad_norm": 0.8858140110969543, + "learning_rate": 2.6802e-05, + "loss": 0.0171, + "step": 8940 + }, + { + "epoch": 7.040173296573454, + "grad_norm": 2.597757577896118, + "learning_rate": 2.6805e-05, + "loss": 0.3712, + "step": 8941 + }, + { + "epoch": 7.040961008270973, + "grad_norm": 1.0037493705749512, + "learning_rate": 2.6807999999999998e-05, + "loss": 0.2195, + "step": 8942 + }, + { + "epoch": 7.041748719968491, + "grad_norm": 0.5737964510917664, + "learning_rate": 2.6811e-05, + "loss": 0.1563, + "step": 8943 + }, + { + "epoch": 7.042536431666011, + "grad_norm": 0.6193711757659912, + "learning_rate": 2.6814e-05, + "loss": 0.1053, + "step": 8944 + }, + { + "epoch": 7.043324143363529, + "grad_norm": 0.4090132415294647, + "learning_rate": 2.6817e-05, + "loss": 0.0778, + "step": 8945 + }, + { + "epoch": 7.044111855061048, + "grad_norm": 0.5546074509620667, + "learning_rate": 2.682e-05, + "loss": 0.0675, + "step": 8946 + }, + { + "epoch": 7.044899566758566, + "grad_norm": 0.3286011517047882, + "learning_rate": 2.6823e-05, + "loss": 0.0349, + "step": 8947 + }, + { + "epoch": 7.045687278456085, + "grad_norm": 0.2607794404029846, + "learning_rate": 2.6826e-05, + "loss": 0.0223, + "step": 8948 + }, + { + "epoch": 7.0464749901536035, + "grad_norm": 1.3292820453643799, + "learning_rate": 2.6829e-05, + "loss": 0.0782, + "step": 8949 + }, + { + "epoch": 7.047262701851123, + "grad_norm": 0.38891392946243286, + "learning_rate": 2.6832e-05, + "loss": 0.0305, + "step": 8950 + }, + { + "epoch": 7.0480504135486415, + "grad_norm": 0.4917014539241791, + "learning_rate": 2.6835e-05, + "loss": 0.0203, + "step": 8951 + }, + { + "epoch": 7.04883812524616, + "grad_norm": 0.37963685393333435, + "learning_rate": 2.6838e-05, + "loss": 0.0307, + "step": 8952 + }, + { + "epoch": 7.049625836943679, + "grad_norm": 0.4282228648662567, + "learning_rate": 2.6841000000000003e-05, + "loss": 0.0174, + "step": 8953 + }, + { + "epoch": 7.050413548641197, + "grad_norm": 2.598198890686035, + "learning_rate": 2.6844000000000003e-05, + "loss": 0.0216, + "step": 8954 + }, + { + "epoch": 7.051201260338716, + "grad_norm": 0.7419038414955139, + "learning_rate": 2.6847000000000002e-05, + "loss": 0.0162, + "step": 8955 + }, + { + "epoch": 7.051988972036234, + "grad_norm": 0.6321263313293457, + "learning_rate": 2.6850000000000002e-05, + "loss": 0.0247, + "step": 8956 + }, + { + "epoch": 7.052776683733754, + "grad_norm": 0.5065214037895203, + "learning_rate": 2.6853000000000002e-05, + "loss": 0.0188, + "step": 8957 + }, + { + "epoch": 7.053564395431272, + "grad_norm": 0.34397855401039124, + "learning_rate": 2.6856000000000002e-05, + "loss": 0.0202, + "step": 8958 + }, + { + "epoch": 7.054352107128791, + "grad_norm": 0.3619663715362549, + "learning_rate": 2.6859e-05, + "loss": 0.0168, + "step": 8959 + }, + { + "epoch": 7.0551398188263095, + "grad_norm": 0.4526852071285248, + "learning_rate": 2.6862e-05, + "loss": 0.0298, + "step": 8960 + }, + { + "epoch": 7.055927530523828, + "grad_norm": 0.19209441542625427, + "learning_rate": 2.6865e-05, + "loss": 0.0129, + "step": 8961 + }, + { + "epoch": 7.056715242221347, + "grad_norm": 0.38955003023147583, + "learning_rate": 2.6867999999999998e-05, + "loss": 0.0117, + "step": 8962 + }, + { + "epoch": 7.057502953918866, + "grad_norm": 0.5086455941200256, + "learning_rate": 2.6871e-05, + "loss": 0.0304, + "step": 8963 + }, + { + "epoch": 7.058290665616385, + "grad_norm": 0.3429199755191803, + "learning_rate": 2.6874e-05, + "loss": 0.0162, + "step": 8964 + }, + { + "epoch": 7.059078377313903, + "grad_norm": 0.4707707464694977, + "learning_rate": 2.6877e-05, + "loss": 0.0183, + "step": 8965 + }, + { + "epoch": 7.059866089011422, + "grad_norm": 0.4384336769580841, + "learning_rate": 2.688e-05, + "loss": 0.0213, + "step": 8966 + }, + { + "epoch": 7.06065380070894, + "grad_norm": 0.6369316577911377, + "learning_rate": 2.6883e-05, + "loss": 0.0762, + "step": 8967 + }, + { + "epoch": 7.061441512406459, + "grad_norm": 0.333003431558609, + "learning_rate": 2.6886e-05, + "loss": 0.0196, + "step": 8968 + }, + { + "epoch": 7.062229224103978, + "grad_norm": 0.5213468670845032, + "learning_rate": 2.6889e-05, + "loss": 0.0255, + "step": 8969 + }, + { + "epoch": 7.063016935801497, + "grad_norm": 0.396854430437088, + "learning_rate": 2.6892e-05, + "loss": 0.0244, + "step": 8970 + }, + { + "epoch": 7.0638046474990155, + "grad_norm": 0.7079941630363464, + "learning_rate": 2.6895e-05, + "loss": 0.0247, + "step": 8971 + }, + { + "epoch": 7.064592359196534, + "grad_norm": 0.36327096819877625, + "learning_rate": 2.6898e-05, + "loss": 0.0191, + "step": 8972 + }, + { + "epoch": 7.065380070894053, + "grad_norm": 1.3171874284744263, + "learning_rate": 2.6901000000000002e-05, + "loss": 0.0313, + "step": 8973 + }, + { + "epoch": 7.066167782591571, + "grad_norm": 0.29430899024009705, + "learning_rate": 2.6904000000000002e-05, + "loss": 0.0124, + "step": 8974 + }, + { + "epoch": 7.06695549428909, + "grad_norm": 0.8375784754753113, + "learning_rate": 2.6907000000000002e-05, + "loss": 0.0491, + "step": 8975 + }, + { + "epoch": 7.067743205986609, + "grad_norm": 0.20022207498550415, + "learning_rate": 2.691e-05, + "loss": 0.0138, + "step": 8976 + }, + { + "epoch": 7.068530917684128, + "grad_norm": 0.409588098526001, + "learning_rate": 2.6913e-05, + "loss": 0.0154, + "step": 8977 + }, + { + "epoch": 7.069318629381646, + "grad_norm": 0.23299196362495422, + "learning_rate": 2.6916e-05, + "loss": 0.0145, + "step": 8978 + }, + { + "epoch": 7.070106341079165, + "grad_norm": 1.0583503246307373, + "learning_rate": 2.6919e-05, + "loss": 0.0197, + "step": 8979 + }, + { + "epoch": 7.0708940527766835, + "grad_norm": 0.3124052584171295, + "learning_rate": 2.6922e-05, + "loss": 0.0146, + "step": 8980 + }, + { + "epoch": 7.071681764474202, + "grad_norm": 0.7110427021980286, + "learning_rate": 2.6925e-05, + "loss": 0.0311, + "step": 8981 + }, + { + "epoch": 7.0724694761717215, + "grad_norm": 0.4395937919616699, + "learning_rate": 2.6928e-05, + "loss": 0.0164, + "step": 8982 + }, + { + "epoch": 7.07325718786924, + "grad_norm": 0.2654779255390167, + "learning_rate": 2.6931000000000004e-05, + "loss": 0.0146, + "step": 8983 + }, + { + "epoch": 7.074044899566759, + "grad_norm": 0.1848507970571518, + "learning_rate": 2.6934000000000003e-05, + "loss": 0.014, + "step": 8984 + }, + { + "epoch": 7.074832611264277, + "grad_norm": 0.3013020157814026, + "learning_rate": 2.6937000000000003e-05, + "loss": 0.0154, + "step": 8985 + }, + { + "epoch": 7.075620322961796, + "grad_norm": 0.42713114619255066, + "learning_rate": 2.6940000000000003e-05, + "loss": 0.0208, + "step": 8986 + }, + { + "epoch": 7.076408034659314, + "grad_norm": 1.7046151161193848, + "learning_rate": 2.6943e-05, + "loss": 0.043, + "step": 8987 + }, + { + "epoch": 7.077195746356834, + "grad_norm": 0.5735821723937988, + "learning_rate": 2.6946e-05, + "loss": 0.0314, + "step": 8988 + }, + { + "epoch": 7.077983458054352, + "grad_norm": 1.1204107999801636, + "learning_rate": 2.6949e-05, + "loss": 0.0306, + "step": 8989 + }, + { + "epoch": 7.078771169751871, + "grad_norm": 0.7887332439422607, + "learning_rate": 2.6952e-05, + "loss": 0.0451, + "step": 8990 + }, + { + "epoch": 7.0795588814493895, + "grad_norm": 0.9740268588066101, + "learning_rate": 2.6955e-05, + "loss": 0.2771, + "step": 8991 + }, + { + "epoch": 7.080346593146908, + "grad_norm": 0.5494599938392639, + "learning_rate": 2.6958e-05, + "loss": 0.1482, + "step": 8992 + }, + { + "epoch": 7.081134304844427, + "grad_norm": 0.7567278742790222, + "learning_rate": 2.6961e-05, + "loss": 0.1526, + "step": 8993 + }, + { + "epoch": 7.081922016541945, + "grad_norm": 0.5683825612068176, + "learning_rate": 2.6964e-05, + "loss": 0.1367, + "step": 8994 + }, + { + "epoch": 7.082709728239465, + "grad_norm": 0.3655519485473633, + "learning_rate": 2.6967e-05, + "loss": 0.0488, + "step": 8995 + }, + { + "epoch": 7.083497439936983, + "grad_norm": 0.7503821849822998, + "learning_rate": 2.697e-05, + "loss": 0.1066, + "step": 8996 + }, + { + "epoch": 7.084285151634502, + "grad_norm": 0.44751378893852234, + "learning_rate": 2.6973e-05, + "loss": 0.0369, + "step": 8997 + }, + { + "epoch": 7.08507286333202, + "grad_norm": 0.41411519050598145, + "learning_rate": 2.6976e-05, + "loss": 0.0313, + "step": 8998 + }, + { + "epoch": 7.085860575029539, + "grad_norm": 0.5262284874916077, + "learning_rate": 2.6979e-05, + "loss": 0.027, + "step": 8999 + }, + { + "epoch": 7.0866482867270575, + "grad_norm": 0.367987722158432, + "learning_rate": 2.6982e-05, + "loss": 0.0198, + "step": 9000 + }, + { + "epoch": 7.0866482867270575, + "eval_cer": 0.11714233869922493, + "eval_loss": 0.3329727351665497, + "eval_runtime": 16.8987, + "eval_samples_per_second": 17.99, + "eval_steps_per_second": 0.592, + "eval_wer": 0.40272448196469685, + "step": 9000 + }, + { + "epoch": 7.087435998424577, + "grad_norm": 0.30792367458343506, + "learning_rate": 2.6985e-05, + "loss": 0.0133, + "step": 9001 + }, + { + "epoch": 7.0882237101220955, + "grad_norm": 0.32261523604393005, + "learning_rate": 2.6988e-05, + "loss": 0.0205, + "step": 9002 + }, + { + "epoch": 7.089011421819614, + "grad_norm": 0.5831213593482971, + "learning_rate": 2.6991000000000003e-05, + "loss": 0.0215, + "step": 9003 + }, + { + "epoch": 7.089799133517133, + "grad_norm": 0.27516767382621765, + "learning_rate": 2.6994000000000003e-05, + "loss": 0.0139, + "step": 9004 + }, + { + "epoch": 7.090586845214651, + "grad_norm": 0.2603820860385895, + "learning_rate": 2.6997000000000003e-05, + "loss": 0.026, + "step": 9005 + }, + { + "epoch": 7.09137455691217, + "grad_norm": 0.4315789043903351, + "learning_rate": 2.7000000000000002e-05, + "loss": 0.0328, + "step": 9006 + }, + { + "epoch": 7.092162268609689, + "grad_norm": 0.35043519735336304, + "learning_rate": 2.7003000000000002e-05, + "loss": 0.0206, + "step": 9007 + }, + { + "epoch": 7.092949980307208, + "grad_norm": 0.21343573927879333, + "learning_rate": 2.7006000000000002e-05, + "loss": 0.0103, + "step": 9008 + }, + { + "epoch": 7.093737692004726, + "grad_norm": 0.42430612444877625, + "learning_rate": 2.7009000000000002e-05, + "loss": 0.0253, + "step": 9009 + }, + { + "epoch": 7.094525403702245, + "grad_norm": 0.2514789402484894, + "learning_rate": 2.7012e-05, + "loss": 0.0223, + "step": 9010 + }, + { + "epoch": 7.0953131153997635, + "grad_norm": 0.14899970591068268, + "learning_rate": 2.7015e-05, + "loss": 0.0114, + "step": 9011 + }, + { + "epoch": 7.096100827097282, + "grad_norm": 0.9406946301460266, + "learning_rate": 2.7017999999999998e-05, + "loss": 0.0131, + "step": 9012 + }, + { + "epoch": 7.0968885387948015, + "grad_norm": 0.611084520816803, + "learning_rate": 2.7020999999999998e-05, + "loss": 0.0297, + "step": 9013 + }, + { + "epoch": 7.09767625049232, + "grad_norm": 0.3709670603275299, + "learning_rate": 2.7024e-05, + "loss": 0.0206, + "step": 9014 + }, + { + "epoch": 7.098463962189839, + "grad_norm": 0.7399778962135315, + "learning_rate": 2.7027e-05, + "loss": 0.0225, + "step": 9015 + }, + { + "epoch": 7.099251673887357, + "grad_norm": 0.3419361114501953, + "learning_rate": 2.703e-05, + "loss": 0.0211, + "step": 9016 + }, + { + "epoch": 7.100039385584876, + "grad_norm": 0.6119322776794434, + "learning_rate": 2.7033e-05, + "loss": 0.0213, + "step": 9017 + }, + { + "epoch": 7.100827097282394, + "grad_norm": 0.44279706478118896, + "learning_rate": 2.7036e-05, + "loss": 0.0235, + "step": 9018 + }, + { + "epoch": 7.101614808979913, + "grad_norm": 0.3136747181415558, + "learning_rate": 2.7039e-05, + "loss": 0.0126, + "step": 9019 + }, + { + "epoch": 7.102402520677432, + "grad_norm": 0.636989414691925, + "learning_rate": 2.7042e-05, + "loss": 0.0228, + "step": 9020 + }, + { + "epoch": 7.103190232374951, + "grad_norm": 0.4336548149585724, + "learning_rate": 2.7045e-05, + "loss": 0.0201, + "step": 9021 + }, + { + "epoch": 7.1039779440724695, + "grad_norm": 0.24692393839359283, + "learning_rate": 2.7048e-05, + "loss": 0.0158, + "step": 9022 + }, + { + "epoch": 7.104765655769988, + "grad_norm": 0.5024134516716003, + "learning_rate": 2.7051e-05, + "loss": 0.0191, + "step": 9023 + }, + { + "epoch": 7.105553367467507, + "grad_norm": 0.287318617105484, + "learning_rate": 2.7054000000000002e-05, + "loss": 0.0235, + "step": 9024 + }, + { + "epoch": 7.106341079165025, + "grad_norm": 0.33745962381362915, + "learning_rate": 2.7057000000000002e-05, + "loss": 0.0254, + "step": 9025 + }, + { + "epoch": 7.107128790862545, + "grad_norm": 0.4260483980178833, + "learning_rate": 2.7060000000000002e-05, + "loss": 0.0224, + "step": 9026 + }, + { + "epoch": 7.107916502560063, + "grad_norm": 0.47993484139442444, + "learning_rate": 2.7063e-05, + "loss": 0.0183, + "step": 9027 + }, + { + "epoch": 7.108704214257582, + "grad_norm": 0.3929196000099182, + "learning_rate": 2.7066e-05, + "loss": 0.0169, + "step": 9028 + }, + { + "epoch": 7.1094919259551, + "grad_norm": 0.49407875537872314, + "learning_rate": 2.7069e-05, + "loss": 0.0278, + "step": 9029 + }, + { + "epoch": 7.110279637652619, + "grad_norm": 0.601297914981842, + "learning_rate": 2.7072e-05, + "loss": 0.0316, + "step": 9030 + }, + { + "epoch": 7.1110673493501375, + "grad_norm": 0.5110617280006409, + "learning_rate": 2.7075e-05, + "loss": 0.0278, + "step": 9031 + }, + { + "epoch": 7.111855061047657, + "grad_norm": 0.5160835981369019, + "learning_rate": 2.7078e-05, + "loss": 0.0275, + "step": 9032 + }, + { + "epoch": 7.1126427727451755, + "grad_norm": 0.5550220012664795, + "learning_rate": 2.7081e-05, + "loss": 0.0235, + "step": 9033 + }, + { + "epoch": 7.113430484442694, + "grad_norm": 0.5936610698699951, + "learning_rate": 2.7084000000000004e-05, + "loss": 0.0224, + "step": 9034 + }, + { + "epoch": 7.114218196140213, + "grad_norm": 0.26590025424957275, + "learning_rate": 2.7087000000000003e-05, + "loss": 0.012, + "step": 9035 + }, + { + "epoch": 7.115005907837731, + "grad_norm": 0.22903236746788025, + "learning_rate": 2.709e-05, + "loss": 0.0173, + "step": 9036 + }, + { + "epoch": 7.11579361953525, + "grad_norm": 0.9958125948905945, + "learning_rate": 2.7093e-05, + "loss": 0.0288, + "step": 9037 + }, + { + "epoch": 7.116581331232768, + "grad_norm": 1.0204472541809082, + "learning_rate": 2.7096e-05, + "loss": 0.0232, + "step": 9038 + }, + { + "epoch": 7.117369042930288, + "grad_norm": 0.6766762137413025, + "learning_rate": 2.7099e-05, + "loss": 0.0195, + "step": 9039 + }, + { + "epoch": 7.118156754627806, + "grad_norm": 0.7845105528831482, + "learning_rate": 2.7102e-05, + "loss": 0.0617, + "step": 9040 + }, + { + "epoch": 7.118944466325325, + "grad_norm": 1.0249459743499756, + "learning_rate": 2.7105e-05, + "loss": 0.2661, + "step": 9041 + }, + { + "epoch": 7.1197321780228435, + "grad_norm": 0.7758061289787292, + "learning_rate": 2.7108e-05, + "loss": 0.1717, + "step": 9042 + }, + { + "epoch": 7.120519889720362, + "grad_norm": 0.5554949045181274, + "learning_rate": 2.7111e-05, + "loss": 0.164, + "step": 9043 + }, + { + "epoch": 7.121307601417881, + "grad_norm": 0.666215717792511, + "learning_rate": 2.7114e-05, + "loss": 0.1056, + "step": 9044 + }, + { + "epoch": 7.1220953131154, + "grad_norm": 0.6241529583930969, + "learning_rate": 2.7117e-05, + "loss": 0.1026, + "step": 9045 + }, + { + "epoch": 7.122883024812919, + "grad_norm": 0.40377962589263916, + "learning_rate": 2.712e-05, + "loss": 0.0654, + "step": 9046 + }, + { + "epoch": 7.123670736510437, + "grad_norm": 0.48641839623451233, + "learning_rate": 2.7123e-05, + "loss": 0.0311, + "step": 9047 + }, + { + "epoch": 7.124458448207956, + "grad_norm": 0.6594228744506836, + "learning_rate": 2.7126e-05, + "loss": 0.0555, + "step": 9048 + }, + { + "epoch": 7.125246159905474, + "grad_norm": 0.35207876563072205, + "learning_rate": 2.7129e-05, + "loss": 0.0223, + "step": 9049 + }, + { + "epoch": 7.126033871602993, + "grad_norm": 0.36542052030563354, + "learning_rate": 2.7132e-05, + "loss": 0.0274, + "step": 9050 + }, + { + "epoch": 7.126821583300512, + "grad_norm": 0.3714241683483124, + "learning_rate": 2.7135e-05, + "loss": 0.0239, + "step": 9051 + }, + { + "epoch": 7.127609294998031, + "grad_norm": 0.3872576653957367, + "learning_rate": 2.7138e-05, + "loss": 0.0289, + "step": 9052 + }, + { + "epoch": 7.1283970066955495, + "grad_norm": 0.17802825570106506, + "learning_rate": 2.7141e-05, + "loss": 0.0114, + "step": 9053 + }, + { + "epoch": 7.129184718393068, + "grad_norm": 0.3175375759601593, + "learning_rate": 2.7144000000000003e-05, + "loss": 0.0107, + "step": 9054 + }, + { + "epoch": 7.129972430090587, + "grad_norm": 0.3724464774131775, + "learning_rate": 2.7147000000000003e-05, + "loss": 0.0134, + "step": 9055 + }, + { + "epoch": 7.130760141788105, + "grad_norm": 0.35398367047309875, + "learning_rate": 2.7150000000000003e-05, + "loss": 0.0172, + "step": 9056 + }, + { + "epoch": 7.131547853485625, + "grad_norm": 0.32621079683303833, + "learning_rate": 2.7153000000000002e-05, + "loss": 0.0148, + "step": 9057 + }, + { + "epoch": 7.132335565183143, + "grad_norm": 0.21539750695228577, + "learning_rate": 2.7156000000000002e-05, + "loss": 0.0137, + "step": 9058 + }, + { + "epoch": 7.133123276880662, + "grad_norm": 0.2540847659111023, + "learning_rate": 2.7159000000000002e-05, + "loss": 0.0152, + "step": 9059 + }, + { + "epoch": 7.13391098857818, + "grad_norm": 0.21509069204330444, + "learning_rate": 2.7162000000000002e-05, + "loss": 0.0095, + "step": 9060 + }, + { + "epoch": 7.134698700275699, + "grad_norm": 0.4009965658187866, + "learning_rate": 2.7164999999999998e-05, + "loss": 0.0204, + "step": 9061 + }, + { + "epoch": 7.1354864119732175, + "grad_norm": 0.4892454147338867, + "learning_rate": 2.7167999999999998e-05, + "loss": 0.0117, + "step": 9062 + }, + { + "epoch": 7.136274123670736, + "grad_norm": 0.2967832088470459, + "learning_rate": 2.7170999999999998e-05, + "loss": 0.0126, + "step": 9063 + }, + { + "epoch": 7.137061835368256, + "grad_norm": 0.4238257110118866, + "learning_rate": 2.7174e-05, + "loss": 0.0139, + "step": 9064 + }, + { + "epoch": 7.137849547065774, + "grad_norm": 0.26307135820388794, + "learning_rate": 2.7177e-05, + "loss": 0.0094, + "step": 9065 + }, + { + "epoch": 7.138637258763293, + "grad_norm": 0.502079963684082, + "learning_rate": 2.718e-05, + "loss": 0.0228, + "step": 9066 + }, + { + "epoch": 7.139424970460811, + "grad_norm": 0.42731326818466187, + "learning_rate": 2.7183e-05, + "loss": 0.0211, + "step": 9067 + }, + { + "epoch": 7.14021268215833, + "grad_norm": 0.19645851850509644, + "learning_rate": 2.7186e-05, + "loss": 0.0126, + "step": 9068 + }, + { + "epoch": 7.141000393855848, + "grad_norm": 0.5561561584472656, + "learning_rate": 2.7189e-05, + "loss": 0.0182, + "step": 9069 + }, + { + "epoch": 7.141788105553368, + "grad_norm": 0.2016177475452423, + "learning_rate": 2.7192e-05, + "loss": 0.0081, + "step": 9070 + }, + { + "epoch": 7.142575817250886, + "grad_norm": 0.27193790674209595, + "learning_rate": 2.7195e-05, + "loss": 0.0099, + "step": 9071 + }, + { + "epoch": 7.143363528948405, + "grad_norm": 0.5496746897697449, + "learning_rate": 2.7198e-05, + "loss": 0.0285, + "step": 9072 + }, + { + "epoch": 7.1441512406459236, + "grad_norm": 0.8668250441551208, + "learning_rate": 2.7201e-05, + "loss": 0.021, + "step": 9073 + }, + { + "epoch": 7.144938952343442, + "grad_norm": 0.283514142036438, + "learning_rate": 2.7204000000000002e-05, + "loss": 0.0149, + "step": 9074 + }, + { + "epoch": 7.145726664040961, + "grad_norm": 0.3723856806755066, + "learning_rate": 2.7207000000000002e-05, + "loss": 0.0098, + "step": 9075 + }, + { + "epoch": 7.14651437573848, + "grad_norm": 0.38468655943870544, + "learning_rate": 2.7210000000000002e-05, + "loss": 0.0168, + "step": 9076 + }, + { + "epoch": 7.147302087435999, + "grad_norm": 0.2759813368320465, + "learning_rate": 2.7213000000000002e-05, + "loss": 0.0154, + "step": 9077 + }, + { + "epoch": 7.148089799133517, + "grad_norm": 1.1173993349075317, + "learning_rate": 2.7216e-05, + "loss": 0.0259, + "step": 9078 + }, + { + "epoch": 7.148877510831036, + "grad_norm": 0.43293657898902893, + "learning_rate": 2.7219e-05, + "loss": 0.0258, + "step": 9079 + }, + { + "epoch": 7.149665222528554, + "grad_norm": 0.4926551878452301, + "learning_rate": 2.7222e-05, + "loss": 0.017, + "step": 9080 + }, + { + "epoch": 7.150452934226073, + "grad_norm": 0.47769787907600403, + "learning_rate": 2.7225e-05, + "loss": 0.0155, + "step": 9081 + }, + { + "epoch": 7.1512406459235915, + "grad_norm": 0.4609324336051941, + "learning_rate": 2.7228e-05, + "loss": 0.0139, + "step": 9082 + }, + { + "epoch": 7.152028357621111, + "grad_norm": 0.6440995335578918, + "learning_rate": 2.7231e-05, + "loss": 0.0228, + "step": 9083 + }, + { + "epoch": 7.15281606931863, + "grad_norm": 0.43589040637016296, + "learning_rate": 2.7234000000000004e-05, + "loss": 0.0299, + "step": 9084 + }, + { + "epoch": 7.153603781016148, + "grad_norm": 0.27381375432014465, + "learning_rate": 2.7237e-05, + "loss": 0.0173, + "step": 9085 + }, + { + "epoch": 7.154391492713667, + "grad_norm": 0.38328343629837036, + "learning_rate": 2.724e-05, + "loss": 0.016, + "step": 9086 + }, + { + "epoch": 7.155179204411185, + "grad_norm": 0.7139855027198792, + "learning_rate": 2.7243e-05, + "loss": 0.022, + "step": 9087 + }, + { + "epoch": 7.155966916108704, + "grad_norm": 0.6797417402267456, + "learning_rate": 2.7246e-05, + "loss": 0.0268, + "step": 9088 + }, + { + "epoch": 7.156754627806223, + "grad_norm": 0.5319332480430603, + "learning_rate": 2.7249e-05, + "loss": 0.0199, + "step": 9089 + }, + { + "epoch": 7.157542339503742, + "grad_norm": 0.28710973262786865, + "learning_rate": 2.7252e-05, + "loss": 0.0136, + "step": 9090 + }, + { + "epoch": 7.15833005120126, + "grad_norm": 1.5841633081436157, + "learning_rate": 2.7255e-05, + "loss": 0.2886, + "step": 9091 + }, + { + "epoch": 7.159117762898779, + "grad_norm": 0.8386966586112976, + "learning_rate": 2.7258e-05, + "loss": 0.1839, + "step": 9092 + }, + { + "epoch": 7.159905474596298, + "grad_norm": 0.7574689984321594, + "learning_rate": 2.7261e-05, + "loss": 0.1963, + "step": 9093 + }, + { + "epoch": 7.160693186293816, + "grad_norm": 0.5738582611083984, + "learning_rate": 2.7264000000000002e-05, + "loss": 0.1176, + "step": 9094 + }, + { + "epoch": 7.161480897991336, + "grad_norm": 0.6204169392585754, + "learning_rate": 2.7267e-05, + "loss": 0.1067, + "step": 9095 + }, + { + "epoch": 7.162268609688854, + "grad_norm": 0.34917840361595154, + "learning_rate": 2.727e-05, + "loss": 0.0396, + "step": 9096 + }, + { + "epoch": 7.163056321386373, + "grad_norm": 0.3428286612033844, + "learning_rate": 2.7273e-05, + "loss": 0.0366, + "step": 9097 + }, + { + "epoch": 7.163844033083891, + "grad_norm": 0.2472074180841446, + "learning_rate": 2.7276e-05, + "loss": 0.0336, + "step": 9098 + }, + { + "epoch": 7.16463174478141, + "grad_norm": 0.18547141551971436, + "learning_rate": 2.7279e-05, + "loss": 0.0204, + "step": 9099 + }, + { + "epoch": 7.165419456478928, + "grad_norm": 0.36061009764671326, + "learning_rate": 2.7282e-05, + "loss": 0.0323, + "step": 9100 + }, + { + "epoch": 7.166207168176447, + "grad_norm": 0.24869684875011444, + "learning_rate": 2.7285e-05, + "loss": 0.0208, + "step": 9101 + }, + { + "epoch": 7.166994879873966, + "grad_norm": 0.43006035685539246, + "learning_rate": 2.7288e-05, + "loss": 0.0272, + "step": 9102 + }, + { + "epoch": 7.167782591571485, + "grad_norm": 0.30245962738990784, + "learning_rate": 2.7291e-05, + "loss": 0.0242, + "step": 9103 + }, + { + "epoch": 7.168570303269004, + "grad_norm": 0.29146042466163635, + "learning_rate": 2.7294000000000003e-05, + "loss": 0.0149, + "step": 9104 + }, + { + "epoch": 7.169358014966522, + "grad_norm": 0.2454070746898651, + "learning_rate": 2.7297000000000003e-05, + "loss": 0.0133, + "step": 9105 + }, + { + "epoch": 7.170145726664041, + "grad_norm": 0.3445015549659729, + "learning_rate": 2.7300000000000003e-05, + "loss": 0.014, + "step": 9106 + }, + { + "epoch": 7.170933438361559, + "grad_norm": 0.20572657883167267, + "learning_rate": 2.7303000000000003e-05, + "loss": 0.0195, + "step": 9107 + }, + { + "epoch": 7.171721150059079, + "grad_norm": 0.2485155463218689, + "learning_rate": 2.7306000000000002e-05, + "loss": 0.0167, + "step": 9108 + }, + { + "epoch": 7.172508861756597, + "grad_norm": 0.2147819697856903, + "learning_rate": 2.7309000000000002e-05, + "loss": 0.0184, + "step": 9109 + }, + { + "epoch": 7.173296573454116, + "grad_norm": 0.21236979961395264, + "learning_rate": 2.7312e-05, + "loss": 0.0136, + "step": 9110 + }, + { + "epoch": 7.174084285151634, + "grad_norm": 0.2194463312625885, + "learning_rate": 2.7315e-05, + "loss": 0.0163, + "step": 9111 + }, + { + "epoch": 7.174871996849153, + "grad_norm": 0.3879321217536926, + "learning_rate": 2.7318e-05, + "loss": 0.0115, + "step": 9112 + }, + { + "epoch": 7.175659708546672, + "grad_norm": 0.2895151972770691, + "learning_rate": 2.7320999999999998e-05, + "loss": 0.0214, + "step": 9113 + }, + { + "epoch": 7.176447420244191, + "grad_norm": 0.5081645846366882, + "learning_rate": 2.7324e-05, + "loss": 0.0179, + "step": 9114 + }, + { + "epoch": 7.17723513194171, + "grad_norm": 0.5013679265975952, + "learning_rate": 2.7327e-05, + "loss": 0.0119, + "step": 9115 + }, + { + "epoch": 7.178022843639228, + "grad_norm": 0.54798823595047, + "learning_rate": 2.733e-05, + "loss": 0.0201, + "step": 9116 + }, + { + "epoch": 7.178810555336747, + "grad_norm": 0.4025883078575134, + "learning_rate": 2.7333e-05, + "loss": 0.0199, + "step": 9117 + }, + { + "epoch": 7.179598267034265, + "grad_norm": 0.6397970914840698, + "learning_rate": 2.7336e-05, + "loss": 0.0191, + "step": 9118 + }, + { + "epoch": 7.180385978731784, + "grad_norm": 0.25536900758743286, + "learning_rate": 2.7339e-05, + "loss": 0.0123, + "step": 9119 + }, + { + "epoch": 7.181173690429302, + "grad_norm": 1.219786524772644, + "learning_rate": 2.7342e-05, + "loss": 0.0228, + "step": 9120 + }, + { + "epoch": 7.181961402126822, + "grad_norm": 0.2685104310512543, + "learning_rate": 2.7345e-05, + "loss": 0.0186, + "step": 9121 + }, + { + "epoch": 7.1827491138243404, + "grad_norm": 0.30405253171920776, + "learning_rate": 2.7348e-05, + "loss": 0.0171, + "step": 9122 + }, + { + "epoch": 7.183536825521859, + "grad_norm": 0.4253886342048645, + "learning_rate": 2.7351e-05, + "loss": 0.0129, + "step": 9123 + }, + { + "epoch": 7.184324537219378, + "grad_norm": 0.2804122865200043, + "learning_rate": 2.7354000000000003e-05, + "loss": 0.0127, + "step": 9124 + }, + { + "epoch": 7.185112248916896, + "grad_norm": 0.5023788809776306, + "learning_rate": 2.7357000000000003e-05, + "loss": 0.0164, + "step": 9125 + }, + { + "epoch": 7.185899960614415, + "grad_norm": 0.5137375593185425, + "learning_rate": 2.7360000000000002e-05, + "loss": 0.0198, + "step": 9126 + }, + { + "epoch": 7.186687672311934, + "grad_norm": 0.5297588109970093, + "learning_rate": 2.7363000000000002e-05, + "loss": 0.0466, + "step": 9127 + }, + { + "epoch": 7.187475384009453, + "grad_norm": 0.4652569890022278, + "learning_rate": 2.7366000000000002e-05, + "loss": 0.0165, + "step": 9128 + }, + { + "epoch": 7.188263095706971, + "grad_norm": 0.6067317724227905, + "learning_rate": 2.7369000000000002e-05, + "loss": 0.0198, + "step": 9129 + }, + { + "epoch": 7.18905080740449, + "grad_norm": 0.3654748201370239, + "learning_rate": 2.7372e-05, + "loss": 0.0167, + "step": 9130 + }, + { + "epoch": 7.189838519102008, + "grad_norm": 1.0697020292282104, + "learning_rate": 2.7375e-05, + "loss": 0.0473, + "step": 9131 + }, + { + "epoch": 7.190626230799527, + "grad_norm": 0.29045745730400085, + "learning_rate": 2.7378e-05, + "loss": 0.0132, + "step": 9132 + }, + { + "epoch": 7.1914139424970465, + "grad_norm": 0.692283034324646, + "learning_rate": 2.7381e-05, + "loss": 0.0375, + "step": 9133 + }, + { + "epoch": 7.192201654194565, + "grad_norm": 0.44622185826301575, + "learning_rate": 2.7383999999999997e-05, + "loss": 0.0265, + "step": 9134 + }, + { + "epoch": 7.192989365892084, + "grad_norm": 0.26532021164894104, + "learning_rate": 2.7387e-05, + "loss": 0.0203, + "step": 9135 + }, + { + "epoch": 7.193777077589602, + "grad_norm": 0.32941725850105286, + "learning_rate": 2.739e-05, + "loss": 0.0163, + "step": 9136 + }, + { + "epoch": 7.194564789287121, + "grad_norm": 0.5223256945610046, + "learning_rate": 2.7393e-05, + "loss": 0.0263, + "step": 9137 + }, + { + "epoch": 7.195352500984639, + "grad_norm": 0.4591910243034363, + "learning_rate": 2.7396e-05, + "loss": 0.0156, + "step": 9138 + }, + { + "epoch": 7.196140212682159, + "grad_norm": 0.611272394657135, + "learning_rate": 2.7399e-05, + "loss": 0.0267, + "step": 9139 + }, + { + "epoch": 7.196927924379677, + "grad_norm": 0.5281968116760254, + "learning_rate": 2.7402e-05, + "loss": 0.0266, + "step": 9140 + }, + { + "epoch": 7.197715636077196, + "grad_norm": 1.5122628211975098, + "learning_rate": 2.7405e-05, + "loss": 0.3399, + "step": 9141 + }, + { + "epoch": 7.1985033477747145, + "grad_norm": 0.6228889226913452, + "learning_rate": 2.7408e-05, + "loss": 0.1748, + "step": 9142 + }, + { + "epoch": 7.199291059472233, + "grad_norm": 0.6489930152893066, + "learning_rate": 2.7411e-05, + "loss": 0.1849, + "step": 9143 + }, + { + "epoch": 7.200078771169752, + "grad_norm": 0.5103357434272766, + "learning_rate": 2.7414e-05, + "loss": 0.1152, + "step": 9144 + }, + { + "epoch": 7.20086648286727, + "grad_norm": 0.6251024603843689, + "learning_rate": 2.7417000000000002e-05, + "loss": 0.1166, + "step": 9145 + }, + { + "epoch": 7.20165419456479, + "grad_norm": 0.6009951829910278, + "learning_rate": 2.7420000000000002e-05, + "loss": 0.052, + "step": 9146 + }, + { + "epoch": 7.202441906262308, + "grad_norm": 0.3161567449569702, + "learning_rate": 2.7423e-05, + "loss": 0.0568, + "step": 9147 + }, + { + "epoch": 7.203229617959827, + "grad_norm": 0.4689023792743683, + "learning_rate": 2.7426e-05, + "loss": 0.0392, + "step": 9148 + }, + { + "epoch": 7.204017329657345, + "grad_norm": 0.31421375274658203, + "learning_rate": 2.7429e-05, + "loss": 0.0187, + "step": 9149 + }, + { + "epoch": 7.204805041354864, + "grad_norm": 0.746929407119751, + "learning_rate": 2.7432e-05, + "loss": 0.0181, + "step": 9150 + }, + { + "epoch": 7.2055927530523824, + "grad_norm": 0.30621466040611267, + "learning_rate": 2.7435e-05, + "loss": 0.0428, + "step": 9151 + }, + { + "epoch": 7.206380464749902, + "grad_norm": 0.38389402627944946, + "learning_rate": 2.7438e-05, + "loss": 0.0148, + "step": 9152 + }, + { + "epoch": 7.2071681764474205, + "grad_norm": 0.30619585514068604, + "learning_rate": 2.7441e-05, + "loss": 0.0148, + "step": 9153 + }, + { + "epoch": 7.207955888144939, + "grad_norm": 0.13325944542884827, + "learning_rate": 2.7444e-05, + "loss": 0.0073, + "step": 9154 + }, + { + "epoch": 7.208743599842458, + "grad_norm": 0.2805495262145996, + "learning_rate": 2.7447000000000003e-05, + "loss": 0.0273, + "step": 9155 + }, + { + "epoch": 7.209531311539976, + "grad_norm": 0.44939130544662476, + "learning_rate": 2.7450000000000003e-05, + "loss": 0.0476, + "step": 9156 + }, + { + "epoch": 7.210319023237495, + "grad_norm": 0.39325404167175293, + "learning_rate": 2.7453000000000003e-05, + "loss": 0.0197, + "step": 9157 + }, + { + "epoch": 7.211106734935014, + "grad_norm": 0.6889463663101196, + "learning_rate": 2.7456000000000003e-05, + "loss": 0.0278, + "step": 9158 + }, + { + "epoch": 7.211894446632533, + "grad_norm": 0.25332263112068176, + "learning_rate": 2.7459e-05, + "loss": 0.0086, + "step": 9159 + }, + { + "epoch": 7.212682158330051, + "grad_norm": 0.2537365257740021, + "learning_rate": 2.7462e-05, + "loss": 0.009, + "step": 9160 + }, + { + "epoch": 7.21346987002757, + "grad_norm": 0.30562421679496765, + "learning_rate": 2.7465e-05, + "loss": 0.0127, + "step": 9161 + }, + { + "epoch": 7.2142575817250885, + "grad_norm": 0.36509597301483154, + "learning_rate": 2.7468e-05, + "loss": 0.0144, + "step": 9162 + }, + { + "epoch": 7.215045293422607, + "grad_norm": 0.23374004662036896, + "learning_rate": 2.7471e-05, + "loss": 0.0144, + "step": 9163 + }, + { + "epoch": 7.2158330051201265, + "grad_norm": 0.4962430000305176, + "learning_rate": 2.7473999999999998e-05, + "loss": 0.0101, + "step": 9164 + }, + { + "epoch": 7.216620716817645, + "grad_norm": 0.6158142685890198, + "learning_rate": 2.7477e-05, + "loss": 0.0202, + "step": 9165 + }, + { + "epoch": 7.217408428515164, + "grad_norm": 0.3892047703266144, + "learning_rate": 2.748e-05, + "loss": 0.0195, + "step": 9166 + }, + { + "epoch": 7.218196140212682, + "grad_norm": 0.2289707064628601, + "learning_rate": 2.7483e-05, + "loss": 0.0145, + "step": 9167 + }, + { + "epoch": 7.218983851910201, + "grad_norm": 0.2367682158946991, + "learning_rate": 2.7486e-05, + "loss": 0.0122, + "step": 9168 + }, + { + "epoch": 7.219771563607719, + "grad_norm": 0.8267424702644348, + "learning_rate": 2.7489e-05, + "loss": 0.0173, + "step": 9169 + }, + { + "epoch": 7.220559275305238, + "grad_norm": 0.33913910388946533, + "learning_rate": 2.7492e-05, + "loss": 0.0173, + "step": 9170 + }, + { + "epoch": 7.221346987002757, + "grad_norm": 0.4054557979106903, + "learning_rate": 2.7495e-05, + "loss": 0.0268, + "step": 9171 + }, + { + "epoch": 7.222134698700276, + "grad_norm": 0.20327647030353546, + "learning_rate": 2.7498e-05, + "loss": 0.0104, + "step": 9172 + }, + { + "epoch": 7.2229224103977945, + "grad_norm": 0.3914652466773987, + "learning_rate": 2.7501e-05, + "loss": 0.022, + "step": 9173 + }, + { + "epoch": 7.223710122095313, + "grad_norm": 0.4115380644798279, + "learning_rate": 2.7504e-05, + "loss": 0.0329, + "step": 9174 + }, + { + "epoch": 7.224497833792832, + "grad_norm": 0.37951064109802246, + "learning_rate": 2.7507000000000003e-05, + "loss": 0.0196, + "step": 9175 + }, + { + "epoch": 7.22528554549035, + "grad_norm": 0.9857122898101807, + "learning_rate": 2.7510000000000003e-05, + "loss": 0.0142, + "step": 9176 + }, + { + "epoch": 7.22607325718787, + "grad_norm": 0.2753978967666626, + "learning_rate": 2.7513000000000002e-05, + "loss": 0.018, + "step": 9177 + }, + { + "epoch": 7.226860968885388, + "grad_norm": 0.30935394763946533, + "learning_rate": 2.7516000000000002e-05, + "loss": 0.0207, + "step": 9178 + }, + { + "epoch": 7.227648680582907, + "grad_norm": 0.34313085675239563, + "learning_rate": 2.7519000000000002e-05, + "loss": 0.0216, + "step": 9179 + }, + { + "epoch": 7.228436392280425, + "grad_norm": 0.4871402382850647, + "learning_rate": 2.7522000000000002e-05, + "loss": 0.0263, + "step": 9180 + }, + { + "epoch": 7.229224103977944, + "grad_norm": 0.62056964635849, + "learning_rate": 2.7525e-05, + "loss": 0.0294, + "step": 9181 + }, + { + "epoch": 7.2300118156754625, + "grad_norm": 0.2804268002510071, + "learning_rate": 2.7528e-05, + "loss": 0.0162, + "step": 9182 + }, + { + "epoch": 7.230799527372982, + "grad_norm": 0.24326841533184052, + "learning_rate": 2.7531e-05, + "loss": 0.013, + "step": 9183 + }, + { + "epoch": 7.2315872390705005, + "grad_norm": 0.470967173576355, + "learning_rate": 2.7533999999999998e-05, + "loss": 0.0243, + "step": 9184 + }, + { + "epoch": 7.232374950768019, + "grad_norm": 0.3273567855358124, + "learning_rate": 2.7537e-05, + "loss": 0.0193, + "step": 9185 + }, + { + "epoch": 7.233162662465538, + "grad_norm": 0.6006308197975159, + "learning_rate": 2.754e-05, + "loss": 0.0271, + "step": 9186 + }, + { + "epoch": 7.233950374163056, + "grad_norm": 0.7866542339324951, + "learning_rate": 2.7543e-05, + "loss": 0.0246, + "step": 9187 + }, + { + "epoch": 7.234738085860575, + "grad_norm": 0.7033583521842957, + "learning_rate": 2.7546e-05, + "loss": 0.0303, + "step": 9188 + }, + { + "epoch": 7.235525797558093, + "grad_norm": 0.34877654910087585, + "learning_rate": 2.7549e-05, + "loss": 0.0238, + "step": 9189 + }, + { + "epoch": 7.236313509255613, + "grad_norm": 1.0451304912567139, + "learning_rate": 2.7552e-05, + "loss": 0.0343, + "step": 9190 + }, + { + "epoch": 7.237101220953131, + "grad_norm": 1.0180989503860474, + "learning_rate": 2.7555e-05, + "loss": 0.2221, + "step": 9191 + }, + { + "epoch": 7.23788893265065, + "grad_norm": 0.723220944404602, + "learning_rate": 2.7558e-05, + "loss": 0.1668, + "step": 9192 + }, + { + "epoch": 7.2386766443481685, + "grad_norm": 0.772855818271637, + "learning_rate": 2.7561e-05, + "loss": 0.1694, + "step": 9193 + }, + { + "epoch": 7.239464356045687, + "grad_norm": 0.7730955481529236, + "learning_rate": 2.7564e-05, + "loss": 0.145, + "step": 9194 + }, + { + "epoch": 7.240252067743206, + "grad_norm": 0.5808364748954773, + "learning_rate": 2.7567000000000002e-05, + "loss": 0.087, + "step": 9195 + }, + { + "epoch": 7.241039779440725, + "grad_norm": 0.4474388062953949, + "learning_rate": 2.7570000000000002e-05, + "loss": 0.0827, + "step": 9196 + }, + { + "epoch": 7.241827491138244, + "grad_norm": 0.43083885312080383, + "learning_rate": 2.7573000000000002e-05, + "loss": 0.0327, + "step": 9197 + }, + { + "epoch": 7.242615202835762, + "grad_norm": 0.475965291261673, + "learning_rate": 2.7576e-05, + "loss": 0.0378, + "step": 9198 + }, + { + "epoch": 7.243402914533281, + "grad_norm": 0.2963903546333313, + "learning_rate": 2.7579e-05, + "loss": 0.0292, + "step": 9199 + }, + { + "epoch": 7.244190626230799, + "grad_norm": 0.4132997989654541, + "learning_rate": 2.7582e-05, + "loss": 0.025, + "step": 9200 + }, + { + "epoch": 7.244978337928318, + "grad_norm": 0.5726561546325684, + "learning_rate": 2.7585e-05, + "loss": 0.0216, + "step": 9201 + }, + { + "epoch": 7.245766049625837, + "grad_norm": 0.36463847756385803, + "learning_rate": 2.7588e-05, + "loss": 0.0232, + "step": 9202 + }, + { + "epoch": 7.246553761323356, + "grad_norm": 0.3755975067615509, + "learning_rate": 2.7591e-05, + "loss": 0.0276, + "step": 9203 + }, + { + "epoch": 7.2473414730208745, + "grad_norm": 0.25439751148223877, + "learning_rate": 2.7594e-05, + "loss": 0.0101, + "step": 9204 + }, + { + "epoch": 7.248129184718393, + "grad_norm": 0.34777572751045227, + "learning_rate": 2.7597000000000004e-05, + "loss": 0.0156, + "step": 9205 + }, + { + "epoch": 7.248916896415912, + "grad_norm": 0.3450126647949219, + "learning_rate": 2.7600000000000003e-05, + "loss": 0.0097, + "step": 9206 + }, + { + "epoch": 7.24970460811343, + "grad_norm": 0.8190920948982239, + "learning_rate": 2.7603000000000003e-05, + "loss": 0.0268, + "step": 9207 + }, + { + "epoch": 7.250492319810949, + "grad_norm": 0.27395403385162354, + "learning_rate": 2.7606e-05, + "loss": 0.0168, + "step": 9208 + }, + { + "epoch": 7.251280031508468, + "grad_norm": 0.2340250015258789, + "learning_rate": 2.7609e-05, + "loss": 0.013, + "step": 9209 + }, + { + "epoch": 7.252067743205987, + "grad_norm": 0.8285961747169495, + "learning_rate": 2.7612e-05, + "loss": 0.0129, + "step": 9210 + }, + { + "epoch": 7.252855454903505, + "grad_norm": 0.6153205633163452, + "learning_rate": 2.7615e-05, + "loss": 0.0176, + "step": 9211 + }, + { + "epoch": 7.253643166601024, + "grad_norm": 0.3095056116580963, + "learning_rate": 2.7618e-05, + "loss": 0.0152, + "step": 9212 + }, + { + "epoch": 7.2544308782985425, + "grad_norm": 0.35702988505363464, + "learning_rate": 2.7621e-05, + "loss": 0.0184, + "step": 9213 + }, + { + "epoch": 7.255218589996061, + "grad_norm": 0.49266740679740906, + "learning_rate": 2.7624e-05, + "loss": 0.0191, + "step": 9214 + }, + { + "epoch": 7.2560063016935805, + "grad_norm": 0.5508809089660645, + "learning_rate": 2.7627e-05, + "loss": 0.0663, + "step": 9215 + }, + { + "epoch": 7.256794013391099, + "grad_norm": 0.3112180829048157, + "learning_rate": 2.763e-05, + "loss": 0.0141, + "step": 9216 + }, + { + "epoch": 7.257581725088618, + "grad_norm": 0.3703359067440033, + "learning_rate": 2.7633e-05, + "loss": 0.0282, + "step": 9217 + }, + { + "epoch": 7.258369436786136, + "grad_norm": 0.6656762957572937, + "learning_rate": 2.7636e-05, + "loss": 0.0246, + "step": 9218 + }, + { + "epoch": 7.259157148483655, + "grad_norm": 0.30384504795074463, + "learning_rate": 2.7639e-05, + "loss": 0.0168, + "step": 9219 + }, + { + "epoch": 7.259944860181173, + "grad_norm": 0.4591778516769409, + "learning_rate": 2.7642e-05, + "loss": 0.0271, + "step": 9220 + }, + { + "epoch": 7.260732571878693, + "grad_norm": 0.40700581669807434, + "learning_rate": 2.7645e-05, + "loss": 0.0191, + "step": 9221 + }, + { + "epoch": 7.261520283576211, + "grad_norm": 0.3409707844257355, + "learning_rate": 2.7648e-05, + "loss": 0.0169, + "step": 9222 + }, + { + "epoch": 7.26230799527373, + "grad_norm": 0.4884011149406433, + "learning_rate": 2.7651e-05, + "loss": 0.0436, + "step": 9223 + }, + { + "epoch": 7.2630957069712485, + "grad_norm": 0.4240584969520569, + "learning_rate": 2.7654e-05, + "loss": 0.0225, + "step": 9224 + }, + { + "epoch": 7.263883418668767, + "grad_norm": 0.478811115026474, + "learning_rate": 2.7657000000000003e-05, + "loss": 0.0235, + "step": 9225 + }, + { + "epoch": 7.264671130366286, + "grad_norm": 0.35337743163108826, + "learning_rate": 2.7660000000000003e-05, + "loss": 0.0198, + "step": 9226 + }, + { + "epoch": 7.265458842063804, + "grad_norm": 0.29771602153778076, + "learning_rate": 2.7663000000000003e-05, + "loss": 0.0155, + "step": 9227 + }, + { + "epoch": 7.266246553761324, + "grad_norm": 0.19005046784877777, + "learning_rate": 2.7666000000000002e-05, + "loss": 0.0186, + "step": 9228 + }, + { + "epoch": 7.267034265458842, + "grad_norm": 0.3449680209159851, + "learning_rate": 2.7669000000000002e-05, + "loss": 0.0226, + "step": 9229 + }, + { + "epoch": 7.267821977156361, + "grad_norm": 0.4970756471157074, + "learning_rate": 2.7672000000000002e-05, + "loss": 0.0275, + "step": 9230 + }, + { + "epoch": 7.268609688853879, + "grad_norm": 0.3903261721134186, + "learning_rate": 2.7675000000000002e-05, + "loss": 0.017, + "step": 9231 + }, + { + "epoch": 7.269397400551398, + "grad_norm": 0.4260317087173462, + "learning_rate": 2.7678e-05, + "loss": 0.0267, + "step": 9232 + }, + { + "epoch": 7.2701851122489165, + "grad_norm": 0.5069664716720581, + "learning_rate": 2.7680999999999998e-05, + "loss": 0.0265, + "step": 9233 + }, + { + "epoch": 7.270972823946436, + "grad_norm": 0.9576385021209717, + "learning_rate": 2.7683999999999998e-05, + "loss": 0.0208, + "step": 9234 + }, + { + "epoch": 7.2717605356439545, + "grad_norm": 0.2143680900335312, + "learning_rate": 2.7687e-05, + "loss": 0.0144, + "step": 9235 + }, + { + "epoch": 7.272548247341473, + "grad_norm": 0.4659644663333893, + "learning_rate": 2.769e-05, + "loss": 0.0352, + "step": 9236 + }, + { + "epoch": 7.273335959038992, + "grad_norm": 0.5496437549591064, + "learning_rate": 2.7693e-05, + "loss": 0.0232, + "step": 9237 + }, + { + "epoch": 7.27412367073651, + "grad_norm": 0.8122437596321106, + "learning_rate": 2.7696e-05, + "loss": 0.0317, + "step": 9238 + }, + { + "epoch": 7.274911382434029, + "grad_norm": 0.6027169823646545, + "learning_rate": 2.7699e-05, + "loss": 0.0229, + "step": 9239 + }, + { + "epoch": 7.275699094131548, + "grad_norm": 0.4543313980102539, + "learning_rate": 2.7702e-05, + "loss": 0.0185, + "step": 9240 + }, + { + "epoch": 7.276486805829067, + "grad_norm": 0.5591281652450562, + "learning_rate": 2.7705e-05, + "loss": 0.1963, + "step": 9241 + }, + { + "epoch": 7.277274517526585, + "grad_norm": 0.5287315249443054, + "learning_rate": 2.7708e-05, + "loss": 0.1846, + "step": 9242 + }, + { + "epoch": 7.278062229224104, + "grad_norm": 0.7607174515724182, + "learning_rate": 2.7711e-05, + "loss": 0.1372, + "step": 9243 + }, + { + "epoch": 7.2788499409216225, + "grad_norm": 0.6310598254203796, + "learning_rate": 2.7714e-05, + "loss": 0.1365, + "step": 9244 + }, + { + "epoch": 7.279637652619141, + "grad_norm": 1.1358559131622314, + "learning_rate": 2.7717000000000002e-05, + "loss": 0.116, + "step": 9245 + }, + { + "epoch": 7.28042536431666, + "grad_norm": 0.5854106545448303, + "learning_rate": 2.7720000000000002e-05, + "loss": 0.0799, + "step": 9246 + }, + { + "epoch": 7.281213076014179, + "grad_norm": 0.41762855648994446, + "learning_rate": 2.7723000000000002e-05, + "loss": 0.0482, + "step": 9247 + }, + { + "epoch": 7.282000787711698, + "grad_norm": 0.3920493423938751, + "learning_rate": 2.7726000000000002e-05, + "loss": 0.0189, + "step": 9248 + }, + { + "epoch": 7.282788499409216, + "grad_norm": 0.2843414843082428, + "learning_rate": 2.7729e-05, + "loss": 0.0303, + "step": 9249 + }, + { + "epoch": 7.283576211106735, + "grad_norm": 0.37770721316337585, + "learning_rate": 2.7732e-05, + "loss": 0.0163, + "step": 9250 + }, + { + "epoch": 7.284363922804253, + "grad_norm": 0.29621422290802, + "learning_rate": 2.7735e-05, + "loss": 0.0227, + "step": 9251 + }, + { + "epoch": 7.285151634501772, + "grad_norm": 0.30757543444633484, + "learning_rate": 2.7738e-05, + "loss": 0.0138, + "step": 9252 + }, + { + "epoch": 7.285939346199291, + "grad_norm": 0.29088544845581055, + "learning_rate": 2.7741e-05, + "loss": 0.0239, + "step": 9253 + }, + { + "epoch": 7.28672705789681, + "grad_norm": 0.23161806166172028, + "learning_rate": 2.7744e-05, + "loss": 0.0105, + "step": 9254 + }, + { + "epoch": 7.2875147695943285, + "grad_norm": 0.33439382910728455, + "learning_rate": 2.7747000000000004e-05, + "loss": 0.0202, + "step": 9255 + }, + { + "epoch": 7.288302481291847, + "grad_norm": 0.24127501249313354, + "learning_rate": 2.7750000000000004e-05, + "loss": 0.0108, + "step": 9256 + }, + { + "epoch": 7.289090192989366, + "grad_norm": 0.28086423873901367, + "learning_rate": 2.7753e-05, + "loss": 0.0125, + "step": 9257 + }, + { + "epoch": 7.289877904686884, + "grad_norm": 0.3595923185348511, + "learning_rate": 2.7756e-05, + "loss": 0.0214, + "step": 9258 + }, + { + "epoch": 7.290665616384404, + "grad_norm": 0.19111856818199158, + "learning_rate": 2.7759e-05, + "loss": 0.0094, + "step": 9259 + }, + { + "epoch": 7.291453328081922, + "grad_norm": 0.30792608857154846, + "learning_rate": 2.7762e-05, + "loss": 0.0131, + "step": 9260 + }, + { + "epoch": 7.292241039779441, + "grad_norm": 0.18002764880657196, + "learning_rate": 2.7765e-05, + "loss": 0.0102, + "step": 9261 + }, + { + "epoch": 7.293028751476959, + "grad_norm": 0.22355034947395325, + "learning_rate": 2.7768e-05, + "loss": 0.0096, + "step": 9262 + }, + { + "epoch": 7.293816463174478, + "grad_norm": 0.3675677180290222, + "learning_rate": 2.7771e-05, + "loss": 0.0158, + "step": 9263 + }, + { + "epoch": 7.2946041748719965, + "grad_norm": 0.608842134475708, + "learning_rate": 2.7774e-05, + "loss": 0.0408, + "step": 9264 + }, + { + "epoch": 7.295391886569515, + "grad_norm": 0.3864253759384155, + "learning_rate": 2.7777e-05, + "loss": 0.0245, + "step": 9265 + }, + { + "epoch": 7.2961795982670345, + "grad_norm": 0.3703145682811737, + "learning_rate": 2.778e-05, + "loss": 0.0204, + "step": 9266 + }, + { + "epoch": 7.296967309964553, + "grad_norm": 0.5820485353469849, + "learning_rate": 2.7783e-05, + "loss": 0.0407, + "step": 9267 + }, + { + "epoch": 7.297755021662072, + "grad_norm": 0.21601712703704834, + "learning_rate": 2.7786e-05, + "loss": 0.0092, + "step": 9268 + }, + { + "epoch": 7.29854273335959, + "grad_norm": 0.3430825471878052, + "learning_rate": 2.7789e-05, + "loss": 0.0151, + "step": 9269 + }, + { + "epoch": 7.299330445057109, + "grad_norm": 0.29204830527305603, + "learning_rate": 2.7792e-05, + "loss": 0.015, + "step": 9270 + }, + { + "epoch": 7.300118156754628, + "grad_norm": 0.5287074446678162, + "learning_rate": 2.7795e-05, + "loss": 0.025, + "step": 9271 + }, + { + "epoch": 7.300905868452147, + "grad_norm": 0.5552311539649963, + "learning_rate": 2.7798e-05, + "loss": 0.0295, + "step": 9272 + }, + { + "epoch": 7.301693580149665, + "grad_norm": 0.5263144969940186, + "learning_rate": 2.7801e-05, + "loss": 0.0145, + "step": 9273 + }, + { + "epoch": 7.302481291847184, + "grad_norm": 0.30280637741088867, + "learning_rate": 2.7804e-05, + "loss": 0.0171, + "step": 9274 + }, + { + "epoch": 7.3032690035447025, + "grad_norm": 0.13048502802848816, + "learning_rate": 2.7807e-05, + "loss": 0.009, + "step": 9275 + }, + { + "epoch": 7.304056715242221, + "grad_norm": 0.39893054962158203, + "learning_rate": 2.7810000000000003e-05, + "loss": 0.0163, + "step": 9276 + }, + { + "epoch": 7.30484442693974, + "grad_norm": 0.38910749554634094, + "learning_rate": 2.7813000000000003e-05, + "loss": 0.0127, + "step": 9277 + }, + { + "epoch": 7.305632138637259, + "grad_norm": 0.43751397728919983, + "learning_rate": 2.7816000000000003e-05, + "loss": 0.0219, + "step": 9278 + }, + { + "epoch": 7.306419850334778, + "grad_norm": 0.42284077405929565, + "learning_rate": 2.7819000000000002e-05, + "loss": 0.0216, + "step": 9279 + }, + { + "epoch": 7.307207562032296, + "grad_norm": 0.5357441902160645, + "learning_rate": 2.7822000000000002e-05, + "loss": 0.0268, + "step": 9280 + }, + { + "epoch": 7.307995273729815, + "grad_norm": 0.28835225105285645, + "learning_rate": 2.7825000000000002e-05, + "loss": 0.0087, + "step": 9281 + }, + { + "epoch": 7.308782985427333, + "grad_norm": 0.5197528004646301, + "learning_rate": 2.7828e-05, + "loss": 0.0308, + "step": 9282 + }, + { + "epoch": 7.309570697124852, + "grad_norm": 0.4005749225616455, + "learning_rate": 2.7831e-05, + "loss": 0.0185, + "step": 9283 + }, + { + "epoch": 7.310358408822371, + "grad_norm": 0.7696473598480225, + "learning_rate": 2.7833999999999998e-05, + "loss": 0.0223, + "step": 9284 + }, + { + "epoch": 7.31114612051989, + "grad_norm": 0.39477506279945374, + "learning_rate": 2.7836999999999998e-05, + "loss": 0.0268, + "step": 9285 + }, + { + "epoch": 7.311933832217409, + "grad_norm": 0.4025716483592987, + "learning_rate": 2.784e-05, + "loss": 0.0195, + "step": 9286 + }, + { + "epoch": 7.312721543914927, + "grad_norm": 0.15294893085956573, + "learning_rate": 2.7843e-05, + "loss": 0.0078, + "step": 9287 + }, + { + "epoch": 7.313509255612446, + "grad_norm": 0.4515872895717621, + "learning_rate": 2.7846e-05, + "loss": 0.0176, + "step": 9288 + }, + { + "epoch": 7.314296967309964, + "grad_norm": 0.23908396065235138, + "learning_rate": 2.7849e-05, + "loss": 0.0134, + "step": 9289 + }, + { + "epoch": 7.315084679007484, + "grad_norm": 0.6547776460647583, + "learning_rate": 2.7852e-05, + "loss": 0.0301, + "step": 9290 + }, + { + "epoch": 7.315872390705002, + "grad_norm": 1.3066625595092773, + "learning_rate": 2.7855e-05, + "loss": 0.3394, + "step": 9291 + }, + { + "epoch": 7.316660102402521, + "grad_norm": 0.7458978891372681, + "learning_rate": 2.7858e-05, + "loss": 0.2177, + "step": 9292 + }, + { + "epoch": 7.317447814100039, + "grad_norm": 0.6083602905273438, + "learning_rate": 2.7861e-05, + "loss": 0.1712, + "step": 9293 + }, + { + "epoch": 7.318235525797558, + "grad_norm": 0.8602408170700073, + "learning_rate": 2.7864e-05, + "loss": 0.1238, + "step": 9294 + }, + { + "epoch": 7.3190232374950766, + "grad_norm": 0.6151702404022217, + "learning_rate": 2.7867e-05, + "loss": 0.1135, + "step": 9295 + }, + { + "epoch": 7.319810949192595, + "grad_norm": 0.550321102142334, + "learning_rate": 2.7870000000000003e-05, + "loss": 0.0592, + "step": 9296 + }, + { + "epoch": 7.320598660890115, + "grad_norm": 0.4755474627017975, + "learning_rate": 2.7873000000000002e-05, + "loss": 0.0386, + "step": 9297 + }, + { + "epoch": 7.321386372587633, + "grad_norm": 0.5159398913383484, + "learning_rate": 2.7876000000000002e-05, + "loss": 0.0875, + "step": 9298 + }, + { + "epoch": 7.322174084285152, + "grad_norm": 0.21518558263778687, + "learning_rate": 2.7879000000000002e-05, + "loss": 0.021, + "step": 9299 + }, + { + "epoch": 7.32296179598267, + "grad_norm": 0.3402646780014038, + "learning_rate": 2.7882000000000002e-05, + "loss": 0.018, + "step": 9300 + }, + { + "epoch": 7.323749507680189, + "grad_norm": 0.3489951491355896, + "learning_rate": 2.7885e-05, + "loss": 0.0218, + "step": 9301 + }, + { + "epoch": 7.324537219377707, + "grad_norm": 0.26396670937538147, + "learning_rate": 2.7888e-05, + "loss": 0.014, + "step": 9302 + }, + { + "epoch": 7.325324931075227, + "grad_norm": 0.2666645348072052, + "learning_rate": 2.7891e-05, + "loss": 0.0249, + "step": 9303 + }, + { + "epoch": 7.326112642772745, + "grad_norm": 0.282879114151001, + "learning_rate": 2.7894e-05, + "loss": 0.0143, + "step": 9304 + }, + { + "epoch": 7.326900354470264, + "grad_norm": 0.19479720294475555, + "learning_rate": 2.7897e-05, + "loss": 0.0205, + "step": 9305 + }, + { + "epoch": 7.327688066167783, + "grad_norm": 0.3954017758369446, + "learning_rate": 2.79e-05, + "loss": 0.0187, + "step": 9306 + }, + { + "epoch": 7.328475777865301, + "grad_norm": 0.27025678753852844, + "learning_rate": 2.7903e-05, + "loss": 0.0253, + "step": 9307 + }, + { + "epoch": 7.32926348956282, + "grad_norm": 0.3097599744796753, + "learning_rate": 2.7906e-05, + "loss": 0.0254, + "step": 9308 + }, + { + "epoch": 7.330051201260339, + "grad_norm": 0.25445231795310974, + "learning_rate": 2.7909e-05, + "loss": 0.0133, + "step": 9309 + }, + { + "epoch": 7.330838912957858, + "grad_norm": 0.19243615865707397, + "learning_rate": 2.7912e-05, + "loss": 0.0107, + "step": 9310 + }, + { + "epoch": 7.331626624655376, + "grad_norm": 0.23861084878444672, + "learning_rate": 2.7915e-05, + "loss": 0.0131, + "step": 9311 + }, + { + "epoch": 7.332414336352895, + "grad_norm": 0.3514357805252075, + "learning_rate": 2.7918e-05, + "loss": 0.0188, + "step": 9312 + }, + { + "epoch": 7.333202048050413, + "grad_norm": 0.2171882838010788, + "learning_rate": 2.7921e-05, + "loss": 0.0137, + "step": 9313 + }, + { + "epoch": 7.333989759747932, + "grad_norm": 0.6931078433990479, + "learning_rate": 2.7924e-05, + "loss": 0.0272, + "step": 9314 + }, + { + "epoch": 7.334777471445451, + "grad_norm": 0.33109214901924133, + "learning_rate": 2.7927e-05, + "loss": 0.0215, + "step": 9315 + }, + { + "epoch": 7.33556518314297, + "grad_norm": 0.4407903850078583, + "learning_rate": 2.7930000000000002e-05, + "loss": 0.0209, + "step": 9316 + }, + { + "epoch": 7.336352894840489, + "grad_norm": 0.2959407866001129, + "learning_rate": 2.7933000000000002e-05, + "loss": 0.0123, + "step": 9317 + }, + { + "epoch": 7.337140606538007, + "grad_norm": 0.21687224507331848, + "learning_rate": 2.7936e-05, + "loss": 0.0104, + "step": 9318 + }, + { + "epoch": 7.337928318235526, + "grad_norm": 0.24788117408752441, + "learning_rate": 2.7939e-05, + "loss": 0.0141, + "step": 9319 + }, + { + "epoch": 7.338716029933044, + "grad_norm": 0.26750078797340393, + "learning_rate": 2.7942e-05, + "loss": 0.0125, + "step": 9320 + }, + { + "epoch": 7.339503741630563, + "grad_norm": 0.55827397108078, + "learning_rate": 2.7945e-05, + "loss": 0.0175, + "step": 9321 + }, + { + "epoch": 7.340291453328082, + "grad_norm": 0.266769677400589, + "learning_rate": 2.7948e-05, + "loss": 0.0167, + "step": 9322 + }, + { + "epoch": 7.341079165025601, + "grad_norm": 0.340726375579834, + "learning_rate": 2.7951e-05, + "loss": 0.0137, + "step": 9323 + }, + { + "epoch": 7.341866876723119, + "grad_norm": 0.3215126693248749, + "learning_rate": 2.7954e-05, + "loss": 0.0146, + "step": 9324 + }, + { + "epoch": 7.342654588420638, + "grad_norm": 0.3376474380493164, + "learning_rate": 2.7957e-05, + "loss": 0.0149, + "step": 9325 + }, + { + "epoch": 7.343442300118157, + "grad_norm": 0.2986065745353699, + "learning_rate": 2.7960000000000003e-05, + "loss": 0.0131, + "step": 9326 + }, + { + "epoch": 7.344230011815675, + "grad_norm": 0.2372676581144333, + "learning_rate": 2.7963000000000003e-05, + "loss": 0.0107, + "step": 9327 + }, + { + "epoch": 7.345017723513195, + "grad_norm": 0.2627512216567993, + "learning_rate": 2.7966000000000003e-05, + "loss": 0.0177, + "step": 9328 + }, + { + "epoch": 7.345805435210713, + "grad_norm": 0.4311186671257019, + "learning_rate": 2.7969000000000003e-05, + "loss": 0.0245, + "step": 9329 + }, + { + "epoch": 7.346593146908232, + "grad_norm": 0.33416762948036194, + "learning_rate": 2.7972000000000003e-05, + "loss": 0.0173, + "step": 9330 + }, + { + "epoch": 7.34738085860575, + "grad_norm": 0.32324278354644775, + "learning_rate": 2.7975e-05, + "loss": 0.0189, + "step": 9331 + }, + { + "epoch": 7.348168570303269, + "grad_norm": 0.3726102411746979, + "learning_rate": 2.7978e-05, + "loss": 0.0246, + "step": 9332 + }, + { + "epoch": 7.348956282000787, + "grad_norm": 0.5515580177307129, + "learning_rate": 2.7981e-05, + "loss": 0.0281, + "step": 9333 + }, + { + "epoch": 7.349743993698306, + "grad_norm": 0.37780752778053284, + "learning_rate": 2.7984e-05, + "loss": 0.0221, + "step": 9334 + }, + { + "epoch": 7.3505317053958255, + "grad_norm": 0.3200411796569824, + "learning_rate": 2.7986999999999998e-05, + "loss": 0.0216, + "step": 9335 + }, + { + "epoch": 7.351319417093344, + "grad_norm": 0.47836005687713623, + "learning_rate": 2.799e-05, + "loss": 0.0228, + "step": 9336 + }, + { + "epoch": 7.352107128790863, + "grad_norm": 0.45179927349090576, + "learning_rate": 2.7993e-05, + "loss": 0.0132, + "step": 9337 + }, + { + "epoch": 7.352894840488381, + "grad_norm": 0.5167964100837708, + "learning_rate": 2.7996e-05, + "loss": 0.0259, + "step": 9338 + }, + { + "epoch": 7.3536825521859, + "grad_norm": 0.5136348605155945, + "learning_rate": 2.7999e-05, + "loss": 0.0396, + "step": 9339 + }, + { + "epoch": 7.354470263883418, + "grad_norm": 0.484779417514801, + "learning_rate": 2.8002e-05, + "loss": 0.0168, + "step": 9340 + }, + { + "epoch": 7.355257975580938, + "grad_norm": 0.8301293253898621, + "learning_rate": 2.8005e-05, + "loss": 0.2634, + "step": 9341 + }, + { + "epoch": 7.356045687278456, + "grad_norm": 0.7148861885070801, + "learning_rate": 2.8008e-05, + "loss": 0.1704, + "step": 9342 + }, + { + "epoch": 7.356833398975975, + "grad_norm": 0.5370573997497559, + "learning_rate": 2.8011e-05, + "loss": 0.1275, + "step": 9343 + }, + { + "epoch": 7.3576211106734934, + "grad_norm": 0.6712014079093933, + "learning_rate": 2.8014e-05, + "loss": 0.1495, + "step": 9344 + }, + { + "epoch": 7.358408822371012, + "grad_norm": 1.2368762493133545, + "learning_rate": 2.8017e-05, + "loss": 0.0878, + "step": 9345 + }, + { + "epoch": 7.359196534068531, + "grad_norm": 0.4464990496635437, + "learning_rate": 2.8020000000000003e-05, + "loss": 0.0413, + "step": 9346 + }, + { + "epoch": 7.35998424576605, + "grad_norm": 0.6591498255729675, + "learning_rate": 2.8023000000000003e-05, + "loss": 0.1584, + "step": 9347 + }, + { + "epoch": 7.360771957463569, + "grad_norm": 0.48187294602394104, + "learning_rate": 2.8026000000000002e-05, + "loss": 0.041, + "step": 9348 + }, + { + "epoch": 7.361559669161087, + "grad_norm": 0.25465697050094604, + "learning_rate": 2.8029000000000002e-05, + "loss": 0.0382, + "step": 9349 + }, + { + "epoch": 7.362347380858606, + "grad_norm": 0.32972589135169983, + "learning_rate": 2.8032000000000002e-05, + "loss": 0.0304, + "step": 9350 + }, + { + "epoch": 7.363135092556124, + "grad_norm": 0.4525609314441681, + "learning_rate": 2.8035000000000002e-05, + "loss": 0.0359, + "step": 9351 + }, + { + "epoch": 7.363922804253643, + "grad_norm": 0.177524134516716, + "learning_rate": 2.8038e-05, + "loss": 0.0212, + "step": 9352 + }, + { + "epoch": 7.364710515951161, + "grad_norm": 0.33817383646965027, + "learning_rate": 2.8041e-05, + "loss": 0.0233, + "step": 9353 + }, + { + "epoch": 7.365498227648681, + "grad_norm": 0.28903356194496155, + "learning_rate": 2.8044e-05, + "loss": 0.021, + "step": 9354 + }, + { + "epoch": 7.3662859393461995, + "grad_norm": 0.24099516868591309, + "learning_rate": 2.8047e-05, + "loss": 0.0117, + "step": 9355 + }, + { + "epoch": 7.367073651043718, + "grad_norm": 0.2946109473705292, + "learning_rate": 2.805e-05, + "loss": 0.0236, + "step": 9356 + }, + { + "epoch": 7.367861362741237, + "grad_norm": 0.29897037148475647, + "learning_rate": 2.8053e-05, + "loss": 0.0197, + "step": 9357 + }, + { + "epoch": 7.368649074438755, + "grad_norm": 0.4790729284286499, + "learning_rate": 2.8056e-05, + "loss": 0.0223, + "step": 9358 + }, + { + "epoch": 7.369436786136274, + "grad_norm": 0.17554645240306854, + "learning_rate": 2.8059e-05, + "loss": 0.0151, + "step": 9359 + }, + { + "epoch": 7.370224497833793, + "grad_norm": 0.6024491190910339, + "learning_rate": 2.8062e-05, + "loss": 0.0216, + "step": 9360 + }, + { + "epoch": 7.371012209531312, + "grad_norm": 0.2564999461174011, + "learning_rate": 2.8065e-05, + "loss": 0.0081, + "step": 9361 + }, + { + "epoch": 7.37179992122883, + "grad_norm": 0.2987314760684967, + "learning_rate": 2.8068e-05, + "loss": 0.0103, + "step": 9362 + }, + { + "epoch": 7.372587632926349, + "grad_norm": 0.4380175471305847, + "learning_rate": 2.8071e-05, + "loss": 0.0289, + "step": 9363 + }, + { + "epoch": 7.3733753446238675, + "grad_norm": 0.6236618757247925, + "learning_rate": 2.8074e-05, + "loss": 0.0321, + "step": 9364 + }, + { + "epoch": 7.374163056321386, + "grad_norm": 0.8248868584632874, + "learning_rate": 2.8077e-05, + "loss": 0.0259, + "step": 9365 + }, + { + "epoch": 7.3749507680189055, + "grad_norm": 0.3288436233997345, + "learning_rate": 2.8080000000000002e-05, + "loss": 0.0143, + "step": 9366 + }, + { + "epoch": 7.375738479716424, + "grad_norm": 0.2734087407588959, + "learning_rate": 2.8083000000000002e-05, + "loss": 0.0135, + "step": 9367 + }, + { + "epoch": 7.376526191413943, + "grad_norm": 0.15342862904071808, + "learning_rate": 2.8086000000000002e-05, + "loss": 0.0118, + "step": 9368 + }, + { + "epoch": 7.377313903111461, + "grad_norm": 0.2698177099227905, + "learning_rate": 2.8089e-05, + "loss": 0.0116, + "step": 9369 + }, + { + "epoch": 7.37810161480898, + "grad_norm": 0.4030674993991852, + "learning_rate": 2.8092e-05, + "loss": 0.0176, + "step": 9370 + }, + { + "epoch": 7.378889326506498, + "grad_norm": 0.33537933230400085, + "learning_rate": 2.8095e-05, + "loss": 0.0192, + "step": 9371 + }, + { + "epoch": 7.379677038204017, + "grad_norm": 0.33578482270240784, + "learning_rate": 2.8098e-05, + "loss": 0.0204, + "step": 9372 + }, + { + "epoch": 7.380464749901536, + "grad_norm": 0.41498687863349915, + "learning_rate": 2.8101e-05, + "loss": 0.0111, + "step": 9373 + }, + { + "epoch": 7.381252461599055, + "grad_norm": 0.2608725130558014, + "learning_rate": 2.8104e-05, + "loss": 0.0156, + "step": 9374 + }, + { + "epoch": 7.3820401732965735, + "grad_norm": 0.43041500449180603, + "learning_rate": 2.8107e-05, + "loss": 0.0124, + "step": 9375 + }, + { + "epoch": 7.382827884994092, + "grad_norm": 0.3556455373764038, + "learning_rate": 2.8110000000000004e-05, + "loss": 0.0244, + "step": 9376 + }, + { + "epoch": 7.383615596691611, + "grad_norm": 0.4082838296890259, + "learning_rate": 2.8113000000000003e-05, + "loss": 0.0139, + "step": 9377 + }, + { + "epoch": 7.384403308389129, + "grad_norm": 0.46036598086357117, + "learning_rate": 2.8116000000000003e-05, + "loss": 0.0164, + "step": 9378 + }, + { + "epoch": 7.385191020086649, + "grad_norm": 0.256242036819458, + "learning_rate": 2.8119000000000003e-05, + "loss": 0.019, + "step": 9379 + }, + { + "epoch": 7.385978731784167, + "grad_norm": 0.5138771533966064, + "learning_rate": 2.8122e-05, + "loss": 0.0207, + "step": 9380 + }, + { + "epoch": 7.386766443481686, + "grad_norm": 0.4629580080509186, + "learning_rate": 2.8125e-05, + "loss": 0.0129, + "step": 9381 + }, + { + "epoch": 7.387554155179204, + "grad_norm": 0.8706965446472168, + "learning_rate": 2.8128e-05, + "loss": 0.019, + "step": 9382 + }, + { + "epoch": 7.388341866876723, + "grad_norm": 0.6621288061141968, + "learning_rate": 2.8131e-05, + "loss": 0.0364, + "step": 9383 + }, + { + "epoch": 7.3891295785742415, + "grad_norm": 0.4190557301044464, + "learning_rate": 2.8134e-05, + "loss": 0.0205, + "step": 9384 + }, + { + "epoch": 7.389917290271761, + "grad_norm": 0.3908953070640564, + "learning_rate": 2.8137e-05, + "loss": 0.0157, + "step": 9385 + }, + { + "epoch": 7.3907050019692795, + "grad_norm": 0.310821533203125, + "learning_rate": 2.8139999999999998e-05, + "loss": 0.0156, + "step": 9386 + }, + { + "epoch": 7.391492713666798, + "grad_norm": 0.33522945642471313, + "learning_rate": 2.8143e-05, + "loss": 0.0163, + "step": 9387 + }, + { + "epoch": 7.392280425364317, + "grad_norm": 0.35316556692123413, + "learning_rate": 2.8146e-05, + "loss": 0.0147, + "step": 9388 + }, + { + "epoch": 7.393068137061835, + "grad_norm": 0.3221622705459595, + "learning_rate": 2.8149e-05, + "loss": 0.0156, + "step": 9389 + }, + { + "epoch": 7.393855848759354, + "grad_norm": 0.462213397026062, + "learning_rate": 2.8152e-05, + "loss": 0.0245, + "step": 9390 + }, + { + "epoch": 7.394643560456872, + "grad_norm": 1.1448854207992554, + "learning_rate": 2.8155e-05, + "loss": 0.3461, + "step": 9391 + }, + { + "epoch": 7.395431272154392, + "grad_norm": 0.6655204892158508, + "learning_rate": 2.8158e-05, + "loss": 0.174, + "step": 9392 + }, + { + "epoch": 7.39621898385191, + "grad_norm": 0.6266448497772217, + "learning_rate": 2.8161e-05, + "loss": 0.1313, + "step": 9393 + }, + { + "epoch": 7.397006695549429, + "grad_norm": 0.8602591156959534, + "learning_rate": 2.8164e-05, + "loss": 0.1617, + "step": 9394 + }, + { + "epoch": 7.3977944072469475, + "grad_norm": 0.7497865557670593, + "learning_rate": 2.8167e-05, + "loss": 0.1394, + "step": 9395 + }, + { + "epoch": 7.398582118944466, + "grad_norm": 0.7767435312271118, + "learning_rate": 2.817e-05, + "loss": 0.0737, + "step": 9396 + }, + { + "epoch": 7.3993698306419855, + "grad_norm": 0.2581069767475128, + "learning_rate": 2.8173000000000003e-05, + "loss": 0.0281, + "step": 9397 + }, + { + "epoch": 7.400157542339504, + "grad_norm": 0.24348384141921997, + "learning_rate": 2.8176000000000003e-05, + "loss": 0.0268, + "step": 9398 + }, + { + "epoch": 7.400945254037023, + "grad_norm": 0.2570728659629822, + "learning_rate": 2.8179000000000002e-05, + "loss": 0.0253, + "step": 9399 + }, + { + "epoch": 7.401732965734541, + "grad_norm": 0.38176366686820984, + "learning_rate": 2.8182000000000002e-05, + "loss": 0.0248, + "step": 9400 + }, + { + "epoch": 7.40252067743206, + "grad_norm": 0.9674753546714783, + "learning_rate": 2.8185000000000002e-05, + "loss": 0.0264, + "step": 9401 + }, + { + "epoch": 7.403308389129578, + "grad_norm": 0.39834919571876526, + "learning_rate": 2.8188000000000002e-05, + "loss": 0.0482, + "step": 9402 + }, + { + "epoch": 7.404096100827097, + "grad_norm": 0.32011401653289795, + "learning_rate": 2.8191e-05, + "loss": 0.0166, + "step": 9403 + }, + { + "epoch": 7.404883812524616, + "grad_norm": 0.17788919806480408, + "learning_rate": 2.8194e-05, + "loss": 0.0125, + "step": 9404 + }, + { + "epoch": 7.405671524222135, + "grad_norm": 0.28945425152778625, + "learning_rate": 2.8196999999999998e-05, + "loss": 0.0095, + "step": 9405 + }, + { + "epoch": 7.4064592359196535, + "grad_norm": 0.17767083644866943, + "learning_rate": 2.8199999999999998e-05, + "loss": 0.0114, + "step": 9406 + }, + { + "epoch": 7.407246947617172, + "grad_norm": 0.5823059678077698, + "learning_rate": 2.8203e-05, + "loss": 0.0253, + "step": 9407 + }, + { + "epoch": 7.408034659314691, + "grad_norm": 0.28853362798690796, + "learning_rate": 2.8206e-05, + "loss": 0.0211, + "step": 9408 + }, + { + "epoch": 7.408822371012209, + "grad_norm": 0.27188652753829956, + "learning_rate": 2.8209e-05, + "loss": 0.0148, + "step": 9409 + }, + { + "epoch": 7.409610082709729, + "grad_norm": 0.4392208456993103, + "learning_rate": 2.8212e-05, + "loss": 0.0286, + "step": 9410 + }, + { + "epoch": 7.410397794407247, + "grad_norm": 0.3882302939891815, + "learning_rate": 2.8215e-05, + "loss": 0.0203, + "step": 9411 + }, + { + "epoch": 7.411185506104766, + "grad_norm": 0.5152142643928528, + "learning_rate": 2.8218e-05, + "loss": 0.0335, + "step": 9412 + }, + { + "epoch": 7.411973217802284, + "grad_norm": 0.21383124589920044, + "learning_rate": 2.8221e-05, + "loss": 0.0212, + "step": 9413 + }, + { + "epoch": 7.412760929499803, + "grad_norm": 0.25707724690437317, + "learning_rate": 2.8224e-05, + "loss": 0.0144, + "step": 9414 + }, + { + "epoch": 7.4135486411973215, + "grad_norm": 0.3259631395339966, + "learning_rate": 2.8227e-05, + "loss": 0.021, + "step": 9415 + }, + { + "epoch": 7.414336352894841, + "grad_norm": 0.30531346797943115, + "learning_rate": 2.823e-05, + "loss": 0.0261, + "step": 9416 + }, + { + "epoch": 7.4151240645923595, + "grad_norm": 0.37082967162132263, + "learning_rate": 2.8233000000000002e-05, + "loss": 0.0211, + "step": 9417 + }, + { + "epoch": 7.415911776289878, + "grad_norm": 0.22755169868469238, + "learning_rate": 2.8236000000000002e-05, + "loss": 0.0119, + "step": 9418 + }, + { + "epoch": 7.416699487987397, + "grad_norm": 0.6896255612373352, + "learning_rate": 2.8239000000000002e-05, + "loss": 0.0148, + "step": 9419 + }, + { + "epoch": 7.417487199684915, + "grad_norm": 0.44193315505981445, + "learning_rate": 2.8242e-05, + "loss": 0.0186, + "step": 9420 + }, + { + "epoch": 7.418274911382434, + "grad_norm": 0.31813937425613403, + "learning_rate": 2.8245e-05, + "loss": 0.023, + "step": 9421 + }, + { + "epoch": 7.419062623079952, + "grad_norm": 0.23495973646640778, + "learning_rate": 2.8248e-05, + "loss": 0.011, + "step": 9422 + }, + { + "epoch": 7.419850334777472, + "grad_norm": 0.3708813488483429, + "learning_rate": 2.8251e-05, + "loss": 0.0243, + "step": 9423 + }, + { + "epoch": 7.42063804647499, + "grad_norm": 0.25900304317474365, + "learning_rate": 2.8254e-05, + "loss": 0.0183, + "step": 9424 + }, + { + "epoch": 7.421425758172509, + "grad_norm": 0.4064837396144867, + "learning_rate": 2.8257e-05, + "loss": 0.0262, + "step": 9425 + }, + { + "epoch": 7.4222134698700275, + "grad_norm": 0.29481756687164307, + "learning_rate": 2.826e-05, + "loss": 0.0156, + "step": 9426 + }, + { + "epoch": 7.423001181567546, + "grad_norm": 0.5105202794075012, + "learning_rate": 2.8263000000000004e-05, + "loss": 0.025, + "step": 9427 + }, + { + "epoch": 7.423788893265065, + "grad_norm": 0.44582250714302063, + "learning_rate": 2.8266000000000003e-05, + "loss": 0.0117, + "step": 9428 + }, + { + "epoch": 7.424576604962584, + "grad_norm": 0.2187117487192154, + "learning_rate": 2.8269e-05, + "loss": 0.0115, + "step": 9429 + }, + { + "epoch": 7.425364316660103, + "grad_norm": 0.2593291103839874, + "learning_rate": 2.8272e-05, + "loss": 0.0175, + "step": 9430 + }, + { + "epoch": 7.426152028357621, + "grad_norm": 0.8141688108444214, + "learning_rate": 2.8275e-05, + "loss": 0.0154, + "step": 9431 + }, + { + "epoch": 7.42693974005514, + "grad_norm": 0.2598518431186676, + "learning_rate": 2.8278e-05, + "loss": 0.0158, + "step": 9432 + }, + { + "epoch": 7.427727451752658, + "grad_norm": 0.595628023147583, + "learning_rate": 2.8281e-05, + "loss": 0.0227, + "step": 9433 + }, + { + "epoch": 7.428515163450177, + "grad_norm": 0.40648940205574036, + "learning_rate": 2.8284e-05, + "loss": 0.0203, + "step": 9434 + }, + { + "epoch": 7.429302875147696, + "grad_norm": 0.9301758408546448, + "learning_rate": 2.8287e-05, + "loss": 0.0338, + "step": 9435 + }, + { + "epoch": 7.430090586845215, + "grad_norm": 0.44239503145217896, + "learning_rate": 2.829e-05, + "loss": 0.0088, + "step": 9436 + }, + { + "epoch": 7.4308782985427335, + "grad_norm": 0.4384671747684479, + "learning_rate": 2.8293e-05, + "loss": 0.0233, + "step": 9437 + }, + { + "epoch": 7.431666010240252, + "grad_norm": 0.3667430877685547, + "learning_rate": 2.8296e-05, + "loss": 0.0097, + "step": 9438 + }, + { + "epoch": 7.432453721937771, + "grad_norm": 0.27133893966674805, + "learning_rate": 2.8299e-05, + "loss": 0.0175, + "step": 9439 + }, + { + "epoch": 7.433241433635289, + "grad_norm": 0.35329669713974, + "learning_rate": 2.8302e-05, + "loss": 0.0171, + "step": 9440 + }, + { + "epoch": 7.434029145332808, + "grad_norm": 1.0551636219024658, + "learning_rate": 2.8305e-05, + "loss": 0.3471, + "step": 9441 + }, + { + "epoch": 7.434816857030327, + "grad_norm": 0.8742051720619202, + "learning_rate": 2.8308e-05, + "loss": 0.2197, + "step": 9442 + }, + { + "epoch": 7.435604568727846, + "grad_norm": 0.6437816619873047, + "learning_rate": 2.8311e-05, + "loss": 0.1239, + "step": 9443 + }, + { + "epoch": 7.436392280425364, + "grad_norm": 0.616770327091217, + "learning_rate": 2.8314e-05, + "loss": 0.1151, + "step": 9444 + }, + { + "epoch": 7.437179992122883, + "grad_norm": 1.0190868377685547, + "learning_rate": 2.8317e-05, + "loss": 0.1247, + "step": 9445 + }, + { + "epoch": 7.4379677038204015, + "grad_norm": 0.365347295999527, + "learning_rate": 2.832e-05, + "loss": 0.0691, + "step": 9446 + }, + { + "epoch": 7.43875541551792, + "grad_norm": 0.4068818688392639, + "learning_rate": 2.8323000000000003e-05, + "loss": 0.0649, + "step": 9447 + }, + { + "epoch": 7.4395431272154395, + "grad_norm": 0.658702552318573, + "learning_rate": 2.8326000000000003e-05, + "loss": 0.0316, + "step": 9448 + }, + { + "epoch": 7.440330838912958, + "grad_norm": 0.2840753197669983, + "learning_rate": 2.8329000000000003e-05, + "loss": 0.0488, + "step": 9449 + }, + { + "epoch": 7.441118550610477, + "grad_norm": 0.3141888380050659, + "learning_rate": 2.8332000000000002e-05, + "loss": 0.0252, + "step": 9450 + }, + { + "epoch": 7.441906262307995, + "grad_norm": 0.4778541922569275, + "learning_rate": 2.8335000000000002e-05, + "loss": 0.0294, + "step": 9451 + }, + { + "epoch": 7.442693974005514, + "grad_norm": 0.28454726934432983, + "learning_rate": 2.8338000000000002e-05, + "loss": 0.0111, + "step": 9452 + }, + { + "epoch": 7.443481685703032, + "grad_norm": 0.2797602713108063, + "learning_rate": 2.8341000000000002e-05, + "loss": 0.0168, + "step": 9453 + }, + { + "epoch": 7.444269397400552, + "grad_norm": 0.3410591781139374, + "learning_rate": 2.8344e-05, + "loss": 0.0179, + "step": 9454 + }, + { + "epoch": 7.44505710909807, + "grad_norm": 0.3537704050540924, + "learning_rate": 2.8346999999999998e-05, + "loss": 0.0263, + "step": 9455 + }, + { + "epoch": 7.445844820795589, + "grad_norm": 0.47040778398513794, + "learning_rate": 2.8349999999999998e-05, + "loss": 0.0204, + "step": 9456 + }, + { + "epoch": 7.4466325324931075, + "grad_norm": 0.24038736522197723, + "learning_rate": 2.8353e-05, + "loss": 0.0177, + "step": 9457 + }, + { + "epoch": 7.447420244190626, + "grad_norm": 0.32774847745895386, + "learning_rate": 2.8356e-05, + "loss": 0.0144, + "step": 9458 + }, + { + "epoch": 7.448207955888145, + "grad_norm": 0.2853170335292816, + "learning_rate": 2.8359e-05, + "loss": 0.0167, + "step": 9459 + }, + { + "epoch": 7.448995667585663, + "grad_norm": 0.36303234100341797, + "learning_rate": 2.8362e-05, + "loss": 0.0139, + "step": 9460 + }, + { + "epoch": 7.449783379283183, + "grad_norm": 0.28721413016319275, + "learning_rate": 2.8365e-05, + "loss": 0.0165, + "step": 9461 + }, + { + "epoch": 7.450571090980701, + "grad_norm": 0.2607871890068054, + "learning_rate": 2.8368e-05, + "loss": 0.0165, + "step": 9462 + }, + { + "epoch": 7.45135880267822, + "grad_norm": 0.35141459107398987, + "learning_rate": 2.8371e-05, + "loss": 0.0209, + "step": 9463 + }, + { + "epoch": 7.452146514375738, + "grad_norm": 0.21856538951396942, + "learning_rate": 2.8374e-05, + "loss": 0.015, + "step": 9464 + }, + { + "epoch": 7.452934226073257, + "grad_norm": 0.2130337506532669, + "learning_rate": 2.8377e-05, + "loss": 0.0184, + "step": 9465 + }, + { + "epoch": 7.4537219377707755, + "grad_norm": 0.2569637596607208, + "learning_rate": 2.838e-05, + "loss": 0.0144, + "step": 9466 + }, + { + "epoch": 7.454509649468295, + "grad_norm": 0.3688300549983978, + "learning_rate": 2.8383000000000003e-05, + "loss": 0.0173, + "step": 9467 + }, + { + "epoch": 7.4552973611658135, + "grad_norm": 0.3468276858329773, + "learning_rate": 2.8386000000000002e-05, + "loss": 0.0198, + "step": 9468 + }, + { + "epoch": 7.456085072863332, + "grad_norm": 0.3548327088356018, + "learning_rate": 2.8389000000000002e-05, + "loss": 0.0138, + "step": 9469 + }, + { + "epoch": 7.456872784560851, + "grad_norm": 0.3620729446411133, + "learning_rate": 2.8392000000000002e-05, + "loss": 0.0169, + "step": 9470 + }, + { + "epoch": 7.457660496258369, + "grad_norm": 0.2894379794597626, + "learning_rate": 2.8395000000000002e-05, + "loss": 0.0182, + "step": 9471 + }, + { + "epoch": 7.458448207955888, + "grad_norm": 0.3786224126815796, + "learning_rate": 2.8398e-05, + "loss": 0.0156, + "step": 9472 + }, + { + "epoch": 7.459235919653407, + "grad_norm": 0.4209785759449005, + "learning_rate": 2.8401e-05, + "loss": 0.0179, + "step": 9473 + }, + { + "epoch": 7.460023631350926, + "grad_norm": 0.49586600065231323, + "learning_rate": 2.8404e-05, + "loss": 0.0163, + "step": 9474 + }, + { + "epoch": 7.460811343048444, + "grad_norm": 0.4403609037399292, + "learning_rate": 2.8407e-05, + "loss": 0.0218, + "step": 9475 + }, + { + "epoch": 7.461599054745963, + "grad_norm": 0.18158234655857086, + "learning_rate": 2.841e-05, + "loss": 0.0128, + "step": 9476 + }, + { + "epoch": 7.4623867664434815, + "grad_norm": 0.3250170350074768, + "learning_rate": 2.8413000000000004e-05, + "loss": 0.0161, + "step": 9477 + }, + { + "epoch": 7.463174478141, + "grad_norm": 0.24795092642307281, + "learning_rate": 2.8416e-05, + "loss": 0.0272, + "step": 9478 + }, + { + "epoch": 7.463962189838519, + "grad_norm": 0.46583735942840576, + "learning_rate": 2.8419e-05, + "loss": 0.019, + "step": 9479 + }, + { + "epoch": 7.464749901536038, + "grad_norm": 0.4604480564594269, + "learning_rate": 2.8422e-05, + "loss": 0.0127, + "step": 9480 + }, + { + "epoch": 7.465537613233557, + "grad_norm": 1.1207308769226074, + "learning_rate": 2.8425e-05, + "loss": 0.023, + "step": 9481 + }, + { + "epoch": 7.466325324931075, + "grad_norm": 0.2920511066913605, + "learning_rate": 2.8428e-05, + "loss": 0.0184, + "step": 9482 + }, + { + "epoch": 7.467113036628594, + "grad_norm": 1.5468792915344238, + "learning_rate": 2.8431e-05, + "loss": 0.0296, + "step": 9483 + }, + { + "epoch": 7.467900748326112, + "grad_norm": 0.633175253868103, + "learning_rate": 2.8434e-05, + "loss": 0.0275, + "step": 9484 + }, + { + "epoch": 7.468688460023631, + "grad_norm": 0.31534698605537415, + "learning_rate": 2.8437e-05, + "loss": 0.0323, + "step": 9485 + }, + { + "epoch": 7.46947617172115, + "grad_norm": 0.4594838619232178, + "learning_rate": 2.844e-05, + "loss": 0.018, + "step": 9486 + }, + { + "epoch": 7.470263883418669, + "grad_norm": 0.4011078476905823, + "learning_rate": 2.8443000000000002e-05, + "loss": 0.0153, + "step": 9487 + }, + { + "epoch": 7.4710515951161875, + "grad_norm": 0.26359716057777405, + "learning_rate": 2.8446000000000002e-05, + "loss": 0.015, + "step": 9488 + }, + { + "epoch": 7.471839306813706, + "grad_norm": 0.5127514004707336, + "learning_rate": 2.8449e-05, + "loss": 0.0313, + "step": 9489 + }, + { + "epoch": 7.472627018511225, + "grad_norm": 0.6136807203292847, + "learning_rate": 2.8452e-05, + "loss": 0.0411, + "step": 9490 + }, + { + "epoch": 7.473414730208743, + "grad_norm": 1.0213322639465332, + "learning_rate": 2.8455e-05, + "loss": 0.2211, + "step": 9491 + }, + { + "epoch": 7.474202441906263, + "grad_norm": 1.13585364818573, + "learning_rate": 2.8458e-05, + "loss": 0.2246, + "step": 9492 + }, + { + "epoch": 7.474990153603781, + "grad_norm": 0.7383990287780762, + "learning_rate": 2.8461e-05, + "loss": 0.1302, + "step": 9493 + }, + { + "epoch": 7.4757778653013, + "grad_norm": 0.6228314638137817, + "learning_rate": 2.8464e-05, + "loss": 0.1218, + "step": 9494 + }, + { + "epoch": 7.476565576998818, + "grad_norm": 0.49344775080680847, + "learning_rate": 2.8467e-05, + "loss": 0.065, + "step": 9495 + }, + { + "epoch": 7.477353288696337, + "grad_norm": 0.3844696581363678, + "learning_rate": 2.847e-05, + "loss": 0.0531, + "step": 9496 + }, + { + "epoch": 7.4781410003938555, + "grad_norm": 0.2926921844482422, + "learning_rate": 2.8473000000000003e-05, + "loss": 0.0317, + "step": 9497 + }, + { + "epoch": 7.478928712091374, + "grad_norm": 0.2986154854297638, + "learning_rate": 2.8476000000000003e-05, + "loss": 0.0437, + "step": 9498 + }, + { + "epoch": 7.479716423788894, + "grad_norm": 0.21163517236709595, + "learning_rate": 2.8479000000000003e-05, + "loss": 0.0201, + "step": 9499 + }, + { + "epoch": 7.480504135486412, + "grad_norm": 0.5161780714988708, + "learning_rate": 2.8482000000000003e-05, + "loss": 0.0363, + "step": 9500 + }, + { + "epoch": 7.481291847183931, + "grad_norm": 0.3427787721157074, + "learning_rate": 2.8485000000000003e-05, + "loss": 0.0239, + "step": 9501 + }, + { + "epoch": 7.482079558881449, + "grad_norm": 0.45306071639060974, + "learning_rate": 2.8488000000000002e-05, + "loss": 0.0142, + "step": 9502 + }, + { + "epoch": 7.482867270578968, + "grad_norm": 0.27234840393066406, + "learning_rate": 2.8491e-05, + "loss": 0.0202, + "step": 9503 + }, + { + "epoch": 7.483654982276486, + "grad_norm": 0.2947995066642761, + "learning_rate": 2.8494e-05, + "loss": 0.0159, + "step": 9504 + }, + { + "epoch": 7.484442693974006, + "grad_norm": 0.2674362063407898, + "learning_rate": 2.8497e-05, + "loss": 0.0219, + "step": 9505 + }, + { + "epoch": 7.485230405671524, + "grad_norm": 0.2694786489009857, + "learning_rate": 2.8499999999999998e-05, + "loss": 0.0264, + "step": 9506 + }, + { + "epoch": 7.486018117369043, + "grad_norm": 0.2813625931739807, + "learning_rate": 2.8502999999999998e-05, + "loss": 0.0149, + "step": 9507 + }, + { + "epoch": 7.486805829066562, + "grad_norm": 1.6011557579040527, + "learning_rate": 2.8506e-05, + "loss": 0.0159, + "step": 9508 + }, + { + "epoch": 7.48759354076408, + "grad_norm": 0.37383896112442017, + "learning_rate": 2.8509e-05, + "loss": 0.0128, + "step": 9509 + }, + { + "epoch": 7.488381252461599, + "grad_norm": 0.2933398485183716, + "learning_rate": 2.8512e-05, + "loss": 0.0208, + "step": 9510 + }, + { + "epoch": 7.489168964159118, + "grad_norm": 0.5358808636665344, + "learning_rate": 2.8515e-05, + "loss": 0.0545, + "step": 9511 + }, + { + "epoch": 7.489956675856637, + "grad_norm": 0.3708990216255188, + "learning_rate": 2.8518e-05, + "loss": 0.0239, + "step": 9512 + }, + { + "epoch": 7.490744387554155, + "grad_norm": 0.18181723356246948, + "learning_rate": 2.8521e-05, + "loss": 0.012, + "step": 9513 + }, + { + "epoch": 7.491532099251674, + "grad_norm": 0.30967801809310913, + "learning_rate": 2.8524e-05, + "loss": 0.0153, + "step": 9514 + }, + { + "epoch": 7.492319810949192, + "grad_norm": 0.34300723671913147, + "learning_rate": 2.8527e-05, + "loss": 0.0194, + "step": 9515 + }, + { + "epoch": 7.493107522646711, + "grad_norm": 0.6487427949905396, + "learning_rate": 2.853e-05, + "loss": 0.0219, + "step": 9516 + }, + { + "epoch": 7.4938952343442296, + "grad_norm": 0.46490195393562317, + "learning_rate": 2.8533e-05, + "loss": 0.0225, + "step": 9517 + }, + { + "epoch": 7.494682946041749, + "grad_norm": 0.23683254420757294, + "learning_rate": 2.8536000000000003e-05, + "loss": 0.0172, + "step": 9518 + }, + { + "epoch": 7.495470657739268, + "grad_norm": 0.32592350244522095, + "learning_rate": 2.8539000000000002e-05, + "loss": 0.0226, + "step": 9519 + }, + { + "epoch": 7.496258369436786, + "grad_norm": 0.24995455145835876, + "learning_rate": 2.8542000000000002e-05, + "loss": 0.0161, + "step": 9520 + }, + { + "epoch": 7.497046081134305, + "grad_norm": 0.4401668310165405, + "learning_rate": 2.8545000000000002e-05, + "loss": 0.0185, + "step": 9521 + }, + { + "epoch": 7.497833792831823, + "grad_norm": 0.3569294810295105, + "learning_rate": 2.8548000000000002e-05, + "loss": 0.0133, + "step": 9522 + }, + { + "epoch": 7.498621504529343, + "grad_norm": 0.43540090322494507, + "learning_rate": 2.8551e-05, + "loss": 0.032, + "step": 9523 + }, + { + "epoch": 7.499409216226861, + "grad_norm": 0.4189586043357849, + "learning_rate": 2.8554e-05, + "loss": 0.0237, + "step": 9524 + }, + { + "epoch": 7.50019692792438, + "grad_norm": 0.6270734071731567, + "learning_rate": 2.8557e-05, + "loss": 0.0256, + "step": 9525 + }, + { + "epoch": 7.500984639621898, + "grad_norm": 0.3651023209095001, + "learning_rate": 2.856e-05, + "loss": 0.0198, + "step": 9526 + }, + { + "epoch": 7.501772351319417, + "grad_norm": 0.4390255808830261, + "learning_rate": 2.8563e-05, + "loss": 0.0241, + "step": 9527 + }, + { + "epoch": 7.502560063016936, + "grad_norm": 0.6536065340042114, + "learning_rate": 2.8566e-05, + "loss": 0.0134, + "step": 9528 + }, + { + "epoch": 7.503347774714454, + "grad_norm": 0.19978971779346466, + "learning_rate": 2.8569e-05, + "loss": 0.0084, + "step": 9529 + }, + { + "epoch": 7.504135486411974, + "grad_norm": 0.3205675780773163, + "learning_rate": 2.8572e-05, + "loss": 0.0161, + "step": 9530 + }, + { + "epoch": 7.504923198109492, + "grad_norm": 0.3792251646518707, + "learning_rate": 2.8575e-05, + "loss": 0.0278, + "step": 9531 + }, + { + "epoch": 7.505710909807011, + "grad_norm": 0.7396852374076843, + "learning_rate": 2.8578e-05, + "loss": 0.0532, + "step": 9532 + }, + { + "epoch": 7.506498621504529, + "grad_norm": 0.3749678432941437, + "learning_rate": 2.8581e-05, + "loss": 0.0144, + "step": 9533 + }, + { + "epoch": 7.507286333202048, + "grad_norm": 0.48719024658203125, + "learning_rate": 2.8584e-05, + "loss": 0.0417, + "step": 9534 + }, + { + "epoch": 7.508074044899566, + "grad_norm": 0.3702293336391449, + "learning_rate": 2.8587e-05, + "loss": 0.0233, + "step": 9535 + }, + { + "epoch": 7.508861756597085, + "grad_norm": 0.42110446095466614, + "learning_rate": 2.859e-05, + "loss": 0.0199, + "step": 9536 + }, + { + "epoch": 7.5096494682946044, + "grad_norm": 0.6212843656539917, + "learning_rate": 2.8593e-05, + "loss": 0.0243, + "step": 9537 + }, + { + "epoch": 7.510437179992123, + "grad_norm": 0.17003938555717468, + "learning_rate": 2.8596000000000002e-05, + "loss": 0.0148, + "step": 9538 + }, + { + "epoch": 7.511224891689642, + "grad_norm": 0.4239954948425293, + "learning_rate": 2.8599000000000002e-05, + "loss": 0.0234, + "step": 9539 + }, + { + "epoch": 7.51201260338716, + "grad_norm": 0.4800662100315094, + "learning_rate": 2.8602e-05, + "loss": 0.0266, + "step": 9540 + }, + { + "epoch": 7.512800315084679, + "grad_norm": 0.8628897070884705, + "learning_rate": 2.8605e-05, + "loss": 0.3102, + "step": 9541 + }, + { + "epoch": 7.513588026782198, + "grad_norm": 0.7337709069252014, + "learning_rate": 2.8608e-05, + "loss": 0.1969, + "step": 9542 + }, + { + "epoch": 7.514375738479717, + "grad_norm": 0.6387009024620056, + "learning_rate": 2.8611e-05, + "loss": 0.1519, + "step": 9543 + }, + { + "epoch": 7.515163450177235, + "grad_norm": 0.39766213297843933, + "learning_rate": 2.8614e-05, + "loss": 0.0969, + "step": 9544 + }, + { + "epoch": 7.515951161874754, + "grad_norm": 0.38161730766296387, + "learning_rate": 2.8617e-05, + "loss": 0.0634, + "step": 9545 + }, + { + "epoch": 7.516738873572272, + "grad_norm": 0.26424890756607056, + "learning_rate": 2.862e-05, + "loss": 0.045, + "step": 9546 + }, + { + "epoch": 7.517526585269791, + "grad_norm": 0.4924035370349884, + "learning_rate": 2.8623e-05, + "loss": 0.0355, + "step": 9547 + }, + { + "epoch": 7.51831429696731, + "grad_norm": 0.4172055721282959, + "learning_rate": 2.8626000000000003e-05, + "loss": 0.0188, + "step": 9548 + }, + { + "epoch": 7.519102008664829, + "grad_norm": 0.535997748374939, + "learning_rate": 2.8629000000000003e-05, + "loss": 0.0212, + "step": 9549 + }, + { + "epoch": 7.519889720362348, + "grad_norm": 0.5128630995750427, + "learning_rate": 2.8632000000000003e-05, + "loss": 0.0263, + "step": 9550 + }, + { + "epoch": 7.520677432059866, + "grad_norm": 0.2722286581993103, + "learning_rate": 2.8635000000000003e-05, + "loss": 0.0152, + "step": 9551 + }, + { + "epoch": 7.521465143757385, + "grad_norm": 0.3591596186161041, + "learning_rate": 2.8638e-05, + "loss": 0.036, + "step": 9552 + }, + { + "epoch": 7.522252855454903, + "grad_norm": 0.4938894808292389, + "learning_rate": 2.8641e-05, + "loss": 0.0375, + "step": 9553 + }, + { + "epoch": 7.523040567152422, + "grad_norm": 0.5046077966690063, + "learning_rate": 2.8644e-05, + "loss": 0.0239, + "step": 9554 + }, + { + "epoch": 7.52382827884994, + "grad_norm": 0.533744752407074, + "learning_rate": 2.8647e-05, + "loss": 0.0153, + "step": 9555 + }, + { + "epoch": 7.52461599054746, + "grad_norm": 0.7815026044845581, + "learning_rate": 2.865e-05, + "loss": 0.0185, + "step": 9556 + }, + { + "epoch": 7.5254037022449785, + "grad_norm": 0.23155082762241364, + "learning_rate": 2.8652999999999998e-05, + "loss": 0.0128, + "step": 9557 + }, + { + "epoch": 7.526191413942497, + "grad_norm": 0.3388988673686981, + "learning_rate": 2.8656e-05, + "loss": 0.0203, + "step": 9558 + }, + { + "epoch": 7.526979125640016, + "grad_norm": 0.3397889733314514, + "learning_rate": 2.8659e-05, + "loss": 0.0249, + "step": 9559 + }, + { + "epoch": 7.527766837337534, + "grad_norm": 0.32225510478019714, + "learning_rate": 2.8662e-05, + "loss": 0.016, + "step": 9560 + }, + { + "epoch": 7.528554549035054, + "grad_norm": 0.16767466068267822, + "learning_rate": 2.8665e-05, + "loss": 0.0127, + "step": 9561 + }, + { + "epoch": 7.529342260732572, + "grad_norm": 0.27418217062950134, + "learning_rate": 2.8668e-05, + "loss": 0.0184, + "step": 9562 + }, + { + "epoch": 7.530129972430091, + "grad_norm": 0.5084366798400879, + "learning_rate": 2.8671e-05, + "loss": 0.0645, + "step": 9563 + }, + { + "epoch": 7.530917684127609, + "grad_norm": 0.31041622161865234, + "learning_rate": 2.8674e-05, + "loss": 0.0286, + "step": 9564 + }, + { + "epoch": 7.531705395825128, + "grad_norm": 0.28838029503822327, + "learning_rate": 2.8677e-05, + "loss": 0.0183, + "step": 9565 + }, + { + "epoch": 7.5324931075226464, + "grad_norm": 0.7821999192237854, + "learning_rate": 2.868e-05, + "loss": 0.0157, + "step": 9566 + }, + { + "epoch": 7.533280819220165, + "grad_norm": 0.3544462323188782, + "learning_rate": 2.8683e-05, + "loss": 0.0238, + "step": 9567 + }, + { + "epoch": 7.5340685309176845, + "grad_norm": 0.415083646774292, + "learning_rate": 2.8686000000000003e-05, + "loss": 0.0203, + "step": 9568 + }, + { + "epoch": 7.534856242615203, + "grad_norm": 0.36948442459106445, + "learning_rate": 2.8689000000000003e-05, + "loss": 0.0236, + "step": 9569 + }, + { + "epoch": 7.535643954312722, + "grad_norm": 0.40171805024147034, + "learning_rate": 2.8692000000000002e-05, + "loss": 0.0156, + "step": 9570 + }, + { + "epoch": 7.53643166601024, + "grad_norm": 0.2949947416782379, + "learning_rate": 2.8695000000000002e-05, + "loss": 0.0211, + "step": 9571 + }, + { + "epoch": 7.537219377707759, + "grad_norm": 0.2445915937423706, + "learning_rate": 2.8698000000000002e-05, + "loss": 0.0166, + "step": 9572 + }, + { + "epoch": 7.538007089405277, + "grad_norm": 0.3337986171245575, + "learning_rate": 2.8701000000000002e-05, + "loss": 0.0291, + "step": 9573 + }, + { + "epoch": 7.538794801102797, + "grad_norm": 0.368010014295578, + "learning_rate": 2.8704e-05, + "loss": 0.0382, + "step": 9574 + }, + { + "epoch": 7.539582512800315, + "grad_norm": 0.1799900084733963, + "learning_rate": 2.8707e-05, + "loss": 0.0129, + "step": 9575 + }, + { + "epoch": 7.540370224497834, + "grad_norm": 0.5950950384140015, + "learning_rate": 2.871e-05, + "loss": 0.021, + "step": 9576 + }, + { + "epoch": 7.5411579361953525, + "grad_norm": 0.42837315797805786, + "learning_rate": 2.8712999999999998e-05, + "loss": 0.0169, + "step": 9577 + }, + { + "epoch": 7.541945647892871, + "grad_norm": 0.21233956515789032, + "learning_rate": 2.8716e-05, + "loss": 0.0162, + "step": 9578 + }, + { + "epoch": 7.54273335959039, + "grad_norm": 0.6875776052474976, + "learning_rate": 2.8719e-05, + "loss": 0.0237, + "step": 9579 + }, + { + "epoch": 7.543521071287909, + "grad_norm": 0.2700407803058624, + "learning_rate": 2.8722e-05, + "loss": 0.0166, + "step": 9580 + }, + { + "epoch": 7.544308782985428, + "grad_norm": 0.4830089807510376, + "learning_rate": 2.8725e-05, + "loss": 0.0178, + "step": 9581 + }, + { + "epoch": 7.545096494682946, + "grad_norm": 0.3071545362472534, + "learning_rate": 2.8728e-05, + "loss": 0.0136, + "step": 9582 + }, + { + "epoch": 7.545884206380465, + "grad_norm": 0.2836977243423462, + "learning_rate": 2.8731e-05, + "loss": 0.0153, + "step": 9583 + }, + { + "epoch": 7.546671918077983, + "grad_norm": 0.4289986491203308, + "learning_rate": 2.8734e-05, + "loss": 0.0234, + "step": 9584 + }, + { + "epoch": 7.547459629775502, + "grad_norm": 0.2515062689781189, + "learning_rate": 2.8737e-05, + "loss": 0.011, + "step": 9585 + }, + { + "epoch": 7.5482473414730205, + "grad_norm": 0.4549204707145691, + "learning_rate": 2.874e-05, + "loss": 0.0262, + "step": 9586 + }, + { + "epoch": 7.54903505317054, + "grad_norm": 0.34890016913414, + "learning_rate": 2.8743e-05, + "loss": 0.0098, + "step": 9587 + }, + { + "epoch": 7.5498227648680585, + "grad_norm": 0.5123046040534973, + "learning_rate": 2.8746000000000002e-05, + "loss": 0.0241, + "step": 9588 + }, + { + "epoch": 7.550610476565577, + "grad_norm": 0.31074729561805725, + "learning_rate": 2.8749000000000002e-05, + "loss": 0.012, + "step": 9589 + }, + { + "epoch": 7.551398188263096, + "grad_norm": 0.6151841282844543, + "learning_rate": 2.8752000000000002e-05, + "loss": 0.027, + "step": 9590 + }, + { + "epoch": 7.552185899960614, + "grad_norm": 1.1363176107406616, + "learning_rate": 2.8755e-05, + "loss": 0.3041, + "step": 9591 + }, + { + "epoch": 7.552973611658133, + "grad_norm": 0.806398868560791, + "learning_rate": 2.8758e-05, + "loss": 0.1941, + "step": 9592 + }, + { + "epoch": 7.553761323355652, + "grad_norm": 0.7902489900588989, + "learning_rate": 2.8761e-05, + "loss": 0.1244, + "step": 9593 + }, + { + "epoch": 7.554549035053171, + "grad_norm": 0.8128259778022766, + "learning_rate": 2.8764e-05, + "loss": 0.1462, + "step": 9594 + }, + { + "epoch": 7.555336746750689, + "grad_norm": 0.889928936958313, + "learning_rate": 2.8767e-05, + "loss": 0.133, + "step": 9595 + }, + { + "epoch": 7.556124458448208, + "grad_norm": 0.5058755278587341, + "learning_rate": 2.877e-05, + "loss": 0.086, + "step": 9596 + }, + { + "epoch": 7.5569121701457265, + "grad_norm": 1.2744758129119873, + "learning_rate": 2.8773e-05, + "loss": 0.0418, + "step": 9597 + }, + { + "epoch": 7.557699881843245, + "grad_norm": 0.20168446004390717, + "learning_rate": 2.8776000000000004e-05, + "loss": 0.0184, + "step": 9598 + }, + { + "epoch": 7.5584875935407645, + "grad_norm": 0.4915613532066345, + "learning_rate": 2.8779000000000003e-05, + "loss": 0.0225, + "step": 9599 + }, + { + "epoch": 7.559275305238283, + "grad_norm": 0.21234330534934998, + "learning_rate": 2.8782000000000003e-05, + "loss": 0.0194, + "step": 9600 + }, + { + "epoch": 7.560063016935802, + "grad_norm": 0.22556288540363312, + "learning_rate": 2.8785e-05, + "loss": 0.0132, + "step": 9601 + }, + { + "epoch": 7.56085072863332, + "grad_norm": 0.3720574378967285, + "learning_rate": 2.8788e-05, + "loss": 0.0296, + "step": 9602 + }, + { + "epoch": 7.561638440330839, + "grad_norm": 0.4795766770839691, + "learning_rate": 2.8791e-05, + "loss": 0.0618, + "step": 9603 + }, + { + "epoch": 7.562426152028357, + "grad_norm": 0.2748098373413086, + "learning_rate": 2.8794e-05, + "loss": 0.0512, + "step": 9604 + }, + { + "epoch": 7.563213863725876, + "grad_norm": 0.2210908681154251, + "learning_rate": 2.8797e-05, + "loss": 0.0134, + "step": 9605 + }, + { + "epoch": 7.564001575423395, + "grad_norm": 0.1816251128911972, + "learning_rate": 2.88e-05, + "loss": 0.0126, + "step": 9606 + }, + { + "epoch": 7.564789287120914, + "grad_norm": 0.39714154601097107, + "learning_rate": 2.8803e-05, + "loss": 0.0308, + "step": 9607 + }, + { + "epoch": 7.5655769988184325, + "grad_norm": 0.3943287432193756, + "learning_rate": 2.8806e-05, + "loss": 0.0236, + "step": 9608 + }, + { + "epoch": 7.566364710515951, + "grad_norm": 0.45946958661079407, + "learning_rate": 2.8809e-05, + "loss": 0.0227, + "step": 9609 + }, + { + "epoch": 7.56715242221347, + "grad_norm": 0.8898782730102539, + "learning_rate": 2.8812e-05, + "loss": 0.0229, + "step": 9610 + }, + { + "epoch": 7.567940133910989, + "grad_norm": 0.519927978515625, + "learning_rate": 2.8815e-05, + "loss": 0.0198, + "step": 9611 + }, + { + "epoch": 7.568727845608508, + "grad_norm": 0.48438942432403564, + "learning_rate": 2.8818e-05, + "loss": 0.0219, + "step": 9612 + }, + { + "epoch": 7.569515557306026, + "grad_norm": 0.35413962602615356, + "learning_rate": 2.8821e-05, + "loss": 0.0233, + "step": 9613 + }, + { + "epoch": 7.570303269003545, + "grad_norm": 1.1025956869125366, + "learning_rate": 2.8824e-05, + "loss": 0.0221, + "step": 9614 + }, + { + "epoch": 7.571090980701063, + "grad_norm": 0.3340371251106262, + "learning_rate": 2.8827e-05, + "loss": 0.0217, + "step": 9615 + }, + { + "epoch": 7.571878692398582, + "grad_norm": 0.4485151469707489, + "learning_rate": 2.883e-05, + "loss": 0.0091, + "step": 9616 + }, + { + "epoch": 7.5726664040961005, + "grad_norm": 0.20464059710502625, + "learning_rate": 2.8833e-05, + "loss": 0.0147, + "step": 9617 + }, + { + "epoch": 7.57345411579362, + "grad_norm": 0.43677496910095215, + "learning_rate": 2.8836000000000003e-05, + "loss": 0.0304, + "step": 9618 + }, + { + "epoch": 7.5742418274911385, + "grad_norm": 0.33427974581718445, + "learning_rate": 2.8839000000000003e-05, + "loss": 0.0175, + "step": 9619 + }, + { + "epoch": 7.575029539188657, + "grad_norm": 0.352961927652359, + "learning_rate": 2.8842000000000003e-05, + "loss": 0.0126, + "step": 9620 + }, + { + "epoch": 7.575817250886176, + "grad_norm": 0.3347214460372925, + "learning_rate": 2.8845000000000003e-05, + "loss": 0.0196, + "step": 9621 + }, + { + "epoch": 7.576604962583694, + "grad_norm": 0.4533666968345642, + "learning_rate": 2.8848000000000002e-05, + "loss": 0.0207, + "step": 9622 + }, + { + "epoch": 7.577392674281213, + "grad_norm": 0.556059718132019, + "learning_rate": 2.8851000000000002e-05, + "loss": 0.0177, + "step": 9623 + }, + { + "epoch": 7.578180385978731, + "grad_norm": 0.3831595480442047, + "learning_rate": 2.8854000000000002e-05, + "loss": 0.022, + "step": 9624 + }, + { + "epoch": 7.578968097676251, + "grad_norm": 0.6382478475570679, + "learning_rate": 2.8857000000000002e-05, + "loss": 0.0205, + "step": 9625 + }, + { + "epoch": 7.579755809373769, + "grad_norm": 0.3533719778060913, + "learning_rate": 2.8859999999999998e-05, + "loss": 0.0153, + "step": 9626 + }, + { + "epoch": 7.580543521071288, + "grad_norm": 0.3493441641330719, + "learning_rate": 2.8862999999999998e-05, + "loss": 0.0188, + "step": 9627 + }, + { + "epoch": 7.5813312327688065, + "grad_norm": 2.682837724685669, + "learning_rate": 2.8866e-05, + "loss": 0.0187, + "step": 9628 + }, + { + "epoch": 7.582118944466325, + "grad_norm": 0.39747828245162964, + "learning_rate": 2.8869e-05, + "loss": 0.0129, + "step": 9629 + }, + { + "epoch": 7.5829066561638445, + "grad_norm": 0.30434808135032654, + "learning_rate": 2.8872e-05, + "loss": 0.0255, + "step": 9630 + }, + { + "epoch": 7.583694367861363, + "grad_norm": 0.3036974370479584, + "learning_rate": 2.8875e-05, + "loss": 0.024, + "step": 9631 + }, + { + "epoch": 7.584482079558882, + "grad_norm": 0.29212531447410583, + "learning_rate": 2.8878e-05, + "loss": 0.0128, + "step": 9632 + }, + { + "epoch": 7.5852697912564, + "grad_norm": 0.18039415776729584, + "learning_rate": 2.8881e-05, + "loss": 0.0102, + "step": 9633 + }, + { + "epoch": 7.586057502953919, + "grad_norm": 0.536525547504425, + "learning_rate": 2.8884e-05, + "loss": 0.0315, + "step": 9634 + }, + { + "epoch": 7.586845214651437, + "grad_norm": 0.7220152020454407, + "learning_rate": 2.8887e-05, + "loss": 0.0369, + "step": 9635 + }, + { + "epoch": 7.587632926348956, + "grad_norm": 0.30802199244499207, + "learning_rate": 2.889e-05, + "loss": 0.0168, + "step": 9636 + }, + { + "epoch": 7.588420638046475, + "grad_norm": 0.3092663884162903, + "learning_rate": 2.8893e-05, + "loss": 0.0143, + "step": 9637 + }, + { + "epoch": 7.589208349743994, + "grad_norm": 0.4241114854812622, + "learning_rate": 2.8896e-05, + "loss": 0.0129, + "step": 9638 + }, + { + "epoch": 7.5899960614415125, + "grad_norm": 0.6041476726531982, + "learning_rate": 2.8899000000000002e-05, + "loss": 0.025, + "step": 9639 + }, + { + "epoch": 7.590783773139031, + "grad_norm": 1.7348552942276, + "learning_rate": 2.8902000000000002e-05, + "loss": 0.0402, + "step": 9640 + }, + { + "epoch": 7.59157148483655, + "grad_norm": 0.7205200791358948, + "learning_rate": 2.8905000000000002e-05, + "loss": 0.2366, + "step": 9641 + }, + { + "epoch": 7.592359196534068, + "grad_norm": 0.8140365481376648, + "learning_rate": 2.8908000000000002e-05, + "loss": 0.2728, + "step": 9642 + }, + { + "epoch": 7.593146908231587, + "grad_norm": 0.7281056046485901, + "learning_rate": 2.8911e-05, + "loss": 0.1664, + "step": 9643 + }, + { + "epoch": 7.593934619929106, + "grad_norm": 0.6317964792251587, + "learning_rate": 2.8914e-05, + "loss": 0.0966, + "step": 9644 + }, + { + "epoch": 7.594722331626625, + "grad_norm": 0.6540931463241577, + "learning_rate": 2.8917e-05, + "loss": 0.0788, + "step": 9645 + }, + { + "epoch": 7.595510043324143, + "grad_norm": 0.6021493077278137, + "learning_rate": 2.892e-05, + "loss": 0.0341, + "step": 9646 + }, + { + "epoch": 7.596297755021662, + "grad_norm": 0.2749694287776947, + "learning_rate": 2.8923e-05, + "loss": 0.0312, + "step": 9647 + }, + { + "epoch": 7.5970854667191805, + "grad_norm": 0.28776195645332336, + "learning_rate": 2.8926e-05, + "loss": 0.0428, + "step": 9648 + }, + { + "epoch": 7.5978731784167, + "grad_norm": 0.4607768952846527, + "learning_rate": 2.8929000000000004e-05, + "loss": 0.0422, + "step": 9649 + }, + { + "epoch": 7.5986608901142185, + "grad_norm": 0.2764870226383209, + "learning_rate": 2.8932e-05, + "loss": 0.0225, + "step": 9650 + }, + { + "epoch": 7.599448601811737, + "grad_norm": 0.3116981089115143, + "learning_rate": 2.8935e-05, + "loss": 0.0151, + "step": 9651 + }, + { + "epoch": 7.600236313509256, + "grad_norm": 0.6980766654014587, + "learning_rate": 2.8938e-05, + "loss": 0.0411, + "step": 9652 + }, + { + "epoch": 7.601024025206774, + "grad_norm": 0.5314425826072693, + "learning_rate": 2.8941e-05, + "loss": 0.0294, + "step": 9653 + }, + { + "epoch": 7.601811736904293, + "grad_norm": 0.2454226016998291, + "learning_rate": 2.8944e-05, + "loss": 0.0173, + "step": 9654 + }, + { + "epoch": 7.602599448601811, + "grad_norm": 0.2663485109806061, + "learning_rate": 2.8947e-05, + "loss": 0.0192, + "step": 9655 + }, + { + "epoch": 7.603387160299331, + "grad_norm": 0.2172231525182724, + "learning_rate": 2.895e-05, + "loss": 0.0151, + "step": 9656 + }, + { + "epoch": 7.604174871996849, + "grad_norm": 0.3011157810688019, + "learning_rate": 2.8953e-05, + "loss": 0.019, + "step": 9657 + }, + { + "epoch": 7.604962583694368, + "grad_norm": 0.18559393286705017, + "learning_rate": 2.8956e-05, + "loss": 0.0111, + "step": 9658 + }, + { + "epoch": 7.6057502953918865, + "grad_norm": 0.4999474287033081, + "learning_rate": 2.8959000000000002e-05, + "loss": 0.031, + "step": 9659 + }, + { + "epoch": 7.606538007089405, + "grad_norm": 0.35611477494239807, + "learning_rate": 2.8962e-05, + "loss": 0.0192, + "step": 9660 + }, + { + "epoch": 7.607325718786924, + "grad_norm": 0.48935195803642273, + "learning_rate": 2.8965e-05, + "loss": 0.0267, + "step": 9661 + }, + { + "epoch": 7.608113430484442, + "grad_norm": 0.28037941455841064, + "learning_rate": 2.8968e-05, + "loss": 0.0212, + "step": 9662 + }, + { + "epoch": 7.608901142181962, + "grad_norm": 0.14604216814041138, + "learning_rate": 2.8971e-05, + "loss": 0.0098, + "step": 9663 + }, + { + "epoch": 7.60968885387948, + "grad_norm": 0.4383922517299652, + "learning_rate": 2.8974e-05, + "loss": 0.0159, + "step": 9664 + }, + { + "epoch": 7.610476565576999, + "grad_norm": 0.32968297600746155, + "learning_rate": 2.8977e-05, + "loss": 0.0183, + "step": 9665 + }, + { + "epoch": 7.611264277274517, + "grad_norm": 0.42090025544166565, + "learning_rate": 2.898e-05, + "loss": 0.0546, + "step": 9666 + }, + { + "epoch": 7.612051988972036, + "grad_norm": 0.6887231469154358, + "learning_rate": 2.8983e-05, + "loss": 0.0224, + "step": 9667 + }, + { + "epoch": 7.612839700669555, + "grad_norm": 0.3129349648952484, + "learning_rate": 2.8986e-05, + "loss": 0.022, + "step": 9668 + }, + { + "epoch": 7.613627412367074, + "grad_norm": 0.5965191721916199, + "learning_rate": 2.8989000000000003e-05, + "loss": 0.0236, + "step": 9669 + }, + { + "epoch": 7.6144151240645925, + "grad_norm": 0.2942030131816864, + "learning_rate": 2.8992000000000003e-05, + "loss": 0.0181, + "step": 9670 + }, + { + "epoch": 7.615202835762111, + "grad_norm": 0.4124559760093689, + "learning_rate": 2.8995000000000003e-05, + "loss": 0.031, + "step": 9671 + }, + { + "epoch": 7.61599054745963, + "grad_norm": 0.31595122814178467, + "learning_rate": 2.8998000000000003e-05, + "loss": 0.019, + "step": 9672 + }, + { + "epoch": 7.616778259157148, + "grad_norm": 0.36955395340919495, + "learning_rate": 2.9001000000000002e-05, + "loss": 0.0166, + "step": 9673 + }, + { + "epoch": 7.617565970854667, + "grad_norm": 0.3815973699092865, + "learning_rate": 2.9004000000000002e-05, + "loss": 0.0188, + "step": 9674 + }, + { + "epoch": 7.618353682552186, + "grad_norm": 0.3135644197463989, + "learning_rate": 2.9007e-05, + "loss": 0.0109, + "step": 9675 + }, + { + "epoch": 7.619141394249705, + "grad_norm": 0.8388550877571106, + "learning_rate": 2.901e-05, + "loss": 0.0248, + "step": 9676 + }, + { + "epoch": 7.619929105947223, + "grad_norm": 0.23506979644298553, + "learning_rate": 2.9012999999999998e-05, + "loss": 0.0148, + "step": 9677 + }, + { + "epoch": 7.620716817644742, + "grad_norm": 0.2618098855018616, + "learning_rate": 2.9015999999999998e-05, + "loss": 0.0217, + "step": 9678 + }, + { + "epoch": 7.6215045293422605, + "grad_norm": 0.2581983208656311, + "learning_rate": 2.9019e-05, + "loss": 0.0165, + "step": 9679 + }, + { + "epoch": 7.622292241039779, + "grad_norm": 0.3858345150947571, + "learning_rate": 2.9022e-05, + "loss": 0.0238, + "step": 9680 + }, + { + "epoch": 7.623079952737298, + "grad_norm": 0.4625793993473053, + "learning_rate": 2.9025e-05, + "loss": 0.0169, + "step": 9681 + }, + { + "epoch": 7.623867664434817, + "grad_norm": 0.6696707606315613, + "learning_rate": 2.9028e-05, + "loss": 0.021, + "step": 9682 + }, + { + "epoch": 7.624655376132336, + "grad_norm": 0.7681811451911926, + "learning_rate": 2.9031e-05, + "loss": 0.0235, + "step": 9683 + }, + { + "epoch": 7.625443087829854, + "grad_norm": 0.4431666135787964, + "learning_rate": 2.9034e-05, + "loss": 0.0395, + "step": 9684 + }, + { + "epoch": 7.626230799527373, + "grad_norm": 0.4848262667655945, + "learning_rate": 2.9037e-05, + "loss": 0.0331, + "step": 9685 + }, + { + "epoch": 7.627018511224891, + "grad_norm": 0.43949463963508606, + "learning_rate": 2.904e-05, + "loss": 0.019, + "step": 9686 + }, + { + "epoch": 7.627806222922411, + "grad_norm": 0.5391579270362854, + "learning_rate": 2.9043e-05, + "loss": 0.0271, + "step": 9687 + }, + { + "epoch": 7.628593934619929, + "grad_norm": 0.5303795337677002, + "learning_rate": 2.9046e-05, + "loss": 0.0339, + "step": 9688 + }, + { + "epoch": 7.629381646317448, + "grad_norm": 1.7508821487426758, + "learning_rate": 2.9049000000000003e-05, + "loss": 0.043, + "step": 9689 + }, + { + "epoch": 7.6301693580149665, + "grad_norm": 0.5885848999023438, + "learning_rate": 2.9052000000000002e-05, + "loss": 0.0313, + "step": 9690 + }, + { + "epoch": 7.630957069712485, + "grad_norm": 0.7263497114181519, + "learning_rate": 2.9055000000000002e-05, + "loss": 0.2866, + "step": 9691 + }, + { + "epoch": 7.631744781410004, + "grad_norm": 0.6145999431610107, + "learning_rate": 2.9058000000000002e-05, + "loss": 0.2132, + "step": 9692 + }, + { + "epoch": 7.632532493107522, + "grad_norm": 0.8531537652015686, + "learning_rate": 2.9061000000000002e-05, + "loss": 0.2665, + "step": 9693 + }, + { + "epoch": 7.633320204805042, + "grad_norm": 1.0360902547836304, + "learning_rate": 2.9064e-05, + "loss": 0.1727, + "step": 9694 + }, + { + "epoch": 7.63410791650256, + "grad_norm": 0.9552410244941711, + "learning_rate": 2.9067e-05, + "loss": 0.0915, + "step": 9695 + }, + { + "epoch": 7.634895628200079, + "grad_norm": 0.4637034833431244, + "learning_rate": 2.907e-05, + "loss": 0.0723, + "step": 9696 + }, + { + "epoch": 7.635683339897597, + "grad_norm": 0.2521613538265228, + "learning_rate": 2.9073e-05, + "loss": 0.0285, + "step": 9697 + }, + { + "epoch": 7.636471051595116, + "grad_norm": 0.5126423239707947, + "learning_rate": 2.9076e-05, + "loss": 0.0439, + "step": 9698 + }, + { + "epoch": 7.6372587632926345, + "grad_norm": 0.47113877534866333, + "learning_rate": 2.9079e-05, + "loss": 0.0544, + "step": 9699 + }, + { + "epoch": 7.638046474990154, + "grad_norm": 0.4243835508823395, + "learning_rate": 2.9082e-05, + "loss": 0.0271, + "step": 9700 + }, + { + "epoch": 7.638834186687673, + "grad_norm": 0.2313486784696579, + "learning_rate": 2.9085e-05, + "loss": 0.0282, + "step": 9701 + }, + { + "epoch": 7.639621898385191, + "grad_norm": 0.4304080605506897, + "learning_rate": 2.9088e-05, + "loss": 0.0225, + "step": 9702 + }, + { + "epoch": 7.64040961008271, + "grad_norm": 0.2859155237674713, + "learning_rate": 2.9091e-05, + "loss": 0.0213, + "step": 9703 + }, + { + "epoch": 7.641197321780228, + "grad_norm": 0.3095599412918091, + "learning_rate": 2.9094e-05, + "loss": 0.0192, + "step": 9704 + }, + { + "epoch": 7.641985033477747, + "grad_norm": 0.2604365348815918, + "learning_rate": 2.9097e-05, + "loss": 0.0171, + "step": 9705 + }, + { + "epoch": 7.642772745175266, + "grad_norm": 0.43902313709259033, + "learning_rate": 2.91e-05, + "loss": 0.014, + "step": 9706 + }, + { + "epoch": 7.643560456872785, + "grad_norm": 0.3416177034378052, + "learning_rate": 2.9103e-05, + "loss": 0.0148, + "step": 9707 + }, + { + "epoch": 7.644348168570303, + "grad_norm": 0.24253857135772705, + "learning_rate": 2.9106e-05, + "loss": 0.0138, + "step": 9708 + }, + { + "epoch": 7.645135880267822, + "grad_norm": 0.5534826517105103, + "learning_rate": 2.9109000000000002e-05, + "loss": 0.0176, + "step": 9709 + }, + { + "epoch": 7.6459235919653405, + "grad_norm": 0.22599777579307556, + "learning_rate": 2.9112000000000002e-05, + "loss": 0.0137, + "step": 9710 + }, + { + "epoch": 7.646711303662859, + "grad_norm": 0.606767475605011, + "learning_rate": 2.9115e-05, + "loss": 0.0256, + "step": 9711 + }, + { + "epoch": 7.647499015360378, + "grad_norm": 0.82635098695755, + "learning_rate": 2.9118e-05, + "loss": 0.0343, + "step": 9712 + }, + { + "epoch": 7.648286727057897, + "grad_norm": 0.17732077836990356, + "learning_rate": 2.9121e-05, + "loss": 0.0131, + "step": 9713 + }, + { + "epoch": 7.649074438755416, + "grad_norm": 0.21558015048503876, + "learning_rate": 2.9124e-05, + "loss": 0.0063, + "step": 9714 + }, + { + "epoch": 7.649862150452934, + "grad_norm": 0.24400250613689423, + "learning_rate": 2.9127e-05, + "loss": 0.0086, + "step": 9715 + }, + { + "epoch": 7.650649862150453, + "grad_norm": 0.6211650967597961, + "learning_rate": 2.913e-05, + "loss": 0.0242, + "step": 9716 + }, + { + "epoch": 7.651437573847971, + "grad_norm": 0.49437326192855835, + "learning_rate": 2.9133e-05, + "loss": 0.0243, + "step": 9717 + }, + { + "epoch": 7.65222528554549, + "grad_norm": 0.26646238565444946, + "learning_rate": 2.9136e-05, + "loss": 0.0218, + "step": 9718 + }, + { + "epoch": 7.653012997243009, + "grad_norm": 0.3477407693862915, + "learning_rate": 2.9139000000000003e-05, + "loss": 0.0256, + "step": 9719 + }, + { + "epoch": 7.653800708940528, + "grad_norm": 0.23765744268894196, + "learning_rate": 2.9142000000000003e-05, + "loss": 0.0183, + "step": 9720 + }, + { + "epoch": 7.654588420638047, + "grad_norm": 0.47259753942489624, + "learning_rate": 2.9145000000000003e-05, + "loss": 0.0177, + "step": 9721 + }, + { + "epoch": 7.655376132335565, + "grad_norm": 0.3482713997364044, + "learning_rate": 2.9148000000000003e-05, + "loss": 0.024, + "step": 9722 + }, + { + "epoch": 7.656163844033084, + "grad_norm": 0.2558017671108246, + "learning_rate": 2.9151000000000003e-05, + "loss": 0.0105, + "step": 9723 + }, + { + "epoch": 7.656951555730602, + "grad_norm": 0.38481321930885315, + "learning_rate": 2.9154e-05, + "loss": 0.014, + "step": 9724 + }, + { + "epoch": 7.657739267428122, + "grad_norm": 0.522557258605957, + "learning_rate": 2.9157e-05, + "loss": 0.0229, + "step": 9725 + }, + { + "epoch": 7.65852697912564, + "grad_norm": 3.1902787685394287, + "learning_rate": 2.916e-05, + "loss": 0.0256, + "step": 9726 + }, + { + "epoch": 7.659314690823159, + "grad_norm": 0.2474842369556427, + "learning_rate": 2.9163e-05, + "loss": 0.0238, + "step": 9727 + }, + { + "epoch": 7.660102402520677, + "grad_norm": 0.36471569538116455, + "learning_rate": 2.9165999999999998e-05, + "loss": 0.0251, + "step": 9728 + }, + { + "epoch": 7.660890114218196, + "grad_norm": 0.535297691822052, + "learning_rate": 2.9169e-05, + "loss": 0.0285, + "step": 9729 + }, + { + "epoch": 7.661677825915715, + "grad_norm": 0.5341439247131348, + "learning_rate": 2.9172e-05, + "loss": 0.0214, + "step": 9730 + }, + { + "epoch": 7.662465537613233, + "grad_norm": 0.611795961856842, + "learning_rate": 2.9175e-05, + "loss": 0.0255, + "step": 9731 + }, + { + "epoch": 7.663253249310753, + "grad_norm": 0.22677123546600342, + "learning_rate": 2.9178e-05, + "loss": 0.0106, + "step": 9732 + }, + { + "epoch": 7.664040961008271, + "grad_norm": 0.7533841133117676, + "learning_rate": 2.9181e-05, + "loss": 0.0456, + "step": 9733 + }, + { + "epoch": 7.66482867270579, + "grad_norm": 0.17524930834770203, + "learning_rate": 2.9184e-05, + "loss": 0.013, + "step": 9734 + }, + { + "epoch": 7.665616384403308, + "grad_norm": 0.4391390085220337, + "learning_rate": 2.9187e-05, + "loss": 0.0191, + "step": 9735 + }, + { + "epoch": 7.666404096100827, + "grad_norm": 0.45796042680740356, + "learning_rate": 2.919e-05, + "loss": 0.0285, + "step": 9736 + }, + { + "epoch": 7.667191807798346, + "grad_norm": 0.46300753951072693, + "learning_rate": 2.9193e-05, + "loss": 0.0271, + "step": 9737 + }, + { + "epoch": 7.667979519495865, + "grad_norm": 0.44276124238967896, + "learning_rate": 2.9196e-05, + "loss": 0.0414, + "step": 9738 + }, + { + "epoch": 7.668767231193383, + "grad_norm": 0.9261094331741333, + "learning_rate": 2.9199000000000003e-05, + "loss": 0.0383, + "step": 9739 + }, + { + "epoch": 7.669554942890902, + "grad_norm": 0.48221322894096375, + "learning_rate": 2.9202000000000003e-05, + "loss": 0.0223, + "step": 9740 + }, + { + "epoch": 7.670342654588421, + "grad_norm": 1.234261155128479, + "learning_rate": 2.9205000000000002e-05, + "loss": 0.2538, + "step": 9741 + }, + { + "epoch": 7.671130366285939, + "grad_norm": 0.8024499416351318, + "learning_rate": 2.9208000000000002e-05, + "loss": 0.234, + "step": 9742 + }, + { + "epoch": 7.671918077983458, + "grad_norm": 0.6390267014503479, + "learning_rate": 2.9211000000000002e-05, + "loss": 0.1326, + "step": 9743 + }, + { + "epoch": 7.672705789680977, + "grad_norm": 0.8365198969841003, + "learning_rate": 2.9214000000000002e-05, + "loss": 0.1632, + "step": 9744 + }, + { + "epoch": 7.673493501378496, + "grad_norm": 0.555756688117981, + "learning_rate": 2.9217e-05, + "loss": 0.0985, + "step": 9745 + }, + { + "epoch": 7.674281213076014, + "grad_norm": 0.5765618681907654, + "learning_rate": 2.922e-05, + "loss": 0.0889, + "step": 9746 + }, + { + "epoch": 7.675068924773533, + "grad_norm": 0.37500709295272827, + "learning_rate": 2.9223e-05, + "loss": 0.0359, + "step": 9747 + }, + { + "epoch": 7.675856636471051, + "grad_norm": 0.26152363419532776, + "learning_rate": 2.9226e-05, + "loss": 0.0393, + "step": 9748 + }, + { + "epoch": 7.67664434816857, + "grad_norm": 0.24034690856933594, + "learning_rate": 2.9229e-05, + "loss": 0.0199, + "step": 9749 + }, + { + "epoch": 7.677432059866089, + "grad_norm": 0.27973228693008423, + "learning_rate": 2.9232e-05, + "loss": 0.0197, + "step": 9750 + }, + { + "epoch": 7.678219771563608, + "grad_norm": 0.35995662212371826, + "learning_rate": 2.9235e-05, + "loss": 0.0186, + "step": 9751 + }, + { + "epoch": 7.679007483261127, + "grad_norm": 0.2876112163066864, + "learning_rate": 2.9238e-05, + "loss": 0.0198, + "step": 9752 + }, + { + "epoch": 7.679795194958645, + "grad_norm": 0.43074408173561096, + "learning_rate": 2.9241e-05, + "loss": 0.0289, + "step": 9753 + }, + { + "epoch": 7.680582906656164, + "grad_norm": 0.5393575429916382, + "learning_rate": 2.9244e-05, + "loss": 0.0154, + "step": 9754 + }, + { + "epoch": 7.681370618353682, + "grad_norm": 0.1616872102022171, + "learning_rate": 2.9247e-05, + "loss": 0.0093, + "step": 9755 + }, + { + "epoch": 7.682158330051202, + "grad_norm": 0.37812328338623047, + "learning_rate": 2.925e-05, + "loss": 0.0287, + "step": 9756 + }, + { + "epoch": 7.68294604174872, + "grad_norm": 0.6250300407409668, + "learning_rate": 2.9253e-05, + "loss": 0.0248, + "step": 9757 + }, + { + "epoch": 7.683733753446239, + "grad_norm": 0.35114285349845886, + "learning_rate": 2.9256e-05, + "loss": 0.0191, + "step": 9758 + }, + { + "epoch": 7.6845214651437574, + "grad_norm": 0.24905598163604736, + "learning_rate": 2.9259e-05, + "loss": 0.0215, + "step": 9759 + }, + { + "epoch": 7.685309176841276, + "grad_norm": 0.25767529010772705, + "learning_rate": 2.9262000000000002e-05, + "loss": 0.0107, + "step": 9760 + }, + { + "epoch": 7.686096888538795, + "grad_norm": 0.25665056705474854, + "learning_rate": 2.9265000000000002e-05, + "loss": 0.0145, + "step": 9761 + }, + { + "epoch": 7.686884600236313, + "grad_norm": 0.1703498810529709, + "learning_rate": 2.9268e-05, + "loss": 0.0116, + "step": 9762 + }, + { + "epoch": 7.687672311933833, + "grad_norm": 0.24872416257858276, + "learning_rate": 2.9271e-05, + "loss": 0.0127, + "step": 9763 + }, + { + "epoch": 7.688460023631351, + "grad_norm": 0.19726930558681488, + "learning_rate": 2.9274e-05, + "loss": 0.0088, + "step": 9764 + }, + { + "epoch": 7.68924773532887, + "grad_norm": 0.3140941858291626, + "learning_rate": 2.9277e-05, + "loss": 0.0162, + "step": 9765 + }, + { + "epoch": 7.690035447026388, + "grad_norm": 0.49086275696754456, + "learning_rate": 2.928e-05, + "loss": 0.0302, + "step": 9766 + }, + { + "epoch": 7.690823158723907, + "grad_norm": 0.36068618297576904, + "learning_rate": 2.9283e-05, + "loss": 0.0173, + "step": 9767 + }, + { + "epoch": 7.691610870421425, + "grad_norm": 0.47785767912864685, + "learning_rate": 2.9286e-05, + "loss": 0.0153, + "step": 9768 + }, + { + "epoch": 7.692398582118944, + "grad_norm": 0.7807820439338684, + "learning_rate": 2.9289e-05, + "loss": 0.0231, + "step": 9769 + }, + { + "epoch": 7.6931862938164635, + "grad_norm": 0.258829802274704, + "learning_rate": 2.9292000000000003e-05, + "loss": 0.0129, + "step": 9770 + }, + { + "epoch": 7.693974005513982, + "grad_norm": 0.5972117781639099, + "learning_rate": 2.9295000000000003e-05, + "loss": 0.0349, + "step": 9771 + }, + { + "epoch": 7.694761717211501, + "grad_norm": 0.31325089931488037, + "learning_rate": 2.9298000000000003e-05, + "loss": 0.0143, + "step": 9772 + }, + { + "epoch": 7.695549428909019, + "grad_norm": 0.24331089854240417, + "learning_rate": 2.9301e-05, + "loss": 0.0156, + "step": 9773 + }, + { + "epoch": 7.696337140606538, + "grad_norm": 0.22711758315563202, + "learning_rate": 2.9304e-05, + "loss": 0.0121, + "step": 9774 + }, + { + "epoch": 7.697124852304057, + "grad_norm": 0.29225876927375793, + "learning_rate": 2.9307e-05, + "loss": 0.0182, + "step": 9775 + }, + { + "epoch": 7.697912564001576, + "grad_norm": 0.4324364960193634, + "learning_rate": 2.931e-05, + "loss": 0.0239, + "step": 9776 + }, + { + "epoch": 7.698700275699094, + "grad_norm": 1.181913137435913, + "learning_rate": 2.9313e-05, + "loss": 0.0236, + "step": 9777 + }, + { + "epoch": 7.699487987396613, + "grad_norm": 0.38881051540374756, + "learning_rate": 2.9316e-05, + "loss": 0.0174, + "step": 9778 + }, + { + "epoch": 7.7002756990941315, + "grad_norm": 0.2702881395816803, + "learning_rate": 2.9318999999999998e-05, + "loss": 0.0152, + "step": 9779 + }, + { + "epoch": 7.70106341079165, + "grad_norm": 0.6799097061157227, + "learning_rate": 2.9322e-05, + "loss": 0.032, + "step": 9780 + }, + { + "epoch": 7.701851122489169, + "grad_norm": 0.4291258752346039, + "learning_rate": 2.9325e-05, + "loss": 0.0192, + "step": 9781 + }, + { + "epoch": 7.702638834186688, + "grad_norm": 0.5262780785560608, + "learning_rate": 2.9328e-05, + "loss": 0.013, + "step": 9782 + }, + { + "epoch": 7.703426545884207, + "grad_norm": 0.44398045539855957, + "learning_rate": 2.9331e-05, + "loss": 0.0272, + "step": 9783 + }, + { + "epoch": 7.704214257581725, + "grad_norm": 0.7650591135025024, + "learning_rate": 2.9334e-05, + "loss": 0.0278, + "step": 9784 + }, + { + "epoch": 7.705001969279244, + "grad_norm": 0.4761908948421478, + "learning_rate": 2.9337e-05, + "loss": 0.0337, + "step": 9785 + }, + { + "epoch": 7.705789680976762, + "grad_norm": 0.8317835330963135, + "learning_rate": 2.934e-05, + "loss": 0.0352, + "step": 9786 + }, + { + "epoch": 7.706577392674281, + "grad_norm": 0.25027158856391907, + "learning_rate": 2.9343e-05, + "loss": 0.0146, + "step": 9787 + }, + { + "epoch": 7.7073651043717994, + "grad_norm": 0.6573723554611206, + "learning_rate": 2.9346e-05, + "loss": 0.0356, + "step": 9788 + }, + { + "epoch": 7.708152816069319, + "grad_norm": 0.38286885619163513, + "learning_rate": 2.9349e-05, + "loss": 0.0214, + "step": 9789 + }, + { + "epoch": 7.7089405277668375, + "grad_norm": 0.40845057368278503, + "learning_rate": 2.9352000000000003e-05, + "loss": 0.0222, + "step": 9790 + }, + { + "epoch": 7.709728239464356, + "grad_norm": 0.7108209729194641, + "learning_rate": 2.9355000000000003e-05, + "loss": 0.2229, + "step": 9791 + }, + { + "epoch": 7.710515951161875, + "grad_norm": 0.855822741985321, + "learning_rate": 2.9358000000000003e-05, + "loss": 0.2637, + "step": 9792 + }, + { + "epoch": 7.711303662859393, + "grad_norm": 0.5774127244949341, + "learning_rate": 2.9361000000000002e-05, + "loss": 0.1236, + "step": 9793 + }, + { + "epoch": 7.712091374556913, + "grad_norm": 0.5652918219566345, + "learning_rate": 2.9364000000000002e-05, + "loss": 0.14, + "step": 9794 + }, + { + "epoch": 7.712879086254431, + "grad_norm": 0.5863784551620483, + "learning_rate": 2.9367000000000002e-05, + "loss": 0.0903, + "step": 9795 + }, + { + "epoch": 7.71366679795195, + "grad_norm": 0.41725456714630127, + "learning_rate": 2.9370000000000002e-05, + "loss": 0.0498, + "step": 9796 + }, + { + "epoch": 7.714454509649468, + "grad_norm": 0.5668901801109314, + "learning_rate": 2.9373e-05, + "loss": 0.0605, + "step": 9797 + }, + { + "epoch": 7.715242221346987, + "grad_norm": 0.27467986941337585, + "learning_rate": 2.9375999999999998e-05, + "loss": 0.019, + "step": 9798 + }, + { + "epoch": 7.7160299330445055, + "grad_norm": 0.5765169262886047, + "learning_rate": 2.9378999999999998e-05, + "loss": 0.0283, + "step": 9799 + }, + { + "epoch": 7.716817644742024, + "grad_norm": 0.2891973555088043, + "learning_rate": 2.9382e-05, + "loss": 0.0349, + "step": 9800 + }, + { + "epoch": 7.7176053564395435, + "grad_norm": 0.3022392690181732, + "learning_rate": 2.9385e-05, + "loss": 0.0174, + "step": 9801 + }, + { + "epoch": 7.718393068137062, + "grad_norm": 0.5220121145248413, + "learning_rate": 2.9388e-05, + "loss": 0.0303, + "step": 9802 + }, + { + "epoch": 7.719180779834581, + "grad_norm": 0.21791484951972961, + "learning_rate": 2.9391e-05, + "loss": 0.013, + "step": 9803 + }, + { + "epoch": 7.719968491532099, + "grad_norm": 0.3880901634693146, + "learning_rate": 2.9394e-05, + "loss": 0.0184, + "step": 9804 + }, + { + "epoch": 7.720756203229618, + "grad_norm": 0.5183120965957642, + "learning_rate": 2.9397e-05, + "loss": 0.0187, + "step": 9805 + }, + { + "epoch": 7.721543914927136, + "grad_norm": 0.35826125741004944, + "learning_rate": 2.94e-05, + "loss": 0.0296, + "step": 9806 + }, + { + "epoch": 7.722331626624655, + "grad_norm": 0.6256877183914185, + "learning_rate": 2.9403e-05, + "loss": 0.0245, + "step": 9807 + }, + { + "epoch": 7.723119338322174, + "grad_norm": 0.26618313789367676, + "learning_rate": 2.9406e-05, + "loss": 0.0172, + "step": 9808 + }, + { + "epoch": 7.723907050019693, + "grad_norm": 0.31402456760406494, + "learning_rate": 2.9409e-05, + "loss": 0.013, + "step": 9809 + }, + { + "epoch": 7.7246947617172115, + "grad_norm": 0.3021177649497986, + "learning_rate": 2.9412000000000002e-05, + "loss": 0.0204, + "step": 9810 + }, + { + "epoch": 7.72548247341473, + "grad_norm": 0.38086941838264465, + "learning_rate": 2.9415000000000002e-05, + "loss": 0.0164, + "step": 9811 + }, + { + "epoch": 7.726270185112249, + "grad_norm": 0.2370072603225708, + "learning_rate": 2.9418000000000002e-05, + "loss": 0.0165, + "step": 9812 + }, + { + "epoch": 7.727057896809768, + "grad_norm": 0.42069461941719055, + "learning_rate": 2.9421000000000002e-05, + "loss": 0.0201, + "step": 9813 + }, + { + "epoch": 7.727845608507287, + "grad_norm": 0.4981940984725952, + "learning_rate": 2.9424e-05, + "loss": 0.0227, + "step": 9814 + }, + { + "epoch": 7.728633320204805, + "grad_norm": 0.7841001152992249, + "learning_rate": 2.9427e-05, + "loss": 0.0201, + "step": 9815 + }, + { + "epoch": 7.729421031902324, + "grad_norm": 0.26545968651771545, + "learning_rate": 2.943e-05, + "loss": 0.0199, + "step": 9816 + }, + { + "epoch": 7.730208743599842, + "grad_norm": 0.34294381737709045, + "learning_rate": 2.9433e-05, + "loss": 0.0121, + "step": 9817 + }, + { + "epoch": 7.730996455297361, + "grad_norm": 0.6534582376480103, + "learning_rate": 2.9436e-05, + "loss": 0.0424, + "step": 9818 + }, + { + "epoch": 7.7317841669948795, + "grad_norm": 1.2655810117721558, + "learning_rate": 2.9439e-05, + "loss": 0.0292, + "step": 9819 + }, + { + "epoch": 7.732571878692399, + "grad_norm": 0.3337821364402771, + "learning_rate": 2.9442000000000004e-05, + "loss": 0.0184, + "step": 9820 + }, + { + "epoch": 7.7333595903899175, + "grad_norm": 0.2699521481990814, + "learning_rate": 2.9445000000000004e-05, + "loss": 0.0168, + "step": 9821 + }, + { + "epoch": 7.734147302087436, + "grad_norm": 0.3260045349597931, + "learning_rate": 2.9448e-05, + "loss": 0.0193, + "step": 9822 + }, + { + "epoch": 7.734935013784955, + "grad_norm": 0.3702912926673889, + "learning_rate": 2.9451e-05, + "loss": 0.0163, + "step": 9823 + }, + { + "epoch": 7.735722725482473, + "grad_norm": 0.5002459287643433, + "learning_rate": 2.9454e-05, + "loss": 0.0243, + "step": 9824 + }, + { + "epoch": 7.736510437179992, + "grad_norm": 0.4154539108276367, + "learning_rate": 2.9457e-05, + "loss": 0.0269, + "step": 9825 + }, + { + "epoch": 7.737298148877511, + "grad_norm": 0.4111331105232239, + "learning_rate": 2.946e-05, + "loss": 0.0188, + "step": 9826 + }, + { + "epoch": 7.73808586057503, + "grad_norm": 0.22966289520263672, + "learning_rate": 2.9463e-05, + "loss": 0.0155, + "step": 9827 + }, + { + "epoch": 7.738873572272548, + "grad_norm": 0.5000009536743164, + "learning_rate": 2.9466e-05, + "loss": 0.0276, + "step": 9828 + }, + { + "epoch": 7.739661283970067, + "grad_norm": 0.19423611462116241, + "learning_rate": 2.9469e-05, + "loss": 0.0174, + "step": 9829 + }, + { + "epoch": 7.7404489956675855, + "grad_norm": 0.4732537567615509, + "learning_rate": 2.9472000000000002e-05, + "loss": 0.0223, + "step": 9830 + }, + { + "epoch": 7.741236707365104, + "grad_norm": 0.25495627522468567, + "learning_rate": 2.9475e-05, + "loss": 0.0134, + "step": 9831 + }, + { + "epoch": 7.7420244190626235, + "grad_norm": 0.8322594165802002, + "learning_rate": 2.9478e-05, + "loss": 0.0207, + "step": 9832 + }, + { + "epoch": 7.742812130760142, + "grad_norm": 0.3191538155078888, + "learning_rate": 2.9481e-05, + "loss": 0.0115, + "step": 9833 + }, + { + "epoch": 7.743599842457661, + "grad_norm": 0.6944106817245483, + "learning_rate": 2.9484e-05, + "loss": 0.0414, + "step": 9834 + }, + { + "epoch": 7.744387554155179, + "grad_norm": 0.20236264169216156, + "learning_rate": 2.9487e-05, + "loss": 0.0098, + "step": 9835 + }, + { + "epoch": 7.745175265852698, + "grad_norm": 0.501240074634552, + "learning_rate": 2.949e-05, + "loss": 0.0158, + "step": 9836 + }, + { + "epoch": 7.745962977550216, + "grad_norm": 0.4306703805923462, + "learning_rate": 2.9493e-05, + "loss": 0.0451, + "step": 9837 + }, + { + "epoch": 7.746750689247735, + "grad_norm": 0.5489104986190796, + "learning_rate": 2.9496e-05, + "loss": 0.0163, + "step": 9838 + }, + { + "epoch": 7.747538400945254, + "grad_norm": 0.6313586831092834, + "learning_rate": 2.9499e-05, + "loss": 0.0321, + "step": 9839 + }, + { + "epoch": 7.748326112642773, + "grad_norm": 0.40944400429725647, + "learning_rate": 2.9502000000000003e-05, + "loss": 0.0243, + "step": 9840 + }, + { + "epoch": 7.7491138243402915, + "grad_norm": 1.1408917903900146, + "learning_rate": 2.9505000000000003e-05, + "loss": 0.2487, + "step": 9841 + }, + { + "epoch": 7.74990153603781, + "grad_norm": 0.6684057116508484, + "learning_rate": 2.9508000000000003e-05, + "loss": 0.1716, + "step": 9842 + }, + { + "epoch": 7.750689247735329, + "grad_norm": 0.8672237992286682, + "learning_rate": 2.9511000000000003e-05, + "loss": 0.1402, + "step": 9843 + }, + { + "epoch": 7.751476959432847, + "grad_norm": 0.6693621277809143, + "learning_rate": 2.9514000000000002e-05, + "loss": 0.146, + "step": 9844 + }, + { + "epoch": 7.752264671130367, + "grad_norm": 0.7447203397750854, + "learning_rate": 2.9517000000000002e-05, + "loss": 0.1006, + "step": 9845 + }, + { + "epoch": 7.753052382827885, + "grad_norm": 0.5182852745056152, + "learning_rate": 2.9520000000000002e-05, + "loss": 0.07, + "step": 9846 + }, + { + "epoch": 7.753840094525404, + "grad_norm": 0.2839941680431366, + "learning_rate": 2.9523e-05, + "loss": 0.0238, + "step": 9847 + }, + { + "epoch": 7.754627806222922, + "grad_norm": 0.3223516047000885, + "learning_rate": 2.9525999999999998e-05, + "loss": 0.0436, + "step": 9848 + }, + { + "epoch": 7.755415517920441, + "grad_norm": 0.4472682476043701, + "learning_rate": 2.9528999999999998e-05, + "loss": 0.035, + "step": 9849 + }, + { + "epoch": 7.7562032296179595, + "grad_norm": 0.39001137018203735, + "learning_rate": 2.9532e-05, + "loss": 0.0234, + "step": 9850 + }, + { + "epoch": 7.756990941315479, + "grad_norm": 0.33054256439208984, + "learning_rate": 2.9535e-05, + "loss": 0.0206, + "step": 9851 + }, + { + "epoch": 7.7577786530129975, + "grad_norm": 0.3839034140110016, + "learning_rate": 2.9538e-05, + "loss": 0.0244, + "step": 9852 + }, + { + "epoch": 7.758566364710516, + "grad_norm": 0.47751760482788086, + "learning_rate": 2.9541e-05, + "loss": 0.0276, + "step": 9853 + }, + { + "epoch": 7.759354076408035, + "grad_norm": 0.3731856644153595, + "learning_rate": 2.9544e-05, + "loss": 0.0168, + "step": 9854 + }, + { + "epoch": 7.760141788105553, + "grad_norm": 0.2487650364637375, + "learning_rate": 2.9547e-05, + "loss": 0.0084, + "step": 9855 + }, + { + "epoch": 7.760929499803072, + "grad_norm": 0.2651713788509369, + "learning_rate": 2.955e-05, + "loss": 0.0201, + "step": 9856 + }, + { + "epoch": 7.76171721150059, + "grad_norm": 0.3739633858203888, + "learning_rate": 2.9553e-05, + "loss": 0.0226, + "step": 9857 + }, + { + "epoch": 7.76250492319811, + "grad_norm": 0.4891897439956665, + "learning_rate": 2.9556e-05, + "loss": 0.021, + "step": 9858 + }, + { + "epoch": 7.763292634895628, + "grad_norm": 0.20360048115253448, + "learning_rate": 2.9559e-05, + "loss": 0.0133, + "step": 9859 + }, + { + "epoch": 7.764080346593147, + "grad_norm": 0.352435827255249, + "learning_rate": 2.9562000000000003e-05, + "loss": 0.0498, + "step": 9860 + }, + { + "epoch": 7.7648680582906655, + "grad_norm": 0.29218587279319763, + "learning_rate": 2.9565000000000002e-05, + "loss": 0.0245, + "step": 9861 + }, + { + "epoch": 7.765655769988184, + "grad_norm": 0.17267701029777527, + "learning_rate": 2.9568000000000002e-05, + "loss": 0.0134, + "step": 9862 + }, + { + "epoch": 7.7664434816857035, + "grad_norm": 0.3003751337528229, + "learning_rate": 2.9571000000000002e-05, + "loss": 0.0178, + "step": 9863 + }, + { + "epoch": 7.767231193383222, + "grad_norm": 0.5073226690292358, + "learning_rate": 2.9574000000000002e-05, + "loss": 0.0176, + "step": 9864 + }, + { + "epoch": 7.768018905080741, + "grad_norm": 0.36926764249801636, + "learning_rate": 2.9577e-05, + "loss": 0.0318, + "step": 9865 + }, + { + "epoch": 7.768806616778259, + "grad_norm": 0.37791746854782104, + "learning_rate": 2.958e-05, + "loss": 0.0219, + "step": 9866 + }, + { + "epoch": 7.769594328475778, + "grad_norm": 0.3071479797363281, + "learning_rate": 2.9583e-05, + "loss": 0.0121, + "step": 9867 + }, + { + "epoch": 7.770382040173296, + "grad_norm": 0.41455739736557007, + "learning_rate": 2.9586e-05, + "loss": 0.018, + "step": 9868 + }, + { + "epoch": 7.771169751870815, + "grad_norm": 0.5118886828422546, + "learning_rate": 2.9589e-05, + "loss": 0.0237, + "step": 9869 + }, + { + "epoch": 7.771957463568334, + "grad_norm": 1.5318753719329834, + "learning_rate": 2.9592000000000004e-05, + "loss": 0.0268, + "step": 9870 + }, + { + "epoch": 7.772745175265853, + "grad_norm": 0.6017165780067444, + "learning_rate": 2.9595e-05, + "loss": 0.0151, + "step": 9871 + }, + { + "epoch": 7.7735328869633715, + "grad_norm": 0.6573225259780884, + "learning_rate": 2.9598e-05, + "loss": 0.0209, + "step": 9872 + }, + { + "epoch": 7.77432059866089, + "grad_norm": 0.4317187964916229, + "learning_rate": 2.9601e-05, + "loss": 0.0247, + "step": 9873 + }, + { + "epoch": 7.775108310358409, + "grad_norm": 0.42995500564575195, + "learning_rate": 2.9604e-05, + "loss": 0.0193, + "step": 9874 + }, + { + "epoch": 7.775896022055927, + "grad_norm": 0.24768735468387604, + "learning_rate": 2.9607e-05, + "loss": 0.0151, + "step": 9875 + }, + { + "epoch": 7.776683733753446, + "grad_norm": 0.24564291536808014, + "learning_rate": 2.961e-05, + "loss": 0.0088, + "step": 9876 + }, + { + "epoch": 7.777471445450965, + "grad_norm": 0.6435468792915344, + "learning_rate": 2.9613e-05, + "loss": 0.0275, + "step": 9877 + }, + { + "epoch": 7.778259157148484, + "grad_norm": 0.4359194338321686, + "learning_rate": 2.9616e-05, + "loss": 0.0216, + "step": 9878 + }, + { + "epoch": 7.779046868846002, + "grad_norm": 0.5787261724472046, + "learning_rate": 2.9619e-05, + "loss": 0.0297, + "step": 9879 + }, + { + "epoch": 7.779834580543521, + "grad_norm": 0.7609696388244629, + "learning_rate": 2.9622000000000002e-05, + "loss": 0.0291, + "step": 9880 + }, + { + "epoch": 7.7806222922410395, + "grad_norm": 0.2749735414981842, + "learning_rate": 2.9625000000000002e-05, + "loss": 0.0286, + "step": 9881 + }, + { + "epoch": 7.781410003938559, + "grad_norm": 0.31091034412384033, + "learning_rate": 2.9628e-05, + "loss": 0.0157, + "step": 9882 + }, + { + "epoch": 7.7821977156360775, + "grad_norm": 0.23239362239837646, + "learning_rate": 2.9631e-05, + "loss": 0.0123, + "step": 9883 + }, + { + "epoch": 7.782985427333596, + "grad_norm": 0.35535377264022827, + "learning_rate": 2.9634e-05, + "loss": 0.0196, + "step": 9884 + }, + { + "epoch": 7.783773139031115, + "grad_norm": 0.5588963627815247, + "learning_rate": 2.9637e-05, + "loss": 0.0247, + "step": 9885 + }, + { + "epoch": 7.784560850728633, + "grad_norm": 0.5566390156745911, + "learning_rate": 2.964e-05, + "loss": 0.0235, + "step": 9886 + }, + { + "epoch": 7.785348562426152, + "grad_norm": 0.5282109379768372, + "learning_rate": 2.9643e-05, + "loss": 0.0288, + "step": 9887 + }, + { + "epoch": 7.78613627412367, + "grad_norm": 0.22028258442878723, + "learning_rate": 2.9646e-05, + "loss": 0.0152, + "step": 9888 + }, + { + "epoch": 7.78692398582119, + "grad_norm": 0.2026507407426834, + "learning_rate": 2.9649e-05, + "loss": 0.0109, + "step": 9889 + }, + { + "epoch": 7.787711697518708, + "grad_norm": 1.21559476852417, + "learning_rate": 2.9652e-05, + "loss": 0.04, + "step": 9890 + }, + { + "epoch": 7.788499409216227, + "grad_norm": 1.0171853303909302, + "learning_rate": 2.9655000000000003e-05, + "loss": 0.3109, + "step": 9891 + }, + { + "epoch": 7.7892871209137455, + "grad_norm": 0.863106906414032, + "learning_rate": 2.9658000000000003e-05, + "loss": 0.2132, + "step": 9892 + }, + { + "epoch": 7.790074832611264, + "grad_norm": 0.805728554725647, + "learning_rate": 2.9661000000000003e-05, + "loss": 0.1315, + "step": 9893 + }, + { + "epoch": 7.790862544308783, + "grad_norm": 0.7934402823448181, + "learning_rate": 2.9664000000000003e-05, + "loss": 0.1005, + "step": 9894 + }, + { + "epoch": 7.791650256006301, + "grad_norm": 0.6578500866889954, + "learning_rate": 2.9667000000000002e-05, + "loss": 0.1102, + "step": 9895 + }, + { + "epoch": 7.792437967703821, + "grad_norm": 0.5552372932434082, + "learning_rate": 2.967e-05, + "loss": 0.0999, + "step": 9896 + }, + { + "epoch": 7.793225679401339, + "grad_norm": 0.30095672607421875, + "learning_rate": 2.9673e-05, + "loss": 0.0201, + "step": 9897 + }, + { + "epoch": 7.794013391098858, + "grad_norm": 0.5017735362052917, + "learning_rate": 2.9676e-05, + "loss": 0.0401, + "step": 9898 + }, + { + "epoch": 7.794801102796376, + "grad_norm": 0.3315179944038391, + "learning_rate": 2.9678999999999998e-05, + "loss": 0.0238, + "step": 9899 + }, + { + "epoch": 7.795588814493895, + "grad_norm": 0.3105565011501312, + "learning_rate": 2.9681999999999998e-05, + "loss": 0.0216, + "step": 9900 + }, + { + "epoch": 7.796376526191414, + "grad_norm": 0.5193770527839661, + "learning_rate": 2.9685e-05, + "loss": 0.0601, + "step": 9901 + }, + { + "epoch": 7.797164237888933, + "grad_norm": 0.28750160336494446, + "learning_rate": 2.9688e-05, + "loss": 0.023, + "step": 9902 + }, + { + "epoch": 7.7979519495864515, + "grad_norm": 0.4324815571308136, + "learning_rate": 2.9691e-05, + "loss": 0.0257, + "step": 9903 + }, + { + "epoch": 7.79873966128397, + "grad_norm": 0.14644372463226318, + "learning_rate": 2.9694e-05, + "loss": 0.0097, + "step": 9904 + }, + { + "epoch": 7.799527372981489, + "grad_norm": 0.345511257648468, + "learning_rate": 2.9697e-05, + "loss": 0.0281, + "step": 9905 + }, + { + "epoch": 7.800315084679007, + "grad_norm": 0.4325602948665619, + "learning_rate": 2.97e-05, + "loss": 0.0203, + "step": 9906 + }, + { + "epoch": 7.801102796376526, + "grad_norm": 0.5256118178367615, + "learning_rate": 2.9703e-05, + "loss": 0.047, + "step": 9907 + }, + { + "epoch": 7.801890508074045, + "grad_norm": 0.21044227480888367, + "learning_rate": 2.9706e-05, + "loss": 0.0122, + "step": 9908 + }, + { + "epoch": 7.802678219771564, + "grad_norm": 0.7910927534103394, + "learning_rate": 2.9709e-05, + "loss": 0.0294, + "step": 9909 + }, + { + "epoch": 7.803465931469082, + "grad_norm": 0.24096724390983582, + "learning_rate": 2.9712e-05, + "loss": 0.0106, + "step": 9910 + }, + { + "epoch": 7.804253643166601, + "grad_norm": 0.43781596422195435, + "learning_rate": 2.9715000000000003e-05, + "loss": 0.0285, + "step": 9911 + }, + { + "epoch": 7.8050413548641195, + "grad_norm": 0.35256192088127136, + "learning_rate": 2.9718000000000002e-05, + "loss": 0.0193, + "step": 9912 + }, + { + "epoch": 7.805829066561638, + "grad_norm": 0.6157747507095337, + "learning_rate": 2.9721000000000002e-05, + "loss": 0.0256, + "step": 9913 + }, + { + "epoch": 7.806616778259157, + "grad_norm": 0.29482823610305786, + "learning_rate": 2.9724000000000002e-05, + "loss": 0.0128, + "step": 9914 + }, + { + "epoch": 7.807404489956676, + "grad_norm": 0.29574987292289734, + "learning_rate": 2.9727000000000002e-05, + "loss": 0.0119, + "step": 9915 + }, + { + "epoch": 7.808192201654195, + "grad_norm": 0.21602991223335266, + "learning_rate": 2.973e-05, + "loss": 0.0127, + "step": 9916 + }, + { + "epoch": 7.808979913351713, + "grad_norm": 0.40624669194221497, + "learning_rate": 2.9733e-05, + "loss": 0.0217, + "step": 9917 + }, + { + "epoch": 7.809767625049232, + "grad_norm": 0.30695897340774536, + "learning_rate": 2.9736e-05, + "loss": 0.014, + "step": 9918 + }, + { + "epoch": 7.81055533674675, + "grad_norm": 0.7722525596618652, + "learning_rate": 2.9739e-05, + "loss": 0.0299, + "step": 9919 + }, + { + "epoch": 7.81134304844427, + "grad_norm": 0.4642868638038635, + "learning_rate": 2.9742e-05, + "loss": 0.0138, + "step": 9920 + }, + { + "epoch": 7.812130760141788, + "grad_norm": 0.22540651261806488, + "learning_rate": 2.9745e-05, + "loss": 0.0108, + "step": 9921 + }, + { + "epoch": 7.812918471839307, + "grad_norm": 0.1681995540857315, + "learning_rate": 2.9748e-05, + "loss": 0.0096, + "step": 9922 + }, + { + "epoch": 7.8137061835368256, + "grad_norm": 0.3472670912742615, + "learning_rate": 2.9751e-05, + "loss": 0.0176, + "step": 9923 + }, + { + "epoch": 7.814493895234344, + "grad_norm": 0.23416924476623535, + "learning_rate": 2.9754e-05, + "loss": 0.0114, + "step": 9924 + }, + { + "epoch": 7.815281606931863, + "grad_norm": 0.5800943374633789, + "learning_rate": 2.9757e-05, + "loss": 0.0342, + "step": 9925 + }, + { + "epoch": 7.816069318629381, + "grad_norm": 0.34639057517051697, + "learning_rate": 2.976e-05, + "loss": 0.0251, + "step": 9926 + }, + { + "epoch": 7.816857030326901, + "grad_norm": 0.6471945643424988, + "learning_rate": 2.9763e-05, + "loss": 0.0341, + "step": 9927 + }, + { + "epoch": 7.817644742024419, + "grad_norm": 0.3317613899707794, + "learning_rate": 2.9766e-05, + "loss": 0.0244, + "step": 9928 + }, + { + "epoch": 7.818432453721938, + "grad_norm": 0.5234882235527039, + "learning_rate": 2.9769e-05, + "loss": 0.0368, + "step": 9929 + }, + { + "epoch": 7.819220165419456, + "grad_norm": 0.38392046093940735, + "learning_rate": 2.9772e-05, + "loss": 0.0282, + "step": 9930 + }, + { + "epoch": 7.820007877116975, + "grad_norm": 0.2944355607032776, + "learning_rate": 2.9775000000000002e-05, + "loss": 0.0221, + "step": 9931 + }, + { + "epoch": 7.8207955888144935, + "grad_norm": 0.3849775493144989, + "learning_rate": 2.9778000000000002e-05, + "loss": 0.0172, + "step": 9932 + }, + { + "epoch": 7.821583300512012, + "grad_norm": 0.22761230170726776, + "learning_rate": 2.9781e-05, + "loss": 0.017, + "step": 9933 + }, + { + "epoch": 7.822371012209532, + "grad_norm": 0.26249960064888, + "learning_rate": 2.9784e-05, + "loss": 0.0155, + "step": 9934 + }, + { + "epoch": 7.82315872390705, + "grad_norm": 0.4003177881240845, + "learning_rate": 2.9787e-05, + "loss": 0.0176, + "step": 9935 + }, + { + "epoch": 7.823946435604569, + "grad_norm": 0.3016769587993622, + "learning_rate": 2.979e-05, + "loss": 0.0156, + "step": 9936 + }, + { + "epoch": 7.824734147302087, + "grad_norm": 0.2518315017223358, + "learning_rate": 2.9793e-05, + "loss": 0.0188, + "step": 9937 + }, + { + "epoch": 7.825521858999606, + "grad_norm": 0.40752649307250977, + "learning_rate": 2.9796e-05, + "loss": 0.0228, + "step": 9938 + }, + { + "epoch": 7.826309570697125, + "grad_norm": 0.34875643253326416, + "learning_rate": 2.9799e-05, + "loss": 0.026, + "step": 9939 + }, + { + "epoch": 7.827097282394644, + "grad_norm": 0.4446348547935486, + "learning_rate": 2.9802e-05, + "loss": 0.0203, + "step": 9940 + }, + { + "epoch": 7.827884994092162, + "grad_norm": 0.7712163925170898, + "learning_rate": 2.9805000000000003e-05, + "loss": 0.304, + "step": 9941 + }, + { + "epoch": 7.828672705789681, + "grad_norm": 0.7638985514640808, + "learning_rate": 2.9808000000000003e-05, + "loss": 0.195, + "step": 9942 + }, + { + "epoch": 7.8294604174872, + "grad_norm": 0.5173488259315491, + "learning_rate": 2.9811000000000003e-05, + "loss": 0.2007, + "step": 9943 + }, + { + "epoch": 7.830248129184718, + "grad_norm": 0.491717666387558, + "learning_rate": 2.9814000000000003e-05, + "loss": 0.1172, + "step": 9944 + }, + { + "epoch": 7.831035840882237, + "grad_norm": 0.6436645984649658, + "learning_rate": 2.9817e-05, + "loss": 0.0797, + "step": 9945 + }, + { + "epoch": 7.831823552579756, + "grad_norm": 0.3663143217563629, + "learning_rate": 2.982e-05, + "loss": 0.0466, + "step": 9946 + }, + { + "epoch": 7.832611264277275, + "grad_norm": 0.5060704350471497, + "learning_rate": 2.9823e-05, + "loss": 0.0254, + "step": 9947 + }, + { + "epoch": 7.833398975974793, + "grad_norm": 0.4428746998310089, + "learning_rate": 2.9826e-05, + "loss": 0.0592, + "step": 9948 + }, + { + "epoch": 7.834186687672312, + "grad_norm": 0.38627874851226807, + "learning_rate": 2.9829e-05, + "loss": 0.0281, + "step": 9949 + }, + { + "epoch": 7.83497439936983, + "grad_norm": 0.38482770323753357, + "learning_rate": 2.9831999999999998e-05, + "loss": 0.0191, + "step": 9950 + }, + { + "epoch": 7.835762111067349, + "grad_norm": 0.34392309188842773, + "learning_rate": 2.9835e-05, + "loss": 0.0206, + "step": 9951 + }, + { + "epoch": 7.8365498227648684, + "grad_norm": 0.2636268138885498, + "learning_rate": 2.9838e-05, + "loss": 0.0217, + "step": 9952 + }, + { + "epoch": 7.837337534462387, + "grad_norm": 0.3283672630786896, + "learning_rate": 2.9841e-05, + "loss": 0.0172, + "step": 9953 + }, + { + "epoch": 7.838125246159906, + "grad_norm": 0.32193800806999207, + "learning_rate": 2.9844e-05, + "loss": 0.0253, + "step": 9954 + }, + { + "epoch": 7.838912957857424, + "grad_norm": 0.36959636211395264, + "learning_rate": 2.9847e-05, + "loss": 0.0211, + "step": 9955 + }, + { + "epoch": 7.839700669554943, + "grad_norm": 0.24686190485954285, + "learning_rate": 2.985e-05, + "loss": 0.0084, + "step": 9956 + }, + { + "epoch": 7.840488381252461, + "grad_norm": 0.3686894476413727, + "learning_rate": 2.9853e-05, + "loss": 0.0303, + "step": 9957 + }, + { + "epoch": 7.841276092949981, + "grad_norm": 0.42638516426086426, + "learning_rate": 2.9856e-05, + "loss": 0.0206, + "step": 9958 + }, + { + "epoch": 7.842063804647499, + "grad_norm": 0.3660891056060791, + "learning_rate": 2.9859e-05, + "loss": 0.0194, + "step": 9959 + }, + { + "epoch": 7.842851516345018, + "grad_norm": 0.4268742501735687, + "learning_rate": 2.9862e-05, + "loss": 0.0369, + "step": 9960 + }, + { + "epoch": 7.843639228042536, + "grad_norm": 0.400086909532547, + "learning_rate": 2.9865000000000003e-05, + "loss": 0.0181, + "step": 9961 + }, + { + "epoch": 7.844426939740055, + "grad_norm": 0.6228218674659729, + "learning_rate": 2.9868000000000003e-05, + "loss": 0.0163, + "step": 9962 + }, + { + "epoch": 7.845214651437574, + "grad_norm": 1.1654390096664429, + "learning_rate": 2.9871000000000003e-05, + "loss": 0.0157, + "step": 9963 + }, + { + "epoch": 7.846002363135092, + "grad_norm": 0.3323245346546173, + "learning_rate": 2.9874000000000002e-05, + "loss": 0.0192, + "step": 9964 + }, + { + "epoch": 7.846790074832612, + "grad_norm": 0.39727914333343506, + "learning_rate": 2.9877000000000002e-05, + "loss": 0.0266, + "step": 9965 + }, + { + "epoch": 7.84757778653013, + "grad_norm": 0.23647907376289368, + "learning_rate": 2.9880000000000002e-05, + "loss": 0.0127, + "step": 9966 + }, + { + "epoch": 7.848365498227649, + "grad_norm": 0.28414371609687805, + "learning_rate": 2.9883000000000002e-05, + "loss": 0.019, + "step": 9967 + }, + { + "epoch": 7.849153209925167, + "grad_norm": 0.6148589253425598, + "learning_rate": 2.9886e-05, + "loss": 0.026, + "step": 9968 + }, + { + "epoch": 7.849940921622686, + "grad_norm": 0.1922149956226349, + "learning_rate": 2.9889e-05, + "loss": 0.0134, + "step": 9969 + }, + { + "epoch": 7.850728633320204, + "grad_norm": 0.5715631246566772, + "learning_rate": 2.9891999999999998e-05, + "loss": 0.0261, + "step": 9970 + }, + { + "epoch": 7.851516345017724, + "grad_norm": 0.43132534623146057, + "learning_rate": 2.9895e-05, + "loss": 0.0234, + "step": 9971 + }, + { + "epoch": 7.8523040567152425, + "grad_norm": 0.39577430486679077, + "learning_rate": 2.9898e-05, + "loss": 0.0176, + "step": 9972 + }, + { + "epoch": 7.853091768412761, + "grad_norm": 0.4561624228954315, + "learning_rate": 2.9901e-05, + "loss": 0.0162, + "step": 9973 + }, + { + "epoch": 7.85387948011028, + "grad_norm": 0.4762882888317108, + "learning_rate": 2.9904e-05, + "loss": 0.0206, + "step": 9974 + }, + { + "epoch": 7.854667191807798, + "grad_norm": 0.34452852606773376, + "learning_rate": 2.9907e-05, + "loss": 0.0201, + "step": 9975 + }, + { + "epoch": 7.855454903505317, + "grad_norm": 0.38414081931114197, + "learning_rate": 2.991e-05, + "loss": 0.0206, + "step": 9976 + }, + { + "epoch": 7.856242615202836, + "grad_norm": 0.22081682085990906, + "learning_rate": 2.9913e-05, + "loss": 0.0109, + "step": 9977 + }, + { + "epoch": 7.857030326900355, + "grad_norm": 0.3255661129951477, + "learning_rate": 2.9916e-05, + "loss": 0.0148, + "step": 9978 + }, + { + "epoch": 7.857818038597873, + "grad_norm": 1.1911675930023193, + "learning_rate": 2.9919e-05, + "loss": 0.02, + "step": 9979 + }, + { + "epoch": 7.858605750295392, + "grad_norm": 0.8609846830368042, + "learning_rate": 2.9922e-05, + "loss": 0.0294, + "step": 9980 + }, + { + "epoch": 7.8593934619929104, + "grad_norm": 0.43614447116851807, + "learning_rate": 2.9925000000000002e-05, + "loss": 0.0375, + "step": 9981 + }, + { + "epoch": 7.860181173690429, + "grad_norm": 0.9894643425941467, + "learning_rate": 2.9928000000000002e-05, + "loss": 0.0286, + "step": 9982 + }, + { + "epoch": 7.860968885387948, + "grad_norm": 0.8458738327026367, + "learning_rate": 2.9931000000000002e-05, + "loss": 0.0231, + "step": 9983 + }, + { + "epoch": 7.861756597085467, + "grad_norm": 0.4954654574394226, + "learning_rate": 2.9934000000000002e-05, + "loss": 0.0317, + "step": 9984 + }, + { + "epoch": 7.862544308782986, + "grad_norm": 1.259466528892517, + "learning_rate": 2.9937e-05, + "loss": 0.037, + "step": 9985 + }, + { + "epoch": 7.863332020480504, + "grad_norm": 0.5498787760734558, + "learning_rate": 2.994e-05, + "loss": 0.0182, + "step": 9986 + }, + { + "epoch": 7.864119732178023, + "grad_norm": 0.3800681531429291, + "learning_rate": 2.9943e-05, + "loss": 0.0247, + "step": 9987 + }, + { + "epoch": 7.864907443875541, + "grad_norm": 0.3687657415866852, + "learning_rate": 2.9946e-05, + "loss": 0.0238, + "step": 9988 + }, + { + "epoch": 7.865695155573061, + "grad_norm": 0.6847066283226013, + "learning_rate": 2.9949e-05, + "loss": 0.0211, + "step": 9989 + }, + { + "epoch": 7.866482867270579, + "grad_norm": 0.5230737328529358, + "learning_rate": 2.9952e-05, + "loss": 0.0381, + "step": 9990 + }, + { + "epoch": 7.867270578968098, + "grad_norm": 0.7326624989509583, + "learning_rate": 2.9955000000000004e-05, + "loss": 0.2243, + "step": 9991 + }, + { + "epoch": 7.8680582906656165, + "grad_norm": 0.7568221688270569, + "learning_rate": 2.9958000000000004e-05, + "loss": 0.186, + "step": 9992 + }, + { + "epoch": 7.868846002363135, + "grad_norm": 0.6449829936027527, + "learning_rate": 2.9961000000000003e-05, + "loss": 0.1527, + "step": 9993 + }, + { + "epoch": 7.869633714060654, + "grad_norm": 0.7005048990249634, + "learning_rate": 2.9964e-05, + "loss": 0.1151, + "step": 9994 + }, + { + "epoch": 7.870421425758172, + "grad_norm": 0.6204913258552551, + "learning_rate": 2.9967e-05, + "loss": 0.1138, + "step": 9995 + }, + { + "epoch": 7.871209137455692, + "grad_norm": 0.7048729658126831, + "learning_rate": 2.997e-05, + "loss": 0.0676, + "step": 9996 + }, + { + "epoch": 7.87199684915321, + "grad_norm": 0.60378098487854, + "learning_rate": 2.9973e-05, + "loss": 0.0947, + "step": 9997 + }, + { + "epoch": 7.872784560850729, + "grad_norm": 0.4461608827114105, + "learning_rate": 2.9976e-05, + "loss": 0.0332, + "step": 9998 + }, + { + "epoch": 7.873572272548247, + "grad_norm": 0.3558432161808014, + "learning_rate": 2.9979e-05, + "loss": 0.0281, + "step": 9999 + }, + { + "epoch": 7.874359984245766, + "grad_norm": 0.24705870449543, + "learning_rate": 2.9982e-05, + "loss": 0.0141, + "step": 10000 + }, + { + "epoch": 7.874359984245766, + "eval_cer": 0.12128988775695362, + "eval_loss": 0.33333268761634827, + "eval_runtime": 16.1178, + "eval_samples_per_second": 18.861, + "eval_steps_per_second": 0.62, + "eval_wer": 0.4307367613200307, + "step": 10000 + }, + { + "epoch": 7.8751476959432845, + "grad_norm": 0.3558761775493622, + "learning_rate": 2.9985000000000002e-05, + "loss": 0.0383, + "step": 10001 + }, + { + "epoch": 7.875935407640803, + "grad_norm": 0.2802606523036957, + "learning_rate": 2.9988e-05, + "loss": 0.0148, + "step": 10002 + }, + { + "epoch": 7.8767231193383225, + "grad_norm": 0.2704249918460846, + "learning_rate": 2.9991e-05, + "loss": 0.0165, + "step": 10003 + }, + { + "epoch": 7.877510831035841, + "grad_norm": 0.2933802604675293, + "learning_rate": 2.9994e-05, + "loss": 0.025, + "step": 10004 + }, + { + "epoch": 7.87829854273336, + "grad_norm": 0.47034138441085815, + "learning_rate": 2.9997e-05, + "loss": 0.0295, + "step": 10005 + }, + { + "epoch": 7.879086254430878, + "grad_norm": 0.27868786454200745, + "learning_rate": 3e-05, + "loss": 0.0171, + "step": 10006 + }, + { + "epoch": 7.879873966128397, + "grad_norm": 0.31668829917907715, + "learning_rate": 2.999966666666667e-05, + "loss": 0.0301, + "step": 10007 + }, + { + "epoch": 7.880661677825916, + "grad_norm": 0.48655378818511963, + "learning_rate": 2.9999333333333333e-05, + "loss": 0.0282, + "step": 10008 + }, + { + "epoch": 7.881449389523435, + "grad_norm": 0.3982943594455719, + "learning_rate": 2.9999000000000002e-05, + "loss": 0.0161, + "step": 10009 + }, + { + "epoch": 7.882237101220953, + "grad_norm": 0.7788484692573547, + "learning_rate": 2.9998666666666668e-05, + "loss": 0.0265, + "step": 10010 + }, + { + "epoch": 7.883024812918472, + "grad_norm": 0.19180168211460114, + "learning_rate": 2.9998333333333334e-05, + "loss": 0.0192, + "step": 10011 + }, + { + "epoch": 7.8838125246159905, + "grad_norm": 0.29853397607803345, + "learning_rate": 2.9998e-05, + "loss": 0.0193, + "step": 10012 + }, + { + "epoch": 7.884600236313509, + "grad_norm": 0.31834739446640015, + "learning_rate": 2.999766666666667e-05, + "loss": 0.0094, + "step": 10013 + }, + { + "epoch": 7.885387948011028, + "grad_norm": 0.4058375060558319, + "learning_rate": 2.999733333333333e-05, + "loss": 0.0217, + "step": 10014 + }, + { + "epoch": 7.886175659708547, + "grad_norm": 0.26482200622558594, + "learning_rate": 2.9997e-05, + "loss": 0.012, + "step": 10015 + }, + { + "epoch": 7.886963371406066, + "grad_norm": 0.3419625759124756, + "learning_rate": 2.999666666666667e-05, + "loss": 0.0312, + "step": 10016 + }, + { + "epoch": 7.887751083103584, + "grad_norm": 0.426855206489563, + "learning_rate": 2.9996333333333333e-05, + "loss": 0.0179, + "step": 10017 + }, + { + "epoch": 7.888538794801103, + "grad_norm": 0.8280730247497559, + "learning_rate": 2.9996000000000002e-05, + "loss": 0.0361, + "step": 10018 + }, + { + "epoch": 7.889326506498621, + "grad_norm": 0.2911975383758545, + "learning_rate": 2.9995666666666668e-05, + "loss": 0.0147, + "step": 10019 + }, + { + "epoch": 7.89011421819614, + "grad_norm": 0.22657740116119385, + "learning_rate": 2.9995333333333334e-05, + "loss": 0.0112, + "step": 10020 + }, + { + "epoch": 7.8909019298936585, + "grad_norm": 0.4716150462627411, + "learning_rate": 2.9995e-05, + "loss": 0.0287, + "step": 10021 + }, + { + "epoch": 7.891689641591178, + "grad_norm": 0.24315035343170166, + "learning_rate": 2.9994666666666666e-05, + "loss": 0.0119, + "step": 10022 + }, + { + "epoch": 7.8924773532886965, + "grad_norm": 0.49310025572776794, + "learning_rate": 2.9994333333333335e-05, + "loss": 0.0357, + "step": 10023 + }, + { + "epoch": 7.893265064986215, + "grad_norm": 0.2953670918941498, + "learning_rate": 2.9994e-05, + "loss": 0.0215, + "step": 10024 + }, + { + "epoch": 7.894052776683734, + "grad_norm": 0.8455106019973755, + "learning_rate": 2.9993666666666667e-05, + "loss": 0.0169, + "step": 10025 + }, + { + "epoch": 7.894840488381252, + "grad_norm": 0.38549405336380005, + "learning_rate": 2.9993333333333333e-05, + "loss": 0.0148, + "step": 10026 + }, + { + "epoch": 7.895628200078772, + "grad_norm": 0.378886342048645, + "learning_rate": 2.9993000000000002e-05, + "loss": 0.0163, + "step": 10027 + }, + { + "epoch": 7.89641591177629, + "grad_norm": 0.42616546154022217, + "learning_rate": 2.9992666666666665e-05, + "loss": 0.0293, + "step": 10028 + }, + { + "epoch": 7.897203623473809, + "grad_norm": 0.2699037194252014, + "learning_rate": 2.9992333333333334e-05, + "loss": 0.018, + "step": 10029 + }, + { + "epoch": 7.897991335171327, + "grad_norm": 0.48952633142471313, + "learning_rate": 2.9992e-05, + "loss": 0.0217, + "step": 10030 + }, + { + "epoch": 7.898779046868846, + "grad_norm": 0.889363706111908, + "learning_rate": 2.9991666666666666e-05, + "loss": 0.0393, + "step": 10031 + }, + { + "epoch": 7.8995667585663645, + "grad_norm": 0.3041420578956604, + "learning_rate": 2.9991333333333335e-05, + "loss": 0.015, + "step": 10032 + }, + { + "epoch": 7.900354470263883, + "grad_norm": 0.7082728147506714, + "learning_rate": 2.9991e-05, + "loss": 0.0261, + "step": 10033 + }, + { + "epoch": 7.9011421819614025, + "grad_norm": 0.4072374999523163, + "learning_rate": 2.9990666666666667e-05, + "loss": 0.024, + "step": 10034 + }, + { + "epoch": 7.901929893658921, + "grad_norm": 0.47361478209495544, + "learning_rate": 2.9990333333333333e-05, + "loss": 0.0239, + "step": 10035 + }, + { + "epoch": 7.90271760535644, + "grad_norm": 0.7109715342521667, + "learning_rate": 2.9990000000000003e-05, + "loss": 0.0213, + "step": 10036 + }, + { + "epoch": 7.903505317053958, + "grad_norm": 0.22588880360126495, + "learning_rate": 2.9989666666666665e-05, + "loss": 0.0114, + "step": 10037 + }, + { + "epoch": 7.904293028751477, + "grad_norm": 0.312457412481308, + "learning_rate": 2.9989333333333334e-05, + "loss": 0.0208, + "step": 10038 + }, + { + "epoch": 7.905080740448995, + "grad_norm": 0.44180774688720703, + "learning_rate": 2.9989e-05, + "loss": 0.0242, + "step": 10039 + }, + { + "epoch": 7.905868452146514, + "grad_norm": 0.2993430197238922, + "learning_rate": 2.9988666666666666e-05, + "loss": 0.0187, + "step": 10040 + }, + { + "epoch": 7.906656163844033, + "grad_norm": 0.6550106406211853, + "learning_rate": 2.9988333333333336e-05, + "loss": 0.2757, + "step": 10041 + }, + { + "epoch": 7.907443875541552, + "grad_norm": 0.901502788066864, + "learning_rate": 2.9988e-05, + "loss": 0.2568, + "step": 10042 + }, + { + "epoch": 7.9082315872390705, + "grad_norm": 0.7842085361480713, + "learning_rate": 2.9987666666666667e-05, + "loss": 0.1385, + "step": 10043 + }, + { + "epoch": 7.909019298936589, + "grad_norm": 0.627765953540802, + "learning_rate": 2.9987333333333333e-05, + "loss": 0.1156, + "step": 10044 + }, + { + "epoch": 7.909807010634108, + "grad_norm": 0.6312331557273865, + "learning_rate": 2.9987000000000003e-05, + "loss": 0.1223, + "step": 10045 + }, + { + "epoch": 7.910594722331627, + "grad_norm": 0.7615615129470825, + "learning_rate": 2.9986666666666665e-05, + "loss": 0.1114, + "step": 10046 + }, + { + "epoch": 7.911382434029146, + "grad_norm": 0.37394094467163086, + "learning_rate": 2.9986333333333335e-05, + "loss": 0.0336, + "step": 10047 + }, + { + "epoch": 7.912170145726664, + "grad_norm": 0.2838548421859741, + "learning_rate": 2.9986000000000004e-05, + "loss": 0.031, + "step": 10048 + }, + { + "epoch": 7.912957857424183, + "grad_norm": 0.4226374328136444, + "learning_rate": 2.9985666666666666e-05, + "loss": 0.0446, + "step": 10049 + }, + { + "epoch": 7.913745569121701, + "grad_norm": 0.8996577262878418, + "learning_rate": 2.9985333333333336e-05, + "loss": 0.0461, + "step": 10050 + }, + { + "epoch": 7.91453328081922, + "grad_norm": 0.528930127620697, + "learning_rate": 2.9985000000000002e-05, + "loss": 0.0307, + "step": 10051 + }, + { + "epoch": 7.9153209925167385, + "grad_norm": 0.427461713552475, + "learning_rate": 2.9984666666666668e-05, + "loss": 0.0203, + "step": 10052 + }, + { + "epoch": 7.916108704214258, + "grad_norm": 0.27759167551994324, + "learning_rate": 2.9984333333333334e-05, + "loss": 0.0255, + "step": 10053 + }, + { + "epoch": 7.9168964159117765, + "grad_norm": 0.379840224981308, + "learning_rate": 2.9984e-05, + "loss": 0.0213, + "step": 10054 + }, + { + "epoch": 7.917684127609295, + "grad_norm": 1.049493432044983, + "learning_rate": 2.9983666666666665e-05, + "loss": 0.0477, + "step": 10055 + }, + { + "epoch": 7.918471839306814, + "grad_norm": 0.3523655831813812, + "learning_rate": 2.9983333333333335e-05, + "loss": 0.0196, + "step": 10056 + }, + { + "epoch": 7.919259551004332, + "grad_norm": 0.2661159932613373, + "learning_rate": 2.9983e-05, + "loss": 0.0126, + "step": 10057 + }, + { + "epoch": 7.920047262701851, + "grad_norm": 0.28063735365867615, + "learning_rate": 2.9982666666666667e-05, + "loss": 0.0161, + "step": 10058 + }, + { + "epoch": 7.920834974399369, + "grad_norm": 0.3713608980178833, + "learning_rate": 2.9982333333333336e-05, + "loss": 0.0106, + "step": 10059 + }, + { + "epoch": 7.921622686096889, + "grad_norm": 0.43482619524002075, + "learning_rate": 2.9982e-05, + "loss": 0.0296, + "step": 10060 + }, + { + "epoch": 7.922410397794407, + "grad_norm": 0.20232729613780975, + "learning_rate": 2.9981666666666668e-05, + "loss": 0.0146, + "step": 10061 + }, + { + "epoch": 7.923198109491926, + "grad_norm": 0.4871148467063904, + "learning_rate": 2.9981333333333334e-05, + "loss": 0.0272, + "step": 10062 + }, + { + "epoch": 7.9239858211894445, + "grad_norm": 0.5630569458007812, + "learning_rate": 2.9981e-05, + "loss": 0.0183, + "step": 10063 + }, + { + "epoch": 7.924773532886963, + "grad_norm": 0.27318260073661804, + "learning_rate": 2.9980666666666666e-05, + "loss": 0.0253, + "step": 10064 + }, + { + "epoch": 7.9255612445844825, + "grad_norm": 0.23613294959068298, + "learning_rate": 2.9980333333333335e-05, + "loss": 0.018, + "step": 10065 + }, + { + "epoch": 7.926348956282001, + "grad_norm": 0.39785176515579224, + "learning_rate": 2.998e-05, + "loss": 0.0186, + "step": 10066 + }, + { + "epoch": 7.92713666797952, + "grad_norm": 0.3971847593784332, + "learning_rate": 2.9979666666666667e-05, + "loss": 0.0162, + "step": 10067 + }, + { + "epoch": 7.927924379677038, + "grad_norm": 0.6397718191146851, + "learning_rate": 2.9979333333333336e-05, + "loss": 0.0155, + "step": 10068 + }, + { + "epoch": 7.928712091374557, + "grad_norm": 0.3618886470794678, + "learning_rate": 2.9979e-05, + "loss": 0.0194, + "step": 10069 + }, + { + "epoch": 7.929499803072075, + "grad_norm": 0.3963218927383423, + "learning_rate": 2.9978666666666668e-05, + "loss": 0.0217, + "step": 10070 + }, + { + "epoch": 7.930287514769594, + "grad_norm": 0.3497677743434906, + "learning_rate": 2.9978333333333334e-05, + "loss": 0.0167, + "step": 10071 + }, + { + "epoch": 7.931075226467113, + "grad_norm": 0.37643471360206604, + "learning_rate": 2.9978e-05, + "loss": 0.0281, + "step": 10072 + }, + { + "epoch": 7.931862938164632, + "grad_norm": 0.3826889097690582, + "learning_rate": 2.997766666666667e-05, + "loss": 0.0275, + "step": 10073 + }, + { + "epoch": 7.9326506498621505, + "grad_norm": 0.32273030281066895, + "learning_rate": 2.9977333333333335e-05, + "loss": 0.0509, + "step": 10074 + }, + { + "epoch": 7.933438361559669, + "grad_norm": 0.4514162540435791, + "learning_rate": 2.9977e-05, + "loss": 0.0149, + "step": 10075 + }, + { + "epoch": 7.934226073257188, + "grad_norm": 0.3629794120788574, + "learning_rate": 2.9976666666666667e-05, + "loss": 0.0222, + "step": 10076 + }, + { + "epoch": 7.935013784954706, + "grad_norm": 0.5934845209121704, + "learning_rate": 2.9976333333333336e-05, + "loss": 0.0296, + "step": 10077 + }, + { + "epoch": 7.935801496652226, + "grad_norm": 0.488299161195755, + "learning_rate": 2.9976e-05, + "loss": 0.0421, + "step": 10078 + }, + { + "epoch": 7.936589208349744, + "grad_norm": 0.35287392139434814, + "learning_rate": 2.9975666666666668e-05, + "loss": 0.0243, + "step": 10079 + }, + { + "epoch": 7.937376920047263, + "grad_norm": 0.3971240818500519, + "learning_rate": 2.9975333333333334e-05, + "loss": 0.0178, + "step": 10080 + }, + { + "epoch": 7.938164631744781, + "grad_norm": 0.5919625163078308, + "learning_rate": 2.9975e-05, + "loss": 0.0213, + "step": 10081 + }, + { + "epoch": 7.9389523434423, + "grad_norm": 0.6210979223251343, + "learning_rate": 2.997466666666667e-05, + "loss": 0.0487, + "step": 10082 + }, + { + "epoch": 7.9397400551398185, + "grad_norm": 0.9279066324234009, + "learning_rate": 2.9974333333333332e-05, + "loss": 0.0343, + "step": 10083 + }, + { + "epoch": 7.940527766837338, + "grad_norm": 0.26421302556991577, + "learning_rate": 2.9974e-05, + "loss": 0.0208, + "step": 10084 + }, + { + "epoch": 7.9413154785348565, + "grad_norm": 0.3983015716075897, + "learning_rate": 2.9973666666666667e-05, + "loss": 0.0202, + "step": 10085 + }, + { + "epoch": 7.942103190232375, + "grad_norm": 0.6221725344657898, + "learning_rate": 2.9973333333333333e-05, + "loss": 0.0333, + "step": 10086 + }, + { + "epoch": 7.942890901929894, + "grad_norm": 0.416182279586792, + "learning_rate": 2.9973e-05, + "loss": 0.0234, + "step": 10087 + }, + { + "epoch": 7.943678613627412, + "grad_norm": 0.32593676447868347, + "learning_rate": 2.997266666666667e-05, + "loss": 0.0177, + "step": 10088 + }, + { + "epoch": 7.944466325324931, + "grad_norm": 0.36441245675086975, + "learning_rate": 2.997233333333333e-05, + "loss": 0.0221, + "step": 10089 + }, + { + "epoch": 7.945254037022449, + "grad_norm": 0.5222330689430237, + "learning_rate": 2.9972e-05, + "loss": 0.0161, + "step": 10090 + }, + { + "epoch": 7.946041748719969, + "grad_norm": 1.2049967050552368, + "learning_rate": 2.997166666666667e-05, + "loss": 0.2642, + "step": 10091 + }, + { + "epoch": 7.946829460417487, + "grad_norm": 0.6319082379341125, + "learning_rate": 2.9971333333333332e-05, + "loss": 0.177, + "step": 10092 + }, + { + "epoch": 7.947617172115006, + "grad_norm": 1.0471487045288086, + "learning_rate": 2.9971e-05, + "loss": 0.1628, + "step": 10093 + }, + { + "epoch": 7.9484048838125245, + "grad_norm": 0.4554738700389862, + "learning_rate": 2.9970666666666667e-05, + "loss": 0.0884, + "step": 10094 + }, + { + "epoch": 7.949192595510043, + "grad_norm": 0.5674762725830078, + "learning_rate": 2.9970333333333333e-05, + "loss": 0.0832, + "step": 10095 + }, + { + "epoch": 7.949980307207562, + "grad_norm": 0.5540615916252136, + "learning_rate": 2.997e-05, + "loss": 0.0638, + "step": 10096 + }, + { + "epoch": 7.950768018905081, + "grad_norm": 1.038414716720581, + "learning_rate": 2.996966666666667e-05, + "loss": 0.076, + "step": 10097 + }, + { + "epoch": 7.9515557306026, + "grad_norm": 0.3530098795890808, + "learning_rate": 2.9969333333333335e-05, + "loss": 0.0218, + "step": 10098 + }, + { + "epoch": 7.952343442300118, + "grad_norm": 0.2717477083206177, + "learning_rate": 2.9969e-05, + "loss": 0.016, + "step": 10099 + }, + { + "epoch": 7.953131153997637, + "grad_norm": 0.2893408238887787, + "learning_rate": 2.996866666666667e-05, + "loss": 0.0333, + "step": 10100 + }, + { + "epoch": 7.953918865695155, + "grad_norm": 0.22000403702259064, + "learning_rate": 2.9968333333333332e-05, + "loss": 0.0209, + "step": 10101 + }, + { + "epoch": 7.954706577392674, + "grad_norm": 0.6150271892547607, + "learning_rate": 2.9968000000000002e-05, + "loss": 0.0857, + "step": 10102 + }, + { + "epoch": 7.955494289090193, + "grad_norm": 0.39216795563697815, + "learning_rate": 2.9967666666666668e-05, + "loss": 0.0263, + "step": 10103 + }, + { + "epoch": 7.956282000787712, + "grad_norm": 0.3324461281299591, + "learning_rate": 2.9967333333333334e-05, + "loss": 0.031, + "step": 10104 + }, + { + "epoch": 7.9570697124852305, + "grad_norm": 0.6761205792427063, + "learning_rate": 2.9967e-05, + "loss": 0.0116, + "step": 10105 + }, + { + "epoch": 7.957857424182749, + "grad_norm": 0.28438037633895874, + "learning_rate": 2.996666666666667e-05, + "loss": 0.017, + "step": 10106 + }, + { + "epoch": 7.958645135880268, + "grad_norm": 0.18695735931396484, + "learning_rate": 2.9966333333333335e-05, + "loss": 0.0139, + "step": 10107 + }, + { + "epoch": 7.959432847577786, + "grad_norm": 0.20005002617835999, + "learning_rate": 2.9966e-05, + "loss": 0.0178, + "step": 10108 + }, + { + "epoch": 7.960220559275305, + "grad_norm": 0.23940551280975342, + "learning_rate": 2.996566666666667e-05, + "loss": 0.0176, + "step": 10109 + }, + { + "epoch": 7.961008270972824, + "grad_norm": 0.2540276050567627, + "learning_rate": 2.9965333333333333e-05, + "loss": 0.0132, + "step": 10110 + }, + { + "epoch": 7.961795982670343, + "grad_norm": 0.4458366334438324, + "learning_rate": 2.9965000000000002e-05, + "loss": 0.0157, + "step": 10111 + }, + { + "epoch": 7.962583694367861, + "grad_norm": 0.34153032302856445, + "learning_rate": 2.9964666666666664e-05, + "loss": 0.0173, + "step": 10112 + }, + { + "epoch": 7.96337140606538, + "grad_norm": 0.39906856417655945, + "learning_rate": 2.9964333333333334e-05, + "loss": 0.0231, + "step": 10113 + }, + { + "epoch": 7.9641591177628985, + "grad_norm": 0.3698543608188629, + "learning_rate": 2.9964e-05, + "loss": 0.0211, + "step": 10114 + }, + { + "epoch": 7.964946829460417, + "grad_norm": 0.2539694905281067, + "learning_rate": 2.9963666666666666e-05, + "loss": 0.0191, + "step": 10115 + }, + { + "epoch": 7.9657345411579366, + "grad_norm": 0.5265986919403076, + "learning_rate": 2.9963333333333335e-05, + "loss": 0.0217, + "step": 10116 + }, + { + "epoch": 7.966522252855455, + "grad_norm": 1.2154077291488647, + "learning_rate": 2.9963e-05, + "loss": 0.0149, + "step": 10117 + }, + { + "epoch": 7.967309964552974, + "grad_norm": 0.34355413913726807, + "learning_rate": 2.9962666666666667e-05, + "loss": 0.0205, + "step": 10118 + }, + { + "epoch": 7.968097676250492, + "grad_norm": 0.25658828020095825, + "learning_rate": 2.9962333333333333e-05, + "loss": 0.0151, + "step": 10119 + }, + { + "epoch": 7.968885387948011, + "grad_norm": 0.4871200621128082, + "learning_rate": 2.9962000000000002e-05, + "loss": 0.0233, + "step": 10120 + }, + { + "epoch": 7.969673099645529, + "grad_norm": 0.3869940936565399, + "learning_rate": 2.9961666666666665e-05, + "loss": 0.0264, + "step": 10121 + }, + { + "epoch": 7.970460811343049, + "grad_norm": 0.3275604546070099, + "learning_rate": 2.9961333333333334e-05, + "loss": 0.0133, + "step": 10122 + }, + { + "epoch": 7.971248523040567, + "grad_norm": 0.6744339466094971, + "learning_rate": 2.9961000000000003e-05, + "loss": 0.0266, + "step": 10123 + }, + { + "epoch": 7.972036234738086, + "grad_norm": 0.29602184891700745, + "learning_rate": 2.9960666666666666e-05, + "loss": 0.0132, + "step": 10124 + }, + { + "epoch": 7.9728239464356045, + "grad_norm": 0.3270699083805084, + "learning_rate": 2.9960333333333335e-05, + "loss": 0.0268, + "step": 10125 + }, + { + "epoch": 7.973611658133123, + "grad_norm": 0.5475334525108337, + "learning_rate": 2.996e-05, + "loss": 0.025, + "step": 10126 + }, + { + "epoch": 7.974399369830642, + "grad_norm": 2.394449234008789, + "learning_rate": 2.9959666666666667e-05, + "loss": 0.0204, + "step": 10127 + }, + { + "epoch": 7.97518708152816, + "grad_norm": 0.223244309425354, + "learning_rate": 2.9959333333333333e-05, + "loss": 0.0108, + "step": 10128 + }, + { + "epoch": 7.97597479322568, + "grad_norm": 0.9901888370513916, + "learning_rate": 2.9959000000000002e-05, + "loss": 0.0266, + "step": 10129 + }, + { + "epoch": 7.976762504923198, + "grad_norm": 0.370530366897583, + "learning_rate": 2.9958666666666665e-05, + "loss": 0.0159, + "step": 10130 + }, + { + "epoch": 7.977550216620717, + "grad_norm": 0.34362563490867615, + "learning_rate": 2.9958333333333334e-05, + "loss": 0.0182, + "step": 10131 + }, + { + "epoch": 7.978337928318235, + "grad_norm": 0.2835136353969574, + "learning_rate": 2.9958000000000004e-05, + "loss": 0.0186, + "step": 10132 + }, + { + "epoch": 7.979125640015754, + "grad_norm": 0.3225875198841095, + "learning_rate": 2.9957666666666666e-05, + "loss": 0.0222, + "step": 10133 + }, + { + "epoch": 7.979913351713273, + "grad_norm": 0.3703390657901764, + "learning_rate": 2.9957333333333335e-05, + "loss": 0.0238, + "step": 10134 + }, + { + "epoch": 7.980701063410792, + "grad_norm": 0.32370150089263916, + "learning_rate": 2.9957e-05, + "loss": 0.0168, + "step": 10135 + }, + { + "epoch": 7.981488775108311, + "grad_norm": 0.508065402507782, + "learning_rate": 2.9956666666666667e-05, + "loss": 0.0261, + "step": 10136 + }, + { + "epoch": 7.982276486805829, + "grad_norm": 0.33130887150764465, + "learning_rate": 2.9956333333333333e-05, + "loss": 0.0179, + "step": 10137 + }, + { + "epoch": 7.983064198503348, + "grad_norm": 0.3298388123512268, + "learning_rate": 2.9956000000000003e-05, + "loss": 0.0179, + "step": 10138 + }, + { + "epoch": 7.983851910200866, + "grad_norm": 0.5801504850387573, + "learning_rate": 2.9955666666666665e-05, + "loss": 0.0177, + "step": 10139 + }, + { + "epoch": 7.984639621898385, + "grad_norm": 0.8603256344795227, + "learning_rate": 2.9955333333333334e-05, + "loss": 0.0535, + "step": 10140 + }, + { + "epoch": 7.985427333595904, + "grad_norm": 0.7337473630905151, + "learning_rate": 2.9955000000000004e-05, + "loss": 0.2612, + "step": 10141 + }, + { + "epoch": 7.986215045293423, + "grad_norm": 0.6231429576873779, + "learning_rate": 2.9954666666666666e-05, + "loss": 0.1387, + "step": 10142 + }, + { + "epoch": 7.987002756990941, + "grad_norm": 0.4206695258617401, + "learning_rate": 2.9954333333333336e-05, + "loss": 0.0416, + "step": 10143 + }, + { + "epoch": 7.98779046868846, + "grad_norm": 0.32448530197143555, + "learning_rate": 2.9953999999999998e-05, + "loss": 0.0268, + "step": 10144 + }, + { + "epoch": 7.9885781803859786, + "grad_norm": 0.29890063405036926, + "learning_rate": 2.9953666666666667e-05, + "loss": 0.0168, + "step": 10145 + }, + { + "epoch": 7.989365892083497, + "grad_norm": 0.5020161867141724, + "learning_rate": 2.9953333333333333e-05, + "loss": 0.013, + "step": 10146 + }, + { + "epoch": 7.990153603781016, + "grad_norm": 0.3826081156730652, + "learning_rate": 2.9953e-05, + "loss": 0.0208, + "step": 10147 + }, + { + "epoch": 7.990941315478535, + "grad_norm": 0.43068644404411316, + "learning_rate": 2.995266666666667e-05, + "loss": 0.0222, + "step": 10148 + }, + { + "epoch": 7.991729027176054, + "grad_norm": 0.3354749083518982, + "learning_rate": 2.9952333333333335e-05, + "loss": 0.0186, + "step": 10149 + }, + { + "epoch": 7.992516738873572, + "grad_norm": 0.410099595785141, + "learning_rate": 2.9952e-05, + "loss": 0.0279, + "step": 10150 + }, + { + "epoch": 7.993304450571091, + "grad_norm": 0.5109843611717224, + "learning_rate": 2.9951666666666666e-05, + "loss": 0.0321, + "step": 10151 + }, + { + "epoch": 7.994092162268609, + "grad_norm": 0.5007470846176147, + "learning_rate": 2.9951333333333336e-05, + "loss": 0.021, + "step": 10152 + }, + { + "epoch": 7.994879873966129, + "grad_norm": 0.17320364713668823, + "learning_rate": 2.9951e-05, + "loss": 0.0114, + "step": 10153 + }, + { + "epoch": 7.995667585663647, + "grad_norm": 0.2647205889225006, + "learning_rate": 2.9950666666666668e-05, + "loss": 0.0219, + "step": 10154 + }, + { + "epoch": 7.996455297361166, + "grad_norm": 0.4291802644729614, + "learning_rate": 2.9950333333333334e-05, + "loss": 0.026, + "step": 10155 + }, + { + "epoch": 7.997243009058685, + "grad_norm": 0.35479283332824707, + "learning_rate": 2.995e-05, + "loss": 0.0313, + "step": 10156 + }, + { + "epoch": 7.998030720756203, + "grad_norm": 0.6302383542060852, + "learning_rate": 2.994966666666667e-05, + "loss": 0.0185, + "step": 10157 + }, + { + "epoch": 7.998818432453722, + "grad_norm": 0.24919483065605164, + "learning_rate": 2.9949333333333335e-05, + "loss": 0.0162, + "step": 10158 + }, + { + "epoch": 7.99960614415124, + "grad_norm": 0.3839191198348999, + "learning_rate": 2.9949e-05, + "loss": 0.022, + "step": 10159 + }, + { + "epoch": 8.0, + "grad_norm": 0.3189985156059265, + "learning_rate": 2.9948666666666667e-05, + "loss": 0.006, + "step": 10160 + }, + { + "epoch": 8.00078771169752, + "grad_norm": 0.9194998145103455, + "learning_rate": 2.9948333333333336e-05, + "loss": 0.2463, + "step": 10161 + }, + { + "epoch": 8.001575423395037, + "grad_norm": 0.5384595990180969, + "learning_rate": 2.9948e-05, + "loss": 0.2011, + "step": 10162 + }, + { + "epoch": 8.002363135092557, + "grad_norm": 0.5615425705909729, + "learning_rate": 2.9947666666666668e-05, + "loss": 0.1737, + "step": 10163 + }, + { + "epoch": 8.003150846790074, + "grad_norm": 0.6900678277015686, + "learning_rate": 2.9947333333333334e-05, + "loss": 0.095, + "step": 10164 + }, + { + "epoch": 8.003938558487594, + "grad_norm": 0.4474695026874542, + "learning_rate": 2.9947e-05, + "loss": 0.073, + "step": 10165 + }, + { + "epoch": 8.004726270185111, + "grad_norm": 0.2778373062610626, + "learning_rate": 2.994666666666667e-05, + "loss": 0.0224, + "step": 10166 + }, + { + "epoch": 8.00551398188263, + "grad_norm": 0.39203158020973206, + "learning_rate": 2.9946333333333335e-05, + "loss": 0.0308, + "step": 10167 + }, + { + "epoch": 8.00630169358015, + "grad_norm": 0.6222851276397705, + "learning_rate": 2.9946e-05, + "loss": 0.0651, + "step": 10168 + }, + { + "epoch": 8.007089405277668, + "grad_norm": 0.4130534827709198, + "learning_rate": 2.9945666666666667e-05, + "loss": 0.0131, + "step": 10169 + }, + { + "epoch": 8.007877116975187, + "grad_norm": 0.5578759908676147, + "learning_rate": 2.9945333333333336e-05, + "loss": 0.033, + "step": 10170 + }, + { + "epoch": 8.008664828672705, + "grad_norm": 0.2711983919143677, + "learning_rate": 2.9945e-05, + "loss": 0.0246, + "step": 10171 + }, + { + "epoch": 8.009452540370225, + "grad_norm": 0.2325834035873413, + "learning_rate": 2.9944666666666668e-05, + "loss": 0.015, + "step": 10172 + }, + { + "epoch": 8.010240252067744, + "grad_norm": 0.4381721019744873, + "learning_rate": 2.9944333333333334e-05, + "loss": 0.0167, + "step": 10173 + }, + { + "epoch": 8.011027963765262, + "grad_norm": 0.2387073040008545, + "learning_rate": 2.9944e-05, + "loss": 0.0123, + "step": 10174 + }, + { + "epoch": 8.011815675462781, + "grad_norm": 0.30531665682792664, + "learning_rate": 2.994366666666667e-05, + "loss": 0.0117, + "step": 10175 + }, + { + "epoch": 8.012603387160299, + "grad_norm": 0.4245263338088989, + "learning_rate": 2.9943333333333332e-05, + "loss": 0.0109, + "step": 10176 + }, + { + "epoch": 8.013391098857818, + "grad_norm": 0.966298520565033, + "learning_rate": 2.9943e-05, + "loss": 0.0221, + "step": 10177 + }, + { + "epoch": 8.014178810555336, + "grad_norm": 0.2896774113178253, + "learning_rate": 2.9942666666666667e-05, + "loss": 0.0125, + "step": 10178 + }, + { + "epoch": 8.014966522252855, + "grad_norm": 0.28557589650154114, + "learning_rate": 2.9942333333333333e-05, + "loss": 0.0189, + "step": 10179 + }, + { + "epoch": 8.015754233950375, + "grad_norm": 0.18668867647647858, + "learning_rate": 2.9942e-05, + "loss": 0.0055, + "step": 10180 + }, + { + "epoch": 8.016541945647893, + "grad_norm": 0.2853369116783142, + "learning_rate": 2.9941666666666668e-05, + "loss": 0.0174, + "step": 10181 + }, + { + "epoch": 8.017329657345412, + "grad_norm": 0.804125189781189, + "learning_rate": 2.9941333333333334e-05, + "loss": 0.0206, + "step": 10182 + }, + { + "epoch": 8.01811736904293, + "grad_norm": 0.2275409996509552, + "learning_rate": 2.9941e-05, + "loss": 0.0094, + "step": 10183 + }, + { + "epoch": 8.01890508074045, + "grad_norm": 0.3692604899406433, + "learning_rate": 2.994066666666667e-05, + "loss": 0.0163, + "step": 10184 + }, + { + "epoch": 8.019692792437967, + "grad_norm": 0.4432673454284668, + "learning_rate": 2.9940333333333332e-05, + "loss": 0.0192, + "step": 10185 + }, + { + "epoch": 8.020480504135486, + "grad_norm": 0.5080938935279846, + "learning_rate": 2.994e-05, + "loss": 0.0184, + "step": 10186 + }, + { + "epoch": 8.021268215833006, + "grad_norm": 0.6262601613998413, + "learning_rate": 2.9939666666666667e-05, + "loss": 0.0299, + "step": 10187 + }, + { + "epoch": 8.022055927530523, + "grad_norm": 0.6657440662384033, + "learning_rate": 2.9939333333333333e-05, + "loss": 0.0189, + "step": 10188 + }, + { + "epoch": 8.022843639228043, + "grad_norm": 0.9580446481704712, + "learning_rate": 2.9939e-05, + "loss": 0.0177, + "step": 10189 + }, + { + "epoch": 8.02363135092556, + "grad_norm": 0.22907082736492157, + "learning_rate": 2.993866666666667e-05, + "loss": 0.0166, + "step": 10190 + }, + { + "epoch": 8.02441906262308, + "grad_norm": 0.3063949942588806, + "learning_rate": 2.9938333333333334e-05, + "loss": 0.0119, + "step": 10191 + }, + { + "epoch": 8.0252067743206, + "grad_norm": 0.3200068473815918, + "learning_rate": 2.9938e-05, + "loss": 0.0195, + "step": 10192 + }, + { + "epoch": 8.025994486018117, + "grad_norm": 0.6104007363319397, + "learning_rate": 2.993766666666667e-05, + "loss": 0.0243, + "step": 10193 + }, + { + "epoch": 8.026782197715637, + "grad_norm": 0.2996874749660492, + "learning_rate": 2.9937333333333332e-05, + "loss": 0.02, + "step": 10194 + }, + { + "epoch": 8.027569909413154, + "grad_norm": 0.44567492604255676, + "learning_rate": 2.9937e-05, + "loss": 0.0179, + "step": 10195 + }, + { + "epoch": 8.028357621110674, + "grad_norm": 0.36115026473999023, + "learning_rate": 2.9936666666666667e-05, + "loss": 0.027, + "step": 10196 + }, + { + "epoch": 8.029145332808191, + "grad_norm": 0.35212621092796326, + "learning_rate": 2.9936333333333333e-05, + "loss": 0.0116, + "step": 10197 + }, + { + "epoch": 8.02993304450571, + "grad_norm": 0.2937287986278534, + "learning_rate": 2.9936000000000003e-05, + "loss": 0.0124, + "step": 10198 + }, + { + "epoch": 8.03072075620323, + "grad_norm": 0.47607168555259705, + "learning_rate": 2.993566666666667e-05, + "loss": 0.0169, + "step": 10199 + }, + { + "epoch": 8.031508467900748, + "grad_norm": 0.9479598999023438, + "learning_rate": 2.9935333333333335e-05, + "loss": 0.0147, + "step": 10200 + }, + { + "epoch": 8.032296179598267, + "grad_norm": 0.4828936755657196, + "learning_rate": 2.9935e-05, + "loss": 0.0384, + "step": 10201 + }, + { + "epoch": 8.033083891295785, + "grad_norm": 0.5216270089149475, + "learning_rate": 2.993466666666667e-05, + "loss": 0.0222, + "step": 10202 + }, + { + "epoch": 8.033871602993305, + "grad_norm": 0.30424565076828003, + "learning_rate": 2.9934333333333332e-05, + "loss": 0.0181, + "step": 10203 + }, + { + "epoch": 8.034659314690822, + "grad_norm": 0.2550046443939209, + "learning_rate": 2.9934000000000002e-05, + "loss": 0.0125, + "step": 10204 + }, + { + "epoch": 8.035447026388342, + "grad_norm": 0.20822645723819733, + "learning_rate": 2.9933666666666664e-05, + "loss": 0.0125, + "step": 10205 + }, + { + "epoch": 8.036234738085861, + "grad_norm": 0.3599098026752472, + "learning_rate": 2.9933333333333334e-05, + "loss": 0.0227, + "step": 10206 + }, + { + "epoch": 8.037022449783379, + "grad_norm": 0.6384773850440979, + "learning_rate": 2.9933000000000003e-05, + "loss": 0.0186, + "step": 10207 + }, + { + "epoch": 8.037810161480898, + "grad_norm": 0.25584667921066284, + "learning_rate": 2.9932666666666665e-05, + "loss": 0.0131, + "step": 10208 + }, + { + "epoch": 8.038597873178416, + "grad_norm": 0.4329455494880676, + "learning_rate": 2.9932333333333335e-05, + "loss": 0.0167, + "step": 10209 + }, + { + "epoch": 8.039385584875935, + "grad_norm": 0.4418920576572418, + "learning_rate": 2.9932e-05, + "loss": 0.0314, + "step": 10210 + }, + { + "epoch": 8.040173296573455, + "grad_norm": 1.1998074054718018, + "learning_rate": 2.9931666666666667e-05, + "loss": 0.3444, + "step": 10211 + }, + { + "epoch": 8.040961008270973, + "grad_norm": 0.7490217089653015, + "learning_rate": 2.9931333333333333e-05, + "loss": 0.1783, + "step": 10212 + }, + { + "epoch": 8.041748719968492, + "grad_norm": 0.5958465933799744, + "learning_rate": 2.9931000000000002e-05, + "loss": 0.1125, + "step": 10213 + }, + { + "epoch": 8.04253643166601, + "grad_norm": 0.8203449249267578, + "learning_rate": 2.9930666666666668e-05, + "loss": 0.1731, + "step": 10214 + }, + { + "epoch": 8.04332414336353, + "grad_norm": 0.5753868818283081, + "learning_rate": 2.9930333333333334e-05, + "loss": 0.127, + "step": 10215 + }, + { + "epoch": 8.044111855061047, + "grad_norm": 0.6203922033309937, + "learning_rate": 2.9930000000000003e-05, + "loss": 0.0828, + "step": 10216 + }, + { + "epoch": 8.044899566758566, + "grad_norm": 0.35652920603752136, + "learning_rate": 2.9929666666666666e-05, + "loss": 0.0937, + "step": 10217 + }, + { + "epoch": 8.045687278456086, + "grad_norm": 0.6388605833053589, + "learning_rate": 2.9929333333333335e-05, + "loss": 0.0312, + "step": 10218 + }, + { + "epoch": 8.046474990153603, + "grad_norm": 0.26398196816444397, + "learning_rate": 2.9929e-05, + "loss": 0.0205, + "step": 10219 + }, + { + "epoch": 8.047262701851123, + "grad_norm": 0.19294720888137817, + "learning_rate": 2.9928666666666667e-05, + "loss": 0.0187, + "step": 10220 + }, + { + "epoch": 8.04805041354864, + "grad_norm": 0.32789409160614014, + "learning_rate": 2.9928333333333333e-05, + "loss": 0.0439, + "step": 10221 + }, + { + "epoch": 8.04883812524616, + "grad_norm": 0.30728447437286377, + "learning_rate": 2.9928000000000002e-05, + "loss": 0.017, + "step": 10222 + }, + { + "epoch": 8.04962583694368, + "grad_norm": 0.2662770748138428, + "learning_rate": 2.9927666666666668e-05, + "loss": 0.0219, + "step": 10223 + }, + { + "epoch": 8.050413548641197, + "grad_norm": 0.35597020387649536, + "learning_rate": 2.9927333333333334e-05, + "loss": 0.0165, + "step": 10224 + }, + { + "epoch": 8.051201260338717, + "grad_norm": 0.34626373648643494, + "learning_rate": 2.9927000000000003e-05, + "loss": 0.0237, + "step": 10225 + }, + { + "epoch": 8.051988972036234, + "grad_norm": 0.21200600266456604, + "learning_rate": 2.9926666666666666e-05, + "loss": 0.0124, + "step": 10226 + }, + { + "epoch": 8.052776683733754, + "grad_norm": 0.2530820369720459, + "learning_rate": 2.9926333333333335e-05, + "loss": 0.0093, + "step": 10227 + }, + { + "epoch": 8.053564395431271, + "grad_norm": 0.1593167632818222, + "learning_rate": 2.9926e-05, + "loss": 0.0079, + "step": 10228 + }, + { + "epoch": 8.054352107128791, + "grad_norm": 0.349813848733902, + "learning_rate": 2.9925666666666667e-05, + "loss": 0.0336, + "step": 10229 + }, + { + "epoch": 8.05513981882631, + "grad_norm": 0.16703693568706512, + "learning_rate": 2.9925333333333333e-05, + "loss": 0.0063, + "step": 10230 + }, + { + "epoch": 8.055927530523828, + "grad_norm": 0.41472816467285156, + "learning_rate": 2.9925000000000002e-05, + "loss": 0.0116, + "step": 10231 + }, + { + "epoch": 8.056715242221347, + "grad_norm": 0.39166775345802307, + "learning_rate": 2.9924666666666668e-05, + "loss": 0.0214, + "step": 10232 + }, + { + "epoch": 8.057502953918865, + "grad_norm": 0.29738253355026245, + "learning_rate": 2.9924333333333334e-05, + "loss": 0.02, + "step": 10233 + }, + { + "epoch": 8.058290665616385, + "grad_norm": 0.27335575222969055, + "learning_rate": 2.9924e-05, + "loss": 0.0183, + "step": 10234 + }, + { + "epoch": 8.059078377313902, + "grad_norm": 0.4235578179359436, + "learning_rate": 2.9923666666666666e-05, + "loss": 0.0452, + "step": 10235 + }, + { + "epoch": 8.059866089011422, + "grad_norm": 0.38066181540489197, + "learning_rate": 2.9923333333333335e-05, + "loss": 0.016, + "step": 10236 + }, + { + "epoch": 8.060653800708941, + "grad_norm": 0.26323774456977844, + "learning_rate": 2.9922999999999998e-05, + "loss": 0.0165, + "step": 10237 + }, + { + "epoch": 8.061441512406459, + "grad_norm": 0.2850700914859772, + "learning_rate": 2.9922666666666667e-05, + "loss": 0.0127, + "step": 10238 + }, + { + "epoch": 8.062229224103978, + "grad_norm": 0.1850004941225052, + "learning_rate": 2.9922333333333333e-05, + "loss": 0.0089, + "step": 10239 + }, + { + "epoch": 8.063016935801496, + "grad_norm": 0.4786926805973053, + "learning_rate": 2.9922e-05, + "loss": 0.014, + "step": 10240 + }, + { + "epoch": 8.063804647499015, + "grad_norm": 0.40199610590934753, + "learning_rate": 2.992166666666667e-05, + "loss": 0.0139, + "step": 10241 + }, + { + "epoch": 8.064592359196535, + "grad_norm": 0.27797743678092957, + "learning_rate": 2.9921333333333334e-05, + "loss": 0.0144, + "step": 10242 + }, + { + "epoch": 8.065380070894053, + "grad_norm": 0.45722803473472595, + "learning_rate": 2.9921e-05, + "loss": 0.0101, + "step": 10243 + }, + { + "epoch": 8.066167782591572, + "grad_norm": 0.2571103274822235, + "learning_rate": 2.9920666666666666e-05, + "loss": 0.0132, + "step": 10244 + }, + { + "epoch": 8.06695549428909, + "grad_norm": 0.3177914321422577, + "learning_rate": 2.9920333333333336e-05, + "loss": 0.0166, + "step": 10245 + }, + { + "epoch": 8.06774320598661, + "grad_norm": 0.7594206929206848, + "learning_rate": 2.9919999999999998e-05, + "loss": 0.018, + "step": 10246 + }, + { + "epoch": 8.068530917684127, + "grad_norm": 0.22458398342132568, + "learning_rate": 2.9919666666666667e-05, + "loss": 0.012, + "step": 10247 + }, + { + "epoch": 8.069318629381646, + "grad_norm": 0.4348165988922119, + "learning_rate": 2.9919333333333337e-05, + "loss": 0.023, + "step": 10248 + }, + { + "epoch": 8.070106341079166, + "grad_norm": 0.5295814275741577, + "learning_rate": 2.9919e-05, + "loss": 0.0271, + "step": 10249 + }, + { + "epoch": 8.070894052776683, + "grad_norm": 0.38505131006240845, + "learning_rate": 2.991866666666667e-05, + "loss": 0.0095, + "step": 10250 + }, + { + "epoch": 8.071681764474203, + "grad_norm": 0.47454720735549927, + "learning_rate": 2.9918333333333335e-05, + "loss": 0.0231, + "step": 10251 + }, + { + "epoch": 8.07246947617172, + "grad_norm": 0.4150013029575348, + "learning_rate": 2.9918e-05, + "loss": 0.0199, + "step": 10252 + }, + { + "epoch": 8.07325718786924, + "grad_norm": 0.28089502453804016, + "learning_rate": 2.9917666666666666e-05, + "loss": 0.012, + "step": 10253 + }, + { + "epoch": 8.074044899566758, + "grad_norm": 0.356823593378067, + "learning_rate": 2.9917333333333336e-05, + "loss": 0.0175, + "step": 10254 + }, + { + "epoch": 8.074832611264277, + "grad_norm": 0.39370694756507874, + "learning_rate": 2.9917e-05, + "loss": 0.0186, + "step": 10255 + }, + { + "epoch": 8.075620322961797, + "grad_norm": 0.732435405254364, + "learning_rate": 2.9916666666666668e-05, + "loss": 0.0215, + "step": 10256 + }, + { + "epoch": 8.076408034659314, + "grad_norm": 0.35179632902145386, + "learning_rate": 2.9916333333333337e-05, + "loss": 0.0208, + "step": 10257 + }, + { + "epoch": 8.077195746356834, + "grad_norm": 0.18869397044181824, + "learning_rate": 2.9916e-05, + "loss": 0.0076, + "step": 10258 + }, + { + "epoch": 8.077983458054351, + "grad_norm": 0.48112159967422485, + "learning_rate": 2.991566666666667e-05, + "loss": 0.0168, + "step": 10259 + }, + { + "epoch": 8.078771169751871, + "grad_norm": 0.4235551357269287, + "learning_rate": 2.9915333333333335e-05, + "loss": 0.0143, + "step": 10260 + }, + { + "epoch": 8.07955888144939, + "grad_norm": 0.7755119800567627, + "learning_rate": 2.9915e-05, + "loss": 0.2192, + "step": 10261 + }, + { + "epoch": 8.080346593146908, + "grad_norm": 0.9564999341964722, + "learning_rate": 2.9914666666666667e-05, + "loss": 0.1989, + "step": 10262 + }, + { + "epoch": 8.081134304844428, + "grad_norm": 0.8694396018981934, + "learning_rate": 2.9914333333333336e-05, + "loss": 0.1288, + "step": 10263 + }, + { + "epoch": 8.081922016541945, + "grad_norm": 0.8710492253303528, + "learning_rate": 2.9914000000000002e-05, + "loss": 0.1176, + "step": 10264 + }, + { + "epoch": 8.082709728239465, + "grad_norm": 0.5327661633491516, + "learning_rate": 2.9913666666666668e-05, + "loss": 0.0676, + "step": 10265 + }, + { + "epoch": 8.083497439936982, + "grad_norm": 0.5332239866256714, + "learning_rate": 2.9913333333333334e-05, + "loss": 0.0499, + "step": 10266 + }, + { + "epoch": 8.084285151634502, + "grad_norm": 0.2794540822505951, + "learning_rate": 2.9913e-05, + "loss": 0.0211, + "step": 10267 + }, + { + "epoch": 8.085072863332021, + "grad_norm": 0.4160428047180176, + "learning_rate": 2.991266666666667e-05, + "loss": 0.0212, + "step": 10268 + }, + { + "epoch": 8.085860575029539, + "grad_norm": 0.40023618936538696, + "learning_rate": 2.991233333333333e-05, + "loss": 0.0209, + "step": 10269 + }, + { + "epoch": 8.086648286727058, + "grad_norm": 0.31184372305870056, + "learning_rate": 2.9912e-05, + "loss": 0.02, + "step": 10270 + }, + { + "epoch": 8.087435998424576, + "grad_norm": 0.34038233757019043, + "learning_rate": 2.9911666666666667e-05, + "loss": 0.0182, + "step": 10271 + }, + { + "epoch": 8.088223710122096, + "grad_norm": 0.3324332535266876, + "learning_rate": 2.9911333333333333e-05, + "loss": 0.0235, + "step": 10272 + }, + { + "epoch": 8.089011421819613, + "grad_norm": 0.2477327138185501, + "learning_rate": 2.9911000000000002e-05, + "loss": 0.0071, + "step": 10273 + }, + { + "epoch": 8.089799133517133, + "grad_norm": 0.19268397986888885, + "learning_rate": 2.9910666666666668e-05, + "loss": 0.0103, + "step": 10274 + }, + { + "epoch": 8.090586845214652, + "grad_norm": 0.16586384177207947, + "learning_rate": 2.9910333333333334e-05, + "loss": 0.0136, + "step": 10275 + }, + { + "epoch": 8.09137455691217, + "grad_norm": 0.32804593443870544, + "learning_rate": 2.991e-05, + "loss": 0.0165, + "step": 10276 + }, + { + "epoch": 8.09216226860969, + "grad_norm": 0.21316325664520264, + "learning_rate": 2.990966666666667e-05, + "loss": 0.0113, + "step": 10277 + }, + { + "epoch": 8.092949980307207, + "grad_norm": 0.37515994906425476, + "learning_rate": 2.9909333333333332e-05, + "loss": 0.0207, + "step": 10278 + }, + { + "epoch": 8.093737692004726, + "grad_norm": 0.5912442803382874, + "learning_rate": 2.9909e-05, + "loss": 0.0087, + "step": 10279 + }, + { + "epoch": 8.094525403702246, + "grad_norm": 0.4961969554424286, + "learning_rate": 2.9908666666666667e-05, + "loss": 0.0165, + "step": 10280 + }, + { + "epoch": 8.095313115399764, + "grad_norm": 1.4628514051437378, + "learning_rate": 2.9908333333333333e-05, + "loss": 0.0213, + "step": 10281 + }, + { + "epoch": 8.096100827097283, + "grad_norm": 0.1502506136894226, + "learning_rate": 2.9908000000000002e-05, + "loss": 0.0109, + "step": 10282 + }, + { + "epoch": 8.0968885387948, + "grad_norm": 0.4311656057834625, + "learning_rate": 2.9907666666666668e-05, + "loss": 0.0165, + "step": 10283 + }, + { + "epoch": 8.09767625049232, + "grad_norm": 0.5567402839660645, + "learning_rate": 2.9907333333333334e-05, + "loss": 0.0174, + "step": 10284 + }, + { + "epoch": 8.098463962189838, + "grad_norm": 0.3892802894115448, + "learning_rate": 2.9907e-05, + "loss": 0.0165, + "step": 10285 + }, + { + "epoch": 8.099251673887357, + "grad_norm": 0.2212117612361908, + "learning_rate": 2.990666666666667e-05, + "loss": 0.0126, + "step": 10286 + }, + { + "epoch": 8.100039385584877, + "grad_norm": 0.18923574686050415, + "learning_rate": 2.9906333333333332e-05, + "loss": 0.0138, + "step": 10287 + }, + { + "epoch": 8.100827097282394, + "grad_norm": 0.35901689529418945, + "learning_rate": 2.9906e-05, + "loss": 0.0185, + "step": 10288 + }, + { + "epoch": 8.101614808979914, + "grad_norm": 0.38425761461257935, + "learning_rate": 2.9905666666666667e-05, + "loss": 0.0169, + "step": 10289 + }, + { + "epoch": 8.102402520677431, + "grad_norm": 0.6975796818733215, + "learning_rate": 2.9905333333333333e-05, + "loss": 0.0257, + "step": 10290 + }, + { + "epoch": 8.103190232374951, + "grad_norm": 0.5706655979156494, + "learning_rate": 2.9905000000000003e-05, + "loss": 0.0097, + "step": 10291 + }, + { + "epoch": 8.103977944072469, + "grad_norm": 0.5789849162101746, + "learning_rate": 2.990466666666667e-05, + "loss": 0.0237, + "step": 10292 + }, + { + "epoch": 8.104765655769988, + "grad_norm": 0.20858469605445862, + "learning_rate": 2.9904333333333334e-05, + "loss": 0.0089, + "step": 10293 + }, + { + "epoch": 8.105553367467508, + "grad_norm": 0.46498754620552063, + "learning_rate": 2.9904e-05, + "loss": 0.0195, + "step": 10294 + }, + { + "epoch": 8.106341079165025, + "grad_norm": 0.31988975405693054, + "learning_rate": 2.9903666666666666e-05, + "loss": 0.013, + "step": 10295 + }, + { + "epoch": 8.107128790862545, + "grad_norm": 0.3709053099155426, + "learning_rate": 2.9903333333333332e-05, + "loss": 0.0152, + "step": 10296 + }, + { + "epoch": 8.107916502560062, + "grad_norm": 0.4860718846321106, + "learning_rate": 2.9903e-05, + "loss": 0.0143, + "step": 10297 + }, + { + "epoch": 8.108704214257582, + "grad_norm": 0.3752463161945343, + "learning_rate": 2.9902666666666667e-05, + "loss": 0.0147, + "step": 10298 + }, + { + "epoch": 8.109491925955101, + "grad_norm": 0.3754386901855469, + "learning_rate": 2.9902333333333333e-05, + "loss": 0.017, + "step": 10299 + }, + { + "epoch": 8.110279637652619, + "grad_norm": 0.26076605916023254, + "learning_rate": 2.9902000000000003e-05, + "loss": 0.0129, + "step": 10300 + }, + { + "epoch": 8.111067349350138, + "grad_norm": 0.8730480074882507, + "learning_rate": 2.9901666666666665e-05, + "loss": 0.0164, + "step": 10301 + }, + { + "epoch": 8.111855061047656, + "grad_norm": 0.7609413266181946, + "learning_rate": 2.9901333333333335e-05, + "loss": 0.0233, + "step": 10302 + }, + { + "epoch": 8.112642772745176, + "grad_norm": 0.3944171369075775, + "learning_rate": 2.9901e-05, + "loss": 0.0174, + "step": 10303 + }, + { + "epoch": 8.113430484442693, + "grad_norm": 0.5769505500793457, + "learning_rate": 2.9900666666666666e-05, + "loss": 0.0175, + "step": 10304 + }, + { + "epoch": 8.114218196140213, + "grad_norm": 0.5408366322517395, + "learning_rate": 2.9900333333333332e-05, + "loss": 0.0255, + "step": 10305 + }, + { + "epoch": 8.115005907837732, + "grad_norm": 0.23367516696453094, + "learning_rate": 2.9900000000000002e-05, + "loss": 0.0124, + "step": 10306 + }, + { + "epoch": 8.11579361953525, + "grad_norm": 0.6860462427139282, + "learning_rate": 2.9899666666666668e-05, + "loss": 0.0204, + "step": 10307 + }, + { + "epoch": 8.11658133123277, + "grad_norm": 0.36058834195137024, + "learning_rate": 2.9899333333333334e-05, + "loss": 0.0228, + "step": 10308 + }, + { + "epoch": 8.117369042930287, + "grad_norm": 0.4933391511440277, + "learning_rate": 2.9899000000000003e-05, + "loss": 0.0283, + "step": 10309 + }, + { + "epoch": 8.118156754627806, + "grad_norm": 0.5651419758796692, + "learning_rate": 2.9898666666666665e-05, + "loss": 0.0214, + "step": 10310 + }, + { + "epoch": 8.118944466325324, + "grad_norm": 0.92777019739151, + "learning_rate": 2.9898333333333335e-05, + "loss": 0.2799, + "step": 10311 + }, + { + "epoch": 8.119732178022844, + "grad_norm": 0.6147975325584412, + "learning_rate": 2.9898e-05, + "loss": 0.1339, + "step": 10312 + }, + { + "epoch": 8.120519889720363, + "grad_norm": 0.7315796613693237, + "learning_rate": 2.9897666666666667e-05, + "loss": 0.2061, + "step": 10313 + }, + { + "epoch": 8.12130760141788, + "grad_norm": 0.9801881313323975, + "learning_rate": 2.9897333333333336e-05, + "loss": 0.1244, + "step": 10314 + }, + { + "epoch": 8.1220953131154, + "grad_norm": 0.8003073334693909, + "learning_rate": 2.9897000000000002e-05, + "loss": 0.1652, + "step": 10315 + }, + { + "epoch": 8.122883024812918, + "grad_norm": 0.6599928736686707, + "learning_rate": 2.9896666666666668e-05, + "loss": 0.0755, + "step": 10316 + }, + { + "epoch": 8.123670736510437, + "grad_norm": 0.27035826444625854, + "learning_rate": 2.9896333333333334e-05, + "loss": 0.0306, + "step": 10317 + }, + { + "epoch": 8.124458448207957, + "grad_norm": 0.27442875504493713, + "learning_rate": 2.9896000000000003e-05, + "loss": 0.0293, + "step": 10318 + }, + { + "epoch": 8.125246159905474, + "grad_norm": 0.3282085061073303, + "learning_rate": 2.9895666666666666e-05, + "loss": 0.0185, + "step": 10319 + }, + { + "epoch": 8.126033871602994, + "grad_norm": 0.2823207974433899, + "learning_rate": 2.9895333333333335e-05, + "loss": 0.0336, + "step": 10320 + }, + { + "epoch": 8.126821583300512, + "grad_norm": 0.26794883608818054, + "learning_rate": 2.9895e-05, + "loss": 0.0202, + "step": 10321 + }, + { + "epoch": 8.127609294998031, + "grad_norm": 0.22826483845710754, + "learning_rate": 2.9894666666666667e-05, + "loss": 0.0159, + "step": 10322 + }, + { + "epoch": 8.128397006695549, + "grad_norm": 0.279645174741745, + "learning_rate": 2.9894333333333336e-05, + "loss": 0.0183, + "step": 10323 + }, + { + "epoch": 8.129184718393068, + "grad_norm": 0.24703866243362427, + "learning_rate": 2.9894e-05, + "loss": 0.0176, + "step": 10324 + }, + { + "epoch": 8.129972430090588, + "grad_norm": 0.21563154458999634, + "learning_rate": 2.9893666666666668e-05, + "loss": 0.0148, + "step": 10325 + }, + { + "epoch": 8.130760141788105, + "grad_norm": 0.24278292059898376, + "learning_rate": 2.9893333333333334e-05, + "loss": 0.011, + "step": 10326 + }, + { + "epoch": 8.131547853485625, + "grad_norm": 0.34128206968307495, + "learning_rate": 2.9893e-05, + "loss": 0.0232, + "step": 10327 + }, + { + "epoch": 8.132335565183142, + "grad_norm": 0.2669439911842346, + "learning_rate": 2.9892666666666666e-05, + "loss": 0.0106, + "step": 10328 + }, + { + "epoch": 8.133123276880662, + "grad_norm": 0.19152428209781647, + "learning_rate": 2.9892333333333335e-05, + "loss": 0.0131, + "step": 10329 + }, + { + "epoch": 8.13391098857818, + "grad_norm": 0.2182084023952484, + "learning_rate": 2.9891999999999998e-05, + "loss": 0.0179, + "step": 10330 + }, + { + "epoch": 8.134698700275699, + "grad_norm": 0.2348005175590515, + "learning_rate": 2.9891666666666667e-05, + "loss": 0.0182, + "step": 10331 + }, + { + "epoch": 8.135486411973218, + "grad_norm": 0.45706284046173096, + "learning_rate": 2.9891333333333336e-05, + "loss": 0.0185, + "step": 10332 + }, + { + "epoch": 8.136274123670736, + "grad_norm": 0.4788086414337158, + "learning_rate": 2.9891e-05, + "loss": 0.0153, + "step": 10333 + }, + { + "epoch": 8.137061835368256, + "grad_norm": 0.3057752847671509, + "learning_rate": 2.9890666666666668e-05, + "loss": 0.0111, + "step": 10334 + }, + { + "epoch": 8.137849547065773, + "grad_norm": 0.40509089827537537, + "learning_rate": 2.9890333333333334e-05, + "loss": 0.0172, + "step": 10335 + }, + { + "epoch": 8.138637258763293, + "grad_norm": 0.29632335901260376, + "learning_rate": 2.989e-05, + "loss": 0.0207, + "step": 10336 + }, + { + "epoch": 8.139424970460812, + "grad_norm": 0.6030861735343933, + "learning_rate": 2.9889666666666666e-05, + "loss": 0.0262, + "step": 10337 + }, + { + "epoch": 8.14021268215833, + "grad_norm": 0.182813823223114, + "learning_rate": 2.9889333333333335e-05, + "loss": 0.013, + "step": 10338 + }, + { + "epoch": 8.14100039385585, + "grad_norm": 0.858906626701355, + "learning_rate": 2.9889e-05, + "loss": 0.0316, + "step": 10339 + }, + { + "epoch": 8.141788105553367, + "grad_norm": 0.21971814334392548, + "learning_rate": 2.9888666666666667e-05, + "loss": 0.0118, + "step": 10340 + }, + { + "epoch": 8.142575817250886, + "grad_norm": 0.43000999093055725, + "learning_rate": 2.9888333333333337e-05, + "loss": 0.0227, + "step": 10341 + }, + { + "epoch": 8.143363528948404, + "grad_norm": 0.339648962020874, + "learning_rate": 2.9888e-05, + "loss": 0.0115, + "step": 10342 + }, + { + "epoch": 8.144151240645924, + "grad_norm": 0.3128739297389984, + "learning_rate": 2.988766666666667e-05, + "loss": 0.0142, + "step": 10343 + }, + { + "epoch": 8.144938952343443, + "grad_norm": 0.24964632093906403, + "learning_rate": 2.9887333333333334e-05, + "loss": 0.0229, + "step": 10344 + }, + { + "epoch": 8.14572666404096, + "grad_norm": 0.8363447785377502, + "learning_rate": 2.9887e-05, + "loss": 0.0179, + "step": 10345 + }, + { + "epoch": 8.14651437573848, + "grad_norm": 0.44501131772994995, + "learning_rate": 2.9886666666666666e-05, + "loss": 0.0287, + "step": 10346 + }, + { + "epoch": 8.147302087435998, + "grad_norm": 0.22439850866794586, + "learning_rate": 2.9886333333333336e-05, + "loss": 0.012, + "step": 10347 + }, + { + "epoch": 8.148089799133517, + "grad_norm": 0.2968738079071045, + "learning_rate": 2.9886e-05, + "loss": 0.015, + "step": 10348 + }, + { + "epoch": 8.148877510831035, + "grad_norm": 0.46798184514045715, + "learning_rate": 2.9885666666666667e-05, + "loss": 0.0184, + "step": 10349 + }, + { + "epoch": 8.149665222528554, + "grad_norm": 0.24010537564754486, + "learning_rate": 2.9885333333333337e-05, + "loss": 0.0139, + "step": 10350 + }, + { + "epoch": 8.150452934226074, + "grad_norm": 0.8671597838401794, + "learning_rate": 2.9885e-05, + "loss": 0.0255, + "step": 10351 + }, + { + "epoch": 8.151240645923592, + "grad_norm": 0.5681260228157043, + "learning_rate": 2.988466666666667e-05, + "loss": 0.0221, + "step": 10352 + }, + { + "epoch": 8.152028357621111, + "grad_norm": 0.4050300419330597, + "learning_rate": 2.9884333333333335e-05, + "loss": 0.0259, + "step": 10353 + }, + { + "epoch": 8.152816069318629, + "grad_norm": 0.30908751487731934, + "learning_rate": 2.9884e-05, + "loss": 0.0159, + "step": 10354 + }, + { + "epoch": 8.153603781016148, + "grad_norm": 0.3829069137573242, + "learning_rate": 2.9883666666666666e-05, + "loss": 0.0203, + "step": 10355 + }, + { + "epoch": 8.154391492713668, + "grad_norm": 0.35258132219314575, + "learning_rate": 2.9883333333333332e-05, + "loss": 0.018, + "step": 10356 + }, + { + "epoch": 8.155179204411185, + "grad_norm": 0.2286834567785263, + "learning_rate": 2.9883000000000002e-05, + "loss": 0.0176, + "step": 10357 + }, + { + "epoch": 8.155966916108705, + "grad_norm": 0.25774306058883667, + "learning_rate": 2.9882666666666668e-05, + "loss": 0.0168, + "step": 10358 + }, + { + "epoch": 8.156754627806222, + "grad_norm": 0.39393487572669983, + "learning_rate": 2.9882333333333334e-05, + "loss": 0.0183, + "step": 10359 + }, + { + "epoch": 8.157542339503742, + "grad_norm": 0.8741686940193176, + "learning_rate": 2.9882e-05, + "loss": 0.0167, + "step": 10360 + }, + { + "epoch": 8.15833005120126, + "grad_norm": 0.9404981136322021, + "learning_rate": 2.988166666666667e-05, + "loss": 0.2744, + "step": 10361 + }, + { + "epoch": 8.159117762898779, + "grad_norm": 0.6657750606536865, + "learning_rate": 2.988133333333333e-05, + "loss": 0.1909, + "step": 10362 + }, + { + "epoch": 8.159905474596298, + "grad_norm": 0.6756545901298523, + "learning_rate": 2.9881e-05, + "loss": 0.1773, + "step": 10363 + }, + { + "epoch": 8.160693186293816, + "grad_norm": 0.6386479139328003, + "learning_rate": 2.988066666666667e-05, + "loss": 0.1103, + "step": 10364 + }, + { + "epoch": 8.161480897991336, + "grad_norm": 0.5115734338760376, + "learning_rate": 2.9880333333333333e-05, + "loss": 0.0742, + "step": 10365 + }, + { + "epoch": 8.162268609688853, + "grad_norm": 0.47949132323265076, + "learning_rate": 2.9880000000000002e-05, + "loss": 0.0364, + "step": 10366 + }, + { + "epoch": 8.163056321386373, + "grad_norm": 0.48135024309158325, + "learning_rate": 2.9879666666666668e-05, + "loss": 0.0255, + "step": 10367 + }, + { + "epoch": 8.16384403308389, + "grad_norm": 0.33679237961769104, + "learning_rate": 2.9879333333333334e-05, + "loss": 0.0252, + "step": 10368 + }, + { + "epoch": 8.16463174478141, + "grad_norm": 0.23328308761119843, + "learning_rate": 2.9879e-05, + "loss": 0.0215, + "step": 10369 + }, + { + "epoch": 8.16541945647893, + "grad_norm": 0.22649076581001282, + "learning_rate": 2.987866666666667e-05, + "loss": 0.0217, + "step": 10370 + }, + { + "epoch": 8.166207168176447, + "grad_norm": 0.3037770092487335, + "learning_rate": 2.987833333333333e-05, + "loss": 0.016, + "step": 10371 + }, + { + "epoch": 8.166994879873966, + "grad_norm": 0.6223933696746826, + "learning_rate": 2.9878e-05, + "loss": 0.0311, + "step": 10372 + }, + { + "epoch": 8.167782591571484, + "grad_norm": 0.3116408586502075, + "learning_rate": 2.987766666666667e-05, + "loss": 0.0142, + "step": 10373 + }, + { + "epoch": 8.168570303269004, + "grad_norm": 0.6625848412513733, + "learning_rate": 2.9877333333333333e-05, + "loss": 0.0142, + "step": 10374 + }, + { + "epoch": 8.169358014966523, + "grad_norm": 0.38300591707229614, + "learning_rate": 2.9877000000000002e-05, + "loss": 0.0299, + "step": 10375 + }, + { + "epoch": 8.17014572666404, + "grad_norm": 0.26139315962791443, + "learning_rate": 2.9876666666666668e-05, + "loss": 0.0173, + "step": 10376 + }, + { + "epoch": 8.17093343836156, + "grad_norm": 0.4398151934146881, + "learning_rate": 2.9876333333333334e-05, + "loss": 0.0186, + "step": 10377 + }, + { + "epoch": 8.171721150059078, + "grad_norm": 0.2104344666004181, + "learning_rate": 2.9876e-05, + "loss": 0.0092, + "step": 10378 + }, + { + "epoch": 8.172508861756597, + "grad_norm": 0.45195043087005615, + "learning_rate": 2.987566666666667e-05, + "loss": 0.0314, + "step": 10379 + }, + { + "epoch": 8.173296573454115, + "grad_norm": 0.30751144886016846, + "learning_rate": 2.9875333333333332e-05, + "loss": 0.017, + "step": 10380 + }, + { + "epoch": 8.174084285151634, + "grad_norm": 0.2732676565647125, + "learning_rate": 2.9875e-05, + "loss": 0.0169, + "step": 10381 + }, + { + "epoch": 8.174871996849154, + "grad_norm": 0.38715261220932007, + "learning_rate": 2.987466666666667e-05, + "loss": 0.0166, + "step": 10382 + }, + { + "epoch": 8.175659708546672, + "grad_norm": 0.8578219413757324, + "learning_rate": 2.9874333333333333e-05, + "loss": 0.0225, + "step": 10383 + }, + { + "epoch": 8.176447420244191, + "grad_norm": 0.287971168756485, + "learning_rate": 2.9874000000000002e-05, + "loss": 0.0235, + "step": 10384 + }, + { + "epoch": 8.177235131941709, + "grad_norm": 1.1630918979644775, + "learning_rate": 2.9873666666666665e-05, + "loss": 0.0162, + "step": 10385 + }, + { + "epoch": 8.178022843639228, + "grad_norm": 0.3569315969944, + "learning_rate": 2.9873333333333334e-05, + "loss": 0.0199, + "step": 10386 + }, + { + "epoch": 8.178810555336748, + "grad_norm": 0.17130562663078308, + "learning_rate": 2.9873e-05, + "loss": 0.0128, + "step": 10387 + }, + { + "epoch": 8.179598267034265, + "grad_norm": 0.37489134073257446, + "learning_rate": 2.9872666666666666e-05, + "loss": 0.0317, + "step": 10388 + }, + { + "epoch": 8.180385978731785, + "grad_norm": 0.4196975827217102, + "learning_rate": 2.9872333333333335e-05, + "loss": 0.0359, + "step": 10389 + }, + { + "epoch": 8.181173690429302, + "grad_norm": 0.40644022822380066, + "learning_rate": 2.9872e-05, + "loss": 0.0162, + "step": 10390 + }, + { + "epoch": 8.181961402126822, + "grad_norm": 0.5709950923919678, + "learning_rate": 2.9871666666666667e-05, + "loss": 0.0267, + "step": 10391 + }, + { + "epoch": 8.18274911382434, + "grad_norm": 0.24473421275615692, + "learning_rate": 2.9871333333333333e-05, + "loss": 0.0217, + "step": 10392 + }, + { + "epoch": 8.183536825521859, + "grad_norm": 0.1782781183719635, + "learning_rate": 2.9871000000000003e-05, + "loss": 0.0153, + "step": 10393 + }, + { + "epoch": 8.184324537219378, + "grad_norm": 0.40607136487960815, + "learning_rate": 2.9870666666666665e-05, + "loss": 0.0121, + "step": 10394 + }, + { + "epoch": 8.185112248916896, + "grad_norm": 0.32413211464881897, + "learning_rate": 2.9870333333333334e-05, + "loss": 0.0284, + "step": 10395 + }, + { + "epoch": 8.185899960614416, + "grad_norm": 0.15707148611545563, + "learning_rate": 2.987e-05, + "loss": 0.0106, + "step": 10396 + }, + { + "epoch": 8.186687672311933, + "grad_norm": 0.24485492706298828, + "learning_rate": 2.9869666666666666e-05, + "loss": 0.0167, + "step": 10397 + }, + { + "epoch": 8.187475384009453, + "grad_norm": 0.28456932306289673, + "learning_rate": 2.9869333333333336e-05, + "loss": 0.0161, + "step": 10398 + }, + { + "epoch": 8.18826309570697, + "grad_norm": 0.5844255685806274, + "learning_rate": 2.9869e-05, + "loss": 0.0152, + "step": 10399 + }, + { + "epoch": 8.18905080740449, + "grad_norm": 0.33800652623176575, + "learning_rate": 2.9868666666666667e-05, + "loss": 0.021, + "step": 10400 + }, + { + "epoch": 8.18983851910201, + "grad_norm": 0.3740233778953552, + "learning_rate": 2.9868333333333333e-05, + "loss": 0.0133, + "step": 10401 + }, + { + "epoch": 8.190626230799527, + "grad_norm": 0.37843337655067444, + "learning_rate": 2.9868000000000003e-05, + "loss": 0.017, + "step": 10402 + }, + { + "epoch": 8.191413942497046, + "grad_norm": 0.29832541942596436, + "learning_rate": 2.9867666666666665e-05, + "loss": 0.0165, + "step": 10403 + }, + { + "epoch": 8.192201654194564, + "grad_norm": 0.2115269899368286, + "learning_rate": 2.9867333333333335e-05, + "loss": 0.0105, + "step": 10404 + }, + { + "epoch": 8.192989365892084, + "grad_norm": 1.35945463180542, + "learning_rate": 2.9867e-05, + "loss": 0.0203, + "step": 10405 + }, + { + "epoch": 8.193777077589603, + "grad_norm": 0.2672954201698303, + "learning_rate": 2.9866666666666666e-05, + "loss": 0.0143, + "step": 10406 + }, + { + "epoch": 8.19456478928712, + "grad_norm": 0.34780535101890564, + "learning_rate": 2.9866333333333336e-05, + "loss": 0.0176, + "step": 10407 + }, + { + "epoch": 8.19535250098464, + "grad_norm": 0.29689672589302063, + "learning_rate": 2.9866000000000002e-05, + "loss": 0.0161, + "step": 10408 + }, + { + "epoch": 8.196140212682158, + "grad_norm": 1.0491951704025269, + "learning_rate": 2.9865666666666668e-05, + "loss": 0.025, + "step": 10409 + }, + { + "epoch": 8.196927924379677, + "grad_norm": 0.6083102226257324, + "learning_rate": 2.9865333333333334e-05, + "loss": 0.0288, + "step": 10410 + }, + { + "epoch": 8.197715636077195, + "grad_norm": 1.0310370922088623, + "learning_rate": 2.9865000000000003e-05, + "loss": 0.3082, + "step": 10411 + }, + { + "epoch": 8.198503347774714, + "grad_norm": 0.5881170630455017, + "learning_rate": 2.9864666666666665e-05, + "loss": 0.1824, + "step": 10412 + }, + { + "epoch": 8.199291059472234, + "grad_norm": 0.677493155002594, + "learning_rate": 2.9864333333333335e-05, + "loss": 0.1172, + "step": 10413 + }, + { + "epoch": 8.200078771169752, + "grad_norm": 0.495048850774765, + "learning_rate": 2.9864000000000004e-05, + "loss": 0.112, + "step": 10414 + }, + { + "epoch": 8.200866482867271, + "grad_norm": 0.6196361184120178, + "learning_rate": 2.9863666666666667e-05, + "loss": 0.079, + "step": 10415 + }, + { + "epoch": 8.201654194564789, + "grad_norm": 0.35115885734558105, + "learning_rate": 2.9863333333333336e-05, + "loss": 0.0354, + "step": 10416 + }, + { + "epoch": 8.202441906262308, + "grad_norm": 0.3673415780067444, + "learning_rate": 2.9863e-05, + "loss": 0.0483, + "step": 10417 + }, + { + "epoch": 8.203229617959826, + "grad_norm": 0.6089237928390503, + "learning_rate": 2.9862666666666668e-05, + "loss": 0.0818, + "step": 10418 + }, + { + "epoch": 8.204017329657345, + "grad_norm": 0.4350326359272003, + "learning_rate": 2.9862333333333334e-05, + "loss": 0.0205, + "step": 10419 + }, + { + "epoch": 8.204805041354865, + "grad_norm": 0.3120214343070984, + "learning_rate": 2.9862e-05, + "loss": 0.0196, + "step": 10420 + }, + { + "epoch": 8.205592753052382, + "grad_norm": 0.3568422198295593, + "learning_rate": 2.9861666666666666e-05, + "loss": 0.0139, + "step": 10421 + }, + { + "epoch": 8.206380464749902, + "grad_norm": 0.549401044845581, + "learning_rate": 2.9861333333333335e-05, + "loss": 0.0253, + "step": 10422 + }, + { + "epoch": 8.20716817644742, + "grad_norm": 0.3892384469509125, + "learning_rate": 2.9861e-05, + "loss": 0.018, + "step": 10423 + }, + { + "epoch": 8.207955888144939, + "grad_norm": 0.3041032552719116, + "learning_rate": 2.9860666666666667e-05, + "loss": 0.0183, + "step": 10424 + }, + { + "epoch": 8.208743599842458, + "grad_norm": 0.17619521915912628, + "learning_rate": 2.9860333333333336e-05, + "loss": 0.0095, + "step": 10425 + }, + { + "epoch": 8.209531311539976, + "grad_norm": 0.4718153476715088, + "learning_rate": 2.986e-05, + "loss": 0.0339, + "step": 10426 + }, + { + "epoch": 8.210319023237496, + "grad_norm": 0.2131820172071457, + "learning_rate": 2.9859666666666668e-05, + "loss": 0.0143, + "step": 10427 + }, + { + "epoch": 8.211106734935013, + "grad_norm": 0.1769915521144867, + "learning_rate": 2.9859333333333334e-05, + "loss": 0.0115, + "step": 10428 + }, + { + "epoch": 8.211894446632533, + "grad_norm": 0.35751551389694214, + "learning_rate": 2.9859e-05, + "loss": 0.0134, + "step": 10429 + }, + { + "epoch": 8.21268215833005, + "grad_norm": 0.880347490310669, + "learning_rate": 2.9858666666666666e-05, + "loss": 0.0235, + "step": 10430 + }, + { + "epoch": 8.21346987002757, + "grad_norm": 0.30855095386505127, + "learning_rate": 2.9858333333333335e-05, + "loss": 0.0179, + "step": 10431 + }, + { + "epoch": 8.21425758172509, + "grad_norm": 0.2689335346221924, + "learning_rate": 2.9858e-05, + "loss": 0.015, + "step": 10432 + }, + { + "epoch": 8.215045293422607, + "grad_norm": 0.28946205973625183, + "learning_rate": 2.9857666666666667e-05, + "loss": 0.0175, + "step": 10433 + }, + { + "epoch": 8.215833005120126, + "grad_norm": 0.49231043457984924, + "learning_rate": 2.9857333333333336e-05, + "loss": 0.0143, + "step": 10434 + }, + { + "epoch": 8.216620716817644, + "grad_norm": 0.2908076345920563, + "learning_rate": 2.9857e-05, + "loss": 0.0144, + "step": 10435 + }, + { + "epoch": 8.217408428515164, + "grad_norm": 0.3453325927257538, + "learning_rate": 2.9856666666666668e-05, + "loss": 0.0176, + "step": 10436 + }, + { + "epoch": 8.218196140212681, + "grad_norm": 0.4302380084991455, + "learning_rate": 2.9856333333333334e-05, + "loss": 0.0111, + "step": 10437 + }, + { + "epoch": 8.2189838519102, + "grad_norm": 0.7425429224967957, + "learning_rate": 2.9856e-05, + "loss": 0.0132, + "step": 10438 + }, + { + "epoch": 8.21977156360772, + "grad_norm": 0.24565130472183228, + "learning_rate": 2.985566666666667e-05, + "loss": 0.0171, + "step": 10439 + }, + { + "epoch": 8.220559275305238, + "grad_norm": 1.5062533617019653, + "learning_rate": 2.9855333333333335e-05, + "loss": 0.0359, + "step": 10440 + }, + { + "epoch": 8.221346987002757, + "grad_norm": 0.21495556831359863, + "learning_rate": 2.9855e-05, + "loss": 0.0103, + "step": 10441 + }, + { + "epoch": 8.222134698700275, + "grad_norm": 0.31788086891174316, + "learning_rate": 2.9854666666666667e-05, + "loss": 0.0179, + "step": 10442 + }, + { + "epoch": 8.222922410397794, + "grad_norm": 0.39422062039375305, + "learning_rate": 2.9854333333333337e-05, + "loss": 0.0247, + "step": 10443 + }, + { + "epoch": 8.223710122095314, + "grad_norm": 0.6112008690834045, + "learning_rate": 2.9854e-05, + "loss": 0.0178, + "step": 10444 + }, + { + "epoch": 8.224497833792832, + "grad_norm": 0.5119112133979797, + "learning_rate": 2.985366666666667e-05, + "loss": 0.0218, + "step": 10445 + }, + { + "epoch": 8.225285545490351, + "grad_norm": 0.36460933089256287, + "learning_rate": 2.985333333333333e-05, + "loss": 0.02, + "step": 10446 + }, + { + "epoch": 8.226073257187869, + "grad_norm": 0.646962583065033, + "learning_rate": 2.9853e-05, + "loss": 0.0264, + "step": 10447 + }, + { + "epoch": 8.226860968885388, + "grad_norm": 0.2905716300010681, + "learning_rate": 2.985266666666667e-05, + "loss": 0.0155, + "step": 10448 + }, + { + "epoch": 8.227648680582906, + "grad_norm": 0.2315973937511444, + "learning_rate": 2.9852333333333332e-05, + "loss": 0.0161, + "step": 10449 + }, + { + "epoch": 8.228436392280425, + "grad_norm": 0.21110914647579193, + "learning_rate": 2.9852e-05, + "loss": 0.0104, + "step": 10450 + }, + { + "epoch": 8.229224103977945, + "grad_norm": 0.5347280502319336, + "learning_rate": 2.9851666666666667e-05, + "loss": 0.0162, + "step": 10451 + }, + { + "epoch": 8.230011815675462, + "grad_norm": 0.494447261095047, + "learning_rate": 2.9851333333333333e-05, + "loss": 0.0191, + "step": 10452 + }, + { + "epoch": 8.230799527372982, + "grad_norm": 0.41303181648254395, + "learning_rate": 2.9851e-05, + "loss": 0.0219, + "step": 10453 + }, + { + "epoch": 8.2315872390705, + "grad_norm": 0.4228985011577606, + "learning_rate": 2.985066666666667e-05, + "loss": 0.0268, + "step": 10454 + }, + { + "epoch": 8.232374950768019, + "grad_norm": 0.7302096486091614, + "learning_rate": 2.985033333333333e-05, + "loss": 0.0228, + "step": 10455 + }, + { + "epoch": 8.233162662465537, + "grad_norm": 0.6263740658760071, + "learning_rate": 2.985e-05, + "loss": 0.0253, + "step": 10456 + }, + { + "epoch": 8.233950374163056, + "grad_norm": 0.4000450372695923, + "learning_rate": 2.984966666666667e-05, + "loss": 0.0173, + "step": 10457 + }, + { + "epoch": 8.234738085860576, + "grad_norm": 0.2574838697910309, + "learning_rate": 2.9849333333333332e-05, + "loss": 0.0091, + "step": 10458 + }, + { + "epoch": 8.235525797558093, + "grad_norm": 0.5962544083595276, + "learning_rate": 2.9849000000000002e-05, + "loss": 0.0193, + "step": 10459 + }, + { + "epoch": 8.236313509255613, + "grad_norm": 0.6565743684768677, + "learning_rate": 2.9848666666666668e-05, + "loss": 0.0325, + "step": 10460 + }, + { + "epoch": 8.23710122095313, + "grad_norm": 0.886336088180542, + "learning_rate": 2.9848333333333334e-05, + "loss": 0.2846, + "step": 10461 + }, + { + "epoch": 8.23788893265065, + "grad_norm": 0.6724815368652344, + "learning_rate": 2.9848e-05, + "loss": 0.1745, + "step": 10462 + }, + { + "epoch": 8.23867664434817, + "grad_norm": 0.4765651822090149, + "learning_rate": 2.984766666666667e-05, + "loss": 0.1088, + "step": 10463 + }, + { + "epoch": 8.239464356045687, + "grad_norm": 0.7183516025543213, + "learning_rate": 2.9847333333333335e-05, + "loss": 0.1339, + "step": 10464 + }, + { + "epoch": 8.240252067743207, + "grad_norm": 0.749940812587738, + "learning_rate": 2.9847e-05, + "loss": 0.088, + "step": 10465 + }, + { + "epoch": 8.241039779440724, + "grad_norm": 0.5086843967437744, + "learning_rate": 2.984666666666667e-05, + "loss": 0.0314, + "step": 10466 + }, + { + "epoch": 8.241827491138244, + "grad_norm": 0.29895979166030884, + "learning_rate": 2.9846333333333333e-05, + "loss": 0.0249, + "step": 10467 + }, + { + "epoch": 8.242615202835761, + "grad_norm": 0.42716699838638306, + "learning_rate": 2.9846000000000002e-05, + "loss": 0.0228, + "step": 10468 + }, + { + "epoch": 8.24340291453328, + "grad_norm": 0.29286590218544006, + "learning_rate": 2.9845666666666668e-05, + "loss": 0.0181, + "step": 10469 + }, + { + "epoch": 8.2441906262308, + "grad_norm": 0.33467453718185425, + "learning_rate": 2.9845333333333334e-05, + "loss": 0.0185, + "step": 10470 + }, + { + "epoch": 8.244978337928318, + "grad_norm": 0.25840359926223755, + "learning_rate": 2.9845e-05, + "loss": 0.0147, + "step": 10471 + }, + { + "epoch": 8.245766049625837, + "grad_norm": 0.3634125292301178, + "learning_rate": 2.984466666666667e-05, + "loss": 0.0268, + "step": 10472 + }, + { + "epoch": 8.246553761323355, + "grad_norm": 0.42682456970214844, + "learning_rate": 2.9844333333333335e-05, + "loss": 0.0156, + "step": 10473 + }, + { + "epoch": 8.247341473020875, + "grad_norm": 0.22838366031646729, + "learning_rate": 2.9844e-05, + "loss": 0.0164, + "step": 10474 + }, + { + "epoch": 8.248129184718394, + "grad_norm": 0.24474339187145233, + "learning_rate": 2.9843666666666667e-05, + "loss": 0.0102, + "step": 10475 + }, + { + "epoch": 8.248916896415912, + "grad_norm": 0.34466463327407837, + "learning_rate": 2.9843333333333333e-05, + "loss": 0.0261, + "step": 10476 + }, + { + "epoch": 8.249704608113431, + "grad_norm": 0.20582929253578186, + "learning_rate": 2.9843000000000002e-05, + "loss": 0.0153, + "step": 10477 + }, + { + "epoch": 8.250492319810949, + "grad_norm": 0.42448022961616516, + "learning_rate": 2.9842666666666665e-05, + "loss": 0.0152, + "step": 10478 + }, + { + "epoch": 8.251280031508468, + "grad_norm": 0.534959614276886, + "learning_rate": 2.9842333333333334e-05, + "loss": 0.0203, + "step": 10479 + }, + { + "epoch": 8.252067743205986, + "grad_norm": 0.22823788225650787, + "learning_rate": 2.9842e-05, + "loss": 0.0191, + "step": 10480 + }, + { + "epoch": 8.252855454903505, + "grad_norm": 0.3624500334262848, + "learning_rate": 2.9841666666666666e-05, + "loss": 0.0186, + "step": 10481 + }, + { + "epoch": 8.253643166601025, + "grad_norm": 0.36491572856903076, + "learning_rate": 2.9841333333333335e-05, + "loss": 0.016, + "step": 10482 + }, + { + "epoch": 8.254430878298542, + "grad_norm": 0.26491811871528625, + "learning_rate": 2.9841e-05, + "loss": 0.0126, + "step": 10483 + }, + { + "epoch": 8.255218589996062, + "grad_norm": 0.32639873027801514, + "learning_rate": 2.9840666666666667e-05, + "loss": 0.0133, + "step": 10484 + }, + { + "epoch": 8.25600630169358, + "grad_norm": 0.6183876395225525, + "learning_rate": 2.9840333333333333e-05, + "loss": 0.0909, + "step": 10485 + }, + { + "epoch": 8.256794013391099, + "grad_norm": 0.11379077285528183, + "learning_rate": 2.9840000000000002e-05, + "loss": 0.0056, + "step": 10486 + }, + { + "epoch": 8.257581725088617, + "grad_norm": 0.8239457011222839, + "learning_rate": 2.9839666666666665e-05, + "loss": 0.0175, + "step": 10487 + }, + { + "epoch": 8.258369436786136, + "grad_norm": 0.22985190153121948, + "learning_rate": 2.9839333333333334e-05, + "loss": 0.0087, + "step": 10488 + }, + { + "epoch": 8.259157148483656, + "grad_norm": 0.394131064414978, + "learning_rate": 2.9839000000000003e-05, + "loss": 0.0184, + "step": 10489 + }, + { + "epoch": 8.259944860181173, + "grad_norm": 0.29664650559425354, + "learning_rate": 2.9838666666666666e-05, + "loss": 0.0151, + "step": 10490 + }, + { + "epoch": 8.260732571878693, + "grad_norm": 0.40132537484169006, + "learning_rate": 2.9838333333333335e-05, + "loss": 0.0141, + "step": 10491 + }, + { + "epoch": 8.26152028357621, + "grad_norm": 0.29755890369415283, + "learning_rate": 2.9838e-05, + "loss": 0.0132, + "step": 10492 + }, + { + "epoch": 8.26230799527373, + "grad_norm": 0.5002989768981934, + "learning_rate": 2.9837666666666667e-05, + "loss": 0.0216, + "step": 10493 + }, + { + "epoch": 8.26309570697125, + "grad_norm": 0.3670302629470825, + "learning_rate": 2.9837333333333333e-05, + "loss": 0.021, + "step": 10494 + }, + { + "epoch": 8.263883418668767, + "grad_norm": 0.27928459644317627, + "learning_rate": 2.9837000000000002e-05, + "loss": 0.0143, + "step": 10495 + }, + { + "epoch": 8.264671130366287, + "grad_norm": 0.41846075654029846, + "learning_rate": 2.9836666666666665e-05, + "loss": 0.0131, + "step": 10496 + }, + { + "epoch": 8.265458842063804, + "grad_norm": 0.43608030676841736, + "learning_rate": 2.9836333333333334e-05, + "loss": 0.0145, + "step": 10497 + }, + { + "epoch": 8.266246553761324, + "grad_norm": 0.319440096616745, + "learning_rate": 2.9836000000000004e-05, + "loss": 0.0157, + "step": 10498 + }, + { + "epoch": 8.267034265458841, + "grad_norm": 0.39590147137641907, + "learning_rate": 2.9835666666666666e-05, + "loss": 0.0211, + "step": 10499 + }, + { + "epoch": 8.26782197715636, + "grad_norm": 1.2464874982833862, + "learning_rate": 2.9835333333333336e-05, + "loss": 0.0378, + "step": 10500 + }, + { + "epoch": 8.26860968885388, + "grad_norm": 0.18213622272014618, + "learning_rate": 2.9835e-05, + "loss": 0.0095, + "step": 10501 + }, + { + "epoch": 8.269397400551398, + "grad_norm": 0.3471669852733612, + "learning_rate": 2.9834666666666667e-05, + "loss": 0.0227, + "step": 10502 + }, + { + "epoch": 8.270185112248917, + "grad_norm": 0.345306932926178, + "learning_rate": 2.9834333333333333e-05, + "loss": 0.0138, + "step": 10503 + }, + { + "epoch": 8.270972823946435, + "grad_norm": 0.3115245997905731, + "learning_rate": 2.9834000000000003e-05, + "loss": 0.0251, + "step": 10504 + }, + { + "epoch": 8.271760535643955, + "grad_norm": 0.2742913067340851, + "learning_rate": 2.9833666666666665e-05, + "loss": 0.0219, + "step": 10505 + }, + { + "epoch": 8.272548247341472, + "grad_norm": 0.5988137125968933, + "learning_rate": 2.9833333333333335e-05, + "loss": 0.0322, + "step": 10506 + }, + { + "epoch": 8.273335959038992, + "grad_norm": 0.5687007904052734, + "learning_rate": 2.9833e-05, + "loss": 0.0431, + "step": 10507 + }, + { + "epoch": 8.274123670736511, + "grad_norm": 0.3829110562801361, + "learning_rate": 2.9832666666666666e-05, + "loss": 0.0159, + "step": 10508 + }, + { + "epoch": 8.274911382434029, + "grad_norm": 0.32351621985435486, + "learning_rate": 2.9832333333333336e-05, + "loss": 0.0266, + "step": 10509 + }, + { + "epoch": 8.275699094131548, + "grad_norm": 2.019090175628662, + "learning_rate": 2.9831999999999998e-05, + "loss": 0.0334, + "step": 10510 + }, + { + "epoch": 8.276486805829066, + "grad_norm": 0.9476379752159119, + "learning_rate": 2.9831666666666668e-05, + "loss": 0.2746, + "step": 10511 + }, + { + "epoch": 8.277274517526585, + "grad_norm": 0.786779522895813, + "learning_rate": 2.9831333333333334e-05, + "loss": 0.2062, + "step": 10512 + }, + { + "epoch": 8.278062229224105, + "grad_norm": 0.4649220108985901, + "learning_rate": 2.9831e-05, + "loss": 0.1304, + "step": 10513 + }, + { + "epoch": 8.278849940921623, + "grad_norm": 0.6584292650222778, + "learning_rate": 2.983066666666667e-05, + "loss": 0.1116, + "step": 10514 + }, + { + "epoch": 8.279637652619142, + "grad_norm": 0.5439666509628296, + "learning_rate": 2.9830333333333335e-05, + "loss": 0.1417, + "step": 10515 + }, + { + "epoch": 8.28042536431666, + "grad_norm": 0.6109281778335571, + "learning_rate": 2.983e-05, + "loss": 0.0861, + "step": 10516 + }, + { + "epoch": 8.281213076014179, + "grad_norm": 0.29642388224601746, + "learning_rate": 2.9829666666666667e-05, + "loss": 0.0499, + "step": 10517 + }, + { + "epoch": 8.282000787711697, + "grad_norm": 0.3207237720489502, + "learning_rate": 2.9829333333333336e-05, + "loss": 0.0237, + "step": 10518 + }, + { + "epoch": 8.282788499409216, + "grad_norm": 0.2672717273235321, + "learning_rate": 2.9829e-05, + "loss": 0.019, + "step": 10519 + }, + { + "epoch": 8.283576211106736, + "grad_norm": 0.2943597137928009, + "learning_rate": 2.9828666666666668e-05, + "loss": 0.0134, + "step": 10520 + }, + { + "epoch": 8.284363922804253, + "grad_norm": 0.2521344721317291, + "learning_rate": 2.9828333333333334e-05, + "loss": 0.0211, + "step": 10521 + }, + { + "epoch": 8.285151634501773, + "grad_norm": 0.3420548439025879, + "learning_rate": 2.9828e-05, + "loss": 0.0165, + "step": 10522 + }, + { + "epoch": 8.28593934619929, + "grad_norm": 0.17643024027347565, + "learning_rate": 2.982766666666667e-05, + "loss": 0.0135, + "step": 10523 + }, + { + "epoch": 8.28672705789681, + "grad_norm": 0.21830834448337555, + "learning_rate": 2.9827333333333335e-05, + "loss": 0.015, + "step": 10524 + }, + { + "epoch": 8.287514769594328, + "grad_norm": 0.15949103236198425, + "learning_rate": 2.9827e-05, + "loss": 0.0101, + "step": 10525 + }, + { + "epoch": 8.288302481291847, + "grad_norm": 0.1883346140384674, + "learning_rate": 2.9826666666666667e-05, + "loss": 0.0226, + "step": 10526 + }, + { + "epoch": 8.289090192989367, + "grad_norm": 0.2461141049861908, + "learning_rate": 2.9826333333333336e-05, + "loss": 0.0187, + "step": 10527 + }, + { + "epoch": 8.289877904686884, + "grad_norm": 0.24969229102134705, + "learning_rate": 2.9826e-05, + "loss": 0.014, + "step": 10528 + }, + { + "epoch": 8.290665616384404, + "grad_norm": 0.2501179873943329, + "learning_rate": 2.9825666666666668e-05, + "loss": 0.0244, + "step": 10529 + }, + { + "epoch": 8.291453328081921, + "grad_norm": 0.18687458336353302, + "learning_rate": 2.9825333333333334e-05, + "loss": 0.006, + "step": 10530 + }, + { + "epoch": 8.29224103977944, + "grad_norm": 0.27286940813064575, + "learning_rate": 2.9825e-05, + "loss": 0.0137, + "step": 10531 + }, + { + "epoch": 8.29302875147696, + "grad_norm": 0.16923896968364716, + "learning_rate": 2.982466666666667e-05, + "loss": 0.0139, + "step": 10532 + }, + { + "epoch": 8.293816463174478, + "grad_norm": 0.27593716979026794, + "learning_rate": 2.9824333333333335e-05, + "loss": 0.0503, + "step": 10533 + }, + { + "epoch": 8.294604174871997, + "grad_norm": 0.2987075448036194, + "learning_rate": 2.9824e-05, + "loss": 0.0107, + "step": 10534 + }, + { + "epoch": 8.295391886569515, + "grad_norm": 0.20493541657924652, + "learning_rate": 2.9823666666666667e-05, + "loss": 0.0094, + "step": 10535 + }, + { + "epoch": 8.296179598267035, + "grad_norm": 0.22835414111614227, + "learning_rate": 2.9823333333333333e-05, + "loss": 0.012, + "step": 10536 + }, + { + "epoch": 8.296967309964552, + "grad_norm": 0.2880689799785614, + "learning_rate": 2.9823e-05, + "loss": 0.014, + "step": 10537 + }, + { + "epoch": 8.297755021662072, + "grad_norm": 0.452187716960907, + "learning_rate": 2.9822666666666668e-05, + "loss": 0.0191, + "step": 10538 + }, + { + "epoch": 8.298542733359591, + "grad_norm": 0.2822670042514801, + "learning_rate": 2.9822333333333334e-05, + "loss": 0.0157, + "step": 10539 + }, + { + "epoch": 8.299330445057109, + "grad_norm": 0.2701793313026428, + "learning_rate": 2.9822e-05, + "loss": 0.0153, + "step": 10540 + }, + { + "epoch": 8.300118156754628, + "grad_norm": 0.370132178068161, + "learning_rate": 2.982166666666667e-05, + "loss": 0.0174, + "step": 10541 + }, + { + "epoch": 8.300905868452146, + "grad_norm": 0.25163352489471436, + "learning_rate": 2.9821333333333332e-05, + "loss": 0.0078, + "step": 10542 + }, + { + "epoch": 8.301693580149665, + "grad_norm": 0.17673085629940033, + "learning_rate": 2.9821e-05, + "loss": 0.0089, + "step": 10543 + }, + { + "epoch": 8.302481291847183, + "grad_norm": 0.31134578585624695, + "learning_rate": 2.9820666666666667e-05, + "loss": 0.0115, + "step": 10544 + }, + { + "epoch": 8.303269003544703, + "grad_norm": 0.3378169536590576, + "learning_rate": 2.9820333333333333e-05, + "loss": 0.0226, + "step": 10545 + }, + { + "epoch": 8.304056715242222, + "grad_norm": 0.41930434107780457, + "learning_rate": 2.982e-05, + "loss": 0.0162, + "step": 10546 + }, + { + "epoch": 8.30484442693974, + "grad_norm": 0.1748904436826706, + "learning_rate": 2.981966666666667e-05, + "loss": 0.0154, + "step": 10547 + }, + { + "epoch": 8.30563213863726, + "grad_norm": 0.41560760140419006, + "learning_rate": 2.9819333333333334e-05, + "loss": 0.0215, + "step": 10548 + }, + { + "epoch": 8.306419850334777, + "grad_norm": 0.42327943444252014, + "learning_rate": 2.9819e-05, + "loss": 0.0153, + "step": 10549 + }, + { + "epoch": 8.307207562032296, + "grad_norm": 0.6812291145324707, + "learning_rate": 2.981866666666667e-05, + "loss": 0.0231, + "step": 10550 + }, + { + "epoch": 8.307995273729816, + "grad_norm": 0.3225046694278717, + "learning_rate": 2.9818333333333332e-05, + "loss": 0.0218, + "step": 10551 + }, + { + "epoch": 8.308782985427333, + "grad_norm": 0.22187864780426025, + "learning_rate": 2.9818e-05, + "loss": 0.0084, + "step": 10552 + }, + { + "epoch": 8.309570697124853, + "grad_norm": 0.20963795483112335, + "learning_rate": 2.9817666666666667e-05, + "loss": 0.0119, + "step": 10553 + }, + { + "epoch": 8.31035840882237, + "grad_norm": 0.19805285334587097, + "learning_rate": 2.9817333333333333e-05, + "loss": 0.0104, + "step": 10554 + }, + { + "epoch": 8.31114612051989, + "grad_norm": 0.20342157781124115, + "learning_rate": 2.9817e-05, + "loss": 0.0157, + "step": 10555 + }, + { + "epoch": 8.311933832217408, + "grad_norm": 0.974614679813385, + "learning_rate": 2.981666666666667e-05, + "loss": 0.0377, + "step": 10556 + }, + { + "epoch": 8.312721543914927, + "grad_norm": 0.3031672537326813, + "learning_rate": 2.9816333333333335e-05, + "loss": 0.0167, + "step": 10557 + }, + { + "epoch": 8.313509255612447, + "grad_norm": 0.5182365775108337, + "learning_rate": 2.9816e-05, + "loss": 0.019, + "step": 10558 + }, + { + "epoch": 8.314296967309964, + "grad_norm": 0.3913588225841522, + "learning_rate": 2.981566666666667e-05, + "loss": 0.0232, + "step": 10559 + }, + { + "epoch": 8.315084679007484, + "grad_norm": 0.22272513806819916, + "learning_rate": 2.9815333333333332e-05, + "loss": 0.0128, + "step": 10560 + }, + { + "epoch": 8.315872390705001, + "grad_norm": 1.0373690128326416, + "learning_rate": 2.9815e-05, + "loss": 0.2688, + "step": 10561 + }, + { + "epoch": 8.31666010240252, + "grad_norm": 0.727147102355957, + "learning_rate": 2.9814666666666668e-05, + "loss": 0.2134, + "step": 10562 + }, + { + "epoch": 8.317447814100039, + "grad_norm": 0.5861023664474487, + "learning_rate": 2.9814333333333334e-05, + "loss": 0.1635, + "step": 10563 + }, + { + "epoch": 8.318235525797558, + "grad_norm": 0.5524757504463196, + "learning_rate": 2.9814000000000003e-05, + "loss": 0.1157, + "step": 10564 + }, + { + "epoch": 8.319023237495077, + "grad_norm": 0.5788220763206482, + "learning_rate": 2.981366666666667e-05, + "loss": 0.0943, + "step": 10565 + }, + { + "epoch": 8.319810949192595, + "grad_norm": 0.9078844785690308, + "learning_rate": 2.9813333333333335e-05, + "loss": 0.2149, + "step": 10566 + }, + { + "epoch": 8.320598660890115, + "grad_norm": 0.32108911871910095, + "learning_rate": 2.9813e-05, + "loss": 0.0329, + "step": 10567 + }, + { + "epoch": 8.321386372587632, + "grad_norm": 0.46616628766059875, + "learning_rate": 2.9812666666666667e-05, + "loss": 0.0293, + "step": 10568 + }, + { + "epoch": 8.322174084285152, + "grad_norm": 0.286925345659256, + "learning_rate": 2.9812333333333333e-05, + "loss": 0.0242, + "step": 10569 + }, + { + "epoch": 8.322961795982671, + "grad_norm": 0.14280208945274353, + "learning_rate": 2.9812000000000002e-05, + "loss": 0.0122, + "step": 10570 + }, + { + "epoch": 8.323749507680189, + "grad_norm": 0.24818840622901917, + "learning_rate": 2.9811666666666664e-05, + "loss": 0.0193, + "step": 10571 + }, + { + "epoch": 8.324537219377708, + "grad_norm": 0.24827419221401215, + "learning_rate": 2.9811333333333334e-05, + "loss": 0.0369, + "step": 10572 + }, + { + "epoch": 8.325324931075226, + "grad_norm": 0.1983553022146225, + "learning_rate": 2.9811000000000003e-05, + "loss": 0.0175, + "step": 10573 + }, + { + "epoch": 8.326112642772745, + "grad_norm": 0.3498326241970062, + "learning_rate": 2.9810666666666666e-05, + "loss": 0.015, + "step": 10574 + }, + { + "epoch": 8.326900354470263, + "grad_norm": 0.20500679314136505, + "learning_rate": 2.9810333333333335e-05, + "loss": 0.0132, + "step": 10575 + }, + { + "epoch": 8.327688066167783, + "grad_norm": 0.23243360221385956, + "learning_rate": 2.981e-05, + "loss": 0.0205, + "step": 10576 + }, + { + "epoch": 8.328475777865302, + "grad_norm": 0.5560227036476135, + "learning_rate": 2.9809666666666667e-05, + "loss": 0.0239, + "step": 10577 + }, + { + "epoch": 8.32926348956282, + "grad_norm": 0.3468717336654663, + "learning_rate": 2.9809333333333333e-05, + "loss": 0.0227, + "step": 10578 + }, + { + "epoch": 8.33005120126034, + "grad_norm": 0.3288763165473938, + "learning_rate": 2.9809000000000002e-05, + "loss": 0.0179, + "step": 10579 + }, + { + "epoch": 8.330838912957857, + "grad_norm": 0.2990677058696747, + "learning_rate": 2.9808666666666665e-05, + "loss": 0.0142, + "step": 10580 + }, + { + "epoch": 8.331626624655376, + "grad_norm": 0.20004770159721375, + "learning_rate": 2.9808333333333334e-05, + "loss": 0.0125, + "step": 10581 + }, + { + "epoch": 8.332414336352894, + "grad_norm": 0.35595566034317017, + "learning_rate": 2.9808000000000003e-05, + "loss": 0.023, + "step": 10582 + }, + { + "epoch": 8.333202048050413, + "grad_norm": 0.8544878959655762, + "learning_rate": 2.9807666666666666e-05, + "loss": 0.0215, + "step": 10583 + }, + { + "epoch": 8.333989759747933, + "grad_norm": 0.2513749897480011, + "learning_rate": 2.9807333333333335e-05, + "loss": 0.0147, + "step": 10584 + }, + { + "epoch": 8.33477747144545, + "grad_norm": 0.3942696750164032, + "learning_rate": 2.9807e-05, + "loss": 0.0185, + "step": 10585 + }, + { + "epoch": 8.33556518314297, + "grad_norm": 0.21680666506290436, + "learning_rate": 2.9806666666666667e-05, + "loss": 0.0116, + "step": 10586 + }, + { + "epoch": 8.336352894840488, + "grad_norm": 0.3417578339576721, + "learning_rate": 2.9806333333333333e-05, + "loss": 0.0486, + "step": 10587 + }, + { + "epoch": 8.337140606538007, + "grad_norm": 0.48997727036476135, + "learning_rate": 2.9806000000000002e-05, + "loss": 0.011, + "step": 10588 + }, + { + "epoch": 8.337928318235527, + "grad_norm": 0.2760676145553589, + "learning_rate": 2.9805666666666668e-05, + "loss": 0.007, + "step": 10589 + }, + { + "epoch": 8.338716029933044, + "grad_norm": 0.3546636998653412, + "learning_rate": 2.9805333333333334e-05, + "loss": 0.0181, + "step": 10590 + }, + { + "epoch": 8.339503741630564, + "grad_norm": 0.3235403597354889, + "learning_rate": 2.9805000000000003e-05, + "loss": 0.0083, + "step": 10591 + }, + { + "epoch": 8.340291453328081, + "grad_norm": 0.310741126537323, + "learning_rate": 2.9804666666666666e-05, + "loss": 0.014, + "step": 10592 + }, + { + "epoch": 8.3410791650256, + "grad_norm": 0.4861251413822174, + "learning_rate": 2.9804333333333335e-05, + "loss": 0.0139, + "step": 10593 + }, + { + "epoch": 8.341866876723119, + "grad_norm": 0.28897973895072937, + "learning_rate": 2.9804e-05, + "loss": 0.0139, + "step": 10594 + }, + { + "epoch": 8.342654588420638, + "grad_norm": 0.5940023064613342, + "learning_rate": 2.9803666666666667e-05, + "loss": 0.0235, + "step": 10595 + }, + { + "epoch": 8.343442300118157, + "grad_norm": 0.2321949601173401, + "learning_rate": 2.9803333333333333e-05, + "loss": 0.01, + "step": 10596 + }, + { + "epoch": 8.344230011815675, + "grad_norm": 0.6942517757415771, + "learning_rate": 2.9803e-05, + "loss": 0.0256, + "step": 10597 + }, + { + "epoch": 8.345017723513195, + "grad_norm": 0.2867853343486786, + "learning_rate": 2.980266666666667e-05, + "loss": 0.0181, + "step": 10598 + }, + { + "epoch": 8.345805435210712, + "grad_norm": 0.46663036942481995, + "learning_rate": 2.9802333333333334e-05, + "loss": 0.0168, + "step": 10599 + }, + { + "epoch": 8.346593146908232, + "grad_norm": 0.42042839527130127, + "learning_rate": 2.9802e-05, + "loss": 0.0151, + "step": 10600 + }, + { + "epoch": 8.34738085860575, + "grad_norm": 0.7580640912055969, + "learning_rate": 2.9801666666666666e-05, + "loss": 0.0174, + "step": 10601 + }, + { + "epoch": 8.348168570303269, + "grad_norm": 0.4539075493812561, + "learning_rate": 2.9801333333333336e-05, + "loss": 0.0174, + "step": 10602 + }, + { + "epoch": 8.348956282000788, + "grad_norm": 0.27859359979629517, + "learning_rate": 2.9800999999999998e-05, + "loss": 0.0147, + "step": 10603 + }, + { + "epoch": 8.349743993698306, + "grad_norm": 0.43631452322006226, + "learning_rate": 2.9800666666666667e-05, + "loss": 0.0365, + "step": 10604 + }, + { + "epoch": 8.350531705395825, + "grad_norm": 0.38825446367263794, + "learning_rate": 2.9800333333333333e-05, + "loss": 0.019, + "step": 10605 + }, + { + "epoch": 8.351319417093343, + "grad_norm": 0.38263553380966187, + "learning_rate": 2.98e-05, + "loss": 0.0197, + "step": 10606 + }, + { + "epoch": 8.352107128790863, + "grad_norm": 0.4277424216270447, + "learning_rate": 2.979966666666667e-05, + "loss": 0.0204, + "step": 10607 + }, + { + "epoch": 8.352894840488382, + "grad_norm": 0.49209064245224, + "learning_rate": 2.9799333333333335e-05, + "loss": 0.0162, + "step": 10608 + }, + { + "epoch": 8.3536825521859, + "grad_norm": 0.35255613923072815, + "learning_rate": 2.9799e-05, + "loss": 0.0221, + "step": 10609 + }, + { + "epoch": 8.35447026388342, + "grad_norm": 0.3474361300468445, + "learning_rate": 2.9798666666666666e-05, + "loss": 0.0209, + "step": 10610 + }, + { + "epoch": 8.355257975580937, + "grad_norm": 0.9868287444114685, + "learning_rate": 2.9798333333333336e-05, + "loss": 0.2817, + "step": 10611 + }, + { + "epoch": 8.356045687278456, + "grad_norm": 0.6316584944725037, + "learning_rate": 2.9797999999999998e-05, + "loss": 0.1746, + "step": 10612 + }, + { + "epoch": 8.356833398975974, + "grad_norm": 0.7431633472442627, + "learning_rate": 2.9797666666666668e-05, + "loss": 0.1559, + "step": 10613 + }, + { + "epoch": 8.357621110673493, + "grad_norm": 0.44950228929519653, + "learning_rate": 2.9797333333333337e-05, + "loss": 0.0966, + "step": 10614 + }, + { + "epoch": 8.358408822371013, + "grad_norm": 0.9590532779693604, + "learning_rate": 2.9797e-05, + "loss": 0.0632, + "step": 10615 + }, + { + "epoch": 8.35919653406853, + "grad_norm": 0.39598792791366577, + "learning_rate": 2.979666666666667e-05, + "loss": 0.0607, + "step": 10616 + }, + { + "epoch": 8.35998424576605, + "grad_norm": 0.31250259280204773, + "learning_rate": 2.9796333333333335e-05, + "loss": 0.03, + "step": 10617 + }, + { + "epoch": 8.360771957463568, + "grad_norm": 0.2013184130191803, + "learning_rate": 2.9796e-05, + "loss": 0.0197, + "step": 10618 + }, + { + "epoch": 8.361559669161087, + "grad_norm": 0.2115132063627243, + "learning_rate": 2.9795666666666667e-05, + "loss": 0.0132, + "step": 10619 + }, + { + "epoch": 8.362347380858605, + "grad_norm": 0.47750285267829895, + "learning_rate": 2.9795333333333336e-05, + "loss": 0.0305, + "step": 10620 + }, + { + "epoch": 8.363135092556124, + "grad_norm": 0.23422500491142273, + "learning_rate": 2.9795e-05, + "loss": 0.025, + "step": 10621 + }, + { + "epoch": 8.363922804253644, + "grad_norm": 0.3236654996871948, + "learning_rate": 2.9794666666666668e-05, + "loss": 0.0126, + "step": 10622 + }, + { + "epoch": 8.364710515951161, + "grad_norm": 0.6852855086326599, + "learning_rate": 2.9794333333333337e-05, + "loss": 0.0888, + "step": 10623 + }, + { + "epoch": 8.365498227648681, + "grad_norm": 0.32013949751853943, + "learning_rate": 2.9794e-05, + "loss": 0.0202, + "step": 10624 + }, + { + "epoch": 8.366285939346199, + "grad_norm": 0.3068051338195801, + "learning_rate": 2.979366666666667e-05, + "loss": 0.018, + "step": 10625 + }, + { + "epoch": 8.367073651043718, + "grad_norm": 0.26991209387779236, + "learning_rate": 2.979333333333333e-05, + "loss": 0.0157, + "step": 10626 + }, + { + "epoch": 8.367861362741237, + "grad_norm": 0.238408163189888, + "learning_rate": 2.9793e-05, + "loss": 0.0132, + "step": 10627 + }, + { + "epoch": 8.368649074438755, + "grad_norm": 0.4542933702468872, + "learning_rate": 2.9792666666666667e-05, + "loss": 0.0163, + "step": 10628 + }, + { + "epoch": 8.369436786136275, + "grad_norm": 0.5173882842063904, + "learning_rate": 2.9792333333333333e-05, + "loss": 0.0211, + "step": 10629 + }, + { + "epoch": 8.370224497833792, + "grad_norm": 0.2727032005786896, + "learning_rate": 2.9792e-05, + "loss": 0.0151, + "step": 10630 + }, + { + "epoch": 8.371012209531312, + "grad_norm": 0.24207040667533875, + "learning_rate": 2.9791666666666668e-05, + "loss": 0.0163, + "step": 10631 + }, + { + "epoch": 8.37179992122883, + "grad_norm": 0.3237028121948242, + "learning_rate": 2.9791333333333334e-05, + "loss": 0.0208, + "step": 10632 + }, + { + "epoch": 8.372587632926349, + "grad_norm": 0.30184149742126465, + "learning_rate": 2.9791e-05, + "loss": 0.0204, + "step": 10633 + }, + { + "epoch": 8.373375344623868, + "grad_norm": 0.18697760999202728, + "learning_rate": 2.979066666666667e-05, + "loss": 0.0135, + "step": 10634 + }, + { + "epoch": 8.374163056321386, + "grad_norm": 0.1683509200811386, + "learning_rate": 2.9790333333333332e-05, + "loss": 0.0102, + "step": 10635 + }, + { + "epoch": 8.374950768018905, + "grad_norm": 0.49531859159469604, + "learning_rate": 2.979e-05, + "loss": 0.0143, + "step": 10636 + }, + { + "epoch": 8.375738479716423, + "grad_norm": 0.36861473321914673, + "learning_rate": 2.9789666666666667e-05, + "loss": 0.0163, + "step": 10637 + }, + { + "epoch": 8.376526191413943, + "grad_norm": 0.3476718068122864, + "learning_rate": 2.9789333333333333e-05, + "loss": 0.0154, + "step": 10638 + }, + { + "epoch": 8.37731390311146, + "grad_norm": 0.3724827766418457, + "learning_rate": 2.9789000000000002e-05, + "loss": 0.0201, + "step": 10639 + }, + { + "epoch": 8.37810161480898, + "grad_norm": 0.3546150326728821, + "learning_rate": 2.9788666666666668e-05, + "loss": 0.0205, + "step": 10640 + }, + { + "epoch": 8.3788893265065, + "grad_norm": 0.5168182849884033, + "learning_rate": 2.9788333333333334e-05, + "loss": 0.0176, + "step": 10641 + }, + { + "epoch": 8.379677038204017, + "grad_norm": 0.4366934895515442, + "learning_rate": 2.9788e-05, + "loss": 0.0174, + "step": 10642 + }, + { + "epoch": 8.380464749901536, + "grad_norm": 0.09629233926534653, + "learning_rate": 2.978766666666667e-05, + "loss": 0.0036, + "step": 10643 + }, + { + "epoch": 8.381252461599054, + "grad_norm": 0.19844572246074677, + "learning_rate": 2.9787333333333332e-05, + "loss": 0.0157, + "step": 10644 + }, + { + "epoch": 8.382040173296573, + "grad_norm": 0.33960357308387756, + "learning_rate": 2.9787e-05, + "loss": 0.0178, + "step": 10645 + }, + { + "epoch": 8.382827884994093, + "grad_norm": 0.4090791642665863, + "learning_rate": 2.9786666666666667e-05, + "loss": 0.0155, + "step": 10646 + }, + { + "epoch": 8.38361559669161, + "grad_norm": 0.3443642258644104, + "learning_rate": 2.9786333333333333e-05, + "loss": 0.0103, + "step": 10647 + }, + { + "epoch": 8.38440330838913, + "grad_norm": 0.3113538324832916, + "learning_rate": 2.9786000000000002e-05, + "loss": 0.0202, + "step": 10648 + }, + { + "epoch": 8.385191020086648, + "grad_norm": 0.37388357520103455, + "learning_rate": 2.978566666666667e-05, + "loss": 0.0129, + "step": 10649 + }, + { + "epoch": 8.385978731784167, + "grad_norm": 0.28155896067619324, + "learning_rate": 2.9785333333333334e-05, + "loss": 0.013, + "step": 10650 + }, + { + "epoch": 8.386766443481685, + "grad_norm": 0.4453347325325012, + "learning_rate": 2.9785e-05, + "loss": 0.022, + "step": 10651 + }, + { + "epoch": 8.387554155179204, + "grad_norm": 0.3741285502910614, + "learning_rate": 2.978466666666667e-05, + "loss": 0.0208, + "step": 10652 + }, + { + "epoch": 8.388341866876724, + "grad_norm": 0.8414475917816162, + "learning_rate": 2.9784333333333332e-05, + "loss": 0.0166, + "step": 10653 + }, + { + "epoch": 8.389129578574241, + "grad_norm": 0.35315701365470886, + "learning_rate": 2.9784e-05, + "loss": 0.021, + "step": 10654 + }, + { + "epoch": 8.389917290271761, + "grad_norm": 0.627868115901947, + "learning_rate": 2.9783666666666667e-05, + "loss": 0.0225, + "step": 10655 + }, + { + "epoch": 8.390705001969279, + "grad_norm": 0.7735432982444763, + "learning_rate": 2.9783333333333333e-05, + "loss": 0.0268, + "step": 10656 + }, + { + "epoch": 8.391492713666798, + "grad_norm": 0.24571210145950317, + "learning_rate": 2.9783000000000003e-05, + "loss": 0.0148, + "step": 10657 + }, + { + "epoch": 8.392280425364318, + "grad_norm": 0.5012595653533936, + "learning_rate": 2.9782666666666665e-05, + "loss": 0.0221, + "step": 10658 + }, + { + "epoch": 8.393068137061835, + "grad_norm": 0.3419361114501953, + "learning_rate": 2.9782333333333335e-05, + "loss": 0.0115, + "step": 10659 + }, + { + "epoch": 8.393855848759355, + "grad_norm": 0.6576665043830872, + "learning_rate": 2.9782e-05, + "loss": 0.0217, + "step": 10660 + }, + { + "epoch": 8.394643560456872, + "grad_norm": 0.7183488011360168, + "learning_rate": 2.9781666666666666e-05, + "loss": 0.1718, + "step": 10661 + }, + { + "epoch": 8.395431272154392, + "grad_norm": 0.5902436971664429, + "learning_rate": 2.9781333333333332e-05, + "loss": 0.1315, + "step": 10662 + }, + { + "epoch": 8.39621898385191, + "grad_norm": 0.6582280993461609, + "learning_rate": 2.9781e-05, + "loss": 0.1241, + "step": 10663 + }, + { + "epoch": 8.397006695549429, + "grad_norm": 0.5443679094314575, + "learning_rate": 2.9780666666666668e-05, + "loss": 0.0848, + "step": 10664 + }, + { + "epoch": 8.397794407246948, + "grad_norm": 0.5800449848175049, + "learning_rate": 2.9780333333333334e-05, + "loss": 0.1026, + "step": 10665 + }, + { + "epoch": 8.398582118944466, + "grad_norm": 1.30591881275177, + "learning_rate": 2.9780000000000003e-05, + "loss": 0.1015, + "step": 10666 + }, + { + "epoch": 8.399369830641986, + "grad_norm": 0.3059385120868683, + "learning_rate": 2.9779666666666665e-05, + "loss": 0.0253, + "step": 10667 + }, + { + "epoch": 8.400157542339503, + "grad_norm": 0.3137441873550415, + "learning_rate": 2.9779333333333335e-05, + "loss": 0.0257, + "step": 10668 + }, + { + "epoch": 8.400945254037023, + "grad_norm": 0.6368328928947449, + "learning_rate": 2.9779e-05, + "loss": 0.0542, + "step": 10669 + }, + { + "epoch": 8.40173296573454, + "grad_norm": 0.6652997136116028, + "learning_rate": 2.9778666666666667e-05, + "loss": 0.0244, + "step": 10670 + }, + { + "epoch": 8.40252067743206, + "grad_norm": 0.31018465757369995, + "learning_rate": 2.9778333333333333e-05, + "loss": 0.0223, + "step": 10671 + }, + { + "epoch": 8.40330838912958, + "grad_norm": 0.2820208668708801, + "learning_rate": 2.9778000000000002e-05, + "loss": 0.0123, + "step": 10672 + }, + { + "epoch": 8.404096100827097, + "grad_norm": 0.4227636754512787, + "learning_rate": 2.9777666666666668e-05, + "loss": 0.0241, + "step": 10673 + }, + { + "epoch": 8.404883812524616, + "grad_norm": 0.22821921110153198, + "learning_rate": 2.9777333333333334e-05, + "loss": 0.0091, + "step": 10674 + }, + { + "epoch": 8.405671524222134, + "grad_norm": 0.24046176671981812, + "learning_rate": 2.9777000000000003e-05, + "loss": 0.0172, + "step": 10675 + }, + { + "epoch": 8.406459235919653, + "grad_norm": 0.2819768488407135, + "learning_rate": 2.9776666666666666e-05, + "loss": 0.0143, + "step": 10676 + }, + { + "epoch": 8.407246947617173, + "grad_norm": 0.2540547549724579, + "learning_rate": 2.9776333333333335e-05, + "loss": 0.0112, + "step": 10677 + }, + { + "epoch": 8.40803465931469, + "grad_norm": 0.335245817899704, + "learning_rate": 2.9776e-05, + "loss": 0.0134, + "step": 10678 + }, + { + "epoch": 8.40882237101221, + "grad_norm": 0.2404085248708725, + "learning_rate": 2.9775666666666667e-05, + "loss": 0.0215, + "step": 10679 + }, + { + "epoch": 8.409610082709728, + "grad_norm": 0.1664569079875946, + "learning_rate": 2.9775333333333333e-05, + "loss": 0.0103, + "step": 10680 + }, + { + "epoch": 8.410397794407247, + "grad_norm": 0.5102702975273132, + "learning_rate": 2.9775000000000002e-05, + "loss": 0.0169, + "step": 10681 + }, + { + "epoch": 8.411185506104765, + "grad_norm": 0.6758787035942078, + "learning_rate": 2.9774666666666668e-05, + "loss": 0.018, + "step": 10682 + }, + { + "epoch": 8.411973217802284, + "grad_norm": 0.2670678496360779, + "learning_rate": 2.9774333333333334e-05, + "loss": 0.0248, + "step": 10683 + }, + { + "epoch": 8.412760929499804, + "grad_norm": 0.5011562705039978, + "learning_rate": 2.9774000000000003e-05, + "loss": 0.0271, + "step": 10684 + }, + { + "epoch": 8.413548641197321, + "grad_norm": 0.26888927817344666, + "learning_rate": 2.9773666666666666e-05, + "loss": 0.0184, + "step": 10685 + }, + { + "epoch": 8.414336352894841, + "grad_norm": 0.3800218105316162, + "learning_rate": 2.9773333333333335e-05, + "loss": 0.0168, + "step": 10686 + }, + { + "epoch": 8.415124064592359, + "grad_norm": 0.41918957233428955, + "learning_rate": 2.9772999999999998e-05, + "loss": 0.0175, + "step": 10687 + }, + { + "epoch": 8.415911776289878, + "grad_norm": 0.3548583388328552, + "learning_rate": 2.9772666666666667e-05, + "loss": 0.0166, + "step": 10688 + }, + { + "epoch": 8.416699487987396, + "grad_norm": 0.5502747893333435, + "learning_rate": 2.9772333333333336e-05, + "loss": 0.0111, + "step": 10689 + }, + { + "epoch": 8.417487199684915, + "grad_norm": 0.3748393952846527, + "learning_rate": 2.9772e-05, + "loss": 0.013, + "step": 10690 + }, + { + "epoch": 8.418274911382435, + "grad_norm": 0.3247656226158142, + "learning_rate": 2.9771666666666668e-05, + "loss": 0.0195, + "step": 10691 + }, + { + "epoch": 8.419062623079952, + "grad_norm": 0.4449993371963501, + "learning_rate": 2.9771333333333334e-05, + "loss": 0.029, + "step": 10692 + }, + { + "epoch": 8.419850334777472, + "grad_norm": 0.3570934236049652, + "learning_rate": 2.9771e-05, + "loss": 0.0176, + "step": 10693 + }, + { + "epoch": 8.42063804647499, + "grad_norm": 0.5927059650421143, + "learning_rate": 2.9770666666666666e-05, + "loss": 0.0204, + "step": 10694 + }, + { + "epoch": 8.421425758172509, + "grad_norm": 0.25410208106040955, + "learning_rate": 2.9770333333333335e-05, + "loss": 0.0118, + "step": 10695 + }, + { + "epoch": 8.422213469870028, + "grad_norm": 1.0569348335266113, + "learning_rate": 2.9769999999999998e-05, + "loss": 0.0284, + "step": 10696 + }, + { + "epoch": 8.423001181567546, + "grad_norm": 0.9747653007507324, + "learning_rate": 2.9769666666666667e-05, + "loss": 0.019, + "step": 10697 + }, + { + "epoch": 8.423788893265066, + "grad_norm": 0.26572319865226746, + "learning_rate": 2.9769333333333337e-05, + "loss": 0.018, + "step": 10698 + }, + { + "epoch": 8.424576604962583, + "grad_norm": 0.26529085636138916, + "learning_rate": 2.9769e-05, + "loss": 0.012, + "step": 10699 + }, + { + "epoch": 8.425364316660103, + "grad_norm": 0.30606457591056824, + "learning_rate": 2.976866666666667e-05, + "loss": 0.0167, + "step": 10700 + }, + { + "epoch": 8.42615202835762, + "grad_norm": 0.3711000084877014, + "learning_rate": 2.9768333333333334e-05, + "loss": 0.0181, + "step": 10701 + }, + { + "epoch": 8.42693974005514, + "grad_norm": 0.40109214186668396, + "learning_rate": 2.9768e-05, + "loss": 0.0193, + "step": 10702 + }, + { + "epoch": 8.42772745175266, + "grad_norm": 0.259465754032135, + "learning_rate": 2.9767666666666666e-05, + "loss": 0.017, + "step": 10703 + }, + { + "epoch": 8.428515163450177, + "grad_norm": 0.5159271955490112, + "learning_rate": 2.9767333333333336e-05, + "loss": 0.0245, + "step": 10704 + }, + { + "epoch": 8.429302875147696, + "grad_norm": 0.3316991329193115, + "learning_rate": 2.9766999999999998e-05, + "loss": 0.0139, + "step": 10705 + }, + { + "epoch": 8.430090586845214, + "grad_norm": 0.43960946798324585, + "learning_rate": 2.9766666666666667e-05, + "loss": 0.0198, + "step": 10706 + }, + { + "epoch": 8.430878298542734, + "grad_norm": 0.5624348521232605, + "learning_rate": 2.9766333333333337e-05, + "loss": 0.0271, + "step": 10707 + }, + { + "epoch": 8.431666010240253, + "grad_norm": 1.2365645170211792, + "learning_rate": 2.9766e-05, + "loss": 0.0239, + "step": 10708 + }, + { + "epoch": 8.43245372193777, + "grad_norm": 0.33849605917930603, + "learning_rate": 2.976566666666667e-05, + "loss": 0.0158, + "step": 10709 + }, + { + "epoch": 8.43324143363529, + "grad_norm": 0.892874538898468, + "learning_rate": 2.9765333333333335e-05, + "loss": 0.0349, + "step": 10710 + }, + { + "epoch": 8.434029145332808, + "grad_norm": 0.7242209911346436, + "learning_rate": 2.9765e-05, + "loss": 0.2144, + "step": 10711 + }, + { + "epoch": 8.434816857030327, + "grad_norm": 0.7721160650253296, + "learning_rate": 2.9764666666666666e-05, + "loss": 0.1991, + "step": 10712 + }, + { + "epoch": 8.435604568727845, + "grad_norm": 0.7433951497077942, + "learning_rate": 2.9764333333333336e-05, + "loss": 0.1377, + "step": 10713 + }, + { + "epoch": 8.436392280425364, + "grad_norm": 0.6859279870986938, + "learning_rate": 2.9764e-05, + "loss": 0.1349, + "step": 10714 + }, + { + "epoch": 8.437179992122884, + "grad_norm": 0.5546268820762634, + "learning_rate": 2.9763666666666668e-05, + "loss": 0.0698, + "step": 10715 + }, + { + "epoch": 8.437967703820402, + "grad_norm": 0.41933006048202515, + "learning_rate": 2.9763333333333337e-05, + "loss": 0.0358, + "step": 10716 + }, + { + "epoch": 8.438755415517921, + "grad_norm": 0.48514583706855774, + "learning_rate": 2.9763e-05, + "loss": 0.0775, + "step": 10717 + }, + { + "epoch": 8.439543127215439, + "grad_norm": 0.46959438920021057, + "learning_rate": 2.976266666666667e-05, + "loss": 0.0369, + "step": 10718 + }, + { + "epoch": 8.440330838912958, + "grad_norm": 0.6037197709083557, + "learning_rate": 2.976233333333333e-05, + "loss": 0.0205, + "step": 10719 + }, + { + "epoch": 8.441118550610476, + "grad_norm": 0.6921390891075134, + "learning_rate": 2.9762e-05, + "loss": 0.0255, + "step": 10720 + }, + { + "epoch": 8.441906262307995, + "grad_norm": 0.21431978046894073, + "learning_rate": 2.9761666666666667e-05, + "loss": 0.0113, + "step": 10721 + }, + { + "epoch": 8.442693974005515, + "grad_norm": 0.39436236023902893, + "learning_rate": 2.9761333333333333e-05, + "loss": 0.021, + "step": 10722 + }, + { + "epoch": 8.443481685703032, + "grad_norm": 0.48595666885375977, + "learning_rate": 2.9761000000000002e-05, + "loss": 0.0184, + "step": 10723 + }, + { + "epoch": 8.444269397400552, + "grad_norm": 0.19178670644760132, + "learning_rate": 2.9760666666666668e-05, + "loss": 0.0097, + "step": 10724 + }, + { + "epoch": 8.44505710909807, + "grad_norm": 0.2658499479293823, + "learning_rate": 2.9760333333333334e-05, + "loss": 0.0089, + "step": 10725 + }, + { + "epoch": 8.445844820795589, + "grad_norm": 0.5021863579750061, + "learning_rate": 2.976e-05, + "loss": 0.0215, + "step": 10726 + }, + { + "epoch": 8.446632532493108, + "grad_norm": 0.3821256756782532, + "learning_rate": 2.975966666666667e-05, + "loss": 0.0201, + "step": 10727 + }, + { + "epoch": 8.447420244190626, + "grad_norm": 0.37169328331947327, + "learning_rate": 2.975933333333333e-05, + "loss": 0.0263, + "step": 10728 + }, + { + "epoch": 8.448207955888146, + "grad_norm": 0.49452975392341614, + "learning_rate": 2.9759e-05, + "loss": 0.0166, + "step": 10729 + }, + { + "epoch": 8.448995667585663, + "grad_norm": 0.32917508482933044, + "learning_rate": 2.9758666666666667e-05, + "loss": 0.0111, + "step": 10730 + }, + { + "epoch": 8.449783379283183, + "grad_norm": 0.3233838975429535, + "learning_rate": 2.9758333333333333e-05, + "loss": 0.012, + "step": 10731 + }, + { + "epoch": 8.4505710909807, + "grad_norm": 0.28679049015045166, + "learning_rate": 2.9758000000000002e-05, + "loss": 0.0176, + "step": 10732 + }, + { + "epoch": 8.45135880267822, + "grad_norm": 0.3967803716659546, + "learning_rate": 2.9757666666666668e-05, + "loss": 0.0205, + "step": 10733 + }, + { + "epoch": 8.45214651437574, + "grad_norm": 0.2598329186439514, + "learning_rate": 2.9757333333333334e-05, + "loss": 0.0127, + "step": 10734 + }, + { + "epoch": 8.452934226073257, + "grad_norm": 0.31961584091186523, + "learning_rate": 2.9757e-05, + "loss": 0.0222, + "step": 10735 + }, + { + "epoch": 8.453721937770776, + "grad_norm": 0.2127685397863388, + "learning_rate": 2.975666666666667e-05, + "loss": 0.0121, + "step": 10736 + }, + { + "epoch": 8.454509649468294, + "grad_norm": 0.22461813688278198, + "learning_rate": 2.9756333333333332e-05, + "loss": 0.0115, + "step": 10737 + }, + { + "epoch": 8.455297361165814, + "grad_norm": 0.2519998550415039, + "learning_rate": 2.9756e-05, + "loss": 0.0212, + "step": 10738 + }, + { + "epoch": 8.456085072863331, + "grad_norm": 0.3581370413303375, + "learning_rate": 2.975566666666667e-05, + "loss": 0.0247, + "step": 10739 + }, + { + "epoch": 8.45687278456085, + "grad_norm": 0.4366936683654785, + "learning_rate": 2.9755333333333333e-05, + "loss": 0.015, + "step": 10740 + }, + { + "epoch": 8.45766049625837, + "grad_norm": 0.3561399579048157, + "learning_rate": 2.9755000000000002e-05, + "loss": 0.0194, + "step": 10741 + }, + { + "epoch": 8.458448207955888, + "grad_norm": 0.2691282331943512, + "learning_rate": 2.9754666666666668e-05, + "loss": 0.0136, + "step": 10742 + }, + { + "epoch": 8.459235919653407, + "grad_norm": 0.1723324954509735, + "learning_rate": 2.9754333333333334e-05, + "loss": 0.0139, + "step": 10743 + }, + { + "epoch": 8.460023631350925, + "grad_norm": 0.3245251774787903, + "learning_rate": 2.9754e-05, + "loss": 0.031, + "step": 10744 + }, + { + "epoch": 8.460811343048444, + "grad_norm": 0.24310733377933502, + "learning_rate": 2.975366666666667e-05, + "loss": 0.0158, + "step": 10745 + }, + { + "epoch": 8.461599054745964, + "grad_norm": 0.6500071287155151, + "learning_rate": 2.9753333333333332e-05, + "loss": 0.0138, + "step": 10746 + }, + { + "epoch": 8.462386766443482, + "grad_norm": 0.2057361602783203, + "learning_rate": 2.9753e-05, + "loss": 0.0124, + "step": 10747 + }, + { + "epoch": 8.463174478141001, + "grad_norm": 0.5370217561721802, + "learning_rate": 2.9752666666666667e-05, + "loss": 0.0187, + "step": 10748 + }, + { + "epoch": 8.463962189838519, + "grad_norm": 0.36771276593208313, + "learning_rate": 2.9752333333333333e-05, + "loss": 0.0244, + "step": 10749 + }, + { + "epoch": 8.464749901536038, + "grad_norm": 0.25882580876350403, + "learning_rate": 2.9752000000000002e-05, + "loss": 0.0177, + "step": 10750 + }, + { + "epoch": 8.465537613233556, + "grad_norm": 0.36469897627830505, + "learning_rate": 2.9751666666666665e-05, + "loss": 0.0193, + "step": 10751 + }, + { + "epoch": 8.466325324931075, + "grad_norm": 0.27566099166870117, + "learning_rate": 2.9751333333333334e-05, + "loss": 0.0149, + "step": 10752 + }, + { + "epoch": 8.467113036628595, + "grad_norm": 0.38629359006881714, + "learning_rate": 2.9751e-05, + "loss": 0.0142, + "step": 10753 + }, + { + "epoch": 8.467900748326112, + "grad_norm": 0.3919450342655182, + "learning_rate": 2.9750666666666666e-05, + "loss": 0.0217, + "step": 10754 + }, + { + "epoch": 8.468688460023632, + "grad_norm": 0.3476554751396179, + "learning_rate": 2.9750333333333332e-05, + "loss": 0.0218, + "step": 10755 + }, + { + "epoch": 8.46947617172115, + "grad_norm": 0.31900525093078613, + "learning_rate": 2.975e-05, + "loss": 0.0172, + "step": 10756 + }, + { + "epoch": 8.470263883418669, + "grad_norm": 0.8937358856201172, + "learning_rate": 2.9749666666666667e-05, + "loss": 0.0322, + "step": 10757 + }, + { + "epoch": 8.471051595116187, + "grad_norm": 0.3935023546218872, + "learning_rate": 2.9749333333333333e-05, + "loss": 0.0203, + "step": 10758 + }, + { + "epoch": 8.471839306813706, + "grad_norm": 0.3042829632759094, + "learning_rate": 2.9749000000000003e-05, + "loss": 0.0175, + "step": 10759 + }, + { + "epoch": 8.472627018511226, + "grad_norm": 0.5959113836288452, + "learning_rate": 2.9748666666666665e-05, + "loss": 0.021, + "step": 10760 + }, + { + "epoch": 8.473414730208743, + "grad_norm": 1.0057592391967773, + "learning_rate": 2.9748333333333335e-05, + "loss": 0.2121, + "step": 10761 + }, + { + "epoch": 8.474202441906263, + "grad_norm": 0.9036368727684021, + "learning_rate": 2.9748e-05, + "loss": 0.1736, + "step": 10762 + }, + { + "epoch": 8.47499015360378, + "grad_norm": 0.6830325126647949, + "learning_rate": 2.9747666666666666e-05, + "loss": 0.1744, + "step": 10763 + }, + { + "epoch": 8.4757778653013, + "grad_norm": 0.9407737851142883, + "learning_rate": 2.9747333333333336e-05, + "loss": 0.1759, + "step": 10764 + }, + { + "epoch": 8.47656557699882, + "grad_norm": 0.47678670287132263, + "learning_rate": 2.9747e-05, + "loss": 0.0612, + "step": 10765 + }, + { + "epoch": 8.477353288696337, + "grad_norm": 0.8569664359092712, + "learning_rate": 2.9746666666666668e-05, + "loss": 0.0926, + "step": 10766 + }, + { + "epoch": 8.478141000393856, + "grad_norm": 0.40510091185569763, + "learning_rate": 2.9746333333333334e-05, + "loss": 0.03, + "step": 10767 + }, + { + "epoch": 8.478928712091374, + "grad_norm": 0.35329657793045044, + "learning_rate": 2.9746000000000003e-05, + "loss": 0.0483, + "step": 10768 + }, + { + "epoch": 8.479716423788894, + "grad_norm": 0.5157356262207031, + "learning_rate": 2.9745666666666665e-05, + "loss": 0.0305, + "step": 10769 + }, + { + "epoch": 8.480504135486411, + "grad_norm": 0.3476882576942444, + "learning_rate": 2.9745333333333335e-05, + "loss": 0.0147, + "step": 10770 + }, + { + "epoch": 8.48129184718393, + "grad_norm": 0.40923628211021423, + "learning_rate": 2.9745e-05, + "loss": 0.0301, + "step": 10771 + }, + { + "epoch": 8.48207955888145, + "grad_norm": 0.24437032639980316, + "learning_rate": 2.9744666666666667e-05, + "loss": 0.0205, + "step": 10772 + }, + { + "epoch": 8.482867270578968, + "grad_norm": 0.396363765001297, + "learning_rate": 2.9744333333333336e-05, + "loss": 0.0196, + "step": 10773 + }, + { + "epoch": 8.483654982276487, + "grad_norm": 0.19362348318099976, + "learning_rate": 2.9744000000000002e-05, + "loss": 0.0131, + "step": 10774 + }, + { + "epoch": 8.484442693974005, + "grad_norm": 0.2352532595396042, + "learning_rate": 2.9743666666666668e-05, + "loss": 0.0166, + "step": 10775 + }, + { + "epoch": 8.485230405671524, + "grad_norm": 0.298120379447937, + "learning_rate": 2.9743333333333334e-05, + "loss": 0.0263, + "step": 10776 + }, + { + "epoch": 8.486018117369042, + "grad_norm": 0.3646581768989563, + "learning_rate": 2.9743000000000003e-05, + "loss": 0.0237, + "step": 10777 + }, + { + "epoch": 8.486805829066562, + "grad_norm": 0.17937442660331726, + "learning_rate": 2.9742666666666666e-05, + "loss": 0.0142, + "step": 10778 + }, + { + "epoch": 8.487593540764081, + "grad_norm": 0.18940822780132294, + "learning_rate": 2.9742333333333335e-05, + "loss": 0.0103, + "step": 10779 + }, + { + "epoch": 8.488381252461599, + "grad_norm": 0.22696395218372345, + "learning_rate": 2.9742e-05, + "loss": 0.0138, + "step": 10780 + }, + { + "epoch": 8.489168964159118, + "grad_norm": 0.4230320453643799, + "learning_rate": 2.9741666666666667e-05, + "loss": 0.0151, + "step": 10781 + }, + { + "epoch": 8.489956675856636, + "grad_norm": 0.20646406710147858, + "learning_rate": 2.9741333333333336e-05, + "loss": 0.0098, + "step": 10782 + }, + { + "epoch": 8.490744387554155, + "grad_norm": 0.3556465208530426, + "learning_rate": 2.9741e-05, + "loss": 0.014, + "step": 10783 + }, + { + "epoch": 8.491532099251675, + "grad_norm": 0.3138037919998169, + "learning_rate": 2.9740666666666668e-05, + "loss": 0.0141, + "step": 10784 + }, + { + "epoch": 8.492319810949192, + "grad_norm": 0.36824244260787964, + "learning_rate": 2.9740333333333334e-05, + "loss": 0.0188, + "step": 10785 + }, + { + "epoch": 8.493107522646712, + "grad_norm": 0.4065912067890167, + "learning_rate": 2.974e-05, + "loss": 0.0201, + "step": 10786 + }, + { + "epoch": 8.49389523434423, + "grad_norm": 0.28938326239585876, + "learning_rate": 2.9739666666666666e-05, + "loss": 0.0185, + "step": 10787 + }, + { + "epoch": 8.494682946041749, + "grad_norm": 0.4606494605541229, + "learning_rate": 2.9739333333333335e-05, + "loss": 0.0163, + "step": 10788 + }, + { + "epoch": 8.495470657739267, + "grad_norm": 0.4696942865848541, + "learning_rate": 2.9739e-05, + "loss": 0.0154, + "step": 10789 + }, + { + "epoch": 8.496258369436786, + "grad_norm": 0.43996551632881165, + "learning_rate": 2.9738666666666667e-05, + "loss": 0.02, + "step": 10790 + }, + { + "epoch": 8.497046081134306, + "grad_norm": 0.3635314702987671, + "learning_rate": 2.9738333333333336e-05, + "loss": 0.0152, + "step": 10791 + }, + { + "epoch": 8.497833792831823, + "grad_norm": 0.2934000790119171, + "learning_rate": 2.9738e-05, + "loss": 0.0159, + "step": 10792 + }, + { + "epoch": 8.498621504529343, + "grad_norm": 0.474261075258255, + "learning_rate": 2.9737666666666668e-05, + "loss": 0.0154, + "step": 10793 + }, + { + "epoch": 8.49940921622686, + "grad_norm": 0.3099822700023651, + "learning_rate": 2.9737333333333334e-05, + "loss": 0.02, + "step": 10794 + }, + { + "epoch": 8.50019692792438, + "grad_norm": 0.3099801242351532, + "learning_rate": 2.9737e-05, + "loss": 0.0232, + "step": 10795 + }, + { + "epoch": 8.500984639621898, + "grad_norm": 0.2891607880592346, + "learning_rate": 2.9736666666666666e-05, + "loss": 0.0128, + "step": 10796 + }, + { + "epoch": 8.501772351319417, + "grad_norm": 0.19075177609920502, + "learning_rate": 2.9736333333333335e-05, + "loss": 0.0104, + "step": 10797 + }, + { + "epoch": 8.502560063016936, + "grad_norm": 0.4061930179595947, + "learning_rate": 2.9736e-05, + "loss": 0.0186, + "step": 10798 + }, + { + "epoch": 8.503347774714454, + "grad_norm": 0.49073484539985657, + "learning_rate": 2.9735666666666667e-05, + "loss": 0.0276, + "step": 10799 + }, + { + "epoch": 8.504135486411974, + "grad_norm": 0.37687918543815613, + "learning_rate": 2.9735333333333337e-05, + "loss": 0.0163, + "step": 10800 + }, + { + "epoch": 8.504923198109491, + "grad_norm": 0.22416245937347412, + "learning_rate": 2.9735e-05, + "loss": 0.0133, + "step": 10801 + }, + { + "epoch": 8.50571090980701, + "grad_norm": 0.3031943738460541, + "learning_rate": 2.973466666666667e-05, + "loss": 0.0121, + "step": 10802 + }, + { + "epoch": 8.50649862150453, + "grad_norm": 0.23948955535888672, + "learning_rate": 2.9734333333333334e-05, + "loss": 0.0124, + "step": 10803 + }, + { + "epoch": 8.507286333202048, + "grad_norm": 0.2740027904510498, + "learning_rate": 2.9734e-05, + "loss": 0.0151, + "step": 10804 + }, + { + "epoch": 8.508074044899567, + "grad_norm": 0.33493518829345703, + "learning_rate": 2.9733666666666666e-05, + "loss": 0.0226, + "step": 10805 + }, + { + "epoch": 8.508861756597085, + "grad_norm": 0.47770336270332336, + "learning_rate": 2.9733333333333336e-05, + "loss": 0.0206, + "step": 10806 + }, + { + "epoch": 8.509649468294604, + "grad_norm": 0.21402034163475037, + "learning_rate": 2.9733e-05, + "loss": 0.013, + "step": 10807 + }, + { + "epoch": 8.510437179992122, + "grad_norm": 0.289905846118927, + "learning_rate": 2.9732666666666667e-05, + "loss": 0.0209, + "step": 10808 + }, + { + "epoch": 8.511224891689642, + "grad_norm": 0.3462779223918915, + "learning_rate": 2.9732333333333333e-05, + "loss": 0.0233, + "step": 10809 + }, + { + "epoch": 8.512012603387161, + "grad_norm": 0.4782494604587555, + "learning_rate": 2.9732e-05, + "loss": 0.025, + "step": 10810 + }, + { + "epoch": 8.512800315084679, + "grad_norm": 0.6851412653923035, + "learning_rate": 2.973166666666667e-05, + "loss": 0.1939, + "step": 10811 + }, + { + "epoch": 8.513588026782198, + "grad_norm": 0.509722888469696, + "learning_rate": 2.973133333333333e-05, + "loss": 0.1588, + "step": 10812 + }, + { + "epoch": 8.514375738479716, + "grad_norm": 0.5252811908721924, + "learning_rate": 2.9731e-05, + "loss": 0.127, + "step": 10813 + }, + { + "epoch": 8.515163450177235, + "grad_norm": 0.6887207627296448, + "learning_rate": 2.973066666666667e-05, + "loss": 0.096, + "step": 10814 + }, + { + "epoch": 8.515951161874753, + "grad_norm": 0.6535071134567261, + "learning_rate": 2.9730333333333332e-05, + "loss": 0.1123, + "step": 10815 + }, + { + "epoch": 8.516738873572272, + "grad_norm": 1.3075568675994873, + "learning_rate": 2.973e-05, + "loss": 0.093, + "step": 10816 + }, + { + "epoch": 8.517526585269792, + "grad_norm": 0.43973293900489807, + "learning_rate": 2.9729666666666668e-05, + "loss": 0.0312, + "step": 10817 + }, + { + "epoch": 8.51831429696731, + "grad_norm": 0.4781407415866852, + "learning_rate": 2.9729333333333334e-05, + "loss": 0.0409, + "step": 10818 + }, + { + "epoch": 8.519102008664829, + "grad_norm": 0.43682175874710083, + "learning_rate": 2.9729e-05, + "loss": 0.0251, + "step": 10819 + }, + { + "epoch": 8.519889720362347, + "grad_norm": 0.46993938088417053, + "learning_rate": 2.972866666666667e-05, + "loss": 0.0237, + "step": 10820 + }, + { + "epoch": 8.520677432059866, + "grad_norm": 0.24956834316253662, + "learning_rate": 2.972833333333333e-05, + "loss": 0.0177, + "step": 10821 + }, + { + "epoch": 8.521465143757386, + "grad_norm": 0.21365241706371307, + "learning_rate": 2.9728e-05, + "loss": 0.0182, + "step": 10822 + }, + { + "epoch": 8.522252855454903, + "grad_norm": 0.19377660751342773, + "learning_rate": 2.972766666666667e-05, + "loss": 0.0119, + "step": 10823 + }, + { + "epoch": 8.523040567152423, + "grad_norm": 0.14207758009433746, + "learning_rate": 2.9727333333333333e-05, + "loss": 0.007, + "step": 10824 + }, + { + "epoch": 8.52382827884994, + "grad_norm": 0.3105674684047699, + "learning_rate": 2.9727000000000002e-05, + "loss": 0.0219, + "step": 10825 + }, + { + "epoch": 8.52461599054746, + "grad_norm": 0.42507103085517883, + "learning_rate": 2.9726666666666668e-05, + "loss": 0.0187, + "step": 10826 + }, + { + "epoch": 8.525403702244978, + "grad_norm": 0.34534043073654175, + "learning_rate": 2.9726333333333334e-05, + "loss": 0.0212, + "step": 10827 + }, + { + "epoch": 8.526191413942497, + "grad_norm": 0.39465367794036865, + "learning_rate": 2.9726e-05, + "loss": 0.0173, + "step": 10828 + }, + { + "epoch": 8.526979125640016, + "grad_norm": 0.214223250746727, + "learning_rate": 2.972566666666667e-05, + "loss": 0.0109, + "step": 10829 + }, + { + "epoch": 8.527766837337534, + "grad_norm": 0.2992638945579529, + "learning_rate": 2.9725333333333335e-05, + "loss": 0.0148, + "step": 10830 + }, + { + "epoch": 8.528554549035054, + "grad_norm": 0.4973450005054474, + "learning_rate": 2.9725e-05, + "loss": 0.0241, + "step": 10831 + }, + { + "epoch": 8.529342260732571, + "grad_norm": 0.23072127997875214, + "learning_rate": 2.972466666666667e-05, + "loss": 0.0169, + "step": 10832 + }, + { + "epoch": 8.53012997243009, + "grad_norm": 0.2921766936779022, + "learning_rate": 2.9724333333333333e-05, + "loss": 0.0158, + "step": 10833 + }, + { + "epoch": 8.530917684127608, + "grad_norm": 0.36294087767601013, + "learning_rate": 2.9724000000000002e-05, + "loss": 0.0215, + "step": 10834 + }, + { + "epoch": 8.531705395825128, + "grad_norm": 0.3061384856700897, + "learning_rate": 2.9723666666666668e-05, + "loss": 0.0167, + "step": 10835 + }, + { + "epoch": 8.532493107522647, + "grad_norm": 0.3903306722640991, + "learning_rate": 2.9723333333333334e-05, + "loss": 0.0198, + "step": 10836 + }, + { + "epoch": 8.533280819220165, + "grad_norm": 0.19549039006233215, + "learning_rate": 2.9723e-05, + "loss": 0.0098, + "step": 10837 + }, + { + "epoch": 8.534068530917684, + "grad_norm": 0.20155128836631775, + "learning_rate": 2.9722666666666666e-05, + "loss": 0.0145, + "step": 10838 + }, + { + "epoch": 8.534856242615202, + "grad_norm": 0.21537327766418457, + "learning_rate": 2.9722333333333335e-05, + "loss": 0.0127, + "step": 10839 + }, + { + "epoch": 8.535643954312722, + "grad_norm": 0.30568909645080566, + "learning_rate": 2.9722e-05, + "loss": 0.0153, + "step": 10840 + }, + { + "epoch": 8.536431666010241, + "grad_norm": 1.229660153388977, + "learning_rate": 2.9721666666666667e-05, + "loss": 0.0223, + "step": 10841 + }, + { + "epoch": 8.537219377707759, + "grad_norm": 0.39011672139167786, + "learning_rate": 2.9721333333333333e-05, + "loss": 0.0137, + "step": 10842 + }, + { + "epoch": 8.538007089405278, + "grad_norm": 0.3730747401714325, + "learning_rate": 2.9721000000000002e-05, + "loss": 0.0222, + "step": 10843 + }, + { + "epoch": 8.538794801102796, + "grad_norm": 0.2399895191192627, + "learning_rate": 2.9720666666666665e-05, + "loss": 0.013, + "step": 10844 + }, + { + "epoch": 8.539582512800315, + "grad_norm": 0.5803890228271484, + "learning_rate": 2.9720333333333334e-05, + "loss": 0.0221, + "step": 10845 + }, + { + "epoch": 8.540370224497833, + "grad_norm": 0.4556335508823395, + "learning_rate": 2.972e-05, + "loss": 0.0124, + "step": 10846 + }, + { + "epoch": 8.541157936195352, + "grad_norm": 0.5272601246833801, + "learning_rate": 2.9719666666666666e-05, + "loss": 0.0217, + "step": 10847 + }, + { + "epoch": 8.541945647892872, + "grad_norm": 0.29182168841362, + "learning_rate": 2.9719333333333335e-05, + "loss": 0.0292, + "step": 10848 + }, + { + "epoch": 8.54273335959039, + "grad_norm": 1.1024450063705444, + "learning_rate": 2.9719e-05, + "loss": 0.0168, + "step": 10849 + }, + { + "epoch": 8.543521071287909, + "grad_norm": 0.34217944741249084, + "learning_rate": 2.9718666666666667e-05, + "loss": 0.0254, + "step": 10850 + }, + { + "epoch": 8.544308782985427, + "grad_norm": 0.36669209599494934, + "learning_rate": 2.9718333333333333e-05, + "loss": 0.0177, + "step": 10851 + }, + { + "epoch": 8.545096494682946, + "grad_norm": 0.3112242519855499, + "learning_rate": 2.9718000000000002e-05, + "loss": 0.0143, + "step": 10852 + }, + { + "epoch": 8.545884206380464, + "grad_norm": 2.5493688583374023, + "learning_rate": 2.9717666666666665e-05, + "loss": 0.0171, + "step": 10853 + }, + { + "epoch": 8.546671918077983, + "grad_norm": 0.3214631974697113, + "learning_rate": 2.9717333333333334e-05, + "loss": 0.014, + "step": 10854 + }, + { + "epoch": 8.547459629775503, + "grad_norm": 0.20122700929641724, + "learning_rate": 2.9717e-05, + "loss": 0.0181, + "step": 10855 + }, + { + "epoch": 8.54824734147302, + "grad_norm": 0.12083903700113297, + "learning_rate": 2.9716666666666666e-05, + "loss": 0.0083, + "step": 10856 + }, + { + "epoch": 8.54903505317054, + "grad_norm": 0.3182195723056793, + "learning_rate": 2.9716333333333336e-05, + "loss": 0.0191, + "step": 10857 + }, + { + "epoch": 8.549822764868058, + "grad_norm": 0.8944652080535889, + "learning_rate": 2.9716e-05, + "loss": 0.028, + "step": 10858 + }, + { + "epoch": 8.550610476565577, + "grad_norm": 1.2068216800689697, + "learning_rate": 2.9715666666666667e-05, + "loss": 0.0595, + "step": 10859 + }, + { + "epoch": 8.551398188263097, + "grad_norm": 0.44815903902053833, + "learning_rate": 2.9715333333333333e-05, + "loss": 0.0191, + "step": 10860 + }, + { + "epoch": 8.552185899960614, + "grad_norm": 0.7723113298416138, + "learning_rate": 2.9715000000000003e-05, + "loss": 0.2534, + "step": 10861 + }, + { + "epoch": 8.552973611658134, + "grad_norm": 0.653835654258728, + "learning_rate": 2.9714666666666665e-05, + "loss": 0.1421, + "step": 10862 + }, + { + "epoch": 8.553761323355651, + "grad_norm": 0.5147488117218018, + "learning_rate": 2.9714333333333335e-05, + "loss": 0.1349, + "step": 10863 + }, + { + "epoch": 8.55454903505317, + "grad_norm": 0.6199316382408142, + "learning_rate": 2.9714000000000004e-05, + "loss": 0.1126, + "step": 10864 + }, + { + "epoch": 8.555336746750688, + "grad_norm": 0.4188935160636902, + "learning_rate": 2.9713666666666666e-05, + "loss": 0.0645, + "step": 10865 + }, + { + "epoch": 8.556124458448208, + "grad_norm": 0.33075234293937683, + "learning_rate": 2.9713333333333336e-05, + "loss": 0.0365, + "step": 10866 + }, + { + "epoch": 8.556912170145727, + "grad_norm": 0.2993888556957245, + "learning_rate": 2.9713e-05, + "loss": 0.045, + "step": 10867 + }, + { + "epoch": 8.557699881843245, + "grad_norm": 0.18599992990493774, + "learning_rate": 2.9712666666666668e-05, + "loss": 0.0143, + "step": 10868 + }, + { + "epoch": 8.558487593540764, + "grad_norm": 0.1981397271156311, + "learning_rate": 2.9712333333333334e-05, + "loss": 0.0105, + "step": 10869 + }, + { + "epoch": 8.559275305238282, + "grad_norm": 0.29757311940193176, + "learning_rate": 2.9712e-05, + "loss": 0.0261, + "step": 10870 + }, + { + "epoch": 8.560063016935802, + "grad_norm": 0.40028855204582214, + "learning_rate": 2.9711666666666665e-05, + "loss": 0.0342, + "step": 10871 + }, + { + "epoch": 8.56085072863332, + "grad_norm": 0.4448317587375641, + "learning_rate": 2.9711333333333335e-05, + "loss": 0.0254, + "step": 10872 + }, + { + "epoch": 8.561638440330839, + "grad_norm": 1.0471373796463013, + "learning_rate": 2.9711e-05, + "loss": 0.0576, + "step": 10873 + }, + { + "epoch": 8.562426152028358, + "grad_norm": 0.2611372470855713, + "learning_rate": 2.9710666666666667e-05, + "loss": 0.0136, + "step": 10874 + }, + { + "epoch": 8.563213863725876, + "grad_norm": 0.6397662162780762, + "learning_rate": 2.9710333333333336e-05, + "loss": 0.0295, + "step": 10875 + }, + { + "epoch": 8.564001575423395, + "grad_norm": 0.8154506683349609, + "learning_rate": 2.971e-05, + "loss": 0.0334, + "step": 10876 + }, + { + "epoch": 8.564789287120913, + "grad_norm": 0.21720294654369354, + "learning_rate": 2.9709666666666668e-05, + "loss": 0.0108, + "step": 10877 + }, + { + "epoch": 8.565576998818432, + "grad_norm": 0.4388860762119293, + "learning_rate": 2.9709333333333334e-05, + "loss": 0.0197, + "step": 10878 + }, + { + "epoch": 8.566364710515952, + "grad_norm": 0.2633827030658722, + "learning_rate": 2.9709e-05, + "loss": 0.0174, + "step": 10879 + }, + { + "epoch": 8.56715242221347, + "grad_norm": 0.48513925075531006, + "learning_rate": 2.970866666666667e-05, + "loss": 0.0263, + "step": 10880 + }, + { + "epoch": 8.567940133910989, + "grad_norm": 0.3284083902835846, + "learning_rate": 2.9708333333333335e-05, + "loss": 0.0142, + "step": 10881 + }, + { + "epoch": 8.568727845608507, + "grad_norm": 0.2519615888595581, + "learning_rate": 2.9708e-05, + "loss": 0.0102, + "step": 10882 + }, + { + "epoch": 8.569515557306026, + "grad_norm": 0.3547723889350891, + "learning_rate": 2.9707666666666667e-05, + "loss": 0.016, + "step": 10883 + }, + { + "epoch": 8.570303269003544, + "grad_norm": 0.2684028148651123, + "learning_rate": 2.9707333333333336e-05, + "loss": 0.0142, + "step": 10884 + }, + { + "epoch": 8.571090980701063, + "grad_norm": 0.3447440564632416, + "learning_rate": 2.9707e-05, + "loss": 0.0192, + "step": 10885 + }, + { + "epoch": 8.571878692398583, + "grad_norm": 0.26598504185676575, + "learning_rate": 2.9706666666666668e-05, + "loss": 0.015, + "step": 10886 + }, + { + "epoch": 8.5726664040961, + "grad_norm": 0.28737783432006836, + "learning_rate": 2.9706333333333334e-05, + "loss": 0.0173, + "step": 10887 + }, + { + "epoch": 8.57345411579362, + "grad_norm": 0.5868796706199646, + "learning_rate": 2.9706e-05, + "loss": 0.0192, + "step": 10888 + }, + { + "epoch": 8.574241827491138, + "grad_norm": 0.27000686526298523, + "learning_rate": 2.970566666666667e-05, + "loss": 0.0098, + "step": 10889 + }, + { + "epoch": 8.575029539188657, + "grad_norm": 0.2242773026227951, + "learning_rate": 2.9705333333333335e-05, + "loss": 0.0146, + "step": 10890 + }, + { + "epoch": 8.575817250886175, + "grad_norm": 0.48673197627067566, + "learning_rate": 2.9705e-05, + "loss": 0.0218, + "step": 10891 + }, + { + "epoch": 8.576604962583694, + "grad_norm": 1.0311408042907715, + "learning_rate": 2.9704666666666667e-05, + "loss": 0.0251, + "step": 10892 + }, + { + "epoch": 8.577392674281214, + "grad_norm": 0.21431595087051392, + "learning_rate": 2.9704333333333336e-05, + "loss": 0.0104, + "step": 10893 + }, + { + "epoch": 8.578180385978731, + "grad_norm": 0.8827276825904846, + "learning_rate": 2.9704e-05, + "loss": 0.0192, + "step": 10894 + }, + { + "epoch": 8.57896809767625, + "grad_norm": 0.21023043990135193, + "learning_rate": 2.9703666666666668e-05, + "loss": 0.0107, + "step": 10895 + }, + { + "epoch": 8.579755809373768, + "grad_norm": 0.22836905717849731, + "learning_rate": 2.9703333333333334e-05, + "loss": 0.012, + "step": 10896 + }, + { + "epoch": 8.580543521071288, + "grad_norm": 0.31539517641067505, + "learning_rate": 2.9703e-05, + "loss": 0.0177, + "step": 10897 + }, + { + "epoch": 8.581331232768807, + "grad_norm": 0.42677268385887146, + "learning_rate": 2.970266666666667e-05, + "loss": 0.0255, + "step": 10898 + }, + { + "epoch": 8.582118944466325, + "grad_norm": 0.19817082583904266, + "learning_rate": 2.9702333333333332e-05, + "loss": 0.0115, + "step": 10899 + }, + { + "epoch": 8.582906656163845, + "grad_norm": 0.30026814341545105, + "learning_rate": 2.9702e-05, + "loss": 0.0167, + "step": 10900 + }, + { + "epoch": 8.583694367861362, + "grad_norm": 0.23362088203430176, + "learning_rate": 2.9701666666666667e-05, + "loss": 0.0145, + "step": 10901 + }, + { + "epoch": 8.584482079558882, + "grad_norm": 0.09949596226215363, + "learning_rate": 2.9701333333333333e-05, + "loss": 0.005, + "step": 10902 + }, + { + "epoch": 8.5852697912564, + "grad_norm": 0.2673881947994232, + "learning_rate": 2.9701e-05, + "loss": 0.0143, + "step": 10903 + }, + { + "epoch": 8.586057502953919, + "grad_norm": 0.3272136151790619, + "learning_rate": 2.970066666666667e-05, + "loss": 0.0197, + "step": 10904 + }, + { + "epoch": 8.586845214651438, + "grad_norm": 0.30257898569107056, + "learning_rate": 2.9700333333333334e-05, + "loss": 0.0158, + "step": 10905 + }, + { + "epoch": 8.587632926348956, + "grad_norm": 0.4460357129573822, + "learning_rate": 2.97e-05, + "loss": 0.0191, + "step": 10906 + }, + { + "epoch": 8.588420638046475, + "grad_norm": 0.33490660786628723, + "learning_rate": 2.969966666666667e-05, + "loss": 0.0165, + "step": 10907 + }, + { + "epoch": 8.589208349743993, + "grad_norm": 0.16876614093780518, + "learning_rate": 2.9699333333333332e-05, + "loss": 0.0069, + "step": 10908 + }, + { + "epoch": 8.589996061441513, + "grad_norm": 0.6390381455421448, + "learning_rate": 2.9699e-05, + "loss": 0.0257, + "step": 10909 + }, + { + "epoch": 8.59078377313903, + "grad_norm": 1.0406156778335571, + "learning_rate": 2.9698666666666667e-05, + "loss": 0.0209, + "step": 10910 + }, + { + "epoch": 8.59157148483655, + "grad_norm": 0.9657480120658875, + "learning_rate": 2.9698333333333333e-05, + "loss": 0.2083, + "step": 10911 + }, + { + "epoch": 8.592359196534069, + "grad_norm": 0.8259944319725037, + "learning_rate": 2.9698e-05, + "loss": 0.1971, + "step": 10912 + }, + { + "epoch": 8.593146908231587, + "grad_norm": 0.7526848912239075, + "learning_rate": 2.969766666666667e-05, + "loss": 0.125, + "step": 10913 + }, + { + "epoch": 8.593934619929106, + "grad_norm": 0.5437930822372437, + "learning_rate": 2.9697333333333335e-05, + "loss": 0.1184, + "step": 10914 + }, + { + "epoch": 8.594722331626624, + "grad_norm": 1.1172784566879272, + "learning_rate": 2.9697e-05, + "loss": 0.1067, + "step": 10915 + }, + { + "epoch": 8.595510043324143, + "grad_norm": 0.5249778628349304, + "learning_rate": 2.969666666666667e-05, + "loss": 0.0617, + "step": 10916 + }, + { + "epoch": 8.596297755021663, + "grad_norm": 0.3362478017807007, + "learning_rate": 2.9696333333333332e-05, + "loss": 0.0369, + "step": 10917 + }, + { + "epoch": 8.59708546671918, + "grad_norm": 0.4709383249282837, + "learning_rate": 2.9696e-05, + "loss": 0.0551, + "step": 10918 + }, + { + "epoch": 8.5978731784167, + "grad_norm": 0.20525038242340088, + "learning_rate": 2.9695666666666668e-05, + "loss": 0.0235, + "step": 10919 + }, + { + "epoch": 8.598660890114218, + "grad_norm": 0.37299832701683044, + "learning_rate": 2.9695333333333334e-05, + "loss": 0.0218, + "step": 10920 + }, + { + "epoch": 8.599448601811737, + "grad_norm": 0.3296608626842499, + "learning_rate": 2.9695e-05, + "loss": 0.0173, + "step": 10921 + }, + { + "epoch": 8.600236313509257, + "grad_norm": 0.6465900540351868, + "learning_rate": 2.969466666666667e-05, + "loss": 0.0277, + "step": 10922 + }, + { + "epoch": 8.601024025206774, + "grad_norm": 0.32955309748649597, + "learning_rate": 2.9694333333333335e-05, + "loss": 0.042, + "step": 10923 + }, + { + "epoch": 8.601811736904294, + "grad_norm": 0.2572714686393738, + "learning_rate": 2.9694e-05, + "loss": 0.015, + "step": 10924 + }, + { + "epoch": 8.602599448601811, + "grad_norm": 0.2645775377750397, + "learning_rate": 2.969366666666667e-05, + "loss": 0.0116, + "step": 10925 + }, + { + "epoch": 8.60338716029933, + "grad_norm": 0.2367963343858719, + "learning_rate": 2.9693333333333333e-05, + "loss": 0.0166, + "step": 10926 + }, + { + "epoch": 8.604174871996848, + "grad_norm": 0.28238746523857117, + "learning_rate": 2.9693000000000002e-05, + "loss": 0.0202, + "step": 10927 + }, + { + "epoch": 8.604962583694368, + "grad_norm": 0.28132182359695435, + "learning_rate": 2.9692666666666668e-05, + "loss": 0.0217, + "step": 10928 + }, + { + "epoch": 8.605750295391886, + "grad_norm": 0.1853845715522766, + "learning_rate": 2.9692333333333334e-05, + "loss": 0.0119, + "step": 10929 + }, + { + "epoch": 8.606538007089405, + "grad_norm": 0.2788648307323456, + "learning_rate": 2.9692000000000003e-05, + "loss": 0.0166, + "step": 10930 + }, + { + "epoch": 8.607325718786925, + "grad_norm": 0.49382081627845764, + "learning_rate": 2.9691666666666666e-05, + "loss": 0.0152, + "step": 10931 + }, + { + "epoch": 8.608113430484442, + "grad_norm": 0.4738923907279968, + "learning_rate": 2.9691333333333335e-05, + "loss": 0.0227, + "step": 10932 + }, + { + "epoch": 8.608901142181962, + "grad_norm": 0.23127494752407074, + "learning_rate": 2.9691e-05, + "loss": 0.0202, + "step": 10933 + }, + { + "epoch": 8.60968885387948, + "grad_norm": 0.19786405563354492, + "learning_rate": 2.9690666666666667e-05, + "loss": 0.013, + "step": 10934 + }, + { + "epoch": 8.610476565576999, + "grad_norm": 0.38919639587402344, + "learning_rate": 2.9690333333333333e-05, + "loss": 0.0222, + "step": 10935 + }, + { + "epoch": 8.611264277274518, + "grad_norm": 0.24511264264583588, + "learning_rate": 2.9690000000000002e-05, + "loss": 0.0073, + "step": 10936 + }, + { + "epoch": 8.612051988972036, + "grad_norm": 0.42663809657096863, + "learning_rate": 2.9689666666666665e-05, + "loss": 0.0183, + "step": 10937 + }, + { + "epoch": 8.612839700669555, + "grad_norm": 0.1517314463853836, + "learning_rate": 2.9689333333333334e-05, + "loss": 0.0121, + "step": 10938 + }, + { + "epoch": 8.613627412367073, + "grad_norm": 0.3401576280593872, + "learning_rate": 2.9689000000000003e-05, + "loss": 0.0102, + "step": 10939 + }, + { + "epoch": 8.614415124064593, + "grad_norm": 0.4816720187664032, + "learning_rate": 2.9688666666666666e-05, + "loss": 0.0152, + "step": 10940 + }, + { + "epoch": 8.615202835762112, + "grad_norm": 0.21655143797397614, + "learning_rate": 2.9688333333333335e-05, + "loss": 0.0118, + "step": 10941 + }, + { + "epoch": 8.61599054745963, + "grad_norm": 0.5504953265190125, + "learning_rate": 2.9688e-05, + "loss": 0.0125, + "step": 10942 + }, + { + "epoch": 8.61677825915715, + "grad_norm": 0.39398428797721863, + "learning_rate": 2.9687666666666667e-05, + "loss": 0.0217, + "step": 10943 + }, + { + "epoch": 8.617565970854667, + "grad_norm": 0.3597140908241272, + "learning_rate": 2.9687333333333333e-05, + "loss": 0.0191, + "step": 10944 + }, + { + "epoch": 8.618353682552186, + "grad_norm": 0.4104452431201935, + "learning_rate": 2.9687000000000002e-05, + "loss": 0.0095, + "step": 10945 + }, + { + "epoch": 8.619141394249704, + "grad_norm": 0.46118196845054626, + "learning_rate": 2.9686666666666665e-05, + "loss": 0.0196, + "step": 10946 + }, + { + "epoch": 8.619929105947223, + "grad_norm": 0.2311372011899948, + "learning_rate": 2.9686333333333334e-05, + "loss": 0.0053, + "step": 10947 + }, + { + "epoch": 8.620716817644743, + "grad_norm": 0.17377282679080963, + "learning_rate": 2.9686000000000003e-05, + "loss": 0.0107, + "step": 10948 + }, + { + "epoch": 8.62150452934226, + "grad_norm": 0.3485450744628906, + "learning_rate": 2.9685666666666666e-05, + "loss": 0.0171, + "step": 10949 + }, + { + "epoch": 8.62229224103978, + "grad_norm": 0.383931040763855, + "learning_rate": 2.9685333333333335e-05, + "loss": 0.021, + "step": 10950 + }, + { + "epoch": 8.623079952737298, + "grad_norm": 0.531337320804596, + "learning_rate": 2.9685e-05, + "loss": 0.0252, + "step": 10951 + }, + { + "epoch": 8.623867664434817, + "grad_norm": 0.33041542768478394, + "learning_rate": 2.9684666666666667e-05, + "loss": 0.0134, + "step": 10952 + }, + { + "epoch": 8.624655376132335, + "grad_norm": 0.8483511805534363, + "learning_rate": 2.9684333333333333e-05, + "loss": 0.0181, + "step": 10953 + }, + { + "epoch": 8.625443087829854, + "grad_norm": 0.6189489960670471, + "learning_rate": 2.9684000000000002e-05, + "loss": 0.0257, + "step": 10954 + }, + { + "epoch": 8.626230799527374, + "grad_norm": 0.41576334834098816, + "learning_rate": 2.968366666666667e-05, + "loss": 0.0209, + "step": 10955 + }, + { + "epoch": 8.627018511224891, + "grad_norm": 0.31664279103279114, + "learning_rate": 2.9683333333333334e-05, + "loss": 0.0093, + "step": 10956 + }, + { + "epoch": 8.62780622292241, + "grad_norm": 0.30010437965393066, + "learning_rate": 2.9683000000000004e-05, + "loss": 0.0153, + "step": 10957 + }, + { + "epoch": 8.628593934619929, + "grad_norm": 0.46287646889686584, + "learning_rate": 2.9682666666666666e-05, + "loss": 0.0235, + "step": 10958 + }, + { + "epoch": 8.629381646317448, + "grad_norm": 0.7038161158561707, + "learning_rate": 2.9682333333333335e-05, + "loss": 0.0165, + "step": 10959 + }, + { + "epoch": 8.630169358014967, + "grad_norm": 0.563539981842041, + "learning_rate": 2.9681999999999998e-05, + "loss": 0.0204, + "step": 10960 + }, + { + "epoch": 8.630957069712485, + "grad_norm": 0.7218595743179321, + "learning_rate": 2.9681666666666667e-05, + "loss": 0.2532, + "step": 10961 + }, + { + "epoch": 8.631744781410005, + "grad_norm": 0.7994888424873352, + "learning_rate": 2.9681333333333333e-05, + "loss": 0.1796, + "step": 10962 + }, + { + "epoch": 8.632532493107522, + "grad_norm": 0.7023782134056091, + "learning_rate": 2.9681e-05, + "loss": 0.1924, + "step": 10963 + }, + { + "epoch": 8.633320204805042, + "grad_norm": 0.5520655512809753, + "learning_rate": 2.968066666666667e-05, + "loss": 0.0966, + "step": 10964 + }, + { + "epoch": 8.63410791650256, + "grad_norm": 0.45052775740623474, + "learning_rate": 2.9680333333333334e-05, + "loss": 0.0729, + "step": 10965 + }, + { + "epoch": 8.634895628200079, + "grad_norm": 0.3931734561920166, + "learning_rate": 2.968e-05, + "loss": 0.0277, + "step": 10966 + }, + { + "epoch": 8.635683339897598, + "grad_norm": 0.3973797857761383, + "learning_rate": 2.9679666666666666e-05, + "loss": 0.0334, + "step": 10967 + }, + { + "epoch": 8.636471051595116, + "grad_norm": 0.3786875903606415, + "learning_rate": 2.9679333333333336e-05, + "loss": 0.0485, + "step": 10968 + }, + { + "epoch": 8.637258763292635, + "grad_norm": 0.525698184967041, + "learning_rate": 2.9678999999999998e-05, + "loss": 0.0272, + "step": 10969 + }, + { + "epoch": 8.638046474990153, + "grad_norm": 0.28396594524383545, + "learning_rate": 2.9678666666666668e-05, + "loss": 0.0228, + "step": 10970 + }, + { + "epoch": 8.638834186687673, + "grad_norm": 0.573300302028656, + "learning_rate": 2.9678333333333334e-05, + "loss": 0.0203, + "step": 10971 + }, + { + "epoch": 8.63962189838519, + "grad_norm": 0.16106022894382477, + "learning_rate": 2.9678e-05, + "loss": 0.0141, + "step": 10972 + }, + { + "epoch": 8.64040961008271, + "grad_norm": 1.807296633720398, + "learning_rate": 2.967766666666667e-05, + "loss": 0.0737, + "step": 10973 + }, + { + "epoch": 8.64119732178023, + "grad_norm": 0.13696794211864471, + "learning_rate": 2.9677333333333335e-05, + "loss": 0.0077, + "step": 10974 + }, + { + "epoch": 8.641985033477747, + "grad_norm": 0.1670578271150589, + "learning_rate": 2.9677e-05, + "loss": 0.0122, + "step": 10975 + }, + { + "epoch": 8.642772745175266, + "grad_norm": 0.1989860087633133, + "learning_rate": 2.9676666666666667e-05, + "loss": 0.0167, + "step": 10976 + }, + { + "epoch": 8.643560456872784, + "grad_norm": 0.2561444640159607, + "learning_rate": 2.9676333333333336e-05, + "loss": 0.0188, + "step": 10977 + }, + { + "epoch": 8.644348168570303, + "grad_norm": 0.18615922331809998, + "learning_rate": 2.9676e-05, + "loss": 0.0129, + "step": 10978 + }, + { + "epoch": 8.645135880267823, + "grad_norm": 0.37074050307273865, + "learning_rate": 2.9675666666666668e-05, + "loss": 0.0139, + "step": 10979 + }, + { + "epoch": 8.64592359196534, + "grad_norm": 0.30917397141456604, + "learning_rate": 2.9675333333333337e-05, + "loss": 0.0165, + "step": 10980 + }, + { + "epoch": 8.64671130366286, + "grad_norm": 0.3903804421424866, + "learning_rate": 2.9675e-05, + "loss": 0.0369, + "step": 10981 + }, + { + "epoch": 8.647499015360378, + "grad_norm": 0.1950106918811798, + "learning_rate": 2.967466666666667e-05, + "loss": 0.0128, + "step": 10982 + }, + { + "epoch": 8.648286727057897, + "grad_norm": 0.2587393522262573, + "learning_rate": 2.9674333333333335e-05, + "loss": 0.013, + "step": 10983 + }, + { + "epoch": 8.649074438755415, + "grad_norm": 0.19996896386146545, + "learning_rate": 2.9674e-05, + "loss": 0.0153, + "step": 10984 + }, + { + "epoch": 8.649862150452934, + "grad_norm": 0.4600755274295807, + "learning_rate": 2.9673666666666667e-05, + "loss": 0.0206, + "step": 10985 + }, + { + "epoch": 8.650649862150454, + "grad_norm": 0.38296470046043396, + "learning_rate": 2.9673333333333336e-05, + "loss": 0.0362, + "step": 10986 + }, + { + "epoch": 8.651437573847971, + "grad_norm": 0.304080069065094, + "learning_rate": 2.9673e-05, + "loss": 0.0161, + "step": 10987 + }, + { + "epoch": 8.65222528554549, + "grad_norm": 0.29237890243530273, + "learning_rate": 2.9672666666666668e-05, + "loss": 0.0174, + "step": 10988 + }, + { + "epoch": 8.653012997243009, + "grad_norm": 0.46671056747436523, + "learning_rate": 2.9672333333333334e-05, + "loss": 0.0163, + "step": 10989 + }, + { + "epoch": 8.653800708940528, + "grad_norm": 0.3902857005596161, + "learning_rate": 2.9672e-05, + "loss": 0.0254, + "step": 10990 + }, + { + "epoch": 8.654588420638046, + "grad_norm": 0.3908444941043854, + "learning_rate": 2.967166666666667e-05, + "loss": 0.0337, + "step": 10991 + }, + { + "epoch": 8.655376132335565, + "grad_norm": 0.3781626522541046, + "learning_rate": 2.9671333333333332e-05, + "loss": 0.0191, + "step": 10992 + }, + { + "epoch": 8.656163844033085, + "grad_norm": 0.13851433992385864, + "learning_rate": 2.9671e-05, + "loss": 0.0085, + "step": 10993 + }, + { + "epoch": 8.656951555730602, + "grad_norm": 0.3328271508216858, + "learning_rate": 2.9670666666666667e-05, + "loss": 0.0173, + "step": 10994 + }, + { + "epoch": 8.657739267428122, + "grad_norm": 0.40621474385261536, + "learning_rate": 2.9670333333333333e-05, + "loss": 0.0116, + "step": 10995 + }, + { + "epoch": 8.65852697912564, + "grad_norm": 0.21904979646205902, + "learning_rate": 2.967e-05, + "loss": 0.01, + "step": 10996 + }, + { + "epoch": 8.659314690823159, + "grad_norm": 0.564613401889801, + "learning_rate": 2.9669666666666668e-05, + "loss": 0.0257, + "step": 10997 + }, + { + "epoch": 8.660102402520678, + "grad_norm": 0.4866170883178711, + "learning_rate": 2.9669333333333334e-05, + "loss": 0.0192, + "step": 10998 + }, + { + "epoch": 8.660890114218196, + "grad_norm": 0.3906688392162323, + "learning_rate": 2.9669e-05, + "loss": 0.0189, + "step": 10999 + }, + { + "epoch": 8.661677825915715, + "grad_norm": 0.7100082635879517, + "learning_rate": 2.966866666666667e-05, + "loss": 0.0237, + "step": 11000 + }, + { + "epoch": 8.661677825915715, + "eval_cer": 0.1259040360836768, + "eval_loss": 0.3652730882167816, + "eval_runtime": 16.8685, + "eval_samples_per_second": 18.022, + "eval_steps_per_second": 0.593, + "eval_wer": 0.4293937068303914, + "step": 11000 + }, + { + "epoch": 8.662465537613233, + "grad_norm": 0.443218857049942, + "learning_rate": 2.9668333333333332e-05, + "loss": 0.0132, + "step": 11001 + }, + { + "epoch": 8.663253249310753, + "grad_norm": 0.29289543628692627, + "learning_rate": 2.9668e-05, + "loss": 0.0138, + "step": 11002 + }, + { + "epoch": 8.66404096100827, + "grad_norm": 0.4263511896133423, + "learning_rate": 2.9667666666666667e-05, + "loss": 0.0247, + "step": 11003 + }, + { + "epoch": 8.66482867270579, + "grad_norm": 0.3136260211467743, + "learning_rate": 2.9667333333333333e-05, + "loss": 0.018, + "step": 11004 + }, + { + "epoch": 8.66561638440331, + "grad_norm": 0.39550212025642395, + "learning_rate": 2.9667000000000002e-05, + "loss": 0.0138, + "step": 11005 + }, + { + "epoch": 8.666404096100827, + "grad_norm": 0.20593085885047913, + "learning_rate": 2.966666666666667e-05, + "loss": 0.0117, + "step": 11006 + }, + { + "epoch": 8.667191807798346, + "grad_norm": 0.751373291015625, + "learning_rate": 2.9666333333333334e-05, + "loss": 0.0156, + "step": 11007 + }, + { + "epoch": 8.667979519495864, + "grad_norm": 0.6233245730400085, + "learning_rate": 2.9666e-05, + "loss": 0.0286, + "step": 11008 + }, + { + "epoch": 8.668767231193383, + "grad_norm": 1.2647929191589355, + "learning_rate": 2.966566666666667e-05, + "loss": 0.0243, + "step": 11009 + }, + { + "epoch": 8.669554942890901, + "grad_norm": 0.24827887117862701, + "learning_rate": 2.9665333333333332e-05, + "loss": 0.0096, + "step": 11010 + }, + { + "epoch": 8.67034265458842, + "grad_norm": 1.3756550550460815, + "learning_rate": 2.9665e-05, + "loss": 0.3991, + "step": 11011 + }, + { + "epoch": 8.67113036628594, + "grad_norm": 0.8019303679466248, + "learning_rate": 2.9664666666666667e-05, + "loss": 0.2183, + "step": 11012 + }, + { + "epoch": 8.671918077983458, + "grad_norm": 0.5079783797264099, + "learning_rate": 2.9664333333333333e-05, + "loss": 0.1054, + "step": 11013 + }, + { + "epoch": 8.672705789680977, + "grad_norm": 0.686543345451355, + "learning_rate": 2.9664000000000003e-05, + "loss": 0.126, + "step": 11014 + }, + { + "epoch": 8.673493501378495, + "grad_norm": 0.4559345841407776, + "learning_rate": 2.966366666666667e-05, + "loss": 0.0747, + "step": 11015 + }, + { + "epoch": 8.674281213076014, + "grad_norm": 0.37128403782844543, + "learning_rate": 2.9663333333333334e-05, + "loss": 0.0407, + "step": 11016 + }, + { + "epoch": 8.675068924773534, + "grad_norm": 0.39450889825820923, + "learning_rate": 2.9663e-05, + "loss": 0.0266, + "step": 11017 + }, + { + "epoch": 8.675856636471051, + "grad_norm": 0.41569259762763977, + "learning_rate": 2.966266666666667e-05, + "loss": 0.0373, + "step": 11018 + }, + { + "epoch": 8.67664434816857, + "grad_norm": 0.5147281885147095, + "learning_rate": 2.9662333333333332e-05, + "loss": 0.0539, + "step": 11019 + }, + { + "epoch": 8.677432059866089, + "grad_norm": 0.3097142279148102, + "learning_rate": 2.9662e-05, + "loss": 0.0245, + "step": 11020 + }, + { + "epoch": 8.678219771563608, + "grad_norm": 0.24714972078800201, + "learning_rate": 2.9661666666666664e-05, + "loss": 0.0183, + "step": 11021 + }, + { + "epoch": 8.679007483261126, + "grad_norm": 0.25181686878204346, + "learning_rate": 2.9661333333333333e-05, + "loss": 0.0161, + "step": 11022 + }, + { + "epoch": 8.679795194958645, + "grad_norm": 0.3301362693309784, + "learning_rate": 2.9661000000000003e-05, + "loss": 0.0212, + "step": 11023 + }, + { + "epoch": 8.680582906656165, + "grad_norm": 1.2179269790649414, + "learning_rate": 2.9660666666666665e-05, + "loss": 0.037, + "step": 11024 + }, + { + "epoch": 8.681370618353682, + "grad_norm": 0.20670321583747864, + "learning_rate": 2.9660333333333335e-05, + "loss": 0.0222, + "step": 11025 + }, + { + "epoch": 8.682158330051202, + "grad_norm": 0.3139294981956482, + "learning_rate": 2.966e-05, + "loss": 0.0244, + "step": 11026 + }, + { + "epoch": 8.68294604174872, + "grad_norm": 0.3574301302433014, + "learning_rate": 2.9659666666666667e-05, + "loss": 0.0121, + "step": 11027 + }, + { + "epoch": 8.683733753446239, + "grad_norm": 0.33746981620788574, + "learning_rate": 2.9659333333333332e-05, + "loss": 0.0203, + "step": 11028 + }, + { + "epoch": 8.684521465143757, + "grad_norm": 0.6787299513816833, + "learning_rate": 2.9659000000000002e-05, + "loss": 0.0216, + "step": 11029 + }, + { + "epoch": 8.685309176841276, + "grad_norm": 0.16425997018814087, + "learning_rate": 2.9658666666666668e-05, + "loss": 0.0101, + "step": 11030 + }, + { + "epoch": 8.686096888538795, + "grad_norm": 0.39093276858329773, + "learning_rate": 2.9658333333333334e-05, + "loss": 0.0118, + "step": 11031 + }, + { + "epoch": 8.686884600236313, + "grad_norm": 0.1758860945701599, + "learning_rate": 2.9658000000000003e-05, + "loss": 0.0104, + "step": 11032 + }, + { + "epoch": 8.687672311933833, + "grad_norm": 0.2854546904563904, + "learning_rate": 2.9657666666666666e-05, + "loss": 0.0099, + "step": 11033 + }, + { + "epoch": 8.68846002363135, + "grad_norm": 0.5903995037078857, + "learning_rate": 2.9657333333333335e-05, + "loss": 0.0102, + "step": 11034 + }, + { + "epoch": 8.68924773532887, + "grad_norm": 0.40349307656288147, + "learning_rate": 2.9657e-05, + "loss": 0.0279, + "step": 11035 + }, + { + "epoch": 8.69003544702639, + "grad_norm": 0.29045599699020386, + "learning_rate": 2.9656666666666667e-05, + "loss": 0.0131, + "step": 11036 + }, + { + "epoch": 8.690823158723907, + "grad_norm": 0.34862610697746277, + "learning_rate": 2.9656333333333333e-05, + "loss": 0.0111, + "step": 11037 + }, + { + "epoch": 8.691610870421426, + "grad_norm": 0.7488780617713928, + "learning_rate": 2.9656000000000002e-05, + "loss": 0.0256, + "step": 11038 + }, + { + "epoch": 8.692398582118944, + "grad_norm": 0.42176347970962524, + "learning_rate": 2.9655666666666668e-05, + "loss": 0.018, + "step": 11039 + }, + { + "epoch": 8.693186293816463, + "grad_norm": 0.16865885257720947, + "learning_rate": 2.9655333333333334e-05, + "loss": 0.0092, + "step": 11040 + }, + { + "epoch": 8.693974005513981, + "grad_norm": 0.42227497696876526, + "learning_rate": 2.9655000000000003e-05, + "loss": 0.0242, + "step": 11041 + }, + { + "epoch": 8.6947617172115, + "grad_norm": 0.2815741300582886, + "learning_rate": 2.9654666666666666e-05, + "loss": 0.0193, + "step": 11042 + }, + { + "epoch": 8.69554942890902, + "grad_norm": 0.30721428990364075, + "learning_rate": 2.9654333333333335e-05, + "loss": 0.0169, + "step": 11043 + }, + { + "epoch": 8.696337140606538, + "grad_norm": 1.1493804454803467, + "learning_rate": 2.9654e-05, + "loss": 0.0233, + "step": 11044 + }, + { + "epoch": 8.697124852304057, + "grad_norm": 0.4068741500377655, + "learning_rate": 2.9653666666666667e-05, + "loss": 0.0156, + "step": 11045 + }, + { + "epoch": 8.697912564001575, + "grad_norm": 0.7823633551597595, + "learning_rate": 2.9653333333333333e-05, + "loss": 0.0238, + "step": 11046 + }, + { + "epoch": 8.698700275699094, + "grad_norm": 0.5244719386100769, + "learning_rate": 2.9653000000000002e-05, + "loss": 0.0245, + "step": 11047 + }, + { + "epoch": 8.699487987396612, + "grad_norm": 0.48672425746917725, + "learning_rate": 2.9652666666666668e-05, + "loss": 0.0234, + "step": 11048 + }, + { + "epoch": 8.700275699094131, + "grad_norm": 0.7184216976165771, + "learning_rate": 2.9652333333333334e-05, + "loss": 0.0231, + "step": 11049 + }, + { + "epoch": 8.701063410791651, + "grad_norm": 0.5418495535850525, + "learning_rate": 2.9652e-05, + "loss": 0.0192, + "step": 11050 + }, + { + "epoch": 8.701851122489169, + "grad_norm": 0.9434211254119873, + "learning_rate": 2.9651666666666666e-05, + "loss": 0.0312, + "step": 11051 + }, + { + "epoch": 8.702638834186688, + "grad_norm": 0.7898208498954773, + "learning_rate": 2.9651333333333335e-05, + "loss": 0.0206, + "step": 11052 + }, + { + "epoch": 8.703426545884206, + "grad_norm": 0.4133720099925995, + "learning_rate": 2.9650999999999998e-05, + "loss": 0.015, + "step": 11053 + }, + { + "epoch": 8.704214257581725, + "grad_norm": 0.502413809299469, + "learning_rate": 2.9650666666666667e-05, + "loss": 0.0305, + "step": 11054 + }, + { + "epoch": 8.705001969279245, + "grad_norm": 1.4176465272903442, + "learning_rate": 2.9650333333333336e-05, + "loss": 0.0256, + "step": 11055 + }, + { + "epoch": 8.705789680976762, + "grad_norm": 0.2744583785533905, + "learning_rate": 2.965e-05, + "loss": 0.0131, + "step": 11056 + }, + { + "epoch": 8.706577392674282, + "grad_norm": 0.3048623204231262, + "learning_rate": 2.964966666666667e-05, + "loss": 0.022, + "step": 11057 + }, + { + "epoch": 8.7073651043718, + "grad_norm": 0.1865616887807846, + "learning_rate": 2.9649333333333334e-05, + "loss": 0.0111, + "step": 11058 + }, + { + "epoch": 8.708152816069319, + "grad_norm": 0.47987449169158936, + "learning_rate": 2.9649e-05, + "loss": 0.0301, + "step": 11059 + }, + { + "epoch": 8.708940527766837, + "grad_norm": 0.5396548509597778, + "learning_rate": 2.9648666666666666e-05, + "loss": 0.0319, + "step": 11060 + }, + { + "epoch": 8.709728239464356, + "grad_norm": 0.5714070200920105, + "learning_rate": 2.9648333333333335e-05, + "loss": 0.2374, + "step": 11061 + }, + { + "epoch": 8.710515951161875, + "grad_norm": 0.7201754450798035, + "learning_rate": 2.9647999999999998e-05, + "loss": 0.2057, + "step": 11062 + }, + { + "epoch": 8.711303662859393, + "grad_norm": 0.712701678276062, + "learning_rate": 2.9647666666666667e-05, + "loss": 0.1278, + "step": 11063 + }, + { + "epoch": 8.712091374556913, + "grad_norm": 0.555408775806427, + "learning_rate": 2.9647333333333337e-05, + "loss": 0.0963, + "step": 11064 + }, + { + "epoch": 8.71287908625443, + "grad_norm": 0.5153411626815796, + "learning_rate": 2.9647e-05, + "loss": 0.0974, + "step": 11065 + }, + { + "epoch": 8.71366679795195, + "grad_norm": 0.7403603196144104, + "learning_rate": 2.964666666666667e-05, + "loss": 0.0826, + "step": 11066 + }, + { + "epoch": 8.714454509649467, + "grad_norm": 0.4014451503753662, + "learning_rate": 2.9646333333333334e-05, + "loss": 0.05, + "step": 11067 + }, + { + "epoch": 8.715242221346987, + "grad_norm": 0.39654412865638733, + "learning_rate": 2.9646e-05, + "loss": 0.0316, + "step": 11068 + }, + { + "epoch": 8.716029933044506, + "grad_norm": 0.3006378710269928, + "learning_rate": 2.9645666666666666e-05, + "loss": 0.028, + "step": 11069 + }, + { + "epoch": 8.716817644742024, + "grad_norm": 0.15672644972801208, + "learning_rate": 2.9645333333333336e-05, + "loss": 0.0205, + "step": 11070 + }, + { + "epoch": 8.717605356439543, + "grad_norm": 0.40335020422935486, + "learning_rate": 2.9644999999999998e-05, + "loss": 0.0339, + "step": 11071 + }, + { + "epoch": 8.718393068137061, + "grad_norm": 0.2062625139951706, + "learning_rate": 2.9644666666666668e-05, + "loss": 0.0151, + "step": 11072 + }, + { + "epoch": 8.71918077983458, + "grad_norm": 0.24845820665359497, + "learning_rate": 2.9644333333333337e-05, + "loss": 0.0153, + "step": 11073 + }, + { + "epoch": 8.7199684915321, + "grad_norm": 0.2464263141155243, + "learning_rate": 2.9644e-05, + "loss": 0.0128, + "step": 11074 + }, + { + "epoch": 8.720756203229618, + "grad_norm": 0.2308516800403595, + "learning_rate": 2.964366666666667e-05, + "loss": 0.0268, + "step": 11075 + }, + { + "epoch": 8.721543914927137, + "grad_norm": 0.17758554220199585, + "learning_rate": 2.9643333333333335e-05, + "loss": 0.0108, + "step": 11076 + }, + { + "epoch": 8.722331626624655, + "grad_norm": 0.3577635586261749, + "learning_rate": 2.9643e-05, + "loss": 0.0267, + "step": 11077 + }, + { + "epoch": 8.723119338322174, + "grad_norm": 0.2409444898366928, + "learning_rate": 2.9642666666666667e-05, + "loss": 0.0098, + "step": 11078 + }, + { + "epoch": 8.723907050019692, + "grad_norm": 0.33393776416778564, + "learning_rate": 2.9642333333333336e-05, + "loss": 0.0251, + "step": 11079 + }, + { + "epoch": 8.724694761717211, + "grad_norm": 0.2090577930212021, + "learning_rate": 2.9642000000000002e-05, + "loss": 0.0147, + "step": 11080 + }, + { + "epoch": 8.725482473414731, + "grad_norm": 0.237937793135643, + "learning_rate": 2.9641666666666668e-05, + "loss": 0.0172, + "step": 11081 + }, + { + "epoch": 8.726270185112249, + "grad_norm": 0.25607773661613464, + "learning_rate": 2.9641333333333334e-05, + "loss": 0.017, + "step": 11082 + }, + { + "epoch": 8.727057896809768, + "grad_norm": 0.28254494071006775, + "learning_rate": 2.9641e-05, + "loss": 0.0125, + "step": 11083 + }, + { + "epoch": 8.727845608507286, + "grad_norm": 0.3296395242214203, + "learning_rate": 2.964066666666667e-05, + "loss": 0.0193, + "step": 11084 + }, + { + "epoch": 8.728633320204805, + "grad_norm": 0.20588341355323792, + "learning_rate": 2.964033333333333e-05, + "loss": 0.0139, + "step": 11085 + }, + { + "epoch": 8.729421031902323, + "grad_norm": 0.12713879346847534, + "learning_rate": 2.964e-05, + "loss": 0.0056, + "step": 11086 + }, + { + "epoch": 8.730208743599842, + "grad_norm": 0.4155350923538208, + "learning_rate": 2.9639666666666667e-05, + "loss": 0.0199, + "step": 11087 + }, + { + "epoch": 8.730996455297362, + "grad_norm": 0.3544866740703583, + "learning_rate": 2.9639333333333333e-05, + "loss": 0.0122, + "step": 11088 + }, + { + "epoch": 8.73178416699488, + "grad_norm": 0.2745567262172699, + "learning_rate": 2.9639000000000002e-05, + "loss": 0.015, + "step": 11089 + }, + { + "epoch": 8.732571878692399, + "grad_norm": 0.38283514976501465, + "learning_rate": 2.9638666666666668e-05, + "loss": 0.019, + "step": 11090 + }, + { + "epoch": 8.733359590389917, + "grad_norm": 0.3804533779621124, + "learning_rate": 2.9638333333333334e-05, + "loss": 0.02, + "step": 11091 + }, + { + "epoch": 8.734147302087436, + "grad_norm": 0.3819330632686615, + "learning_rate": 2.9638e-05, + "loss": 0.0151, + "step": 11092 + }, + { + "epoch": 8.734935013784956, + "grad_norm": 0.2961790859699249, + "learning_rate": 2.963766666666667e-05, + "loss": 0.0165, + "step": 11093 + }, + { + "epoch": 8.735722725482473, + "grad_norm": 0.26486119627952576, + "learning_rate": 2.963733333333333e-05, + "loss": 0.01, + "step": 11094 + }, + { + "epoch": 8.736510437179993, + "grad_norm": 0.35718804597854614, + "learning_rate": 2.9637e-05, + "loss": 0.027, + "step": 11095 + }, + { + "epoch": 8.73729814887751, + "grad_norm": 0.41138336062431335, + "learning_rate": 2.9636666666666667e-05, + "loss": 0.0197, + "step": 11096 + }, + { + "epoch": 8.73808586057503, + "grad_norm": 0.3840198516845703, + "learning_rate": 2.9636333333333333e-05, + "loss": 0.0244, + "step": 11097 + }, + { + "epoch": 8.738873572272547, + "grad_norm": 0.3401906192302704, + "learning_rate": 2.9636000000000002e-05, + "loss": 0.0169, + "step": 11098 + }, + { + "epoch": 8.739661283970067, + "grad_norm": 0.2483128160238266, + "learning_rate": 2.9635666666666668e-05, + "loss": 0.0173, + "step": 11099 + }, + { + "epoch": 8.740448995667586, + "grad_norm": 0.532294511795044, + "learning_rate": 2.9635333333333334e-05, + "loss": 0.0236, + "step": 11100 + }, + { + "epoch": 8.741236707365104, + "grad_norm": 0.4468952417373657, + "learning_rate": 2.9635e-05, + "loss": 0.024, + "step": 11101 + }, + { + "epoch": 8.742024419062624, + "grad_norm": 0.2689838111400604, + "learning_rate": 2.963466666666667e-05, + "loss": 0.0136, + "step": 11102 + }, + { + "epoch": 8.742812130760141, + "grad_norm": 0.3274592161178589, + "learning_rate": 2.9634333333333332e-05, + "loss": 0.0167, + "step": 11103 + }, + { + "epoch": 8.74359984245766, + "grad_norm": 0.3368755280971527, + "learning_rate": 2.9634e-05, + "loss": 0.0277, + "step": 11104 + }, + { + "epoch": 8.744387554155178, + "grad_norm": 0.4776070713996887, + "learning_rate": 2.963366666666667e-05, + "loss": 0.016, + "step": 11105 + }, + { + "epoch": 8.745175265852698, + "grad_norm": 0.20880042016506195, + "learning_rate": 2.9633333333333333e-05, + "loss": 0.0321, + "step": 11106 + }, + { + "epoch": 8.745962977550217, + "grad_norm": 0.4847213327884674, + "learning_rate": 2.9633000000000002e-05, + "loss": 0.0391, + "step": 11107 + }, + { + "epoch": 8.746750689247735, + "grad_norm": 0.3675191402435303, + "learning_rate": 2.963266666666667e-05, + "loss": 0.0183, + "step": 11108 + }, + { + "epoch": 8.747538400945254, + "grad_norm": 0.7927159070968628, + "learning_rate": 2.9632333333333334e-05, + "loss": 0.0451, + "step": 11109 + }, + { + "epoch": 8.748326112642772, + "grad_norm": 0.377970814704895, + "learning_rate": 2.9632e-05, + "loss": 0.019, + "step": 11110 + }, + { + "epoch": 8.749113824340292, + "grad_norm": 0.9682193994522095, + "learning_rate": 2.9631666666666666e-05, + "loss": 0.285, + "step": 11111 + }, + { + "epoch": 8.749901536037811, + "grad_norm": 0.6362496614456177, + "learning_rate": 2.9631333333333332e-05, + "loss": 0.1654, + "step": 11112 + }, + { + "epoch": 8.750689247735329, + "grad_norm": 0.4942246079444885, + "learning_rate": 2.9631e-05, + "loss": 0.1326, + "step": 11113 + }, + { + "epoch": 8.751476959432848, + "grad_norm": 0.7021231055259705, + "learning_rate": 2.9630666666666667e-05, + "loss": 0.1213, + "step": 11114 + }, + { + "epoch": 8.752264671130366, + "grad_norm": 0.5954614877700806, + "learning_rate": 2.9630333333333333e-05, + "loss": 0.1217, + "step": 11115 + }, + { + "epoch": 8.753052382827885, + "grad_norm": 0.3259143531322479, + "learning_rate": 2.9630000000000003e-05, + "loss": 0.0499, + "step": 11116 + }, + { + "epoch": 8.753840094525403, + "grad_norm": 1.0517828464508057, + "learning_rate": 2.9629666666666665e-05, + "loss": 0.0896, + "step": 11117 + }, + { + "epoch": 8.754627806222922, + "grad_norm": 0.5198004841804504, + "learning_rate": 2.9629333333333334e-05, + "loss": 0.0344, + "step": 11118 + }, + { + "epoch": 8.755415517920442, + "grad_norm": 0.3236861228942871, + "learning_rate": 2.9629e-05, + "loss": 0.0397, + "step": 11119 + }, + { + "epoch": 8.75620322961796, + "grad_norm": 0.2958207428455353, + "learning_rate": 2.9628666666666666e-05, + "loss": 0.024, + "step": 11120 + }, + { + "epoch": 8.756990941315479, + "grad_norm": 0.3582890033721924, + "learning_rate": 2.9628333333333332e-05, + "loss": 0.0117, + "step": 11121 + }, + { + "epoch": 8.757778653012997, + "grad_norm": 0.3045893907546997, + "learning_rate": 2.9628e-05, + "loss": 0.0137, + "step": 11122 + }, + { + "epoch": 8.758566364710516, + "grad_norm": 0.4289858639240265, + "learning_rate": 2.9627666666666668e-05, + "loss": 0.0637, + "step": 11123 + }, + { + "epoch": 8.759354076408034, + "grad_norm": 0.42881643772125244, + "learning_rate": 2.9627333333333333e-05, + "loss": 0.0133, + "step": 11124 + }, + { + "epoch": 8.760141788105553, + "grad_norm": 0.3439445495605469, + "learning_rate": 2.9627000000000003e-05, + "loss": 0.0182, + "step": 11125 + }, + { + "epoch": 8.760929499803073, + "grad_norm": 0.19802184402942657, + "learning_rate": 2.9626666666666665e-05, + "loss": 0.0116, + "step": 11126 + }, + { + "epoch": 8.76171721150059, + "grad_norm": 0.21476425230503082, + "learning_rate": 2.9626333333333335e-05, + "loss": 0.0118, + "step": 11127 + }, + { + "epoch": 8.76250492319811, + "grad_norm": 0.1939752697944641, + "learning_rate": 2.9626e-05, + "loss": 0.015, + "step": 11128 + }, + { + "epoch": 8.763292634895627, + "grad_norm": 0.6791943907737732, + "learning_rate": 2.9625666666666667e-05, + "loss": 0.0219, + "step": 11129 + }, + { + "epoch": 8.764080346593147, + "grad_norm": 0.26211851835250854, + "learning_rate": 2.9625333333333336e-05, + "loss": 0.0165, + "step": 11130 + }, + { + "epoch": 8.764868058290666, + "grad_norm": 0.1859394758939743, + "learning_rate": 2.9625000000000002e-05, + "loss": 0.0151, + "step": 11131 + }, + { + "epoch": 8.765655769988184, + "grad_norm": 0.5017255544662476, + "learning_rate": 2.9624666666666668e-05, + "loss": 0.0238, + "step": 11132 + }, + { + "epoch": 8.766443481685704, + "grad_norm": 0.23423072695732117, + "learning_rate": 2.9624333333333334e-05, + "loss": 0.0078, + "step": 11133 + }, + { + "epoch": 8.767231193383221, + "grad_norm": 0.6608293056488037, + "learning_rate": 2.9624000000000003e-05, + "loss": 0.0285, + "step": 11134 + }, + { + "epoch": 8.76801890508074, + "grad_norm": 0.6325976252555847, + "learning_rate": 2.9623666666666666e-05, + "loss": 0.0311, + "step": 11135 + }, + { + "epoch": 8.768806616778258, + "grad_norm": 0.25109434127807617, + "learning_rate": 2.9623333333333335e-05, + "loss": 0.017, + "step": 11136 + }, + { + "epoch": 8.769594328475778, + "grad_norm": 0.3264521658420563, + "learning_rate": 2.9623e-05, + "loss": 0.0139, + "step": 11137 + }, + { + "epoch": 8.770382040173297, + "grad_norm": 0.3172730803489685, + "learning_rate": 2.9622666666666667e-05, + "loss": 0.0207, + "step": 11138 + }, + { + "epoch": 8.771169751870815, + "grad_norm": 0.183767631649971, + "learning_rate": 2.9622333333333336e-05, + "loss": 0.0097, + "step": 11139 + }, + { + "epoch": 8.771957463568334, + "grad_norm": 0.25304803252220154, + "learning_rate": 2.9622000000000002e-05, + "loss": 0.0175, + "step": 11140 + }, + { + "epoch": 8.772745175265852, + "grad_norm": 0.38346412777900696, + "learning_rate": 2.9621666666666668e-05, + "loss": 0.0159, + "step": 11141 + }, + { + "epoch": 8.773532886963372, + "grad_norm": 0.2811683416366577, + "learning_rate": 2.9621333333333334e-05, + "loss": 0.0136, + "step": 11142 + }, + { + "epoch": 8.77432059866089, + "grad_norm": 0.30865392088890076, + "learning_rate": 2.9621e-05, + "loss": 0.015, + "step": 11143 + }, + { + "epoch": 8.775108310358409, + "grad_norm": 0.3476986885070801, + "learning_rate": 2.9620666666666666e-05, + "loss": 0.0146, + "step": 11144 + }, + { + "epoch": 8.775896022055928, + "grad_norm": 0.24573244154453278, + "learning_rate": 2.9620333333333335e-05, + "loss": 0.0138, + "step": 11145 + }, + { + "epoch": 8.776683733753446, + "grad_norm": 0.326140433549881, + "learning_rate": 2.9619999999999998e-05, + "loss": 0.0106, + "step": 11146 + }, + { + "epoch": 8.777471445450965, + "grad_norm": 0.196415513753891, + "learning_rate": 2.9619666666666667e-05, + "loss": 0.0154, + "step": 11147 + }, + { + "epoch": 8.778259157148483, + "grad_norm": 0.3064744472503662, + "learning_rate": 2.9619333333333336e-05, + "loss": 0.0135, + "step": 11148 + }, + { + "epoch": 8.779046868846002, + "grad_norm": 0.5655571818351746, + "learning_rate": 2.9619e-05, + "loss": 0.02, + "step": 11149 + }, + { + "epoch": 8.779834580543522, + "grad_norm": 0.27784857153892517, + "learning_rate": 2.9618666666666668e-05, + "loss": 0.0115, + "step": 11150 + }, + { + "epoch": 8.78062229224104, + "grad_norm": 0.23228348791599274, + "learning_rate": 2.9618333333333334e-05, + "loss": 0.0111, + "step": 11151 + }, + { + "epoch": 8.781410003938559, + "grad_norm": 0.5260331034660339, + "learning_rate": 2.9618e-05, + "loss": 0.0352, + "step": 11152 + }, + { + "epoch": 8.782197715636077, + "grad_norm": 0.225691020488739, + "learning_rate": 2.9617666666666666e-05, + "loss": 0.0138, + "step": 11153 + }, + { + "epoch": 8.782985427333596, + "grad_norm": 0.3228612244129181, + "learning_rate": 2.9617333333333335e-05, + "loss": 0.0163, + "step": 11154 + }, + { + "epoch": 8.783773139031114, + "grad_norm": 0.392828106880188, + "learning_rate": 2.9617e-05, + "loss": 0.0151, + "step": 11155 + }, + { + "epoch": 8.784560850728633, + "grad_norm": 0.8171967267990112, + "learning_rate": 2.9616666666666667e-05, + "loss": 0.0128, + "step": 11156 + }, + { + "epoch": 8.785348562426153, + "grad_norm": 0.44472745060920715, + "learning_rate": 2.9616333333333336e-05, + "loss": 0.0202, + "step": 11157 + }, + { + "epoch": 8.78613627412367, + "grad_norm": 0.5206484198570251, + "learning_rate": 2.9616e-05, + "loss": 0.0168, + "step": 11158 + }, + { + "epoch": 8.78692398582119, + "grad_norm": 0.6168463230133057, + "learning_rate": 2.961566666666667e-05, + "loss": 0.0195, + "step": 11159 + }, + { + "epoch": 8.787711697518708, + "grad_norm": 0.8025426864624023, + "learning_rate": 2.9615333333333334e-05, + "loss": 0.0343, + "step": 11160 + }, + { + "epoch": 8.788499409216227, + "grad_norm": 0.665148138999939, + "learning_rate": 2.9615e-05, + "loss": 0.2231, + "step": 11161 + }, + { + "epoch": 8.789287120913745, + "grad_norm": 0.7691815495491028, + "learning_rate": 2.9614666666666666e-05, + "loss": 0.1792, + "step": 11162 + }, + { + "epoch": 8.790074832611264, + "grad_norm": 0.5709341168403625, + "learning_rate": 2.9614333333333335e-05, + "loss": 0.1488, + "step": 11163 + }, + { + "epoch": 8.790862544308784, + "grad_norm": 0.6092702746391296, + "learning_rate": 2.9614e-05, + "loss": 0.1075, + "step": 11164 + }, + { + "epoch": 8.791650256006301, + "grad_norm": 0.6922469735145569, + "learning_rate": 2.9613666666666667e-05, + "loss": 0.0756, + "step": 11165 + }, + { + "epoch": 8.79243796770382, + "grad_norm": 0.3088442087173462, + "learning_rate": 2.9613333333333337e-05, + "loss": 0.0584, + "step": 11166 + }, + { + "epoch": 8.793225679401338, + "grad_norm": 0.2438308447599411, + "learning_rate": 2.9613e-05, + "loss": 0.0292, + "step": 11167 + }, + { + "epoch": 8.794013391098858, + "grad_norm": 0.3187370002269745, + "learning_rate": 2.961266666666667e-05, + "loss": 0.0308, + "step": 11168 + }, + { + "epoch": 8.794801102796377, + "grad_norm": 0.40842872858047485, + "learning_rate": 2.9612333333333334e-05, + "loss": 0.0338, + "step": 11169 + }, + { + "epoch": 8.795588814493895, + "grad_norm": 0.26695016026496887, + "learning_rate": 2.9612e-05, + "loss": 0.0212, + "step": 11170 + }, + { + "epoch": 8.796376526191414, + "grad_norm": 0.15900492668151855, + "learning_rate": 2.9611666666666666e-05, + "loss": 0.0095, + "step": 11171 + }, + { + "epoch": 8.797164237888932, + "grad_norm": 0.1876889318227768, + "learning_rate": 2.9611333333333332e-05, + "loss": 0.0143, + "step": 11172 + }, + { + "epoch": 8.797951949586452, + "grad_norm": 0.16723962128162384, + "learning_rate": 2.9611e-05, + "loss": 0.0115, + "step": 11173 + }, + { + "epoch": 8.798739661283971, + "grad_norm": 0.1846005916595459, + "learning_rate": 2.9610666666666668e-05, + "loss": 0.0162, + "step": 11174 + }, + { + "epoch": 8.799527372981489, + "grad_norm": 0.22838279604911804, + "learning_rate": 2.9610333333333333e-05, + "loss": 0.0118, + "step": 11175 + }, + { + "epoch": 8.800315084679008, + "grad_norm": 0.1626073271036148, + "learning_rate": 2.961e-05, + "loss": 0.01, + "step": 11176 + }, + { + "epoch": 8.801102796376526, + "grad_norm": 0.6050353646278381, + "learning_rate": 2.960966666666667e-05, + "loss": 0.0177, + "step": 11177 + }, + { + "epoch": 8.801890508074045, + "grad_norm": 0.263462632894516, + "learning_rate": 2.960933333333333e-05, + "loss": 0.0249, + "step": 11178 + }, + { + "epoch": 8.802678219771563, + "grad_norm": 0.24374346435070038, + "learning_rate": 2.9609e-05, + "loss": 0.0174, + "step": 11179 + }, + { + "epoch": 8.803465931469082, + "grad_norm": 0.3237876892089844, + "learning_rate": 2.960866666666667e-05, + "loss": 0.0121, + "step": 11180 + }, + { + "epoch": 8.8042536431666, + "grad_norm": 0.18627069890499115, + "learning_rate": 2.9608333333333332e-05, + "loss": 0.0059, + "step": 11181 + }, + { + "epoch": 8.80504135486412, + "grad_norm": 0.2844371497631073, + "learning_rate": 2.9608000000000002e-05, + "loss": 0.0132, + "step": 11182 + }, + { + "epoch": 8.805829066561639, + "grad_norm": 0.3255869448184967, + "learning_rate": 2.9607666666666668e-05, + "loss": 0.0176, + "step": 11183 + }, + { + "epoch": 8.806616778259157, + "grad_norm": 0.25634095072746277, + "learning_rate": 2.9607333333333334e-05, + "loss": 0.0217, + "step": 11184 + }, + { + "epoch": 8.807404489956676, + "grad_norm": 0.332544207572937, + "learning_rate": 2.9607e-05, + "loss": 0.0132, + "step": 11185 + }, + { + "epoch": 8.808192201654194, + "grad_norm": 0.19937558472156525, + "learning_rate": 2.960666666666667e-05, + "loss": 0.0097, + "step": 11186 + }, + { + "epoch": 8.808979913351713, + "grad_norm": 0.25519946217536926, + "learning_rate": 2.960633333333333e-05, + "loss": 0.0224, + "step": 11187 + }, + { + "epoch": 8.809767625049233, + "grad_norm": 0.3693377375602722, + "learning_rate": 2.9606e-05, + "loss": 0.0291, + "step": 11188 + }, + { + "epoch": 8.81055533674675, + "grad_norm": 0.8236566781997681, + "learning_rate": 2.960566666666667e-05, + "loss": 0.012, + "step": 11189 + }, + { + "epoch": 8.81134304844427, + "grad_norm": 0.355374276638031, + "learning_rate": 2.9605333333333333e-05, + "loss": 0.0121, + "step": 11190 + }, + { + "epoch": 8.812130760141788, + "grad_norm": 0.36468505859375, + "learning_rate": 2.9605000000000002e-05, + "loss": 0.0167, + "step": 11191 + }, + { + "epoch": 8.812918471839307, + "grad_norm": 0.5231823921203613, + "learning_rate": 2.9604666666666668e-05, + "loss": 0.0217, + "step": 11192 + }, + { + "epoch": 8.813706183536826, + "grad_norm": 0.21096043288707733, + "learning_rate": 2.9604333333333334e-05, + "loss": 0.0152, + "step": 11193 + }, + { + "epoch": 8.814493895234344, + "grad_norm": 0.4179982841014862, + "learning_rate": 2.9604e-05, + "loss": 0.0524, + "step": 11194 + }, + { + "epoch": 8.815281606931864, + "grad_norm": 0.526261568069458, + "learning_rate": 2.960366666666667e-05, + "loss": 0.0249, + "step": 11195 + }, + { + "epoch": 8.816069318629381, + "grad_norm": 0.26172107458114624, + "learning_rate": 2.960333333333333e-05, + "loss": 0.0185, + "step": 11196 + }, + { + "epoch": 8.8168570303269, + "grad_norm": 0.5130993723869324, + "learning_rate": 2.9603e-05, + "loss": 0.029, + "step": 11197 + }, + { + "epoch": 8.817644742024418, + "grad_norm": 0.20350848138332367, + "learning_rate": 2.960266666666667e-05, + "loss": 0.0101, + "step": 11198 + }, + { + "epoch": 8.818432453721938, + "grad_norm": 0.26220816373825073, + "learning_rate": 2.9602333333333333e-05, + "loss": 0.0116, + "step": 11199 + }, + { + "epoch": 8.819220165419457, + "grad_norm": 0.5971957445144653, + "learning_rate": 2.9602000000000002e-05, + "loss": 0.0333, + "step": 11200 + }, + { + "epoch": 8.820007877116975, + "grad_norm": 0.2345408797264099, + "learning_rate": 2.9601666666666665e-05, + "loss": 0.0134, + "step": 11201 + }, + { + "epoch": 8.820795588814494, + "grad_norm": 0.49811360239982605, + "learning_rate": 2.9601333333333334e-05, + "loss": 0.0205, + "step": 11202 + }, + { + "epoch": 8.821583300512012, + "grad_norm": 0.1713055819272995, + "learning_rate": 2.9601e-05, + "loss": 0.0159, + "step": 11203 + }, + { + "epoch": 8.822371012209532, + "grad_norm": 0.18047010898590088, + "learning_rate": 2.9600666666666666e-05, + "loss": 0.0068, + "step": 11204 + }, + { + "epoch": 8.82315872390705, + "grad_norm": 0.477077841758728, + "learning_rate": 2.9600333333333335e-05, + "loss": 0.0208, + "step": 11205 + }, + { + "epoch": 8.823946435604569, + "grad_norm": 0.30791565775871277, + "learning_rate": 2.96e-05, + "loss": 0.0114, + "step": 11206 + }, + { + "epoch": 8.824734147302088, + "grad_norm": 0.3660251498222351, + "learning_rate": 2.9599666666666667e-05, + "loss": 0.0171, + "step": 11207 + }, + { + "epoch": 8.825521858999606, + "grad_norm": 0.47423189878463745, + "learning_rate": 2.9599333333333333e-05, + "loss": 0.0143, + "step": 11208 + }, + { + "epoch": 8.826309570697125, + "grad_norm": 0.5736004114151001, + "learning_rate": 2.9599000000000002e-05, + "loss": 0.0314, + "step": 11209 + }, + { + "epoch": 8.827097282394643, + "grad_norm": 0.39753472805023193, + "learning_rate": 2.9598666666666665e-05, + "loss": 0.0225, + "step": 11210 + }, + { + "epoch": 8.827884994092162, + "grad_norm": 0.8816331624984741, + "learning_rate": 2.9598333333333334e-05, + "loss": 0.2652, + "step": 11211 + }, + { + "epoch": 8.828672705789682, + "grad_norm": 0.6904330253601074, + "learning_rate": 2.9598e-05, + "loss": 0.1999, + "step": 11212 + }, + { + "epoch": 8.8294604174872, + "grad_norm": 0.6175860166549683, + "learning_rate": 2.9597666666666666e-05, + "loss": 0.1504, + "step": 11213 + }, + { + "epoch": 8.830248129184719, + "grad_norm": 1.1590875387191772, + "learning_rate": 2.9597333333333335e-05, + "loss": 0.1626, + "step": 11214 + }, + { + "epoch": 8.831035840882237, + "grad_norm": 0.4644884765148163, + "learning_rate": 2.9597e-05, + "loss": 0.0911, + "step": 11215 + }, + { + "epoch": 8.831823552579756, + "grad_norm": 0.31432604789733887, + "learning_rate": 2.9596666666666667e-05, + "loss": 0.0368, + "step": 11216 + }, + { + "epoch": 8.832611264277274, + "grad_norm": 0.38912516832351685, + "learning_rate": 2.9596333333333333e-05, + "loss": 0.0298, + "step": 11217 + }, + { + "epoch": 8.833398975974793, + "grad_norm": 0.6062111258506775, + "learning_rate": 2.9596000000000003e-05, + "loss": 0.0296, + "step": 11218 + }, + { + "epoch": 8.834186687672313, + "grad_norm": 0.3407188951969147, + "learning_rate": 2.9595666666666665e-05, + "loss": 0.0104, + "step": 11219 + }, + { + "epoch": 8.83497439936983, + "grad_norm": 0.26200196146965027, + "learning_rate": 2.9595333333333334e-05, + "loss": 0.02, + "step": 11220 + }, + { + "epoch": 8.83576211106735, + "grad_norm": 0.6634745001792908, + "learning_rate": 2.9595e-05, + "loss": 0.0228, + "step": 11221 + }, + { + "epoch": 8.836549822764868, + "grad_norm": 0.4012109041213989, + "learning_rate": 2.9594666666666666e-05, + "loss": 0.0453, + "step": 11222 + }, + { + "epoch": 8.837337534462387, + "grad_norm": 0.28217387199401855, + "learning_rate": 2.9594333333333336e-05, + "loss": 0.0172, + "step": 11223 + }, + { + "epoch": 8.838125246159905, + "grad_norm": 0.29438209533691406, + "learning_rate": 2.9594e-05, + "loss": 0.0202, + "step": 11224 + }, + { + "epoch": 8.838912957857424, + "grad_norm": 0.21653103828430176, + "learning_rate": 2.9593666666666668e-05, + "loss": 0.0159, + "step": 11225 + }, + { + "epoch": 8.839700669554944, + "grad_norm": 0.8249861001968384, + "learning_rate": 2.9593333333333333e-05, + "loss": 0.0335, + "step": 11226 + }, + { + "epoch": 8.840488381252461, + "grad_norm": 0.2858060598373413, + "learning_rate": 2.9593000000000003e-05, + "loss": 0.0207, + "step": 11227 + }, + { + "epoch": 8.84127609294998, + "grad_norm": 0.32140660285949707, + "learning_rate": 2.9592666666666665e-05, + "loss": 0.0199, + "step": 11228 + }, + { + "epoch": 8.842063804647498, + "grad_norm": 0.28616225719451904, + "learning_rate": 2.9592333333333335e-05, + "loss": 0.0102, + "step": 11229 + }, + { + "epoch": 8.842851516345018, + "grad_norm": 0.21439404785633087, + "learning_rate": 2.9592000000000004e-05, + "loss": 0.0121, + "step": 11230 + }, + { + "epoch": 8.843639228042537, + "grad_norm": 0.22160986065864563, + "learning_rate": 2.9591666666666667e-05, + "loss": 0.0158, + "step": 11231 + }, + { + "epoch": 8.844426939740055, + "grad_norm": 0.46804550290107727, + "learning_rate": 2.9591333333333336e-05, + "loss": 0.0169, + "step": 11232 + }, + { + "epoch": 8.845214651437574, + "grad_norm": 0.382292777299881, + "learning_rate": 2.9591e-05, + "loss": 0.0134, + "step": 11233 + }, + { + "epoch": 8.846002363135092, + "grad_norm": 0.41105586290359497, + "learning_rate": 2.9590666666666668e-05, + "loss": 0.0203, + "step": 11234 + }, + { + "epoch": 8.846790074832612, + "grad_norm": 0.21173015236854553, + "learning_rate": 2.9590333333333334e-05, + "loss": 0.017, + "step": 11235 + }, + { + "epoch": 8.84757778653013, + "grad_norm": 0.3714445233345032, + "learning_rate": 2.959e-05, + "loss": 0.0164, + "step": 11236 + }, + { + "epoch": 8.848365498227649, + "grad_norm": 0.43005651235580444, + "learning_rate": 2.9589666666666666e-05, + "loss": 0.0215, + "step": 11237 + }, + { + "epoch": 8.849153209925168, + "grad_norm": 0.26822561025619507, + "learning_rate": 2.9589333333333335e-05, + "loss": 0.017, + "step": 11238 + }, + { + "epoch": 8.849940921622686, + "grad_norm": 0.2740590572357178, + "learning_rate": 2.9589e-05, + "loss": 0.0202, + "step": 11239 + }, + { + "epoch": 8.850728633320205, + "grad_norm": 0.42884427309036255, + "learning_rate": 2.9588666666666667e-05, + "loss": 0.0226, + "step": 11240 + }, + { + "epoch": 8.851516345017723, + "grad_norm": 0.26930299401283264, + "learning_rate": 2.9588333333333336e-05, + "loss": 0.0144, + "step": 11241 + }, + { + "epoch": 8.852304056715242, + "grad_norm": 0.22699248790740967, + "learning_rate": 2.9588e-05, + "loss": 0.0139, + "step": 11242 + }, + { + "epoch": 8.85309176841276, + "grad_norm": 0.37579602003097534, + "learning_rate": 2.9587666666666668e-05, + "loss": 0.0217, + "step": 11243 + }, + { + "epoch": 8.85387948011028, + "grad_norm": 0.414106160402298, + "learning_rate": 2.9587333333333334e-05, + "loss": 0.0177, + "step": 11244 + }, + { + "epoch": 8.854667191807799, + "grad_norm": 0.09348130226135254, + "learning_rate": 2.9587e-05, + "loss": 0.0041, + "step": 11245 + }, + { + "epoch": 8.855454903505317, + "grad_norm": 0.22017058730125427, + "learning_rate": 2.9586666666666666e-05, + "loss": 0.0069, + "step": 11246 + }, + { + "epoch": 8.856242615202836, + "grad_norm": 0.27899348735809326, + "learning_rate": 2.9586333333333335e-05, + "loss": 0.0162, + "step": 11247 + }, + { + "epoch": 8.857030326900354, + "grad_norm": 0.5117331147193909, + "learning_rate": 2.9586e-05, + "loss": 0.0255, + "step": 11248 + }, + { + "epoch": 8.857818038597873, + "grad_norm": 0.2822231650352478, + "learning_rate": 2.9585666666666667e-05, + "loss": 0.0173, + "step": 11249 + }, + { + "epoch": 8.858605750295393, + "grad_norm": 0.32099422812461853, + "learning_rate": 2.9585333333333336e-05, + "loss": 0.0168, + "step": 11250 + }, + { + "epoch": 8.85939346199291, + "grad_norm": 0.611150324344635, + "learning_rate": 2.9585e-05, + "loss": 0.0191, + "step": 11251 + }, + { + "epoch": 8.86018117369043, + "grad_norm": 0.28143876791000366, + "learning_rate": 2.9584666666666668e-05, + "loss": 0.014, + "step": 11252 + }, + { + "epoch": 8.860968885387948, + "grad_norm": 0.37670835852622986, + "learning_rate": 2.9584333333333334e-05, + "loss": 0.0184, + "step": 11253 + }, + { + "epoch": 8.861756597085467, + "grad_norm": 0.29147446155548096, + "learning_rate": 2.9584e-05, + "loss": 0.0145, + "step": 11254 + }, + { + "epoch": 8.862544308782985, + "grad_norm": 0.22162841260433197, + "learning_rate": 2.958366666666667e-05, + "loss": 0.0158, + "step": 11255 + }, + { + "epoch": 8.863332020480504, + "grad_norm": 0.297034353017807, + "learning_rate": 2.9583333333333335e-05, + "loss": 0.0187, + "step": 11256 + }, + { + "epoch": 8.864119732178024, + "grad_norm": 0.6178835034370422, + "learning_rate": 2.9583e-05, + "loss": 0.019, + "step": 11257 + }, + { + "epoch": 8.864907443875541, + "grad_norm": 0.7281622886657715, + "learning_rate": 2.9582666666666667e-05, + "loss": 0.0174, + "step": 11258 + }, + { + "epoch": 8.86569515557306, + "grad_norm": 1.1171796321868896, + "learning_rate": 2.9582333333333336e-05, + "loss": 0.0374, + "step": 11259 + }, + { + "epoch": 8.866482867270578, + "grad_norm": 0.30954402685165405, + "learning_rate": 2.9582e-05, + "loss": 0.0157, + "step": 11260 + }, + { + "epoch": 8.867270578968098, + "grad_norm": 0.75484699010849, + "learning_rate": 2.958166666666667e-05, + "loss": 0.224, + "step": 11261 + }, + { + "epoch": 8.868058290665616, + "grad_norm": 0.5240325927734375, + "learning_rate": 2.958133333333333e-05, + "loss": 0.1171, + "step": 11262 + }, + { + "epoch": 8.868846002363135, + "grad_norm": 0.615807056427002, + "learning_rate": 2.9581e-05, + "loss": 0.2089, + "step": 11263 + }, + { + "epoch": 8.869633714060654, + "grad_norm": 0.5727880001068115, + "learning_rate": 2.958066666666667e-05, + "loss": 0.0922, + "step": 11264 + }, + { + "epoch": 8.870421425758172, + "grad_norm": 0.5223478078842163, + "learning_rate": 2.9580333333333332e-05, + "loss": 0.0638, + "step": 11265 + }, + { + "epoch": 8.871209137455692, + "grad_norm": 0.45634451508522034, + "learning_rate": 2.958e-05, + "loss": 0.041, + "step": 11266 + }, + { + "epoch": 8.87199684915321, + "grad_norm": 0.14993451535701752, + "learning_rate": 2.9579666666666667e-05, + "loss": 0.0146, + "step": 11267 + }, + { + "epoch": 8.872784560850729, + "grad_norm": 0.383778840303421, + "learning_rate": 2.9579333333333333e-05, + "loss": 0.0349, + "step": 11268 + }, + { + "epoch": 8.873572272548248, + "grad_norm": 0.3810864984989166, + "learning_rate": 2.9579e-05, + "loss": 0.0382, + "step": 11269 + }, + { + "epoch": 8.874359984245766, + "grad_norm": 0.2337159812450409, + "learning_rate": 2.957866666666667e-05, + "loss": 0.0167, + "step": 11270 + }, + { + "epoch": 8.875147695943285, + "grad_norm": 0.20504429936408997, + "learning_rate": 2.9578333333333334e-05, + "loss": 0.0175, + "step": 11271 + }, + { + "epoch": 8.875935407640803, + "grad_norm": 0.2904835343360901, + "learning_rate": 2.9578e-05, + "loss": 0.0188, + "step": 11272 + }, + { + "epoch": 8.876723119338322, + "grad_norm": 0.26143917441368103, + "learning_rate": 2.957766666666667e-05, + "loss": 0.0173, + "step": 11273 + }, + { + "epoch": 8.87751083103584, + "grad_norm": 0.17596310377120972, + "learning_rate": 2.9577333333333332e-05, + "loss": 0.0115, + "step": 11274 + }, + { + "epoch": 8.87829854273336, + "grad_norm": 0.5862122178077698, + "learning_rate": 2.9577e-05, + "loss": 0.0168, + "step": 11275 + }, + { + "epoch": 8.879086254430879, + "grad_norm": 1.0136315822601318, + "learning_rate": 2.9576666666666668e-05, + "loss": 0.0223, + "step": 11276 + }, + { + "epoch": 8.879873966128397, + "grad_norm": 0.23699809610843658, + "learning_rate": 2.9576333333333333e-05, + "loss": 0.0141, + "step": 11277 + }, + { + "epoch": 8.880661677825916, + "grad_norm": 0.2287236452102661, + "learning_rate": 2.9576e-05, + "loss": 0.0105, + "step": 11278 + }, + { + "epoch": 8.881449389523434, + "grad_norm": 0.1826314926147461, + "learning_rate": 2.957566666666667e-05, + "loss": 0.0119, + "step": 11279 + }, + { + "epoch": 8.882237101220953, + "grad_norm": 0.2777906358242035, + "learning_rate": 2.9575333333333335e-05, + "loss": 0.0111, + "step": 11280 + }, + { + "epoch": 8.883024812918471, + "grad_norm": 0.2522366940975189, + "learning_rate": 2.9575e-05, + "loss": 0.0145, + "step": 11281 + }, + { + "epoch": 8.88381252461599, + "grad_norm": 0.9028351902961731, + "learning_rate": 2.957466666666667e-05, + "loss": 0.0346, + "step": 11282 + }, + { + "epoch": 8.88460023631351, + "grad_norm": 0.2004554271697998, + "learning_rate": 2.9574333333333332e-05, + "loss": 0.0163, + "step": 11283 + }, + { + "epoch": 8.885387948011028, + "grad_norm": 0.579574704170227, + "learning_rate": 2.9574000000000002e-05, + "loss": 0.0273, + "step": 11284 + }, + { + "epoch": 8.886175659708547, + "grad_norm": 0.32071661949157715, + "learning_rate": 2.9573666666666668e-05, + "loss": 0.0112, + "step": 11285 + }, + { + "epoch": 8.886963371406065, + "grad_norm": 0.40938249230384827, + "learning_rate": 2.9573333333333334e-05, + "loss": 0.0134, + "step": 11286 + }, + { + "epoch": 8.887751083103584, + "grad_norm": 1.1165554523468018, + "learning_rate": 2.9573e-05, + "loss": 0.035, + "step": 11287 + }, + { + "epoch": 8.888538794801104, + "grad_norm": 0.4485686123371124, + "learning_rate": 2.957266666666667e-05, + "loss": 0.0203, + "step": 11288 + }, + { + "epoch": 8.889326506498621, + "grad_norm": 0.3634614944458008, + "learning_rate": 2.9572333333333335e-05, + "loss": 0.0149, + "step": 11289 + }, + { + "epoch": 8.89011421819614, + "grad_norm": 0.37784308195114136, + "learning_rate": 2.9572e-05, + "loss": 0.02, + "step": 11290 + }, + { + "epoch": 8.890901929893658, + "grad_norm": 0.18675561249256134, + "learning_rate": 2.957166666666667e-05, + "loss": 0.0094, + "step": 11291 + }, + { + "epoch": 8.891689641591178, + "grad_norm": 0.1946256011724472, + "learning_rate": 2.9571333333333333e-05, + "loss": 0.0179, + "step": 11292 + }, + { + "epoch": 8.892477353288696, + "grad_norm": 0.2798627018928528, + "learning_rate": 2.9571000000000002e-05, + "loss": 0.0209, + "step": 11293 + }, + { + "epoch": 8.893265064986215, + "grad_norm": 0.3981491029262543, + "learning_rate": 2.9570666666666665e-05, + "loss": 0.0226, + "step": 11294 + }, + { + "epoch": 8.894052776683735, + "grad_norm": 0.3430628776550293, + "learning_rate": 2.9570333333333334e-05, + "loss": 0.0218, + "step": 11295 + }, + { + "epoch": 8.894840488381252, + "grad_norm": 0.4380897581577301, + "learning_rate": 2.957e-05, + "loss": 0.0247, + "step": 11296 + }, + { + "epoch": 8.895628200078772, + "grad_norm": 0.2487953156232834, + "learning_rate": 2.9569666666666666e-05, + "loss": 0.0141, + "step": 11297 + }, + { + "epoch": 8.89641591177629, + "grad_norm": 0.5445981621742249, + "learning_rate": 2.9569333333333335e-05, + "loss": 0.0159, + "step": 11298 + }, + { + "epoch": 8.897203623473809, + "grad_norm": 0.19984805583953857, + "learning_rate": 2.9569e-05, + "loss": 0.0114, + "step": 11299 + }, + { + "epoch": 8.897991335171326, + "grad_norm": 0.4904729425907135, + "learning_rate": 2.9568666666666667e-05, + "loss": 0.0182, + "step": 11300 + }, + { + "epoch": 8.898779046868846, + "grad_norm": 0.22345301508903503, + "learning_rate": 2.9568333333333333e-05, + "loss": 0.0132, + "step": 11301 + }, + { + "epoch": 8.899566758566365, + "grad_norm": 0.17655684053897858, + "learning_rate": 2.9568000000000002e-05, + "loss": 0.0119, + "step": 11302 + }, + { + "epoch": 8.900354470263883, + "grad_norm": 1.1008186340332031, + "learning_rate": 2.9567666666666665e-05, + "loss": 0.027, + "step": 11303 + }, + { + "epoch": 8.901142181961402, + "grad_norm": 0.21911725401878357, + "learning_rate": 2.9567333333333334e-05, + "loss": 0.0106, + "step": 11304 + }, + { + "epoch": 8.90192989365892, + "grad_norm": 0.5173270106315613, + "learning_rate": 2.9567000000000003e-05, + "loss": 0.0166, + "step": 11305 + }, + { + "epoch": 8.90271760535644, + "grad_norm": 0.1657489538192749, + "learning_rate": 2.9566666666666666e-05, + "loss": 0.0106, + "step": 11306 + }, + { + "epoch": 8.903505317053959, + "grad_norm": 0.2579795718193054, + "learning_rate": 2.9566333333333335e-05, + "loss": 0.0079, + "step": 11307 + }, + { + "epoch": 8.904293028751477, + "grad_norm": 0.35389143228530884, + "learning_rate": 2.9566e-05, + "loss": 0.0209, + "step": 11308 + }, + { + "epoch": 8.905080740448996, + "grad_norm": 0.4923263192176819, + "learning_rate": 2.9565666666666667e-05, + "loss": 0.0269, + "step": 11309 + }, + { + "epoch": 8.905868452146514, + "grad_norm": 0.48955902457237244, + "learning_rate": 2.9565333333333333e-05, + "loss": 0.019, + "step": 11310 + }, + { + "epoch": 8.906656163844033, + "grad_norm": 0.7560325264930725, + "learning_rate": 2.9565000000000002e-05, + "loss": 0.2428, + "step": 11311 + }, + { + "epoch": 8.907443875541551, + "grad_norm": 0.8799266815185547, + "learning_rate": 2.9564666666666665e-05, + "loss": 0.2229, + "step": 11312 + }, + { + "epoch": 8.90823158723907, + "grad_norm": 0.5772503018379211, + "learning_rate": 2.9564333333333334e-05, + "loss": 0.1044, + "step": 11313 + }, + { + "epoch": 8.90901929893659, + "grad_norm": 0.7180315256118774, + "learning_rate": 2.9564000000000004e-05, + "loss": 0.1245, + "step": 11314 + }, + { + "epoch": 8.909807010634108, + "grad_norm": 1.6266435384750366, + "learning_rate": 2.9563666666666666e-05, + "loss": 0.0879, + "step": 11315 + }, + { + "epoch": 8.910594722331627, + "grad_norm": 0.8039340972900391, + "learning_rate": 2.9563333333333335e-05, + "loss": 0.0583, + "step": 11316 + }, + { + "epoch": 8.911382434029145, + "grad_norm": 0.28180935978889465, + "learning_rate": 2.9563e-05, + "loss": 0.0222, + "step": 11317 + }, + { + "epoch": 8.912170145726664, + "grad_norm": 0.2909870147705078, + "learning_rate": 2.9562666666666667e-05, + "loss": 0.0262, + "step": 11318 + }, + { + "epoch": 8.912957857424182, + "grad_norm": 0.31443288922309875, + "learning_rate": 2.9562333333333333e-05, + "loss": 0.0172, + "step": 11319 + }, + { + "epoch": 8.913745569121701, + "grad_norm": 0.3457794189453125, + "learning_rate": 2.9562000000000003e-05, + "loss": 0.0272, + "step": 11320 + }, + { + "epoch": 8.91453328081922, + "grad_norm": 0.7534240484237671, + "learning_rate": 2.956166666666667e-05, + "loss": 0.021, + "step": 11321 + }, + { + "epoch": 8.915320992516738, + "grad_norm": 0.2491845339536667, + "learning_rate": 2.9561333333333334e-05, + "loss": 0.0177, + "step": 11322 + }, + { + "epoch": 8.916108704214258, + "grad_norm": 0.2501870095729828, + "learning_rate": 2.9561e-05, + "loss": 0.0161, + "step": 11323 + }, + { + "epoch": 8.916896415911776, + "grad_norm": 0.6266215443611145, + "learning_rate": 2.9560666666666666e-05, + "loss": 0.0119, + "step": 11324 + }, + { + "epoch": 8.917684127609295, + "grad_norm": 0.29793184995651245, + "learning_rate": 2.9560333333333336e-05, + "loss": 0.0162, + "step": 11325 + }, + { + "epoch": 8.918471839306815, + "grad_norm": 0.12857410311698914, + "learning_rate": 2.9559999999999998e-05, + "loss": 0.0111, + "step": 11326 + }, + { + "epoch": 8.919259551004332, + "grad_norm": 0.28370973467826843, + "learning_rate": 2.9559666666666668e-05, + "loss": 0.019, + "step": 11327 + }, + { + "epoch": 8.920047262701852, + "grad_norm": 0.2858757972717285, + "learning_rate": 2.9559333333333333e-05, + "loss": 0.0145, + "step": 11328 + }, + { + "epoch": 8.92083497439937, + "grad_norm": 0.549110472202301, + "learning_rate": 2.9559e-05, + "loss": 0.0132, + "step": 11329 + }, + { + "epoch": 8.921622686096889, + "grad_norm": 0.27790629863739014, + "learning_rate": 2.955866666666667e-05, + "loss": 0.0161, + "step": 11330 + }, + { + "epoch": 8.922410397794406, + "grad_norm": 0.2368282973766327, + "learning_rate": 2.9558333333333335e-05, + "loss": 0.0171, + "step": 11331 + }, + { + "epoch": 8.923198109491926, + "grad_norm": 0.3892620801925659, + "learning_rate": 2.9558e-05, + "loss": 0.0126, + "step": 11332 + }, + { + "epoch": 8.923985821189445, + "grad_norm": 0.7702925205230713, + "learning_rate": 2.9557666666666667e-05, + "loss": 0.0634, + "step": 11333 + }, + { + "epoch": 8.924773532886963, + "grad_norm": 0.2010396271944046, + "learning_rate": 2.9557333333333336e-05, + "loss": 0.0137, + "step": 11334 + }, + { + "epoch": 8.925561244584483, + "grad_norm": 0.19280149042606354, + "learning_rate": 2.9557e-05, + "loss": 0.009, + "step": 11335 + }, + { + "epoch": 8.926348956282, + "grad_norm": 0.2855478823184967, + "learning_rate": 2.9556666666666668e-05, + "loss": 0.0125, + "step": 11336 + }, + { + "epoch": 8.92713666797952, + "grad_norm": 0.24529516696929932, + "learning_rate": 2.9556333333333334e-05, + "loss": 0.0117, + "step": 11337 + }, + { + "epoch": 8.927924379677037, + "grad_norm": 0.4468950033187866, + "learning_rate": 2.9556e-05, + "loss": 0.0189, + "step": 11338 + }, + { + "epoch": 8.928712091374557, + "grad_norm": 0.18947404623031616, + "learning_rate": 2.955566666666667e-05, + "loss": 0.0099, + "step": 11339 + }, + { + "epoch": 8.929499803072076, + "grad_norm": 0.47200319170951843, + "learning_rate": 2.9555333333333335e-05, + "loss": 0.0196, + "step": 11340 + }, + { + "epoch": 8.930287514769594, + "grad_norm": 0.16405542194843292, + "learning_rate": 2.9555e-05, + "loss": 0.0071, + "step": 11341 + }, + { + "epoch": 8.931075226467113, + "grad_norm": 0.2058214396238327, + "learning_rate": 2.9554666666666667e-05, + "loss": 0.0158, + "step": 11342 + }, + { + "epoch": 8.931862938164631, + "grad_norm": 0.2702392041683197, + "learning_rate": 2.9554333333333336e-05, + "loss": 0.0251, + "step": 11343 + }, + { + "epoch": 8.93265064986215, + "grad_norm": 0.18741631507873535, + "learning_rate": 2.9554e-05, + "loss": 0.0145, + "step": 11344 + }, + { + "epoch": 8.93343836155967, + "grad_norm": 0.24178136885166168, + "learning_rate": 2.9553666666666668e-05, + "loss": 0.0188, + "step": 11345 + }, + { + "epoch": 8.934226073257188, + "grad_norm": 0.6385942697525024, + "learning_rate": 2.9553333333333334e-05, + "loss": 0.0238, + "step": 11346 + }, + { + "epoch": 8.935013784954707, + "grad_norm": 0.26355400681495667, + "learning_rate": 2.9553e-05, + "loss": 0.0206, + "step": 11347 + }, + { + "epoch": 8.935801496652225, + "grad_norm": 0.9304701089859009, + "learning_rate": 2.955266666666667e-05, + "loss": 0.0305, + "step": 11348 + }, + { + "epoch": 8.936589208349744, + "grad_norm": 0.1600780189037323, + "learning_rate": 2.9552333333333335e-05, + "loss": 0.01, + "step": 11349 + }, + { + "epoch": 8.937376920047262, + "grad_norm": 0.23099657893180847, + "learning_rate": 2.9552e-05, + "loss": 0.0163, + "step": 11350 + }, + { + "epoch": 8.938164631744781, + "grad_norm": 0.2504134476184845, + "learning_rate": 2.9551666666666667e-05, + "loss": 0.0184, + "step": 11351 + }, + { + "epoch": 8.9389523434423, + "grad_norm": 0.2114628553390503, + "learning_rate": 2.9551333333333333e-05, + "loss": 0.0205, + "step": 11352 + }, + { + "epoch": 8.939740055139819, + "grad_norm": 0.27128881216049194, + "learning_rate": 2.9551e-05, + "loss": 0.018, + "step": 11353 + }, + { + "epoch": 8.940527766837338, + "grad_norm": 0.28969457745552063, + "learning_rate": 2.9550666666666668e-05, + "loss": 0.0247, + "step": 11354 + }, + { + "epoch": 8.941315478534856, + "grad_norm": 0.3807699978351593, + "learning_rate": 2.9550333333333334e-05, + "loss": 0.0264, + "step": 11355 + }, + { + "epoch": 8.942103190232375, + "grad_norm": 0.4985915422439575, + "learning_rate": 2.955e-05, + "loss": 0.0275, + "step": 11356 + }, + { + "epoch": 8.942890901929893, + "grad_norm": 0.2955717146396637, + "learning_rate": 2.954966666666667e-05, + "loss": 0.0084, + "step": 11357 + }, + { + "epoch": 8.943678613627412, + "grad_norm": 0.37561169266700745, + "learning_rate": 2.9549333333333332e-05, + "loss": 0.0173, + "step": 11358 + }, + { + "epoch": 8.944466325324932, + "grad_norm": 0.3877526521682739, + "learning_rate": 2.9549e-05, + "loss": 0.0203, + "step": 11359 + }, + { + "epoch": 8.94525403702245, + "grad_norm": 0.4264317452907562, + "learning_rate": 2.9548666666666667e-05, + "loss": 0.0185, + "step": 11360 + }, + { + "epoch": 8.946041748719969, + "grad_norm": 0.9589032530784607, + "learning_rate": 2.9548333333333333e-05, + "loss": 0.3191, + "step": 11361 + }, + { + "epoch": 8.946829460417487, + "grad_norm": 0.5672476887702942, + "learning_rate": 2.9548e-05, + "loss": 0.1974, + "step": 11362 + }, + { + "epoch": 8.947617172115006, + "grad_norm": 0.6161143183708191, + "learning_rate": 2.954766666666667e-05, + "loss": 0.209, + "step": 11363 + }, + { + "epoch": 8.948404883812525, + "grad_norm": 0.678166925907135, + "learning_rate": 2.9547333333333334e-05, + "loss": 0.1511, + "step": 11364 + }, + { + "epoch": 8.949192595510043, + "grad_norm": 0.5526218414306641, + "learning_rate": 2.9547e-05, + "loss": 0.0774, + "step": 11365 + }, + { + "epoch": 8.949980307207563, + "grad_norm": 0.43376660346984863, + "learning_rate": 2.954666666666667e-05, + "loss": 0.033, + "step": 11366 + }, + { + "epoch": 8.95076801890508, + "grad_norm": 0.618813157081604, + "learning_rate": 2.9546333333333332e-05, + "loss": 0.0565, + "step": 11367 + }, + { + "epoch": 8.9515557306026, + "grad_norm": 0.21904943883419037, + "learning_rate": 2.9546e-05, + "loss": 0.026, + "step": 11368 + }, + { + "epoch": 8.952343442300117, + "grad_norm": 0.23043043911457062, + "learning_rate": 2.9545666666666667e-05, + "loss": 0.0519, + "step": 11369 + }, + { + "epoch": 8.953131153997637, + "grad_norm": 0.635569155216217, + "learning_rate": 2.9545333333333333e-05, + "loss": 0.0253, + "step": 11370 + }, + { + "epoch": 8.953918865695156, + "grad_norm": 0.38301903009414673, + "learning_rate": 2.9545000000000003e-05, + "loss": 0.0253, + "step": 11371 + }, + { + "epoch": 8.954706577392674, + "grad_norm": 0.27053871750831604, + "learning_rate": 2.954466666666667e-05, + "loss": 0.0146, + "step": 11372 + }, + { + "epoch": 8.955494289090193, + "grad_norm": 0.29693442583084106, + "learning_rate": 2.9544333333333334e-05, + "loss": 0.0169, + "step": 11373 + }, + { + "epoch": 8.956282000787711, + "grad_norm": 0.5132130980491638, + "learning_rate": 2.9544e-05, + "loss": 0.0176, + "step": 11374 + }, + { + "epoch": 8.95706971248523, + "grad_norm": 0.28163331747055054, + "learning_rate": 2.954366666666667e-05, + "loss": 0.0124, + "step": 11375 + }, + { + "epoch": 8.957857424182748, + "grad_norm": 0.5693454146385193, + "learning_rate": 2.9543333333333332e-05, + "loss": 0.0175, + "step": 11376 + }, + { + "epoch": 8.958645135880268, + "grad_norm": 0.3762919008731842, + "learning_rate": 2.9543e-05, + "loss": 0.0303, + "step": 11377 + }, + { + "epoch": 8.959432847577787, + "grad_norm": 0.16272951662540436, + "learning_rate": 2.9542666666666668e-05, + "loss": 0.0099, + "step": 11378 + }, + { + "epoch": 8.960220559275305, + "grad_norm": 0.3071151077747345, + "learning_rate": 2.9542333333333333e-05, + "loss": 0.0228, + "step": 11379 + }, + { + "epoch": 8.961008270972824, + "grad_norm": 0.30470943450927734, + "learning_rate": 2.9542000000000003e-05, + "loss": 0.0174, + "step": 11380 + }, + { + "epoch": 8.961795982670342, + "grad_norm": 0.1704731434583664, + "learning_rate": 2.954166666666667e-05, + "loss": 0.0169, + "step": 11381 + }, + { + "epoch": 8.962583694367861, + "grad_norm": 0.3034401834011078, + "learning_rate": 2.9541333333333335e-05, + "loss": 0.0263, + "step": 11382 + }, + { + "epoch": 8.96337140606538, + "grad_norm": 0.409677654504776, + "learning_rate": 2.9541e-05, + "loss": 0.0248, + "step": 11383 + }, + { + "epoch": 8.964159117762899, + "grad_norm": 0.7795356512069702, + "learning_rate": 2.9540666666666667e-05, + "loss": 0.0175, + "step": 11384 + }, + { + "epoch": 8.964946829460418, + "grad_norm": 0.5539795756340027, + "learning_rate": 2.9540333333333332e-05, + "loss": 0.022, + "step": 11385 + }, + { + "epoch": 8.965734541157936, + "grad_norm": 0.3092123866081238, + "learning_rate": 2.9540000000000002e-05, + "loss": 0.0146, + "step": 11386 + }, + { + "epoch": 8.966522252855455, + "grad_norm": 0.15592174232006073, + "learning_rate": 2.9539666666666664e-05, + "loss": 0.0117, + "step": 11387 + }, + { + "epoch": 8.967309964552973, + "grad_norm": 0.24489828944206238, + "learning_rate": 2.9539333333333334e-05, + "loss": 0.0212, + "step": 11388 + }, + { + "epoch": 8.968097676250492, + "grad_norm": 0.5412663221359253, + "learning_rate": 2.9539000000000003e-05, + "loss": 0.0153, + "step": 11389 + }, + { + "epoch": 8.968885387948012, + "grad_norm": 0.25638964772224426, + "learning_rate": 2.9538666666666666e-05, + "loss": 0.01, + "step": 11390 + }, + { + "epoch": 8.96967309964553, + "grad_norm": 0.20222847163677216, + "learning_rate": 2.9538333333333335e-05, + "loss": 0.0125, + "step": 11391 + }, + { + "epoch": 8.970460811343049, + "grad_norm": 0.254538893699646, + "learning_rate": 2.9538e-05, + "loss": 0.0156, + "step": 11392 + }, + { + "epoch": 8.971248523040567, + "grad_norm": 0.5043157935142517, + "learning_rate": 2.9537666666666667e-05, + "loss": 0.025, + "step": 11393 + }, + { + "epoch": 8.972036234738086, + "grad_norm": 0.31187158823013306, + "learning_rate": 2.9537333333333333e-05, + "loss": 0.0123, + "step": 11394 + }, + { + "epoch": 8.972823946435604, + "grad_norm": 0.350592702627182, + "learning_rate": 2.9537000000000002e-05, + "loss": 0.0152, + "step": 11395 + }, + { + "epoch": 8.973611658133123, + "grad_norm": 0.5406784415245056, + "learning_rate": 2.9536666666666668e-05, + "loss": 0.0156, + "step": 11396 + }, + { + "epoch": 8.974399369830643, + "grad_norm": 0.2810452878475189, + "learning_rate": 2.9536333333333334e-05, + "loss": 0.021, + "step": 11397 + }, + { + "epoch": 8.97518708152816, + "grad_norm": 0.633430004119873, + "learning_rate": 2.9536000000000003e-05, + "loss": 0.0173, + "step": 11398 + }, + { + "epoch": 8.97597479322568, + "grad_norm": 0.4958157539367676, + "learning_rate": 2.9535666666666666e-05, + "loss": 0.0192, + "step": 11399 + }, + { + "epoch": 8.976762504923197, + "grad_norm": 0.3636205494403839, + "learning_rate": 2.9535333333333335e-05, + "loss": 0.0185, + "step": 11400 + }, + { + "epoch": 8.977550216620717, + "grad_norm": 0.280838280916214, + "learning_rate": 2.9535e-05, + "loss": 0.0206, + "step": 11401 + }, + { + "epoch": 8.978337928318236, + "grad_norm": 0.318044513463974, + "learning_rate": 2.9534666666666667e-05, + "loss": 0.0202, + "step": 11402 + }, + { + "epoch": 8.979125640015754, + "grad_norm": 0.3881853222846985, + "learning_rate": 2.9534333333333333e-05, + "loss": 0.0166, + "step": 11403 + }, + { + "epoch": 8.979913351713273, + "grad_norm": 0.15846195816993713, + "learning_rate": 2.9534000000000002e-05, + "loss": 0.0067, + "step": 11404 + }, + { + "epoch": 8.980701063410791, + "grad_norm": 0.39159107208251953, + "learning_rate": 2.9533666666666668e-05, + "loss": 0.0224, + "step": 11405 + }, + { + "epoch": 8.98148877510831, + "grad_norm": 0.3332929313182831, + "learning_rate": 2.9533333333333334e-05, + "loss": 0.0266, + "step": 11406 + }, + { + "epoch": 8.982276486805828, + "grad_norm": 0.2746429443359375, + "learning_rate": 2.9533000000000003e-05, + "loss": 0.0171, + "step": 11407 + }, + { + "epoch": 8.983064198503348, + "grad_norm": 0.45953258872032166, + "learning_rate": 2.9532666666666666e-05, + "loss": 0.0186, + "step": 11408 + }, + { + "epoch": 8.983851910200867, + "grad_norm": 0.5115112662315369, + "learning_rate": 2.9532333333333335e-05, + "loss": 0.026, + "step": 11409 + }, + { + "epoch": 8.984639621898385, + "grad_norm": 0.6416048407554626, + "learning_rate": 2.9532e-05, + "loss": 0.0661, + "step": 11410 + }, + { + "epoch": 8.985427333595904, + "grad_norm": 0.8238231539726257, + "learning_rate": 2.9531666666666667e-05, + "loss": 0.2372, + "step": 11411 + }, + { + "epoch": 8.986215045293422, + "grad_norm": 0.5227060914039612, + "learning_rate": 2.9531333333333333e-05, + "loss": 0.0855, + "step": 11412 + }, + { + "epoch": 8.987002756990941, + "grad_norm": 0.28371042013168335, + "learning_rate": 2.9531e-05, + "loss": 0.0295, + "step": 11413 + }, + { + "epoch": 8.987790468688459, + "grad_norm": 0.27653321623802185, + "learning_rate": 2.953066666666667e-05, + "loss": 0.0199, + "step": 11414 + }, + { + "epoch": 8.988578180385979, + "grad_norm": 0.2885122001171112, + "learning_rate": 2.9530333333333334e-05, + "loss": 0.0123, + "step": 11415 + }, + { + "epoch": 8.989365892083498, + "grad_norm": 0.43754813075065613, + "learning_rate": 2.953e-05, + "loss": 0.0202, + "step": 11416 + }, + { + "epoch": 8.990153603781016, + "grad_norm": 0.3542238473892212, + "learning_rate": 2.9529666666666666e-05, + "loss": 0.0232, + "step": 11417 + }, + { + "epoch": 8.990941315478535, + "grad_norm": 0.2542170584201813, + "learning_rate": 2.9529333333333335e-05, + "loss": 0.0239, + "step": 11418 + }, + { + "epoch": 8.991729027176053, + "grad_norm": 0.3712163269519806, + "learning_rate": 2.9528999999999998e-05, + "loss": 0.021, + "step": 11419 + }, + { + "epoch": 8.992516738873572, + "grad_norm": 0.22111505270004272, + "learning_rate": 2.9528666666666667e-05, + "loss": 0.0168, + "step": 11420 + }, + { + "epoch": 8.993304450571092, + "grad_norm": 0.2042853683233261, + "learning_rate": 2.9528333333333337e-05, + "loss": 0.0233, + "step": 11421 + }, + { + "epoch": 8.99409216226861, + "grad_norm": 0.31610870361328125, + "learning_rate": 2.9528e-05, + "loss": 0.0169, + "step": 11422 + }, + { + "epoch": 8.994879873966129, + "grad_norm": 0.2068406641483307, + "learning_rate": 2.952766666666667e-05, + "loss": 0.0125, + "step": 11423 + }, + { + "epoch": 8.995667585663647, + "grad_norm": 0.2216462939977646, + "learning_rate": 2.9527333333333334e-05, + "loss": 0.0138, + "step": 11424 + }, + { + "epoch": 8.996455297361166, + "grad_norm": 0.2830745577812195, + "learning_rate": 2.9527e-05, + "loss": 0.0185, + "step": 11425 + }, + { + "epoch": 8.997243009058685, + "grad_norm": 0.3665384352207184, + "learning_rate": 2.9526666666666666e-05, + "loss": 0.0218, + "step": 11426 + }, + { + "epoch": 8.998030720756203, + "grad_norm": 0.27342814207077026, + "learning_rate": 2.9526333333333336e-05, + "loss": 0.0168, + "step": 11427 + }, + { + "epoch": 8.998818432453723, + "grad_norm": 0.33991438150405884, + "learning_rate": 2.9525999999999998e-05, + "loss": 0.0169, + "step": 11428 + }, + { + "epoch": 8.99960614415124, + "grad_norm": 0.5906062722206116, + "learning_rate": 2.9525666666666668e-05, + "loss": 0.0191, + "step": 11429 + }, + { + "epoch": 9.0, + "grad_norm": 0.5043011903762817, + "learning_rate": 2.9525333333333337e-05, + "loss": 0.0216, + "step": 11430 + }, + { + "epoch": 9.00078771169752, + "grad_norm": 0.9565243721008301, + "learning_rate": 2.9525e-05, + "loss": 0.2364, + "step": 11431 + }, + { + "epoch": 9.001575423395037, + "grad_norm": 0.48970282077789307, + "learning_rate": 2.952466666666667e-05, + "loss": 0.1884, + "step": 11432 + }, + { + "epoch": 9.002363135092557, + "grad_norm": 0.7559833526611328, + "learning_rate": 2.9524333333333335e-05, + "loss": 0.1952, + "step": 11433 + }, + { + "epoch": 9.003150846790074, + "grad_norm": 0.45129212737083435, + "learning_rate": 2.9524e-05, + "loss": 0.085, + "step": 11434 + }, + { + "epoch": 9.003938558487594, + "grad_norm": 0.5642337799072266, + "learning_rate": 2.9523666666666667e-05, + "loss": 0.0937, + "step": 11435 + }, + { + "epoch": 9.004726270185111, + "grad_norm": 0.5797455906867981, + "learning_rate": 2.9523333333333336e-05, + "loss": 0.0818, + "step": 11436 + }, + { + "epoch": 9.00551398188263, + "grad_norm": 0.22228233516216278, + "learning_rate": 2.9523e-05, + "loss": 0.0123, + "step": 11437 + }, + { + "epoch": 9.00630169358015, + "grad_norm": 0.2005392163991928, + "learning_rate": 2.9522666666666668e-05, + "loss": 0.0192, + "step": 11438 + }, + { + "epoch": 9.007089405277668, + "grad_norm": 0.23658134043216705, + "learning_rate": 2.9522333333333337e-05, + "loss": 0.0115, + "step": 11439 + }, + { + "epoch": 9.007877116975187, + "grad_norm": 0.36169031262397766, + "learning_rate": 2.9522e-05, + "loss": 0.0224, + "step": 11440 + }, + { + "epoch": 9.008664828672705, + "grad_norm": 0.28831014037132263, + "learning_rate": 2.952166666666667e-05, + "loss": 0.0145, + "step": 11441 + }, + { + "epoch": 9.009452540370225, + "grad_norm": 0.26451918482780457, + "learning_rate": 2.9521333333333335e-05, + "loss": 0.0178, + "step": 11442 + }, + { + "epoch": 9.010240252067744, + "grad_norm": 0.2508682310581207, + "learning_rate": 2.9521e-05, + "loss": 0.0147, + "step": 11443 + }, + { + "epoch": 9.011027963765262, + "grad_norm": 0.46556684374809265, + "learning_rate": 2.9520666666666667e-05, + "loss": 0.0139, + "step": 11444 + }, + { + "epoch": 9.011815675462781, + "grad_norm": 0.3080891966819763, + "learning_rate": 2.9520333333333333e-05, + "loss": 0.0142, + "step": 11445 + }, + { + "epoch": 9.012603387160299, + "grad_norm": 0.24828477203845978, + "learning_rate": 2.9520000000000002e-05, + "loss": 0.0086, + "step": 11446 + }, + { + "epoch": 9.013391098857818, + "grad_norm": 0.2770034670829773, + "learning_rate": 2.9519666666666668e-05, + "loss": 0.0183, + "step": 11447 + }, + { + "epoch": 9.014178810555336, + "grad_norm": 0.3977292776107788, + "learning_rate": 2.9519333333333334e-05, + "loss": 0.0097, + "step": 11448 + }, + { + "epoch": 9.014966522252855, + "grad_norm": 0.2422541379928589, + "learning_rate": 2.9519e-05, + "loss": 0.0165, + "step": 11449 + }, + { + "epoch": 9.015754233950375, + "grad_norm": 0.4063398241996765, + "learning_rate": 2.951866666666667e-05, + "loss": 0.0104, + "step": 11450 + }, + { + "epoch": 9.016541945647893, + "grad_norm": 0.17590075731277466, + "learning_rate": 2.951833333333333e-05, + "loss": 0.0104, + "step": 11451 + }, + { + "epoch": 9.017329657345412, + "grad_norm": 0.19048257172107697, + "learning_rate": 2.9518e-05, + "loss": 0.0137, + "step": 11452 + }, + { + "epoch": 9.01811736904293, + "grad_norm": 0.41066521406173706, + "learning_rate": 2.9517666666666667e-05, + "loss": 0.0191, + "step": 11453 + }, + { + "epoch": 9.01890508074045, + "grad_norm": 0.6621385216712952, + "learning_rate": 2.9517333333333333e-05, + "loss": 0.0135, + "step": 11454 + }, + { + "epoch": 9.019692792437967, + "grad_norm": 0.3529075086116791, + "learning_rate": 2.9517000000000002e-05, + "loss": 0.0135, + "step": 11455 + }, + { + "epoch": 9.020480504135486, + "grad_norm": 0.17256909608840942, + "learning_rate": 2.9516666666666668e-05, + "loss": 0.0087, + "step": 11456 + }, + { + "epoch": 9.021268215833006, + "grad_norm": 0.6066980361938477, + "learning_rate": 2.9516333333333334e-05, + "loss": 0.0126, + "step": 11457 + }, + { + "epoch": 9.022055927530523, + "grad_norm": 0.4412175416946411, + "learning_rate": 2.9516e-05, + "loss": 0.0203, + "step": 11458 + }, + { + "epoch": 9.022843639228043, + "grad_norm": 0.22752337157726288, + "learning_rate": 2.951566666666667e-05, + "loss": 0.0072, + "step": 11459 + }, + { + "epoch": 9.02363135092556, + "grad_norm": 0.2653147876262665, + "learning_rate": 2.9515333333333332e-05, + "loss": 0.0117, + "step": 11460 + }, + { + "epoch": 9.02441906262308, + "grad_norm": 0.34508219361305237, + "learning_rate": 2.9515e-05, + "loss": 0.0187, + "step": 11461 + }, + { + "epoch": 9.0252067743206, + "grad_norm": 0.2136310636997223, + "learning_rate": 2.9514666666666667e-05, + "loss": 0.0133, + "step": 11462 + }, + { + "epoch": 9.025994486018117, + "grad_norm": 0.196456640958786, + "learning_rate": 2.9514333333333333e-05, + "loss": 0.0078, + "step": 11463 + }, + { + "epoch": 9.026782197715637, + "grad_norm": 0.2864961326122284, + "learning_rate": 2.9514000000000002e-05, + "loss": 0.0116, + "step": 11464 + }, + { + "epoch": 9.027569909413154, + "grad_norm": 0.5967380404472351, + "learning_rate": 2.9513666666666668e-05, + "loss": 0.0185, + "step": 11465 + }, + { + "epoch": 9.028357621110674, + "grad_norm": 0.34835201501846313, + "learning_rate": 2.9513333333333334e-05, + "loss": 0.0096, + "step": 11466 + }, + { + "epoch": 9.029145332808191, + "grad_norm": 0.3740369379520416, + "learning_rate": 2.9513e-05, + "loss": 0.0138, + "step": 11467 + }, + { + "epoch": 9.02993304450571, + "grad_norm": 0.22127258777618408, + "learning_rate": 2.951266666666667e-05, + "loss": 0.0121, + "step": 11468 + }, + { + "epoch": 9.03072075620323, + "grad_norm": 0.2275819480419159, + "learning_rate": 2.9512333333333332e-05, + "loss": 0.0103, + "step": 11469 + }, + { + "epoch": 9.031508467900748, + "grad_norm": 0.39106932282447815, + "learning_rate": 2.9512e-05, + "loss": 0.0123, + "step": 11470 + }, + { + "epoch": 9.032296179598267, + "grad_norm": 0.17077642679214478, + "learning_rate": 2.951166666666667e-05, + "loss": 0.0108, + "step": 11471 + }, + { + "epoch": 9.033083891295785, + "grad_norm": 0.5555424690246582, + "learning_rate": 2.9511333333333333e-05, + "loss": 0.0097, + "step": 11472 + }, + { + "epoch": 9.033871602993305, + "grad_norm": 0.777265191078186, + "learning_rate": 2.9511000000000003e-05, + "loss": 0.0206, + "step": 11473 + }, + { + "epoch": 9.034659314690822, + "grad_norm": 0.26381736993789673, + "learning_rate": 2.9510666666666665e-05, + "loss": 0.0171, + "step": 11474 + }, + { + "epoch": 9.035447026388342, + "grad_norm": 0.256952166557312, + "learning_rate": 2.9510333333333334e-05, + "loss": 0.0258, + "step": 11475 + }, + { + "epoch": 9.036234738085861, + "grad_norm": 0.2593379020690918, + "learning_rate": 2.951e-05, + "loss": 0.0113, + "step": 11476 + }, + { + "epoch": 9.037022449783379, + "grad_norm": 0.6170002222061157, + "learning_rate": 2.9509666666666666e-05, + "loss": 0.0109, + "step": 11477 + }, + { + "epoch": 9.037810161480898, + "grad_norm": 0.23898626863956451, + "learning_rate": 2.9509333333333332e-05, + "loss": 0.0148, + "step": 11478 + }, + { + "epoch": 9.038597873178416, + "grad_norm": 0.5002512335777283, + "learning_rate": 2.9509e-05, + "loss": 0.0146, + "step": 11479 + }, + { + "epoch": 9.039385584875935, + "grad_norm": 1.0871285200119019, + "learning_rate": 2.9508666666666668e-05, + "loss": 0.0181, + "step": 11480 + }, + { + "epoch": 9.040173296573455, + "grad_norm": 0.6603729724884033, + "learning_rate": 2.9508333333333333e-05, + "loss": 0.1891, + "step": 11481 + }, + { + "epoch": 9.040961008270973, + "grad_norm": 0.7647644281387329, + "learning_rate": 2.9508000000000003e-05, + "loss": 0.2056, + "step": 11482 + }, + { + "epoch": 9.041748719968492, + "grad_norm": 0.625383734703064, + "learning_rate": 2.9507666666666665e-05, + "loss": 0.1212, + "step": 11483 + }, + { + "epoch": 9.04253643166601, + "grad_norm": 0.5337325930595398, + "learning_rate": 2.9507333333333335e-05, + "loss": 0.0766, + "step": 11484 + }, + { + "epoch": 9.04332414336353, + "grad_norm": 0.3250081539154053, + "learning_rate": 2.9507e-05, + "loss": 0.0556, + "step": 11485 + }, + { + "epoch": 9.044111855061047, + "grad_norm": 0.1721099615097046, + "learning_rate": 2.9506666666666667e-05, + "loss": 0.0204, + "step": 11486 + }, + { + "epoch": 9.044899566758566, + "grad_norm": 0.22041171789169312, + "learning_rate": 2.9506333333333332e-05, + "loss": 0.0426, + "step": 11487 + }, + { + "epoch": 9.045687278456086, + "grad_norm": 0.2932591736316681, + "learning_rate": 2.9506000000000002e-05, + "loss": 0.0138, + "step": 11488 + }, + { + "epoch": 9.046474990153603, + "grad_norm": 0.18802247941493988, + "learning_rate": 2.9505666666666668e-05, + "loss": 0.026, + "step": 11489 + }, + { + "epoch": 9.047262701851123, + "grad_norm": 0.2733822166919708, + "learning_rate": 2.9505333333333334e-05, + "loss": 0.0262, + "step": 11490 + }, + { + "epoch": 9.04805041354864, + "grad_norm": 0.3958023190498352, + "learning_rate": 2.9505000000000003e-05, + "loss": 0.0133, + "step": 11491 + }, + { + "epoch": 9.04883812524616, + "grad_norm": 0.16965971887111664, + "learning_rate": 2.9504666666666666e-05, + "loss": 0.0122, + "step": 11492 + }, + { + "epoch": 9.04962583694368, + "grad_norm": 0.19453082978725433, + "learning_rate": 2.9504333333333335e-05, + "loss": 0.0379, + "step": 11493 + }, + { + "epoch": 9.050413548641197, + "grad_norm": 0.25531843304634094, + "learning_rate": 2.9504e-05, + "loss": 0.0111, + "step": 11494 + }, + { + "epoch": 9.051201260338717, + "grad_norm": 0.13770107924938202, + "learning_rate": 2.9503666666666667e-05, + "loss": 0.006, + "step": 11495 + }, + { + "epoch": 9.051988972036234, + "grad_norm": 0.19734786450862885, + "learning_rate": 2.9503333333333336e-05, + "loss": 0.0131, + "step": 11496 + }, + { + "epoch": 9.052776683733754, + "grad_norm": 0.16433049738407135, + "learning_rate": 2.9503000000000002e-05, + "loss": 0.014, + "step": 11497 + }, + { + "epoch": 9.053564395431271, + "grad_norm": 0.19580619037151337, + "learning_rate": 2.9502666666666668e-05, + "loss": 0.0111, + "step": 11498 + }, + { + "epoch": 9.054352107128791, + "grad_norm": 0.3147854804992676, + "learning_rate": 2.9502333333333334e-05, + "loss": 0.0171, + "step": 11499 + }, + { + "epoch": 9.05513981882631, + "grad_norm": 0.18674205243587494, + "learning_rate": 2.9502000000000003e-05, + "loss": 0.0072, + "step": 11500 + }, + { + "epoch": 9.055927530523828, + "grad_norm": 0.2059805691242218, + "learning_rate": 2.9501666666666666e-05, + "loss": 0.0113, + "step": 11501 + }, + { + "epoch": 9.056715242221347, + "grad_norm": 0.1610412299633026, + "learning_rate": 2.9501333333333335e-05, + "loss": 0.0121, + "step": 11502 + }, + { + "epoch": 9.057502953918865, + "grad_norm": 0.4382907748222351, + "learning_rate": 2.9500999999999998e-05, + "loss": 0.0158, + "step": 11503 + }, + { + "epoch": 9.058290665616385, + "grad_norm": 0.2694394588470459, + "learning_rate": 2.9500666666666667e-05, + "loss": 0.013, + "step": 11504 + }, + { + "epoch": 9.059078377313902, + "grad_norm": 0.3415243625640869, + "learning_rate": 2.9500333333333336e-05, + "loss": 0.0142, + "step": 11505 + }, + { + "epoch": 9.059866089011422, + "grad_norm": 0.2516098916530609, + "learning_rate": 2.95e-05, + "loss": 0.0118, + "step": 11506 + }, + { + "epoch": 9.060653800708941, + "grad_norm": 0.31004229187965393, + "learning_rate": 2.9499666666666668e-05, + "loss": 0.0102, + "step": 11507 + }, + { + "epoch": 9.061441512406459, + "grad_norm": 0.20696979761123657, + "learning_rate": 2.9499333333333334e-05, + "loss": 0.0112, + "step": 11508 + }, + { + "epoch": 9.062229224103978, + "grad_norm": 0.19210268557071686, + "learning_rate": 2.9499e-05, + "loss": 0.0118, + "step": 11509 + }, + { + "epoch": 9.063016935801496, + "grad_norm": 0.20274779200553894, + "learning_rate": 2.9498666666666666e-05, + "loss": 0.0089, + "step": 11510 + }, + { + "epoch": 9.063804647499015, + "grad_norm": 0.3182900846004486, + "learning_rate": 2.9498333333333335e-05, + "loss": 0.0158, + "step": 11511 + }, + { + "epoch": 9.064592359196535, + "grad_norm": 0.27627280354499817, + "learning_rate": 2.9497999999999998e-05, + "loss": 0.0135, + "step": 11512 + }, + { + "epoch": 9.065380070894053, + "grad_norm": 0.3533881902694702, + "learning_rate": 2.9497666666666667e-05, + "loss": 0.0156, + "step": 11513 + }, + { + "epoch": 9.066167782591572, + "grad_norm": 0.340813010931015, + "learning_rate": 2.9497333333333336e-05, + "loss": 0.0123, + "step": 11514 + }, + { + "epoch": 9.06695549428909, + "grad_norm": 0.19056616723537445, + "learning_rate": 2.9497e-05, + "loss": 0.011, + "step": 11515 + }, + { + "epoch": 9.06774320598661, + "grad_norm": 0.3053174614906311, + "learning_rate": 2.9496666666666668e-05, + "loss": 0.0172, + "step": 11516 + }, + { + "epoch": 9.068530917684127, + "grad_norm": 0.20347775518894196, + "learning_rate": 2.9496333333333334e-05, + "loss": 0.0079, + "step": 11517 + }, + { + "epoch": 9.069318629381646, + "grad_norm": 0.4042183756828308, + "learning_rate": 2.9496e-05, + "loss": 0.0148, + "step": 11518 + }, + { + "epoch": 9.070106341079166, + "grad_norm": 0.29118311405181885, + "learning_rate": 2.9495666666666666e-05, + "loss": 0.0196, + "step": 11519 + }, + { + "epoch": 9.070894052776683, + "grad_norm": 0.6216691732406616, + "learning_rate": 2.9495333333333335e-05, + "loss": 0.0166, + "step": 11520 + }, + { + "epoch": 9.071681764474203, + "grad_norm": 0.10968466103076935, + "learning_rate": 2.9495e-05, + "loss": 0.0061, + "step": 11521 + }, + { + "epoch": 9.07246947617172, + "grad_norm": 0.3339836597442627, + "learning_rate": 2.9494666666666667e-05, + "loss": 0.0207, + "step": 11522 + }, + { + "epoch": 9.07325718786924, + "grad_norm": 0.8926977515220642, + "learning_rate": 2.9494333333333337e-05, + "loss": 0.0134, + "step": 11523 + }, + { + "epoch": 9.074044899566758, + "grad_norm": 0.4037485718727112, + "learning_rate": 2.9494e-05, + "loss": 0.0147, + "step": 11524 + }, + { + "epoch": 9.074832611264277, + "grad_norm": 0.5158495903015137, + "learning_rate": 2.949366666666667e-05, + "loss": 0.017, + "step": 11525 + }, + { + "epoch": 9.075620322961797, + "grad_norm": 0.3203570544719696, + "learning_rate": 2.9493333333333334e-05, + "loss": 0.0109, + "step": 11526 + }, + { + "epoch": 9.076408034659314, + "grad_norm": 0.5460478663444519, + "learning_rate": 2.9493e-05, + "loss": 0.0262, + "step": 11527 + }, + { + "epoch": 9.077195746356834, + "grad_norm": 0.33743616938591003, + "learning_rate": 2.9492666666666666e-05, + "loss": 0.0117, + "step": 11528 + }, + { + "epoch": 9.077983458054351, + "grad_norm": 0.3712500333786011, + "learning_rate": 2.9492333333333336e-05, + "loss": 0.0203, + "step": 11529 + }, + { + "epoch": 9.078771169751871, + "grad_norm": 0.2384544461965561, + "learning_rate": 2.9492e-05, + "loss": 0.0105, + "step": 11530 + }, + { + "epoch": 9.07955888144939, + "grad_norm": 0.7446715831756592, + "learning_rate": 2.9491666666666667e-05, + "loss": 0.2205, + "step": 11531 + }, + { + "epoch": 9.080346593146908, + "grad_norm": 0.5236197113990784, + "learning_rate": 2.9491333333333337e-05, + "loss": 0.1917, + "step": 11532 + }, + { + "epoch": 9.081134304844428, + "grad_norm": 0.5904142260551453, + "learning_rate": 2.9491e-05, + "loss": 0.1329, + "step": 11533 + }, + { + "epoch": 9.081922016541945, + "grad_norm": 0.39754340052604675, + "learning_rate": 2.949066666666667e-05, + "loss": 0.0802, + "step": 11534 + }, + { + "epoch": 9.082709728239465, + "grad_norm": 0.5181896090507507, + "learning_rate": 2.949033333333333e-05, + "loss": 0.0744, + "step": 11535 + }, + { + "epoch": 9.083497439936982, + "grad_norm": 0.4448355436325073, + "learning_rate": 2.949e-05, + "loss": 0.0569, + "step": 11536 + }, + { + "epoch": 9.084285151634502, + "grad_norm": 0.17378626763820648, + "learning_rate": 2.9489666666666666e-05, + "loss": 0.0202, + "step": 11537 + }, + { + "epoch": 9.085072863332021, + "grad_norm": 0.35044676065444946, + "learning_rate": 2.9489333333333332e-05, + "loss": 0.0342, + "step": 11538 + }, + { + "epoch": 9.085860575029539, + "grad_norm": 0.25510022044181824, + "learning_rate": 2.9489000000000002e-05, + "loss": 0.0179, + "step": 11539 + }, + { + "epoch": 9.086648286727058, + "grad_norm": 0.28699201345443726, + "learning_rate": 2.9488666666666668e-05, + "loss": 0.0259, + "step": 11540 + }, + { + "epoch": 9.087435998424576, + "grad_norm": 0.35936498641967773, + "learning_rate": 2.9488333333333334e-05, + "loss": 0.025, + "step": 11541 + }, + { + "epoch": 9.088223710122096, + "grad_norm": 0.1648491770029068, + "learning_rate": 2.9488e-05, + "loss": 0.0135, + "step": 11542 + }, + { + "epoch": 9.089011421819613, + "grad_norm": 0.11214587092399597, + "learning_rate": 2.948766666666667e-05, + "loss": 0.0101, + "step": 11543 + }, + { + "epoch": 9.089799133517133, + "grad_norm": 0.3677920997142792, + "learning_rate": 2.948733333333333e-05, + "loss": 0.0194, + "step": 11544 + }, + { + "epoch": 9.090586845214652, + "grad_norm": 0.14972592890262604, + "learning_rate": 2.9487e-05, + "loss": 0.0085, + "step": 11545 + }, + { + "epoch": 9.09137455691217, + "grad_norm": 0.2917299270629883, + "learning_rate": 2.948666666666667e-05, + "loss": 0.0178, + "step": 11546 + }, + { + "epoch": 9.09216226860969, + "grad_norm": 0.29632166028022766, + "learning_rate": 2.9486333333333333e-05, + "loss": 0.0211, + "step": 11547 + }, + { + "epoch": 9.092949980307207, + "grad_norm": 0.24113042652606964, + "learning_rate": 2.9486000000000002e-05, + "loss": 0.009, + "step": 11548 + }, + { + "epoch": 9.093737692004726, + "grad_norm": 0.28849542140960693, + "learning_rate": 2.9485666666666668e-05, + "loss": 0.0116, + "step": 11549 + }, + { + "epoch": 9.094525403702246, + "grad_norm": 0.24627509713172913, + "learning_rate": 2.9485333333333334e-05, + "loss": 0.0115, + "step": 11550 + }, + { + "epoch": 9.095313115399764, + "grad_norm": 0.20979663729667664, + "learning_rate": 2.9485e-05, + "loss": 0.0153, + "step": 11551 + }, + { + "epoch": 9.096100827097283, + "grad_norm": 0.6969618201255798, + "learning_rate": 2.948466666666667e-05, + "loss": 0.0127, + "step": 11552 + }, + { + "epoch": 9.0968885387948, + "grad_norm": 0.47589564323425293, + "learning_rate": 2.948433333333333e-05, + "loss": 0.0127, + "step": 11553 + }, + { + "epoch": 9.09767625049232, + "grad_norm": 0.31280627846717834, + "learning_rate": 2.9484e-05, + "loss": 0.0124, + "step": 11554 + }, + { + "epoch": 9.098463962189838, + "grad_norm": 0.379274845123291, + "learning_rate": 2.948366666666667e-05, + "loss": 0.0235, + "step": 11555 + }, + { + "epoch": 9.099251673887357, + "grad_norm": 0.43550658226013184, + "learning_rate": 2.9483333333333333e-05, + "loss": 0.0552, + "step": 11556 + }, + { + "epoch": 9.100039385584877, + "grad_norm": 0.4354303181171417, + "learning_rate": 2.9483000000000002e-05, + "loss": 0.0111, + "step": 11557 + }, + { + "epoch": 9.100827097282394, + "grad_norm": 0.2561132311820984, + "learning_rate": 2.9482666666666668e-05, + "loss": 0.0178, + "step": 11558 + }, + { + "epoch": 9.101614808979914, + "grad_norm": 0.23453263938426971, + "learning_rate": 2.9482333333333334e-05, + "loss": 0.0165, + "step": 11559 + }, + { + "epoch": 9.102402520677431, + "grad_norm": 0.24541474878787994, + "learning_rate": 2.9482e-05, + "loss": 0.0135, + "step": 11560 + }, + { + "epoch": 9.103190232374951, + "grad_norm": 0.34315574169158936, + "learning_rate": 2.948166666666667e-05, + "loss": 0.0129, + "step": 11561 + }, + { + "epoch": 9.103977944072469, + "grad_norm": 0.27767032384872437, + "learning_rate": 2.9481333333333332e-05, + "loss": 0.0092, + "step": 11562 + }, + { + "epoch": 9.104765655769988, + "grad_norm": 0.2589886784553528, + "learning_rate": 2.9481e-05, + "loss": 0.0104, + "step": 11563 + }, + { + "epoch": 9.105553367467508, + "grad_norm": 0.2984495460987091, + "learning_rate": 2.9480666666666667e-05, + "loss": 0.0114, + "step": 11564 + }, + { + "epoch": 9.106341079165025, + "grad_norm": 0.321758896112442, + "learning_rate": 2.9480333333333333e-05, + "loss": 0.0169, + "step": 11565 + }, + { + "epoch": 9.107128790862545, + "grad_norm": 0.28177785873413086, + "learning_rate": 2.9480000000000002e-05, + "loss": 0.0166, + "step": 11566 + }, + { + "epoch": 9.107916502560062, + "grad_norm": 0.32482394576072693, + "learning_rate": 2.9479666666666665e-05, + "loss": 0.0211, + "step": 11567 + }, + { + "epoch": 9.108704214257582, + "grad_norm": 0.24018719792366028, + "learning_rate": 2.9479333333333334e-05, + "loss": 0.0109, + "step": 11568 + }, + { + "epoch": 9.109491925955101, + "grad_norm": 0.3632972538471222, + "learning_rate": 2.9479e-05, + "loss": 0.0164, + "step": 11569 + }, + { + "epoch": 9.110279637652619, + "grad_norm": 0.1595558375120163, + "learning_rate": 2.9478666666666666e-05, + "loss": 0.0097, + "step": 11570 + }, + { + "epoch": 9.111067349350138, + "grad_norm": 0.21295872330665588, + "learning_rate": 2.9478333333333335e-05, + "loss": 0.0123, + "step": 11571 + }, + { + "epoch": 9.111855061047656, + "grad_norm": 0.22575101256370544, + "learning_rate": 2.9478e-05, + "loss": 0.0131, + "step": 11572 + }, + { + "epoch": 9.112642772745176, + "grad_norm": 0.4294453561306, + "learning_rate": 2.9477666666666667e-05, + "loss": 0.0098, + "step": 11573 + }, + { + "epoch": 9.113430484442693, + "grad_norm": 0.4408138692378998, + "learning_rate": 2.9477333333333333e-05, + "loss": 0.0239, + "step": 11574 + }, + { + "epoch": 9.114218196140213, + "grad_norm": 0.2550303041934967, + "learning_rate": 2.9477000000000003e-05, + "loss": 0.0123, + "step": 11575 + }, + { + "epoch": 9.115005907837732, + "grad_norm": 0.349372535943985, + "learning_rate": 2.9476666666666665e-05, + "loss": 0.0109, + "step": 11576 + }, + { + "epoch": 9.11579361953525, + "grad_norm": 0.2339864820241928, + "learning_rate": 2.9476333333333334e-05, + "loss": 0.01, + "step": 11577 + }, + { + "epoch": 9.11658133123277, + "grad_norm": 0.29993584752082825, + "learning_rate": 2.9476e-05, + "loss": 0.0114, + "step": 11578 + }, + { + "epoch": 9.117369042930287, + "grad_norm": 0.3474239408969879, + "learning_rate": 2.9475666666666666e-05, + "loss": 0.0164, + "step": 11579 + }, + { + "epoch": 9.118156754627806, + "grad_norm": 0.6293314099311829, + "learning_rate": 2.9475333333333336e-05, + "loss": 0.029, + "step": 11580 + }, + { + "epoch": 9.118944466325324, + "grad_norm": 0.7651095390319824, + "learning_rate": 2.9475e-05, + "loss": 0.2135, + "step": 11581 + }, + { + "epoch": 9.119732178022844, + "grad_norm": 0.5473427176475525, + "learning_rate": 2.9474666666666667e-05, + "loss": 0.1433, + "step": 11582 + }, + { + "epoch": 9.120519889720363, + "grad_norm": 0.9583653807640076, + "learning_rate": 2.9474333333333333e-05, + "loss": 0.1013, + "step": 11583 + }, + { + "epoch": 9.12130760141788, + "grad_norm": 0.6571617722511292, + "learning_rate": 2.9474000000000003e-05, + "loss": 0.0774, + "step": 11584 + }, + { + "epoch": 9.1220953131154, + "grad_norm": 0.641615629196167, + "learning_rate": 2.9473666666666665e-05, + "loss": 0.0551, + "step": 11585 + }, + { + "epoch": 9.122883024812918, + "grad_norm": 0.42226335406303406, + "learning_rate": 2.9473333333333335e-05, + "loss": 0.0333, + "step": 11586 + }, + { + "epoch": 9.123670736510437, + "grad_norm": 0.23161298036575317, + "learning_rate": 2.9473e-05, + "loss": 0.0164, + "step": 11587 + }, + { + "epoch": 9.124458448207957, + "grad_norm": 0.39622610807418823, + "learning_rate": 2.9472666666666666e-05, + "loss": 0.0525, + "step": 11588 + }, + { + "epoch": 9.125246159905474, + "grad_norm": 0.36048951745033264, + "learning_rate": 2.9472333333333336e-05, + "loss": 0.0198, + "step": 11589 + }, + { + "epoch": 9.126033871602994, + "grad_norm": 0.20875409245491028, + "learning_rate": 2.9472000000000002e-05, + "loss": 0.0116, + "step": 11590 + }, + { + "epoch": 9.126821583300512, + "grad_norm": 0.16572803258895874, + "learning_rate": 2.9471666666666668e-05, + "loss": 0.0156, + "step": 11591 + }, + { + "epoch": 9.127609294998031, + "grad_norm": 0.2737003564834595, + "learning_rate": 2.9471333333333334e-05, + "loss": 0.0246, + "step": 11592 + }, + { + "epoch": 9.128397006695549, + "grad_norm": 0.2108999788761139, + "learning_rate": 2.9471000000000003e-05, + "loss": 0.0179, + "step": 11593 + }, + { + "epoch": 9.129184718393068, + "grad_norm": 0.43589287996292114, + "learning_rate": 2.9470666666666665e-05, + "loss": 0.016, + "step": 11594 + }, + { + "epoch": 9.129972430090588, + "grad_norm": 1.5500718355178833, + "learning_rate": 2.9470333333333335e-05, + "loss": 0.026, + "step": 11595 + }, + { + "epoch": 9.130760141788105, + "grad_norm": 0.4286811947822571, + "learning_rate": 2.947e-05, + "loss": 0.0222, + "step": 11596 + }, + { + "epoch": 9.131547853485625, + "grad_norm": 0.22528640925884247, + "learning_rate": 2.9469666666666667e-05, + "loss": 0.0154, + "step": 11597 + }, + { + "epoch": 9.132335565183142, + "grad_norm": 0.37306103110313416, + "learning_rate": 2.9469333333333336e-05, + "loss": 0.0149, + "step": 11598 + }, + { + "epoch": 9.133123276880662, + "grad_norm": 0.11341919749975204, + "learning_rate": 2.9469e-05, + "loss": 0.0077, + "step": 11599 + }, + { + "epoch": 9.13391098857818, + "grad_norm": 0.2578762471675873, + "learning_rate": 2.9468666666666668e-05, + "loss": 0.0137, + "step": 11600 + }, + { + "epoch": 9.134698700275699, + "grad_norm": 0.6975134015083313, + "learning_rate": 2.9468333333333334e-05, + "loss": 0.022, + "step": 11601 + }, + { + "epoch": 9.135486411973218, + "grad_norm": 0.19093641638755798, + "learning_rate": 2.9468e-05, + "loss": 0.009, + "step": 11602 + }, + { + "epoch": 9.136274123670736, + "grad_norm": 0.2817373275756836, + "learning_rate": 2.9467666666666666e-05, + "loss": 0.011, + "step": 11603 + }, + { + "epoch": 9.137061835368256, + "grad_norm": 0.5303912162780762, + "learning_rate": 2.9467333333333335e-05, + "loss": 0.0216, + "step": 11604 + }, + { + "epoch": 9.137849547065773, + "grad_norm": 0.30334392189979553, + "learning_rate": 2.9467e-05, + "loss": 0.017, + "step": 11605 + }, + { + "epoch": 9.138637258763293, + "grad_norm": 0.21828676760196686, + "learning_rate": 2.9466666666666667e-05, + "loss": 0.0152, + "step": 11606 + }, + { + "epoch": 9.139424970460812, + "grad_norm": 0.9685545563697815, + "learning_rate": 2.9466333333333336e-05, + "loss": 0.0373, + "step": 11607 + }, + { + "epoch": 9.14021268215833, + "grad_norm": 0.28511735796928406, + "learning_rate": 2.9466e-05, + "loss": 0.0156, + "step": 11608 + }, + { + "epoch": 9.14100039385585, + "grad_norm": 0.28764259815216064, + "learning_rate": 2.9465666666666668e-05, + "loss": 0.0135, + "step": 11609 + }, + { + "epoch": 9.141788105553367, + "grad_norm": 0.24891753494739532, + "learning_rate": 2.9465333333333334e-05, + "loss": 0.0222, + "step": 11610 + }, + { + "epoch": 9.142575817250886, + "grad_norm": 0.4256175756454468, + "learning_rate": 2.9465e-05, + "loss": 0.0221, + "step": 11611 + }, + { + "epoch": 9.143363528948404, + "grad_norm": 0.2715255916118622, + "learning_rate": 2.9464666666666666e-05, + "loss": 0.0178, + "step": 11612 + }, + { + "epoch": 9.144151240645924, + "grad_norm": 0.14334654808044434, + "learning_rate": 2.9464333333333335e-05, + "loss": 0.0097, + "step": 11613 + }, + { + "epoch": 9.144938952343443, + "grad_norm": 1.0961048603057861, + "learning_rate": 2.9464e-05, + "loss": 0.0175, + "step": 11614 + }, + { + "epoch": 9.14572666404096, + "grad_norm": 0.3104472756385803, + "learning_rate": 2.9463666666666667e-05, + "loss": 0.0193, + "step": 11615 + }, + { + "epoch": 9.14651437573848, + "grad_norm": 0.15967205166816711, + "learning_rate": 2.9463333333333336e-05, + "loss": 0.0089, + "step": 11616 + }, + { + "epoch": 9.147302087435998, + "grad_norm": 0.15499916672706604, + "learning_rate": 2.9463e-05, + "loss": 0.0107, + "step": 11617 + }, + { + "epoch": 9.148089799133517, + "grad_norm": 0.3770545721054077, + "learning_rate": 2.9462666666666668e-05, + "loss": 0.0264, + "step": 11618 + }, + { + "epoch": 9.148877510831035, + "grad_norm": 0.09586817026138306, + "learning_rate": 2.9462333333333334e-05, + "loss": 0.0053, + "step": 11619 + }, + { + "epoch": 9.149665222528554, + "grad_norm": 0.2457297444343567, + "learning_rate": 2.9462e-05, + "loss": 0.0079, + "step": 11620 + }, + { + "epoch": 9.150452934226074, + "grad_norm": 0.18474030494689941, + "learning_rate": 2.946166666666667e-05, + "loss": 0.0105, + "step": 11621 + }, + { + "epoch": 9.151240645923592, + "grad_norm": 0.24393393099308014, + "learning_rate": 2.9461333333333335e-05, + "loss": 0.0112, + "step": 11622 + }, + { + "epoch": 9.152028357621111, + "grad_norm": 0.3827402889728546, + "learning_rate": 2.9461e-05, + "loss": 0.0086, + "step": 11623 + }, + { + "epoch": 9.152816069318629, + "grad_norm": 0.4348389804363251, + "learning_rate": 2.9460666666666667e-05, + "loss": 0.0247, + "step": 11624 + }, + { + "epoch": 9.153603781016148, + "grad_norm": 0.28710228204727173, + "learning_rate": 2.9460333333333333e-05, + "loss": 0.0218, + "step": 11625 + }, + { + "epoch": 9.154391492713668, + "grad_norm": 0.20817779004573822, + "learning_rate": 2.946e-05, + "loss": 0.0145, + "step": 11626 + }, + { + "epoch": 9.155179204411185, + "grad_norm": 0.379294216632843, + "learning_rate": 2.945966666666667e-05, + "loss": 0.0134, + "step": 11627 + }, + { + "epoch": 9.155966916108705, + "grad_norm": 0.2208326756954193, + "learning_rate": 2.945933333333333e-05, + "loss": 0.0169, + "step": 11628 + }, + { + "epoch": 9.156754627806222, + "grad_norm": 0.3558538556098938, + "learning_rate": 2.9459e-05, + "loss": 0.0176, + "step": 11629 + }, + { + "epoch": 9.157542339503742, + "grad_norm": 0.6085386276245117, + "learning_rate": 2.945866666666667e-05, + "loss": 0.0175, + "step": 11630 + }, + { + "epoch": 9.15833005120126, + "grad_norm": 0.5830380916595459, + "learning_rate": 2.9458333333333332e-05, + "loss": 0.2189, + "step": 11631 + }, + { + "epoch": 9.159117762898779, + "grad_norm": 0.69068843126297, + "learning_rate": 2.9458e-05, + "loss": 0.1915, + "step": 11632 + }, + { + "epoch": 9.159905474596298, + "grad_norm": 0.49938932061195374, + "learning_rate": 2.9457666666666667e-05, + "loss": 0.1233, + "step": 11633 + }, + { + "epoch": 9.160693186293816, + "grad_norm": 0.4695441722869873, + "learning_rate": 2.9457333333333333e-05, + "loss": 0.1192, + "step": 11634 + }, + { + "epoch": 9.161480897991336, + "grad_norm": 0.4328548014163971, + "learning_rate": 2.9457e-05, + "loss": 0.0772, + "step": 11635 + }, + { + "epoch": 9.162268609688853, + "grad_norm": 0.6490544676780701, + "learning_rate": 2.945666666666667e-05, + "loss": 0.1133, + "step": 11636 + }, + { + "epoch": 9.163056321386373, + "grad_norm": 0.32648470997810364, + "learning_rate": 2.945633333333333e-05, + "loss": 0.0462, + "step": 11637 + }, + { + "epoch": 9.16384403308389, + "grad_norm": 0.318563848733902, + "learning_rate": 2.9456e-05, + "loss": 0.0166, + "step": 11638 + }, + { + "epoch": 9.16463174478141, + "grad_norm": 0.29340672492980957, + "learning_rate": 2.945566666666667e-05, + "loss": 0.0202, + "step": 11639 + }, + { + "epoch": 9.16541945647893, + "grad_norm": 0.3720873296260834, + "learning_rate": 2.9455333333333332e-05, + "loss": 0.0209, + "step": 11640 + }, + { + "epoch": 9.166207168176447, + "grad_norm": 0.24550887942314148, + "learning_rate": 2.9455000000000002e-05, + "loss": 0.014, + "step": 11641 + }, + { + "epoch": 9.166994879873966, + "grad_norm": 0.17729873955249786, + "learning_rate": 2.9454666666666668e-05, + "loss": 0.0102, + "step": 11642 + }, + { + "epoch": 9.167782591571484, + "grad_norm": 0.2680634558200836, + "learning_rate": 2.9454333333333334e-05, + "loss": 0.0168, + "step": 11643 + }, + { + "epoch": 9.168570303269004, + "grad_norm": 0.09267501533031464, + "learning_rate": 2.9454e-05, + "loss": 0.0056, + "step": 11644 + }, + { + "epoch": 9.169358014966523, + "grad_norm": 0.47551319003105164, + "learning_rate": 2.945366666666667e-05, + "loss": 0.0231, + "step": 11645 + }, + { + "epoch": 9.17014572666404, + "grad_norm": 0.18368549644947052, + "learning_rate": 2.9453333333333335e-05, + "loss": 0.0138, + "step": 11646 + }, + { + "epoch": 9.17093343836156, + "grad_norm": 0.29334595799446106, + "learning_rate": 2.9453e-05, + "loss": 0.0193, + "step": 11647 + }, + { + "epoch": 9.171721150059078, + "grad_norm": 0.22412508726119995, + "learning_rate": 2.945266666666667e-05, + "loss": 0.0183, + "step": 11648 + }, + { + "epoch": 9.172508861756597, + "grad_norm": 0.26042822003364563, + "learning_rate": 2.9452333333333333e-05, + "loss": 0.0103, + "step": 11649 + }, + { + "epoch": 9.173296573454115, + "grad_norm": 0.4282515347003937, + "learning_rate": 2.9452000000000002e-05, + "loss": 0.0164, + "step": 11650 + }, + { + "epoch": 9.174084285151634, + "grad_norm": 0.3518877923488617, + "learning_rate": 2.9451666666666668e-05, + "loss": 0.0185, + "step": 11651 + }, + { + "epoch": 9.174871996849154, + "grad_norm": 0.6034036874771118, + "learning_rate": 2.9451333333333334e-05, + "loss": 0.0119, + "step": 11652 + }, + { + "epoch": 9.175659708546672, + "grad_norm": 0.12123478949069977, + "learning_rate": 2.9451e-05, + "loss": 0.0066, + "step": 11653 + }, + { + "epoch": 9.176447420244191, + "grad_norm": 0.4758981764316559, + "learning_rate": 2.945066666666667e-05, + "loss": 0.0174, + "step": 11654 + }, + { + "epoch": 9.177235131941709, + "grad_norm": 0.3987779915332794, + "learning_rate": 2.9450333333333335e-05, + "loss": 0.0143, + "step": 11655 + }, + { + "epoch": 9.178022843639228, + "grad_norm": 0.18672512471675873, + "learning_rate": 2.945e-05, + "loss": 0.0099, + "step": 11656 + }, + { + "epoch": 9.178810555336748, + "grad_norm": 0.4397216737270355, + "learning_rate": 2.9449666666666667e-05, + "loss": 0.0295, + "step": 11657 + }, + { + "epoch": 9.179598267034265, + "grad_norm": 0.1697971522808075, + "learning_rate": 2.9449333333333333e-05, + "loss": 0.0103, + "step": 11658 + }, + { + "epoch": 9.180385978731785, + "grad_norm": 0.30526456236839294, + "learning_rate": 2.9449000000000002e-05, + "loss": 0.0124, + "step": 11659 + }, + { + "epoch": 9.181173690429302, + "grad_norm": 0.3160611093044281, + "learning_rate": 2.9448666666666665e-05, + "loss": 0.0172, + "step": 11660 + }, + { + "epoch": 9.181961402126822, + "grad_norm": 0.2361966371536255, + "learning_rate": 2.9448333333333334e-05, + "loss": 0.0105, + "step": 11661 + }, + { + "epoch": 9.18274911382434, + "grad_norm": 0.3826940655708313, + "learning_rate": 2.9448e-05, + "loss": 0.018, + "step": 11662 + }, + { + "epoch": 9.183536825521859, + "grad_norm": 0.2151699811220169, + "learning_rate": 2.9447666666666666e-05, + "loss": 0.0057, + "step": 11663 + }, + { + "epoch": 9.184324537219378, + "grad_norm": 0.3420741558074951, + "learning_rate": 2.9447333333333335e-05, + "loss": 0.0228, + "step": 11664 + }, + { + "epoch": 9.185112248916896, + "grad_norm": 0.38949835300445557, + "learning_rate": 2.9447e-05, + "loss": 0.0184, + "step": 11665 + }, + { + "epoch": 9.185899960614416, + "grad_norm": 1.440316915512085, + "learning_rate": 2.9446666666666667e-05, + "loss": 0.0168, + "step": 11666 + }, + { + "epoch": 9.186687672311933, + "grad_norm": 0.23845449090003967, + "learning_rate": 2.9446333333333333e-05, + "loss": 0.0114, + "step": 11667 + }, + { + "epoch": 9.187475384009453, + "grad_norm": 0.2722768485546112, + "learning_rate": 2.9446000000000002e-05, + "loss": 0.0128, + "step": 11668 + }, + { + "epoch": 9.18826309570697, + "grad_norm": 0.4153118133544922, + "learning_rate": 2.9445666666666665e-05, + "loss": 0.0127, + "step": 11669 + }, + { + "epoch": 9.18905080740449, + "grad_norm": 0.18962687253952026, + "learning_rate": 2.9445333333333334e-05, + "loss": 0.0069, + "step": 11670 + }, + { + "epoch": 9.18983851910201, + "grad_norm": 0.35189422965049744, + "learning_rate": 2.9445000000000004e-05, + "loss": 0.012, + "step": 11671 + }, + { + "epoch": 9.190626230799527, + "grad_norm": 0.32247796654701233, + "learning_rate": 2.9444666666666666e-05, + "loss": 0.0163, + "step": 11672 + }, + { + "epoch": 9.191413942497046, + "grad_norm": 0.22066500782966614, + "learning_rate": 2.9444333333333335e-05, + "loss": 0.0138, + "step": 11673 + }, + { + "epoch": 9.192201654194564, + "grad_norm": 0.31228122115135193, + "learning_rate": 2.9444e-05, + "loss": 0.0204, + "step": 11674 + }, + { + "epoch": 9.192989365892084, + "grad_norm": 0.31346645951271057, + "learning_rate": 2.9443666666666667e-05, + "loss": 0.0144, + "step": 11675 + }, + { + "epoch": 9.193777077589603, + "grad_norm": 0.2936857342720032, + "learning_rate": 2.9443333333333333e-05, + "loss": 0.0297, + "step": 11676 + }, + { + "epoch": 9.19456478928712, + "grad_norm": 0.25577297806739807, + "learning_rate": 2.9443000000000003e-05, + "loss": 0.0151, + "step": 11677 + }, + { + "epoch": 9.19535250098464, + "grad_norm": 0.30347827076911926, + "learning_rate": 2.9442666666666665e-05, + "loss": 0.0116, + "step": 11678 + }, + { + "epoch": 9.196140212682158, + "grad_norm": 0.4068261981010437, + "learning_rate": 2.9442333333333334e-05, + "loss": 0.0201, + "step": 11679 + }, + { + "epoch": 9.196927924379677, + "grad_norm": 0.31018298864364624, + "learning_rate": 2.9442000000000004e-05, + "loss": 0.0145, + "step": 11680 + }, + { + "epoch": 9.197715636077195, + "grad_norm": 1.0356429815292358, + "learning_rate": 2.9441666666666666e-05, + "loss": 0.2146, + "step": 11681 + }, + { + "epoch": 9.198503347774714, + "grad_norm": 0.5786553621292114, + "learning_rate": 2.9441333333333336e-05, + "loss": 0.1992, + "step": 11682 + }, + { + "epoch": 9.199291059472234, + "grad_norm": 0.4287036955356598, + "learning_rate": 2.9441e-05, + "loss": 0.1107, + "step": 11683 + }, + { + "epoch": 9.200078771169752, + "grad_norm": 0.56180340051651, + "learning_rate": 2.9440666666666667e-05, + "loss": 0.1198, + "step": 11684 + }, + { + "epoch": 9.200866482867271, + "grad_norm": 0.4156320095062256, + "learning_rate": 2.9440333333333333e-05, + "loss": 0.0519, + "step": 11685 + }, + { + "epoch": 9.201654194564789, + "grad_norm": 0.5717890858650208, + "learning_rate": 2.944e-05, + "loss": 0.1333, + "step": 11686 + }, + { + "epoch": 9.202441906262308, + "grad_norm": 0.3065141439437866, + "learning_rate": 2.9439666666666665e-05, + "loss": 0.0273, + "step": 11687 + }, + { + "epoch": 9.203229617959826, + "grad_norm": 0.22269587218761444, + "learning_rate": 2.9439333333333335e-05, + "loss": 0.0139, + "step": 11688 + }, + { + "epoch": 9.204017329657345, + "grad_norm": 0.3005319833755493, + "learning_rate": 2.9439e-05, + "loss": 0.0286, + "step": 11689 + }, + { + "epoch": 9.204805041354865, + "grad_norm": 0.18271639943122864, + "learning_rate": 2.9438666666666666e-05, + "loss": 0.0133, + "step": 11690 + }, + { + "epoch": 9.205592753052382, + "grad_norm": 0.28358110785484314, + "learning_rate": 2.9438333333333336e-05, + "loss": 0.0244, + "step": 11691 + }, + { + "epoch": 9.206380464749902, + "grad_norm": 0.2888738214969635, + "learning_rate": 2.9438e-05, + "loss": 0.0169, + "step": 11692 + }, + { + "epoch": 9.20716817644742, + "grad_norm": 0.2939455211162567, + "learning_rate": 2.9437666666666668e-05, + "loss": 0.0113, + "step": 11693 + }, + { + "epoch": 9.207955888144939, + "grad_norm": 0.14830897748470306, + "learning_rate": 2.9437333333333334e-05, + "loss": 0.0073, + "step": 11694 + }, + { + "epoch": 9.208743599842458, + "grad_norm": 0.35661599040031433, + "learning_rate": 2.9437e-05, + "loss": 0.0169, + "step": 11695 + }, + { + "epoch": 9.209531311539976, + "grad_norm": 0.1927947700023651, + "learning_rate": 2.943666666666667e-05, + "loss": 0.0127, + "step": 11696 + }, + { + "epoch": 9.210319023237496, + "grad_norm": 0.22231005132198334, + "learning_rate": 2.9436333333333335e-05, + "loss": 0.012, + "step": 11697 + }, + { + "epoch": 9.211106734935013, + "grad_norm": 0.691202700138092, + "learning_rate": 2.9436e-05, + "loss": 0.0192, + "step": 11698 + }, + { + "epoch": 9.211894446632533, + "grad_norm": 0.1379944384098053, + "learning_rate": 2.9435666666666667e-05, + "loss": 0.0096, + "step": 11699 + }, + { + "epoch": 9.21268215833005, + "grad_norm": 0.20782214403152466, + "learning_rate": 2.9435333333333336e-05, + "loss": 0.012, + "step": 11700 + }, + { + "epoch": 9.21346987002757, + "grad_norm": 0.6277168989181519, + "learning_rate": 2.9435e-05, + "loss": 0.0228, + "step": 11701 + }, + { + "epoch": 9.21425758172509, + "grad_norm": 0.2760486304759979, + "learning_rate": 2.9434666666666668e-05, + "loss": 0.013, + "step": 11702 + }, + { + "epoch": 9.215045293422607, + "grad_norm": 0.2684324085712433, + "learning_rate": 2.9434333333333334e-05, + "loss": 0.0164, + "step": 11703 + }, + { + "epoch": 9.215833005120126, + "grad_norm": 0.4003715217113495, + "learning_rate": 2.9434e-05, + "loss": 0.032, + "step": 11704 + }, + { + "epoch": 9.216620716817644, + "grad_norm": 0.2118326872587204, + "learning_rate": 2.943366666666667e-05, + "loss": 0.0076, + "step": 11705 + }, + { + "epoch": 9.217408428515164, + "grad_norm": 0.21400532126426697, + "learning_rate": 2.9433333333333335e-05, + "loss": 0.0113, + "step": 11706 + }, + { + "epoch": 9.218196140212681, + "grad_norm": 0.32969990372657776, + "learning_rate": 2.9433e-05, + "loss": 0.0193, + "step": 11707 + }, + { + "epoch": 9.2189838519102, + "grad_norm": 0.39098790287971497, + "learning_rate": 2.9432666666666667e-05, + "loss": 0.0089, + "step": 11708 + }, + { + "epoch": 9.21977156360772, + "grad_norm": 0.24603751301765442, + "learning_rate": 2.9432333333333336e-05, + "loss": 0.015, + "step": 11709 + }, + { + "epoch": 9.220559275305238, + "grad_norm": 0.3395046889781952, + "learning_rate": 2.9432e-05, + "loss": 0.0166, + "step": 11710 + }, + { + "epoch": 9.221346987002757, + "grad_norm": 0.18675999343395233, + "learning_rate": 2.9431666666666668e-05, + "loss": 0.0102, + "step": 11711 + }, + { + "epoch": 9.222134698700275, + "grad_norm": 0.1830844134092331, + "learning_rate": 2.9431333333333334e-05, + "loss": 0.0091, + "step": 11712 + }, + { + "epoch": 9.222922410397794, + "grad_norm": 0.2815774381160736, + "learning_rate": 2.9431e-05, + "loss": 0.0144, + "step": 11713 + }, + { + "epoch": 9.223710122095314, + "grad_norm": 0.5375900268554688, + "learning_rate": 2.943066666666667e-05, + "loss": 0.0202, + "step": 11714 + }, + { + "epoch": 9.224497833792832, + "grad_norm": 0.2783139646053314, + "learning_rate": 2.9430333333333332e-05, + "loss": 0.0139, + "step": 11715 + }, + { + "epoch": 9.225285545490351, + "grad_norm": 0.25527137517929077, + "learning_rate": 2.943e-05, + "loss": 0.0106, + "step": 11716 + }, + { + "epoch": 9.226073257187869, + "grad_norm": 0.5294356942176819, + "learning_rate": 2.9429666666666667e-05, + "loss": 0.0165, + "step": 11717 + }, + { + "epoch": 9.226860968885388, + "grad_norm": 0.4398120641708374, + "learning_rate": 2.9429333333333333e-05, + "loss": 0.0157, + "step": 11718 + }, + { + "epoch": 9.227648680582906, + "grad_norm": 0.25155216455459595, + "learning_rate": 2.9429e-05, + "loss": 0.0123, + "step": 11719 + }, + { + "epoch": 9.228436392280425, + "grad_norm": 0.15385104715824127, + "learning_rate": 2.9428666666666668e-05, + "loss": 0.0085, + "step": 11720 + }, + { + "epoch": 9.229224103977945, + "grad_norm": 0.2503533661365509, + "learning_rate": 2.9428333333333334e-05, + "loss": 0.0121, + "step": 11721 + }, + { + "epoch": 9.230011815675462, + "grad_norm": 0.24156485497951508, + "learning_rate": 2.9428e-05, + "loss": 0.0094, + "step": 11722 + }, + { + "epoch": 9.230799527372982, + "grad_norm": 0.16243134438991547, + "learning_rate": 2.942766666666667e-05, + "loss": 0.0108, + "step": 11723 + }, + { + "epoch": 9.2315872390705, + "grad_norm": 0.3959098756313324, + "learning_rate": 2.9427333333333332e-05, + "loss": 0.0243, + "step": 11724 + }, + { + "epoch": 9.232374950768019, + "grad_norm": 0.4083205759525299, + "learning_rate": 2.9427e-05, + "loss": 0.023, + "step": 11725 + }, + { + "epoch": 9.233162662465537, + "grad_norm": 0.2695927321910858, + "learning_rate": 2.9426666666666667e-05, + "loss": 0.0125, + "step": 11726 + }, + { + "epoch": 9.233950374163056, + "grad_norm": 0.3556275963783264, + "learning_rate": 2.9426333333333333e-05, + "loss": 0.0151, + "step": 11727 + }, + { + "epoch": 9.234738085860576, + "grad_norm": 0.5920835137367249, + "learning_rate": 2.9426e-05, + "loss": 0.032, + "step": 11728 + }, + { + "epoch": 9.235525797558093, + "grad_norm": 0.2626240849494934, + "learning_rate": 2.942566666666667e-05, + "loss": 0.0161, + "step": 11729 + }, + { + "epoch": 9.236313509255613, + "grad_norm": 0.34442344307899475, + "learning_rate": 2.9425333333333334e-05, + "loss": 0.018, + "step": 11730 + }, + { + "epoch": 9.23710122095313, + "grad_norm": 0.5760664343833923, + "learning_rate": 2.9425e-05, + "loss": 0.217, + "step": 11731 + }, + { + "epoch": 9.23788893265065, + "grad_norm": 0.709701657295227, + "learning_rate": 2.942466666666667e-05, + "loss": 0.1673, + "step": 11732 + }, + { + "epoch": 9.23867664434817, + "grad_norm": 0.3826198875904083, + "learning_rate": 2.9424333333333332e-05, + "loss": 0.1289, + "step": 11733 + }, + { + "epoch": 9.239464356045687, + "grad_norm": 0.48403409123420715, + "learning_rate": 2.9424e-05, + "loss": 0.1025, + "step": 11734 + }, + { + "epoch": 9.240252067743207, + "grad_norm": 0.31830865144729614, + "learning_rate": 2.9423666666666667e-05, + "loss": 0.0631, + "step": 11735 + }, + { + "epoch": 9.241039779440724, + "grad_norm": 0.4890585243701935, + "learning_rate": 2.9423333333333333e-05, + "loss": 0.0445, + "step": 11736 + }, + { + "epoch": 9.241827491138244, + "grad_norm": 0.3290810286998749, + "learning_rate": 2.9423e-05, + "loss": 0.0194, + "step": 11737 + }, + { + "epoch": 9.242615202835761, + "grad_norm": 0.21989215910434723, + "learning_rate": 2.942266666666667e-05, + "loss": 0.0195, + "step": 11738 + }, + { + "epoch": 9.24340291453328, + "grad_norm": 0.505785346031189, + "learning_rate": 2.9422333333333335e-05, + "loss": 0.0232, + "step": 11739 + }, + { + "epoch": 9.2441906262308, + "grad_norm": 0.31946927309036255, + "learning_rate": 2.9422e-05, + "loss": 0.0141, + "step": 11740 + }, + { + "epoch": 9.244978337928318, + "grad_norm": 0.42139574885368347, + "learning_rate": 2.942166666666667e-05, + "loss": 0.0165, + "step": 11741 + }, + { + "epoch": 9.245766049625837, + "grad_norm": 0.2254931777715683, + "learning_rate": 2.9421333333333332e-05, + "loss": 0.0141, + "step": 11742 + }, + { + "epoch": 9.246553761323355, + "grad_norm": 0.2100120335817337, + "learning_rate": 2.9421000000000002e-05, + "loss": 0.0132, + "step": 11743 + }, + { + "epoch": 9.247341473020875, + "grad_norm": 0.30860236287117004, + "learning_rate": 2.9420666666666668e-05, + "loss": 0.0135, + "step": 11744 + }, + { + "epoch": 9.248129184718394, + "grad_norm": 0.4228977560997009, + "learning_rate": 2.9420333333333334e-05, + "loss": 0.0088, + "step": 11745 + }, + { + "epoch": 9.248916896415912, + "grad_norm": 0.30130138993263245, + "learning_rate": 2.9420000000000003e-05, + "loss": 0.0206, + "step": 11746 + }, + { + "epoch": 9.249704608113431, + "grad_norm": 0.1551612913608551, + "learning_rate": 2.9419666666666665e-05, + "loss": 0.0107, + "step": 11747 + }, + { + "epoch": 9.250492319810949, + "grad_norm": 0.2276759147644043, + "learning_rate": 2.9419333333333335e-05, + "loss": 0.0148, + "step": 11748 + }, + { + "epoch": 9.251280031508468, + "grad_norm": 0.5365098714828491, + "learning_rate": 2.9419e-05, + "loss": 0.0176, + "step": 11749 + }, + { + "epoch": 9.252067743205986, + "grad_norm": 0.46360424160957336, + "learning_rate": 2.9418666666666667e-05, + "loss": 0.0123, + "step": 11750 + }, + { + "epoch": 9.252855454903505, + "grad_norm": 0.42217403650283813, + "learning_rate": 2.9418333333333333e-05, + "loss": 0.0233, + "step": 11751 + }, + { + "epoch": 9.253643166601025, + "grad_norm": 0.3401978313922882, + "learning_rate": 2.9418000000000002e-05, + "loss": 0.0094, + "step": 11752 + }, + { + "epoch": 9.254430878298542, + "grad_norm": 0.4111584722995758, + "learning_rate": 2.9417666666666664e-05, + "loss": 0.029, + "step": 11753 + }, + { + "epoch": 9.255218589996062, + "grad_norm": 0.13516876101493835, + "learning_rate": 2.9417333333333334e-05, + "loss": 0.0076, + "step": 11754 + }, + { + "epoch": 9.25600630169358, + "grad_norm": 0.4284442365169525, + "learning_rate": 2.9417000000000003e-05, + "loss": 0.0154, + "step": 11755 + }, + { + "epoch": 9.256794013391099, + "grad_norm": 0.23051930963993073, + "learning_rate": 2.9416666666666666e-05, + "loss": 0.0204, + "step": 11756 + }, + { + "epoch": 9.257581725088617, + "grad_norm": 0.2506980895996094, + "learning_rate": 2.9416333333333335e-05, + "loss": 0.0106, + "step": 11757 + }, + { + "epoch": 9.258369436786136, + "grad_norm": 0.30369722843170166, + "learning_rate": 2.9416e-05, + "loss": 0.0159, + "step": 11758 + }, + { + "epoch": 9.259157148483656, + "grad_norm": 0.7298219799995422, + "learning_rate": 2.9415666666666667e-05, + "loss": 0.0175, + "step": 11759 + }, + { + "epoch": 9.259944860181173, + "grad_norm": 0.35565003752708435, + "learning_rate": 2.9415333333333333e-05, + "loss": 0.017, + "step": 11760 + }, + { + "epoch": 9.260732571878693, + "grad_norm": 0.3003367781639099, + "learning_rate": 2.9415000000000002e-05, + "loss": 0.019, + "step": 11761 + }, + { + "epoch": 9.26152028357621, + "grad_norm": 0.359587699174881, + "learning_rate": 2.9414666666666668e-05, + "loss": 0.0132, + "step": 11762 + }, + { + "epoch": 9.26230799527373, + "grad_norm": 0.2207503765821457, + "learning_rate": 2.9414333333333334e-05, + "loss": 0.0158, + "step": 11763 + }, + { + "epoch": 9.26309570697125, + "grad_norm": 0.3104201853275299, + "learning_rate": 2.9414000000000003e-05, + "loss": 0.0202, + "step": 11764 + }, + { + "epoch": 9.263883418668767, + "grad_norm": 0.5418708324432373, + "learning_rate": 2.9413666666666666e-05, + "loss": 0.016, + "step": 11765 + }, + { + "epoch": 9.264671130366287, + "grad_norm": 0.23504631221294403, + "learning_rate": 2.9413333333333335e-05, + "loss": 0.0132, + "step": 11766 + }, + { + "epoch": 9.265458842063804, + "grad_norm": 0.3668391704559326, + "learning_rate": 2.9413e-05, + "loss": 0.0161, + "step": 11767 + }, + { + "epoch": 9.266246553761324, + "grad_norm": 0.27138403058052063, + "learning_rate": 2.9412666666666667e-05, + "loss": 0.0137, + "step": 11768 + }, + { + "epoch": 9.267034265458841, + "grad_norm": 0.4986845850944519, + "learning_rate": 2.9412333333333333e-05, + "loss": 0.0131, + "step": 11769 + }, + { + "epoch": 9.26782197715636, + "grad_norm": 0.19926658272743225, + "learning_rate": 2.9412000000000002e-05, + "loss": 0.0103, + "step": 11770 + }, + { + "epoch": 9.26860968885388, + "grad_norm": 0.20494835078716278, + "learning_rate": 2.9411666666666668e-05, + "loss": 0.0104, + "step": 11771 + }, + { + "epoch": 9.269397400551398, + "grad_norm": 0.3069514334201813, + "learning_rate": 2.9411333333333334e-05, + "loss": 0.0094, + "step": 11772 + }, + { + "epoch": 9.270185112248917, + "grad_norm": 0.44741371273994446, + "learning_rate": 2.9411000000000004e-05, + "loss": 0.0212, + "step": 11773 + }, + { + "epoch": 9.270972823946435, + "grad_norm": 0.36756712198257446, + "learning_rate": 2.9410666666666666e-05, + "loss": 0.016, + "step": 11774 + }, + { + "epoch": 9.271760535643955, + "grad_norm": 0.48818591237068176, + "learning_rate": 2.9410333333333335e-05, + "loss": 0.0204, + "step": 11775 + }, + { + "epoch": 9.272548247341472, + "grad_norm": 0.18001453578472137, + "learning_rate": 2.9409999999999998e-05, + "loss": 0.0071, + "step": 11776 + }, + { + "epoch": 9.273335959038992, + "grad_norm": 0.2670212388038635, + "learning_rate": 2.9409666666666667e-05, + "loss": 0.0105, + "step": 11777 + }, + { + "epoch": 9.274123670736511, + "grad_norm": 0.2400531768798828, + "learning_rate": 2.9409333333333333e-05, + "loss": 0.0108, + "step": 11778 + }, + { + "epoch": 9.274911382434029, + "grad_norm": 0.37726083397865295, + "learning_rate": 2.9409e-05, + "loss": 0.0181, + "step": 11779 + }, + { + "epoch": 9.275699094131548, + "grad_norm": 0.3675541579723358, + "learning_rate": 2.940866666666667e-05, + "loss": 0.0164, + "step": 11780 + }, + { + "epoch": 9.276486805829066, + "grad_norm": 0.6788457632064819, + "learning_rate": 2.9408333333333334e-05, + "loss": 0.2025, + "step": 11781 + }, + { + "epoch": 9.277274517526585, + "grad_norm": 0.596764862537384, + "learning_rate": 2.9408e-05, + "loss": 0.1427, + "step": 11782 + }, + { + "epoch": 9.278062229224105, + "grad_norm": 0.5084652304649353, + "learning_rate": 2.9407666666666666e-05, + "loss": 0.1362, + "step": 11783 + }, + { + "epoch": 9.278849940921623, + "grad_norm": 0.5138649344444275, + "learning_rate": 2.9407333333333336e-05, + "loss": 0.1352, + "step": 11784 + }, + { + "epoch": 9.279637652619142, + "grad_norm": 0.42282453179359436, + "learning_rate": 2.9406999999999998e-05, + "loss": 0.0782, + "step": 11785 + }, + { + "epoch": 9.28042536431666, + "grad_norm": 0.4921681880950928, + "learning_rate": 2.9406666666666667e-05, + "loss": 0.0978, + "step": 11786 + }, + { + "epoch": 9.281213076014179, + "grad_norm": 0.16409367322921753, + "learning_rate": 2.9406333333333333e-05, + "loss": 0.0159, + "step": 11787 + }, + { + "epoch": 9.282000787711697, + "grad_norm": 0.2297312617301941, + "learning_rate": 2.9406e-05, + "loss": 0.0351, + "step": 11788 + }, + { + "epoch": 9.282788499409216, + "grad_norm": 0.24316726624965668, + "learning_rate": 2.940566666666667e-05, + "loss": 0.0194, + "step": 11789 + }, + { + "epoch": 9.283576211106736, + "grad_norm": 0.16899895668029785, + "learning_rate": 2.9405333333333335e-05, + "loss": 0.0151, + "step": 11790 + }, + { + "epoch": 9.284363922804253, + "grad_norm": 0.14522667229175568, + "learning_rate": 2.9405e-05, + "loss": 0.0115, + "step": 11791 + }, + { + "epoch": 9.285151634501773, + "grad_norm": 0.5184698104858398, + "learning_rate": 2.9404666666666666e-05, + "loss": 0.0225, + "step": 11792 + }, + { + "epoch": 9.28593934619929, + "grad_norm": 0.18968798220157623, + "learning_rate": 2.9404333333333336e-05, + "loss": 0.0166, + "step": 11793 + }, + { + "epoch": 9.28672705789681, + "grad_norm": 0.3957008123397827, + "learning_rate": 2.9404e-05, + "loss": 0.0135, + "step": 11794 + }, + { + "epoch": 9.287514769594328, + "grad_norm": 0.1967475563287735, + "learning_rate": 2.9403666666666668e-05, + "loss": 0.0166, + "step": 11795 + }, + { + "epoch": 9.288302481291847, + "grad_norm": 0.18586388230323792, + "learning_rate": 2.9403333333333337e-05, + "loss": 0.0135, + "step": 11796 + }, + { + "epoch": 9.289090192989367, + "grad_norm": 0.26435568928718567, + "learning_rate": 2.9403e-05, + "loss": 0.0165, + "step": 11797 + }, + { + "epoch": 9.289877904686884, + "grad_norm": 0.2431444674730301, + "learning_rate": 2.940266666666667e-05, + "loss": 0.0092, + "step": 11798 + }, + { + "epoch": 9.290665616384404, + "grad_norm": 0.1314718872308731, + "learning_rate": 2.9402333333333335e-05, + "loss": 0.0115, + "step": 11799 + }, + { + "epoch": 9.291453328081921, + "grad_norm": 0.5653291344642639, + "learning_rate": 2.9402e-05, + "loss": 0.0255, + "step": 11800 + }, + { + "epoch": 9.29224103977944, + "grad_norm": 0.3327479660511017, + "learning_rate": 2.9401666666666667e-05, + "loss": 0.0112, + "step": 11801 + }, + { + "epoch": 9.29302875147696, + "grad_norm": 0.18023252487182617, + "learning_rate": 2.9401333333333336e-05, + "loss": 0.0142, + "step": 11802 + }, + { + "epoch": 9.293816463174478, + "grad_norm": 0.14383256435394287, + "learning_rate": 2.9401e-05, + "loss": 0.013, + "step": 11803 + }, + { + "epoch": 9.294604174871997, + "grad_norm": 0.4012189209461212, + "learning_rate": 2.9400666666666668e-05, + "loss": 0.0209, + "step": 11804 + }, + { + "epoch": 9.295391886569515, + "grad_norm": 0.16960525512695312, + "learning_rate": 2.9400333333333337e-05, + "loss": 0.0161, + "step": 11805 + }, + { + "epoch": 9.296179598267035, + "grad_norm": 0.43577831983566284, + "learning_rate": 2.94e-05, + "loss": 0.0219, + "step": 11806 + }, + { + "epoch": 9.296967309964552, + "grad_norm": 0.16399775445461273, + "learning_rate": 2.939966666666667e-05, + "loss": 0.0093, + "step": 11807 + }, + { + "epoch": 9.297755021662072, + "grad_norm": 0.3026774525642395, + "learning_rate": 2.939933333333333e-05, + "loss": 0.0129, + "step": 11808 + }, + { + "epoch": 9.298542733359591, + "grad_norm": 0.7528718709945679, + "learning_rate": 2.9399e-05, + "loss": 0.0133, + "step": 11809 + }, + { + "epoch": 9.299330445057109, + "grad_norm": 0.36118412017822266, + "learning_rate": 2.9398666666666667e-05, + "loss": 0.015, + "step": 11810 + }, + { + "epoch": 9.300118156754628, + "grad_norm": 0.2624693810939789, + "learning_rate": 2.9398333333333333e-05, + "loss": 0.0108, + "step": 11811 + }, + { + "epoch": 9.300905868452146, + "grad_norm": 0.12620897591114044, + "learning_rate": 2.9398000000000002e-05, + "loss": 0.0067, + "step": 11812 + }, + { + "epoch": 9.301693580149665, + "grad_norm": 0.2061038762331009, + "learning_rate": 2.9397666666666668e-05, + "loss": 0.0111, + "step": 11813 + }, + { + "epoch": 9.302481291847183, + "grad_norm": 0.412126362323761, + "learning_rate": 2.9397333333333334e-05, + "loss": 0.0209, + "step": 11814 + }, + { + "epoch": 9.303269003544703, + "grad_norm": 0.23181354999542236, + "learning_rate": 2.9397e-05, + "loss": 0.0141, + "step": 11815 + }, + { + "epoch": 9.304056715242222, + "grad_norm": 0.33150842785835266, + "learning_rate": 2.939666666666667e-05, + "loss": 0.0143, + "step": 11816 + }, + { + "epoch": 9.30484442693974, + "grad_norm": 0.2079276144504547, + "learning_rate": 2.9396333333333332e-05, + "loss": 0.011, + "step": 11817 + }, + { + "epoch": 9.30563213863726, + "grad_norm": 0.24909712374210358, + "learning_rate": 2.9396e-05, + "loss": 0.0148, + "step": 11818 + }, + { + "epoch": 9.306419850334777, + "grad_norm": 0.24101777374744415, + "learning_rate": 2.9395666666666667e-05, + "loss": 0.0081, + "step": 11819 + }, + { + "epoch": 9.307207562032296, + "grad_norm": 0.459449827671051, + "learning_rate": 2.9395333333333333e-05, + "loss": 0.0179, + "step": 11820 + }, + { + "epoch": 9.307995273729816, + "grad_norm": 0.46269163489341736, + "learning_rate": 2.9395000000000002e-05, + "loss": 0.0123, + "step": 11821 + }, + { + "epoch": 9.308782985427333, + "grad_norm": 1.4620603322982788, + "learning_rate": 2.9394666666666668e-05, + "loss": 0.0198, + "step": 11822 + }, + { + "epoch": 9.309570697124853, + "grad_norm": 0.40077540278434753, + "learning_rate": 2.9394333333333334e-05, + "loss": 0.0149, + "step": 11823 + }, + { + "epoch": 9.31035840882237, + "grad_norm": 0.3056811988353729, + "learning_rate": 2.9394e-05, + "loss": 0.0143, + "step": 11824 + }, + { + "epoch": 9.31114612051989, + "grad_norm": 0.5227823257446289, + "learning_rate": 2.939366666666667e-05, + "loss": 0.0174, + "step": 11825 + }, + { + "epoch": 9.311933832217408, + "grad_norm": 0.4400835931301117, + "learning_rate": 2.9393333333333332e-05, + "loss": 0.0167, + "step": 11826 + }, + { + "epoch": 9.312721543914927, + "grad_norm": 0.7918035984039307, + "learning_rate": 2.9393e-05, + "loss": 0.045, + "step": 11827 + }, + { + "epoch": 9.313509255612447, + "grad_norm": 0.8421259522438049, + "learning_rate": 2.9392666666666667e-05, + "loss": 0.0271, + "step": 11828 + }, + { + "epoch": 9.314296967309964, + "grad_norm": 0.2756236493587494, + "learning_rate": 2.9392333333333333e-05, + "loss": 0.0164, + "step": 11829 + }, + { + "epoch": 9.315084679007484, + "grad_norm": 0.7153330445289612, + "learning_rate": 2.9392000000000003e-05, + "loss": 0.0383, + "step": 11830 + }, + { + "epoch": 9.315872390705001, + "grad_norm": 0.6273420453071594, + "learning_rate": 2.939166666666667e-05, + "loss": 0.1973, + "step": 11831 + }, + { + "epoch": 9.31666010240252, + "grad_norm": 0.5496479272842407, + "learning_rate": 2.9391333333333334e-05, + "loss": 0.1499, + "step": 11832 + }, + { + "epoch": 9.317447814100039, + "grad_norm": 0.568600058555603, + "learning_rate": 2.9391e-05, + "loss": 0.1857, + "step": 11833 + }, + { + "epoch": 9.318235525797558, + "grad_norm": 0.5716743469238281, + "learning_rate": 2.939066666666667e-05, + "loss": 0.1501, + "step": 11834 + }, + { + "epoch": 9.319023237495077, + "grad_norm": 0.4675544500350952, + "learning_rate": 2.9390333333333332e-05, + "loss": 0.0899, + "step": 11835 + }, + { + "epoch": 9.319810949192595, + "grad_norm": 0.24261006712913513, + "learning_rate": 2.939e-05, + "loss": 0.0392, + "step": 11836 + }, + { + "epoch": 9.320598660890115, + "grad_norm": 0.34150230884552, + "learning_rate": 2.9389666666666667e-05, + "loss": 0.0267, + "step": 11837 + }, + { + "epoch": 9.321386372587632, + "grad_norm": 0.27354660630226135, + "learning_rate": 2.9389333333333333e-05, + "loss": 0.0302, + "step": 11838 + }, + { + "epoch": 9.322174084285152, + "grad_norm": 0.24740685522556305, + "learning_rate": 2.9389000000000003e-05, + "loss": 0.0114, + "step": 11839 + }, + { + "epoch": 9.322961795982671, + "grad_norm": 0.24331510066986084, + "learning_rate": 2.9388666666666665e-05, + "loss": 0.0171, + "step": 11840 + }, + { + "epoch": 9.323749507680189, + "grad_norm": 0.3153528571128845, + "learning_rate": 2.9388333333333335e-05, + "loss": 0.0145, + "step": 11841 + }, + { + "epoch": 9.324537219377708, + "grad_norm": 0.2756885588169098, + "learning_rate": 2.9388e-05, + "loss": 0.0147, + "step": 11842 + }, + { + "epoch": 9.325324931075226, + "grad_norm": 0.22473162412643433, + "learning_rate": 2.9387666666666666e-05, + "loss": 0.0139, + "step": 11843 + }, + { + "epoch": 9.326112642772745, + "grad_norm": 0.20932336151599884, + "learning_rate": 2.9387333333333332e-05, + "loss": 0.0097, + "step": 11844 + }, + { + "epoch": 9.326900354470263, + "grad_norm": 0.22804805636405945, + "learning_rate": 2.9387000000000002e-05, + "loss": 0.0195, + "step": 11845 + }, + { + "epoch": 9.327688066167783, + "grad_norm": 0.1771724969148636, + "learning_rate": 2.9386666666666668e-05, + "loss": 0.0116, + "step": 11846 + }, + { + "epoch": 9.328475777865302, + "grad_norm": 0.2771623134613037, + "learning_rate": 2.9386333333333334e-05, + "loss": 0.0152, + "step": 11847 + }, + { + "epoch": 9.32926348956282, + "grad_norm": 0.4385918378829956, + "learning_rate": 2.9386000000000003e-05, + "loss": 0.0127, + "step": 11848 + }, + { + "epoch": 9.33005120126034, + "grad_norm": 0.4396028220653534, + "learning_rate": 2.9385666666666665e-05, + "loss": 0.0207, + "step": 11849 + }, + { + "epoch": 9.330838912957857, + "grad_norm": 0.38665780425071716, + "learning_rate": 2.9385333333333335e-05, + "loss": 0.0192, + "step": 11850 + }, + { + "epoch": 9.331626624655376, + "grad_norm": 0.37322938442230225, + "learning_rate": 2.9385e-05, + "loss": 0.0197, + "step": 11851 + }, + { + "epoch": 9.332414336352894, + "grad_norm": 0.16671860218048096, + "learning_rate": 2.9384666666666667e-05, + "loss": 0.0163, + "step": 11852 + }, + { + "epoch": 9.333202048050413, + "grad_norm": 0.19907855987548828, + "learning_rate": 2.9384333333333333e-05, + "loss": 0.014, + "step": 11853 + }, + { + "epoch": 9.333989759747933, + "grad_norm": 0.13083359599113464, + "learning_rate": 2.9384000000000002e-05, + "loss": 0.0066, + "step": 11854 + }, + { + "epoch": 9.33477747144545, + "grad_norm": 0.714702844619751, + "learning_rate": 2.9383666666666668e-05, + "loss": 0.0143, + "step": 11855 + }, + { + "epoch": 9.33556518314297, + "grad_norm": 0.1685410737991333, + "learning_rate": 2.9383333333333334e-05, + "loss": 0.0098, + "step": 11856 + }, + { + "epoch": 9.336352894840488, + "grad_norm": 0.28751909732818604, + "learning_rate": 2.9383000000000003e-05, + "loss": 0.0134, + "step": 11857 + }, + { + "epoch": 9.337140606538007, + "grad_norm": 0.29717546701431274, + "learning_rate": 2.9382666666666666e-05, + "loss": 0.0175, + "step": 11858 + }, + { + "epoch": 9.337928318235527, + "grad_norm": 0.23468494415283203, + "learning_rate": 2.9382333333333335e-05, + "loss": 0.0143, + "step": 11859 + }, + { + "epoch": 9.338716029933044, + "grad_norm": 0.2105410099029541, + "learning_rate": 2.9382e-05, + "loss": 0.0114, + "step": 11860 + }, + { + "epoch": 9.339503741630564, + "grad_norm": 0.2932196855545044, + "learning_rate": 2.9381666666666667e-05, + "loss": 0.0172, + "step": 11861 + }, + { + "epoch": 9.340291453328081, + "grad_norm": 0.15232929587364197, + "learning_rate": 2.9381333333333336e-05, + "loss": 0.0072, + "step": 11862 + }, + { + "epoch": 9.3410791650256, + "grad_norm": 0.23737278580665588, + "learning_rate": 2.9381000000000002e-05, + "loss": 0.0109, + "step": 11863 + }, + { + "epoch": 9.341866876723119, + "grad_norm": 0.19606873393058777, + "learning_rate": 2.9380666666666668e-05, + "loss": 0.0105, + "step": 11864 + }, + { + "epoch": 9.342654588420638, + "grad_norm": 0.451729953289032, + "learning_rate": 2.9380333333333334e-05, + "loss": 0.0208, + "step": 11865 + }, + { + "epoch": 9.343442300118157, + "grad_norm": 0.2332930564880371, + "learning_rate": 2.938e-05, + "loss": 0.0109, + "step": 11866 + }, + { + "epoch": 9.344230011815675, + "grad_norm": 0.13023526966571808, + "learning_rate": 2.9379666666666666e-05, + "loss": 0.0066, + "step": 11867 + }, + { + "epoch": 9.345017723513195, + "grad_norm": 0.31281670928001404, + "learning_rate": 2.9379333333333335e-05, + "loss": 0.0123, + "step": 11868 + }, + { + "epoch": 9.345805435210712, + "grad_norm": 0.13813410699367523, + "learning_rate": 2.9378999999999998e-05, + "loss": 0.0075, + "step": 11869 + }, + { + "epoch": 9.346593146908232, + "grad_norm": 0.28493577241897583, + "learning_rate": 2.9378666666666667e-05, + "loss": 0.0265, + "step": 11870 + }, + { + "epoch": 9.34738085860575, + "grad_norm": 0.2696811854839325, + "learning_rate": 2.9378333333333336e-05, + "loss": 0.0171, + "step": 11871 + }, + { + "epoch": 9.348168570303269, + "grad_norm": 0.14562633633613586, + "learning_rate": 2.9378e-05, + "loss": 0.0077, + "step": 11872 + }, + { + "epoch": 9.348956282000788, + "grad_norm": 0.14504940807819366, + "learning_rate": 2.9377666666666668e-05, + "loss": 0.0106, + "step": 11873 + }, + { + "epoch": 9.349743993698306, + "grad_norm": 0.3126898407936096, + "learning_rate": 2.9377333333333334e-05, + "loss": 0.0221, + "step": 11874 + }, + { + "epoch": 9.350531705395825, + "grad_norm": 0.27006858587265015, + "learning_rate": 2.9377e-05, + "loss": 0.0134, + "step": 11875 + }, + { + "epoch": 9.351319417093343, + "grad_norm": 0.3883313536643982, + "learning_rate": 2.9376666666666666e-05, + "loss": 0.0142, + "step": 11876 + }, + { + "epoch": 9.352107128790863, + "grad_norm": 0.14253512024879456, + "learning_rate": 2.9376333333333335e-05, + "loss": 0.0063, + "step": 11877 + }, + { + "epoch": 9.352894840488382, + "grad_norm": 0.45201802253723145, + "learning_rate": 2.9375999999999998e-05, + "loss": 0.0144, + "step": 11878 + }, + { + "epoch": 9.3536825521859, + "grad_norm": 0.34942853450775146, + "learning_rate": 2.9375666666666667e-05, + "loss": 0.0134, + "step": 11879 + }, + { + "epoch": 9.35447026388342, + "grad_norm": 0.32985326647758484, + "learning_rate": 2.9375333333333337e-05, + "loss": 0.0211, + "step": 11880 + }, + { + "epoch": 9.355257975580937, + "grad_norm": 0.7649333477020264, + "learning_rate": 2.9375e-05, + "loss": 0.2411, + "step": 11881 + }, + { + "epoch": 9.356045687278456, + "grad_norm": 0.401239812374115, + "learning_rate": 2.937466666666667e-05, + "loss": 0.0994, + "step": 11882 + }, + { + "epoch": 9.356833398975974, + "grad_norm": 0.46971043944358826, + "learning_rate": 2.9374333333333334e-05, + "loss": 0.0978, + "step": 11883 + }, + { + "epoch": 9.357621110673493, + "grad_norm": 0.5698368549346924, + "learning_rate": 2.9374e-05, + "loss": 0.0993, + "step": 11884 + }, + { + "epoch": 9.358408822371013, + "grad_norm": 0.48080042004585266, + "learning_rate": 2.9373666666666666e-05, + "loss": 0.0754, + "step": 11885 + }, + { + "epoch": 9.35919653406853, + "grad_norm": 0.2514106333255768, + "learning_rate": 2.9373333333333336e-05, + "loss": 0.0373, + "step": 11886 + }, + { + "epoch": 9.35998424576605, + "grad_norm": 0.3696558475494385, + "learning_rate": 2.9373e-05, + "loss": 0.032, + "step": 11887 + }, + { + "epoch": 9.360771957463568, + "grad_norm": 0.33132827281951904, + "learning_rate": 2.9372666666666667e-05, + "loss": 0.026, + "step": 11888 + }, + { + "epoch": 9.361559669161087, + "grad_norm": 0.257862389087677, + "learning_rate": 2.9372333333333337e-05, + "loss": 0.0199, + "step": 11889 + }, + { + "epoch": 9.362347380858605, + "grad_norm": 0.19093254208564758, + "learning_rate": 2.9372e-05, + "loss": 0.0103, + "step": 11890 + }, + { + "epoch": 9.363135092556124, + "grad_norm": 0.1290038526058197, + "learning_rate": 2.937166666666667e-05, + "loss": 0.0104, + "step": 11891 + }, + { + "epoch": 9.363922804253644, + "grad_norm": 0.210408017039299, + "learning_rate": 2.9371333333333335e-05, + "loss": 0.0176, + "step": 11892 + }, + { + "epoch": 9.364710515951161, + "grad_norm": 0.28988558053970337, + "learning_rate": 2.9371e-05, + "loss": 0.0125, + "step": 11893 + }, + { + "epoch": 9.365498227648681, + "grad_norm": 0.302855908870697, + "learning_rate": 2.9370666666666666e-05, + "loss": 0.0156, + "step": 11894 + }, + { + "epoch": 9.366285939346199, + "grad_norm": 0.32023918628692627, + "learning_rate": 2.9370333333333336e-05, + "loss": 0.0175, + "step": 11895 + }, + { + "epoch": 9.367073651043718, + "grad_norm": 0.45455673336982727, + "learning_rate": 2.9370000000000002e-05, + "loss": 0.032, + "step": 11896 + }, + { + "epoch": 9.367861362741237, + "grad_norm": 0.19085553288459778, + "learning_rate": 2.9369666666666668e-05, + "loss": 0.0098, + "step": 11897 + }, + { + "epoch": 9.368649074438755, + "grad_norm": 0.2865551710128784, + "learning_rate": 2.9369333333333334e-05, + "loss": 0.015, + "step": 11898 + }, + { + "epoch": 9.369436786136275, + "grad_norm": 0.16747687757015228, + "learning_rate": 2.9369e-05, + "loss": 0.0105, + "step": 11899 + }, + { + "epoch": 9.370224497833792, + "grad_norm": 0.69572514295578, + "learning_rate": 2.936866666666667e-05, + "loss": 0.0239, + "step": 11900 + }, + { + "epoch": 9.371012209531312, + "grad_norm": 0.22773964703083038, + "learning_rate": 2.936833333333333e-05, + "loss": 0.0087, + "step": 11901 + }, + { + "epoch": 9.37179992122883, + "grad_norm": 0.22886185348033905, + "learning_rate": 2.9368e-05, + "loss": 0.0169, + "step": 11902 + }, + { + "epoch": 9.372587632926349, + "grad_norm": 0.16025778651237488, + "learning_rate": 2.9367666666666667e-05, + "loss": 0.0103, + "step": 11903 + }, + { + "epoch": 9.373375344623868, + "grad_norm": 0.2316143959760666, + "learning_rate": 2.9367333333333333e-05, + "loss": 0.0125, + "step": 11904 + }, + { + "epoch": 9.374163056321386, + "grad_norm": 0.21288251876831055, + "learning_rate": 2.9367000000000002e-05, + "loss": 0.0103, + "step": 11905 + }, + { + "epoch": 9.374950768018905, + "grad_norm": 0.2267528623342514, + "learning_rate": 2.9366666666666668e-05, + "loss": 0.0566, + "step": 11906 + }, + { + "epoch": 9.375738479716423, + "grad_norm": 0.14974536001682281, + "learning_rate": 2.9366333333333334e-05, + "loss": 0.0079, + "step": 11907 + }, + { + "epoch": 9.376526191413943, + "grad_norm": 0.39561405777931213, + "learning_rate": 2.9366e-05, + "loss": 0.014, + "step": 11908 + }, + { + "epoch": 9.37731390311146, + "grad_norm": 0.47884002327919006, + "learning_rate": 2.936566666666667e-05, + "loss": 0.0118, + "step": 11909 + }, + { + "epoch": 9.37810161480898, + "grad_norm": 0.356246680021286, + "learning_rate": 2.936533333333333e-05, + "loss": 0.0161, + "step": 11910 + }, + { + "epoch": 9.3788893265065, + "grad_norm": 0.21851249039173126, + "learning_rate": 2.9365e-05, + "loss": 0.0147, + "step": 11911 + }, + { + "epoch": 9.379677038204017, + "grad_norm": 0.20387616753578186, + "learning_rate": 2.936466666666667e-05, + "loss": 0.0069, + "step": 11912 + }, + { + "epoch": 9.380464749901536, + "grad_norm": 0.21098172664642334, + "learning_rate": 2.9364333333333333e-05, + "loss": 0.0153, + "step": 11913 + }, + { + "epoch": 9.381252461599054, + "grad_norm": 0.1742251217365265, + "learning_rate": 2.9364000000000002e-05, + "loss": 0.0103, + "step": 11914 + }, + { + "epoch": 9.382040173296573, + "grad_norm": 0.28600090742111206, + "learning_rate": 2.9363666666666668e-05, + "loss": 0.0455, + "step": 11915 + }, + { + "epoch": 9.382827884994093, + "grad_norm": 0.4306120276451111, + "learning_rate": 2.9363333333333334e-05, + "loss": 0.0305, + "step": 11916 + }, + { + "epoch": 9.38361559669161, + "grad_norm": 0.14446376264095306, + "learning_rate": 2.9363e-05, + "loss": 0.0085, + "step": 11917 + }, + { + "epoch": 9.38440330838913, + "grad_norm": 0.3709288239479065, + "learning_rate": 2.936266666666667e-05, + "loss": 0.0229, + "step": 11918 + }, + { + "epoch": 9.385191020086648, + "grad_norm": 0.2369428128004074, + "learning_rate": 2.9362333333333332e-05, + "loss": 0.0135, + "step": 11919 + }, + { + "epoch": 9.385978731784167, + "grad_norm": 0.39664024114608765, + "learning_rate": 2.9362e-05, + "loss": 0.0227, + "step": 11920 + }, + { + "epoch": 9.386766443481685, + "grad_norm": 0.31455326080322266, + "learning_rate": 2.936166666666667e-05, + "loss": 0.0136, + "step": 11921 + }, + { + "epoch": 9.387554155179204, + "grad_norm": 0.1446397602558136, + "learning_rate": 2.9361333333333333e-05, + "loss": 0.0094, + "step": 11922 + }, + { + "epoch": 9.388341866876724, + "grad_norm": 0.19892002642154694, + "learning_rate": 2.9361000000000002e-05, + "loss": 0.0129, + "step": 11923 + }, + { + "epoch": 9.389129578574241, + "grad_norm": 0.3052932024002075, + "learning_rate": 2.9360666666666668e-05, + "loss": 0.0163, + "step": 11924 + }, + { + "epoch": 9.389917290271761, + "grad_norm": 0.24211178719997406, + "learning_rate": 2.9360333333333334e-05, + "loss": 0.0127, + "step": 11925 + }, + { + "epoch": 9.390705001969279, + "grad_norm": 0.772782027721405, + "learning_rate": 2.936e-05, + "loss": 0.0155, + "step": 11926 + }, + { + "epoch": 9.391492713666798, + "grad_norm": 0.2878795564174652, + "learning_rate": 2.9359666666666666e-05, + "loss": 0.0245, + "step": 11927 + }, + { + "epoch": 9.392280425364318, + "grad_norm": 0.45938435196876526, + "learning_rate": 2.9359333333333332e-05, + "loss": 0.0216, + "step": 11928 + }, + { + "epoch": 9.393068137061835, + "grad_norm": 0.3603096306324005, + "learning_rate": 2.9359e-05, + "loss": 0.0167, + "step": 11929 + }, + { + "epoch": 9.393855848759355, + "grad_norm": 0.20985102653503418, + "learning_rate": 2.9358666666666667e-05, + "loss": 0.0108, + "step": 11930 + }, + { + "epoch": 9.394643560456872, + "grad_norm": 0.5653095245361328, + "learning_rate": 2.9358333333333333e-05, + "loss": 0.2349, + "step": 11931 + }, + { + "epoch": 9.395431272154392, + "grad_norm": 0.5223917365074158, + "learning_rate": 2.9358000000000003e-05, + "loss": 0.1468, + "step": 11932 + }, + { + "epoch": 9.39621898385191, + "grad_norm": 0.6283195614814758, + "learning_rate": 2.9357666666666665e-05, + "loss": 0.0997, + "step": 11933 + }, + { + "epoch": 9.397006695549429, + "grad_norm": 0.5201848745346069, + "learning_rate": 2.9357333333333334e-05, + "loss": 0.1334, + "step": 11934 + }, + { + "epoch": 9.397794407246948, + "grad_norm": 0.4920872151851654, + "learning_rate": 2.9357e-05, + "loss": 0.1264, + "step": 11935 + }, + { + "epoch": 9.398582118944466, + "grad_norm": 0.4188419282436371, + "learning_rate": 2.9356666666666666e-05, + "loss": 0.0401, + "step": 11936 + }, + { + "epoch": 9.399369830641986, + "grad_norm": 0.4041748642921448, + "learning_rate": 2.9356333333333336e-05, + "loss": 0.0451, + "step": 11937 + }, + { + "epoch": 9.400157542339503, + "grad_norm": 0.26650699973106384, + "learning_rate": 2.9356e-05, + "loss": 0.0191, + "step": 11938 + }, + { + "epoch": 9.400945254037023, + "grad_norm": 0.17605087161064148, + "learning_rate": 2.9355666666666667e-05, + "loss": 0.0428, + "step": 11939 + }, + { + "epoch": 9.40173296573454, + "grad_norm": 0.455792635679245, + "learning_rate": 2.9355333333333333e-05, + "loss": 0.0193, + "step": 11940 + }, + { + "epoch": 9.40252067743206, + "grad_norm": 0.29746323823928833, + "learning_rate": 2.9355000000000003e-05, + "loss": 0.0108, + "step": 11941 + }, + { + "epoch": 9.40330838912958, + "grad_norm": 0.4036015570163727, + "learning_rate": 2.9354666666666665e-05, + "loss": 0.0154, + "step": 11942 + }, + { + "epoch": 9.404096100827097, + "grad_norm": 0.1706131547689438, + "learning_rate": 2.9354333333333335e-05, + "loss": 0.0093, + "step": 11943 + }, + { + "epoch": 9.404883812524616, + "grad_norm": 0.15558435022830963, + "learning_rate": 2.9354e-05, + "loss": 0.0106, + "step": 11944 + }, + { + "epoch": 9.405671524222134, + "grad_norm": 0.535973072052002, + "learning_rate": 2.9353666666666666e-05, + "loss": 0.0125, + "step": 11945 + }, + { + "epoch": 9.406459235919653, + "grad_norm": 0.37875932455062866, + "learning_rate": 2.9353333333333336e-05, + "loss": 0.013, + "step": 11946 + }, + { + "epoch": 9.407246947617173, + "grad_norm": 0.3311537802219391, + "learning_rate": 2.9353000000000002e-05, + "loss": 0.0118, + "step": 11947 + }, + { + "epoch": 9.40803465931469, + "grad_norm": 0.315901517868042, + "learning_rate": 2.9352666666666668e-05, + "loss": 0.0172, + "step": 11948 + }, + { + "epoch": 9.40882237101221, + "grad_norm": 0.4825879633426666, + "learning_rate": 2.9352333333333334e-05, + "loss": 0.0163, + "step": 11949 + }, + { + "epoch": 9.409610082709728, + "grad_norm": 0.2574754059314728, + "learning_rate": 2.9352000000000003e-05, + "loss": 0.0147, + "step": 11950 + }, + { + "epoch": 9.410397794407247, + "grad_norm": 0.36195576190948486, + "learning_rate": 2.9351666666666665e-05, + "loss": 0.0117, + "step": 11951 + }, + { + "epoch": 9.411185506104765, + "grad_norm": 0.15525677800178528, + "learning_rate": 2.9351333333333335e-05, + "loss": 0.0099, + "step": 11952 + }, + { + "epoch": 9.411973217802284, + "grad_norm": 0.19568686187267303, + "learning_rate": 2.9351e-05, + "loss": 0.0163, + "step": 11953 + }, + { + "epoch": 9.412760929499804, + "grad_norm": 0.22557362914085388, + "learning_rate": 2.9350666666666667e-05, + "loss": 0.0121, + "step": 11954 + }, + { + "epoch": 9.413548641197321, + "grad_norm": 0.32776933908462524, + "learning_rate": 2.9350333333333336e-05, + "loss": 0.0147, + "step": 11955 + }, + { + "epoch": 9.414336352894841, + "grad_norm": 0.24502260982990265, + "learning_rate": 2.9350000000000002e-05, + "loss": 0.0084, + "step": 11956 + }, + { + "epoch": 9.415124064592359, + "grad_norm": 0.17145361006259918, + "learning_rate": 2.9349666666666668e-05, + "loss": 0.0087, + "step": 11957 + }, + { + "epoch": 9.415911776289878, + "grad_norm": 0.25698593258857727, + "learning_rate": 2.9349333333333334e-05, + "loss": 0.0188, + "step": 11958 + }, + { + "epoch": 9.416699487987396, + "grad_norm": 0.33278989791870117, + "learning_rate": 2.9349e-05, + "loss": 0.0146, + "step": 11959 + }, + { + "epoch": 9.417487199684915, + "grad_norm": 0.280178964138031, + "learning_rate": 2.9348666666666666e-05, + "loss": 0.0184, + "step": 11960 + }, + { + "epoch": 9.418274911382435, + "grad_norm": 0.16433072090148926, + "learning_rate": 2.9348333333333335e-05, + "loss": 0.0148, + "step": 11961 + }, + { + "epoch": 9.419062623079952, + "grad_norm": 0.41563233733177185, + "learning_rate": 2.9348e-05, + "loss": 0.0219, + "step": 11962 + }, + { + "epoch": 9.419850334777472, + "grad_norm": 0.31459614634513855, + "learning_rate": 2.9347666666666667e-05, + "loss": 0.0165, + "step": 11963 + }, + { + "epoch": 9.42063804647499, + "grad_norm": 0.46519482135772705, + "learning_rate": 2.9347333333333336e-05, + "loss": 0.0187, + "step": 11964 + }, + { + "epoch": 9.421425758172509, + "grad_norm": 0.23701414465904236, + "learning_rate": 2.9347e-05, + "loss": 0.0275, + "step": 11965 + }, + { + "epoch": 9.422213469870028, + "grad_norm": 0.21538688242435455, + "learning_rate": 2.9346666666666668e-05, + "loss": 0.0146, + "step": 11966 + }, + { + "epoch": 9.423001181567546, + "grad_norm": 0.42138200998306274, + "learning_rate": 2.9346333333333334e-05, + "loss": 0.0159, + "step": 11967 + }, + { + "epoch": 9.423788893265066, + "grad_norm": 0.26664218306541443, + "learning_rate": 2.9346e-05, + "loss": 0.0077, + "step": 11968 + }, + { + "epoch": 9.424576604962583, + "grad_norm": 0.36004936695098877, + "learning_rate": 2.9345666666666666e-05, + "loss": 0.0213, + "step": 11969 + }, + { + "epoch": 9.425364316660103, + "grad_norm": 0.2594984173774719, + "learning_rate": 2.9345333333333335e-05, + "loss": 0.0136, + "step": 11970 + }, + { + "epoch": 9.42615202835762, + "grad_norm": 0.16818563640117645, + "learning_rate": 2.9345e-05, + "loss": 0.0091, + "step": 11971 + }, + { + "epoch": 9.42693974005514, + "grad_norm": 0.3207676410675049, + "learning_rate": 2.9344666666666667e-05, + "loss": 0.0183, + "step": 11972 + }, + { + "epoch": 9.42772745175266, + "grad_norm": 0.23831553757190704, + "learning_rate": 2.9344333333333336e-05, + "loss": 0.013, + "step": 11973 + }, + { + "epoch": 9.428515163450177, + "grad_norm": 0.29101893305778503, + "learning_rate": 2.9344e-05, + "loss": 0.0169, + "step": 11974 + }, + { + "epoch": 9.429302875147696, + "grad_norm": 0.3427157402038574, + "learning_rate": 2.9343666666666668e-05, + "loss": 0.017, + "step": 11975 + }, + { + "epoch": 9.430090586845214, + "grad_norm": 0.6791983246803284, + "learning_rate": 2.9343333333333334e-05, + "loss": 0.0222, + "step": 11976 + }, + { + "epoch": 9.430878298542734, + "grad_norm": 0.6001702547073364, + "learning_rate": 2.9343e-05, + "loss": 0.0234, + "step": 11977 + }, + { + "epoch": 9.431666010240253, + "grad_norm": 0.42323288321495056, + "learning_rate": 2.9342666666666666e-05, + "loss": 0.0113, + "step": 11978 + }, + { + "epoch": 9.43245372193777, + "grad_norm": 0.4139474630355835, + "learning_rate": 2.9342333333333335e-05, + "loss": 0.0142, + "step": 11979 + }, + { + "epoch": 9.43324143363529, + "grad_norm": 0.3669714629650116, + "learning_rate": 2.9342e-05, + "loss": 0.0184, + "step": 11980 + }, + { + "epoch": 9.434029145332808, + "grad_norm": 0.7986551523208618, + "learning_rate": 2.9341666666666667e-05, + "loss": 0.2313, + "step": 11981 + }, + { + "epoch": 9.434816857030327, + "grad_norm": 0.630294919013977, + "learning_rate": 2.9341333333333337e-05, + "loss": 0.1611, + "step": 11982 + }, + { + "epoch": 9.435604568727845, + "grad_norm": 0.36635902523994446, + "learning_rate": 2.9341e-05, + "loss": 0.0963, + "step": 11983 + }, + { + "epoch": 9.436392280425364, + "grad_norm": 0.5819231271743774, + "learning_rate": 2.934066666666667e-05, + "loss": 0.1062, + "step": 11984 + }, + { + "epoch": 9.437179992122884, + "grad_norm": 0.3778388798236847, + "learning_rate": 2.9340333333333334e-05, + "loss": 0.0719, + "step": 11985 + }, + { + "epoch": 9.437967703820402, + "grad_norm": 0.3616527020931244, + "learning_rate": 2.934e-05, + "loss": 0.078, + "step": 11986 + }, + { + "epoch": 9.438755415517921, + "grad_norm": 0.2785639762878418, + "learning_rate": 2.933966666666667e-05, + "loss": 0.0307, + "step": 11987 + }, + { + "epoch": 9.439543127215439, + "grad_norm": 0.3261738419532776, + "learning_rate": 2.9339333333333332e-05, + "loss": 0.0305, + "step": 11988 + }, + { + "epoch": 9.440330838912958, + "grad_norm": 0.7529861330986023, + "learning_rate": 2.9339e-05, + "loss": 0.0388, + "step": 11989 + }, + { + "epoch": 9.441118550610476, + "grad_norm": 0.18580986559391022, + "learning_rate": 2.9338666666666667e-05, + "loss": 0.01, + "step": 11990 + }, + { + "epoch": 9.441906262307995, + "grad_norm": 0.207586869597435, + "learning_rate": 2.9338333333333333e-05, + "loss": 0.0172, + "step": 11991 + }, + { + "epoch": 9.442693974005515, + "grad_norm": 0.11757068336009979, + "learning_rate": 2.9338e-05, + "loss": 0.0087, + "step": 11992 + }, + { + "epoch": 9.443481685703032, + "grad_norm": 0.6941796541213989, + "learning_rate": 2.933766666666667e-05, + "loss": 0.0153, + "step": 11993 + }, + { + "epoch": 9.444269397400552, + "grad_norm": 0.22721976041793823, + "learning_rate": 2.933733333333333e-05, + "loss": 0.0165, + "step": 11994 + }, + { + "epoch": 9.44505710909807, + "grad_norm": 0.24363894760608673, + "learning_rate": 2.9337e-05, + "loss": 0.0215, + "step": 11995 + }, + { + "epoch": 9.445844820795589, + "grad_norm": 0.12883563339710236, + "learning_rate": 2.933666666666667e-05, + "loss": 0.0109, + "step": 11996 + }, + { + "epoch": 9.446632532493108, + "grad_norm": 0.17389917373657227, + "learning_rate": 2.9336333333333332e-05, + "loss": 0.0066, + "step": 11997 + }, + { + "epoch": 9.447420244190626, + "grad_norm": 0.231268510222435, + "learning_rate": 2.9336000000000002e-05, + "loss": 0.0137, + "step": 11998 + }, + { + "epoch": 9.448207955888146, + "grad_norm": 0.2812953591346741, + "learning_rate": 2.9335666666666668e-05, + "loss": 0.0191, + "step": 11999 + }, + { + "epoch": 9.448995667585663, + "grad_norm": 0.2826792299747467, + "learning_rate": 2.9335333333333334e-05, + "loss": 0.014, + "step": 12000 + }, + { + "epoch": 9.448995667585663, + "eval_cer": 0.11615729579801436, + "eval_loss": 0.31181466579437256, + "eval_runtime": 16.7832, + "eval_samples_per_second": 18.113, + "eval_steps_per_second": 0.596, + "eval_wer": 0.40483499616270147, + "step": 12000 + }, + { + "epoch": 9.449783379283183, + "grad_norm": 0.45819321274757385, + "learning_rate": 2.9335e-05, + "loss": 0.0218, + "step": 12001 + }, + { + "epoch": 9.4505710909807, + "grad_norm": 0.2518042325973511, + "learning_rate": 2.933466666666667e-05, + "loss": 0.0089, + "step": 12002 + }, + { + "epoch": 9.45135880267822, + "grad_norm": 0.22023923695087433, + "learning_rate": 2.933433333333333e-05, + "loss": 0.0506, + "step": 12003 + }, + { + "epoch": 9.45214651437574, + "grad_norm": 0.32749998569488525, + "learning_rate": 2.9334e-05, + "loss": 0.0138, + "step": 12004 + }, + { + "epoch": 9.452934226073257, + "grad_norm": 0.48048534989356995, + "learning_rate": 2.933366666666667e-05, + "loss": 0.0151, + "step": 12005 + }, + { + "epoch": 9.453721937770776, + "grad_norm": 0.25929930806159973, + "learning_rate": 2.9333333333333333e-05, + "loss": 0.0112, + "step": 12006 + }, + { + "epoch": 9.454509649468294, + "grad_norm": 0.3256855010986328, + "learning_rate": 2.9333000000000002e-05, + "loss": 0.0172, + "step": 12007 + }, + { + "epoch": 9.455297361165814, + "grad_norm": 0.22231173515319824, + "learning_rate": 2.9332666666666668e-05, + "loss": 0.0082, + "step": 12008 + }, + { + "epoch": 9.456085072863331, + "grad_norm": 0.35863563418388367, + "learning_rate": 2.9332333333333334e-05, + "loss": 0.0192, + "step": 12009 + }, + { + "epoch": 9.45687278456085, + "grad_norm": 0.2523389458656311, + "learning_rate": 2.9332e-05, + "loss": 0.0129, + "step": 12010 + }, + { + "epoch": 9.45766049625837, + "grad_norm": 0.26931315660476685, + "learning_rate": 2.933166666666667e-05, + "loss": 0.0142, + "step": 12011 + }, + { + "epoch": 9.458448207955888, + "grad_norm": 0.16721111536026, + "learning_rate": 2.9331333333333335e-05, + "loss": 0.0099, + "step": 12012 + }, + { + "epoch": 9.459235919653407, + "grad_norm": 0.2812984883785248, + "learning_rate": 2.9331e-05, + "loss": 0.0221, + "step": 12013 + }, + { + "epoch": 9.460023631350925, + "grad_norm": 0.17031800746917725, + "learning_rate": 2.933066666666667e-05, + "loss": 0.0098, + "step": 12014 + }, + { + "epoch": 9.460811343048444, + "grad_norm": 0.23723848164081573, + "learning_rate": 2.9330333333333333e-05, + "loss": 0.0137, + "step": 12015 + }, + { + "epoch": 9.461599054745964, + "grad_norm": 0.2439577877521515, + "learning_rate": 2.9330000000000002e-05, + "loss": 0.0136, + "step": 12016 + }, + { + "epoch": 9.462386766443482, + "grad_norm": 0.23214063048362732, + "learning_rate": 2.9329666666666668e-05, + "loss": 0.0133, + "step": 12017 + }, + { + "epoch": 9.463174478141001, + "grad_norm": 0.17383547127246857, + "learning_rate": 2.9329333333333334e-05, + "loss": 0.0061, + "step": 12018 + }, + { + "epoch": 9.463962189838519, + "grad_norm": 0.41210970282554626, + "learning_rate": 2.9329e-05, + "loss": 0.0162, + "step": 12019 + }, + { + "epoch": 9.464749901536038, + "grad_norm": 0.23417888581752777, + "learning_rate": 2.9328666666666666e-05, + "loss": 0.0132, + "step": 12020 + }, + { + "epoch": 9.465537613233556, + "grad_norm": 0.4387734830379486, + "learning_rate": 2.9328333333333335e-05, + "loss": 0.0222, + "step": 12021 + }, + { + "epoch": 9.466325324931075, + "grad_norm": 0.3719028830528259, + "learning_rate": 2.9328e-05, + "loss": 0.0105, + "step": 12022 + }, + { + "epoch": 9.467113036628595, + "grad_norm": 0.5186506509780884, + "learning_rate": 2.9327666666666667e-05, + "loss": 0.0217, + "step": 12023 + }, + { + "epoch": 9.467900748326112, + "grad_norm": 0.22828659415245056, + "learning_rate": 2.9327333333333333e-05, + "loss": 0.0163, + "step": 12024 + }, + { + "epoch": 9.468688460023632, + "grad_norm": 0.6194286942481995, + "learning_rate": 2.9327000000000002e-05, + "loss": 0.0265, + "step": 12025 + }, + { + "epoch": 9.46947617172115, + "grad_norm": 0.4731905460357666, + "learning_rate": 2.9326666666666665e-05, + "loss": 0.0166, + "step": 12026 + }, + { + "epoch": 9.470263883418669, + "grad_norm": 0.44451192021369934, + "learning_rate": 2.9326333333333334e-05, + "loss": 0.0128, + "step": 12027 + }, + { + "epoch": 9.471051595116187, + "grad_norm": 0.8105947375297546, + "learning_rate": 2.9326e-05, + "loss": 0.0073, + "step": 12028 + }, + { + "epoch": 9.471839306813706, + "grad_norm": 0.2902764081954956, + "learning_rate": 2.9325666666666666e-05, + "loss": 0.0127, + "step": 12029 + }, + { + "epoch": 9.472627018511226, + "grad_norm": 0.22775983810424805, + "learning_rate": 2.9325333333333335e-05, + "loss": 0.012, + "step": 12030 + }, + { + "epoch": 9.473414730208743, + "grad_norm": 0.6100355982780457, + "learning_rate": 2.9325e-05, + "loss": 0.2261, + "step": 12031 + }, + { + "epoch": 9.474202441906263, + "grad_norm": 0.5241032838821411, + "learning_rate": 2.9324666666666667e-05, + "loss": 0.1682, + "step": 12032 + }, + { + "epoch": 9.47499015360378, + "grad_norm": 0.4810533821582794, + "learning_rate": 2.9324333333333333e-05, + "loss": 0.1419, + "step": 12033 + }, + { + "epoch": 9.4757778653013, + "grad_norm": 0.935707151889801, + "learning_rate": 2.9324000000000002e-05, + "loss": 0.1177, + "step": 12034 + }, + { + "epoch": 9.47656557699882, + "grad_norm": 0.330248087644577, + "learning_rate": 2.9323666666666665e-05, + "loss": 0.0644, + "step": 12035 + }, + { + "epoch": 9.477353288696337, + "grad_norm": 0.6360933780670166, + "learning_rate": 2.9323333333333334e-05, + "loss": 0.0514, + "step": 12036 + }, + { + "epoch": 9.478141000393856, + "grad_norm": 0.4010324776172638, + "learning_rate": 2.9323000000000004e-05, + "loss": 0.0412, + "step": 12037 + }, + { + "epoch": 9.478928712091374, + "grad_norm": 0.5043799877166748, + "learning_rate": 2.9322666666666666e-05, + "loss": 0.0505, + "step": 12038 + }, + { + "epoch": 9.479716423788894, + "grad_norm": 0.4825480580329895, + "learning_rate": 2.9322333333333336e-05, + "loss": 0.0126, + "step": 12039 + }, + { + "epoch": 9.480504135486411, + "grad_norm": 0.17504334449768066, + "learning_rate": 2.9322e-05, + "loss": 0.0102, + "step": 12040 + }, + { + "epoch": 9.48129184718393, + "grad_norm": 0.2669488489627838, + "learning_rate": 2.9321666666666667e-05, + "loss": 0.017, + "step": 12041 + }, + { + "epoch": 9.48207955888145, + "grad_norm": 0.250510573387146, + "learning_rate": 2.9321333333333333e-05, + "loss": 0.0144, + "step": 12042 + }, + { + "epoch": 9.482867270578968, + "grad_norm": 0.3595871031284332, + "learning_rate": 2.9321000000000003e-05, + "loss": 0.0903, + "step": 12043 + }, + { + "epoch": 9.483654982276487, + "grad_norm": 0.38339418172836304, + "learning_rate": 2.9320666666666665e-05, + "loss": 0.0114, + "step": 12044 + }, + { + "epoch": 9.484442693974005, + "grad_norm": 0.12724915146827698, + "learning_rate": 2.9320333333333335e-05, + "loss": 0.0129, + "step": 12045 + }, + { + "epoch": 9.485230405671524, + "grad_norm": 0.1549227386713028, + "learning_rate": 2.9320000000000004e-05, + "loss": 0.012, + "step": 12046 + }, + { + "epoch": 9.486018117369042, + "grad_norm": 0.3293604850769043, + "learning_rate": 2.9319666666666666e-05, + "loss": 0.0261, + "step": 12047 + }, + { + "epoch": 9.486805829066562, + "grad_norm": 0.2333308309316635, + "learning_rate": 2.9319333333333336e-05, + "loss": 0.013, + "step": 12048 + }, + { + "epoch": 9.487593540764081, + "grad_norm": 0.16406163573265076, + "learning_rate": 2.9318999999999998e-05, + "loss": 0.0098, + "step": 12049 + }, + { + "epoch": 9.488381252461599, + "grad_norm": 0.5630603432655334, + "learning_rate": 2.9318666666666668e-05, + "loss": 0.0232, + "step": 12050 + }, + { + "epoch": 9.489168964159118, + "grad_norm": 0.23447081446647644, + "learning_rate": 2.9318333333333334e-05, + "loss": 0.0121, + "step": 12051 + }, + { + "epoch": 9.489956675856636, + "grad_norm": 0.24240264296531677, + "learning_rate": 2.9318e-05, + "loss": 0.0118, + "step": 12052 + }, + { + "epoch": 9.490744387554155, + "grad_norm": 0.19616511464118958, + "learning_rate": 2.9317666666666665e-05, + "loss": 0.015, + "step": 12053 + }, + { + "epoch": 9.491532099251675, + "grad_norm": 0.10206956416368484, + "learning_rate": 2.9317333333333335e-05, + "loss": 0.0068, + "step": 12054 + }, + { + "epoch": 9.492319810949192, + "grad_norm": 0.4304349720478058, + "learning_rate": 2.9317e-05, + "loss": 0.0106, + "step": 12055 + }, + { + "epoch": 9.493107522646712, + "grad_norm": 0.3942588269710541, + "learning_rate": 2.9316666666666667e-05, + "loss": 0.0127, + "step": 12056 + }, + { + "epoch": 9.49389523434423, + "grad_norm": 0.16850601136684418, + "learning_rate": 2.9316333333333336e-05, + "loss": 0.0104, + "step": 12057 + }, + { + "epoch": 9.494682946041749, + "grad_norm": 0.30229368805885315, + "learning_rate": 2.9316e-05, + "loss": 0.0111, + "step": 12058 + }, + { + "epoch": 9.495470657739267, + "grad_norm": 0.4338896572589874, + "learning_rate": 2.9315666666666668e-05, + "loss": 0.0186, + "step": 12059 + }, + { + "epoch": 9.496258369436786, + "grad_norm": 0.1515011042356491, + "learning_rate": 2.9315333333333334e-05, + "loss": 0.009, + "step": 12060 + }, + { + "epoch": 9.497046081134306, + "grad_norm": 0.20993411540985107, + "learning_rate": 2.9315e-05, + "loss": 0.0119, + "step": 12061 + }, + { + "epoch": 9.497833792831823, + "grad_norm": 0.32982155680656433, + "learning_rate": 2.931466666666667e-05, + "loss": 0.0143, + "step": 12062 + }, + { + "epoch": 9.498621504529343, + "grad_norm": 0.1598256528377533, + "learning_rate": 2.9314333333333335e-05, + "loss": 0.0079, + "step": 12063 + }, + { + "epoch": 9.49940921622686, + "grad_norm": 0.43804630637168884, + "learning_rate": 2.9314e-05, + "loss": 0.0201, + "step": 12064 + }, + { + "epoch": 9.50019692792438, + "grad_norm": 0.27376773953437805, + "learning_rate": 2.9313666666666667e-05, + "loss": 0.0182, + "step": 12065 + }, + { + "epoch": 9.500984639621898, + "grad_norm": 0.3020817041397095, + "learning_rate": 2.9313333333333336e-05, + "loss": 0.0069, + "step": 12066 + }, + { + "epoch": 9.501772351319417, + "grad_norm": 0.6359989643096924, + "learning_rate": 2.9313e-05, + "loss": 0.0263, + "step": 12067 + }, + { + "epoch": 9.502560063016936, + "grad_norm": 0.21652987599372864, + "learning_rate": 2.9312666666666668e-05, + "loss": 0.0113, + "step": 12068 + }, + { + "epoch": 9.503347774714454, + "grad_norm": 1.0869355201721191, + "learning_rate": 2.9312333333333334e-05, + "loss": 0.0268, + "step": 12069 + }, + { + "epoch": 9.504135486411974, + "grad_norm": 0.26688894629478455, + "learning_rate": 2.9312e-05, + "loss": 0.0142, + "step": 12070 + }, + { + "epoch": 9.504923198109491, + "grad_norm": 0.3474649488925934, + "learning_rate": 2.931166666666667e-05, + "loss": 0.0231, + "step": 12071 + }, + { + "epoch": 9.50571090980701, + "grad_norm": 0.3193381130695343, + "learning_rate": 2.9311333333333335e-05, + "loss": 0.0154, + "step": 12072 + }, + { + "epoch": 9.50649862150453, + "grad_norm": 0.3253379762172699, + "learning_rate": 2.9311e-05, + "loss": 0.0172, + "step": 12073 + }, + { + "epoch": 9.507286333202048, + "grad_norm": 0.3247009813785553, + "learning_rate": 2.9310666666666667e-05, + "loss": 0.0121, + "step": 12074 + }, + { + "epoch": 9.508074044899567, + "grad_norm": 0.37176069617271423, + "learning_rate": 2.9310333333333336e-05, + "loss": 0.0321, + "step": 12075 + }, + { + "epoch": 9.508861756597085, + "grad_norm": 0.31155407428741455, + "learning_rate": 2.931e-05, + "loss": 0.017, + "step": 12076 + }, + { + "epoch": 9.509649468294604, + "grad_norm": 0.3298332691192627, + "learning_rate": 2.9309666666666668e-05, + "loss": 0.0157, + "step": 12077 + }, + { + "epoch": 9.510437179992122, + "grad_norm": 0.42814406752586365, + "learning_rate": 2.930933333333333e-05, + "loss": 0.0215, + "step": 12078 + }, + { + "epoch": 9.511224891689642, + "grad_norm": 0.7237496972084045, + "learning_rate": 2.9309e-05, + "loss": 0.0281, + "step": 12079 + }, + { + "epoch": 9.512012603387161, + "grad_norm": 0.512800931930542, + "learning_rate": 2.930866666666667e-05, + "loss": 0.0165, + "step": 12080 + }, + { + "epoch": 9.512800315084679, + "grad_norm": 0.5404132604598999, + "learning_rate": 2.9308333333333332e-05, + "loss": 0.2265, + "step": 12081 + }, + { + "epoch": 9.513588026782198, + "grad_norm": 0.6846574544906616, + "learning_rate": 2.9308e-05, + "loss": 0.2134, + "step": 12082 + }, + { + "epoch": 9.514375738479716, + "grad_norm": 0.49071553349494934, + "learning_rate": 2.9307666666666667e-05, + "loss": 0.0997, + "step": 12083 + }, + { + "epoch": 9.515163450177235, + "grad_norm": 0.4408634901046753, + "learning_rate": 2.9307333333333333e-05, + "loss": 0.1275, + "step": 12084 + }, + { + "epoch": 9.515951161874753, + "grad_norm": 0.42225387692451477, + "learning_rate": 2.9307e-05, + "loss": 0.0653, + "step": 12085 + }, + { + "epoch": 9.516738873572272, + "grad_norm": 0.2605108320713043, + "learning_rate": 2.930666666666667e-05, + "loss": 0.0252, + "step": 12086 + }, + { + "epoch": 9.517526585269792, + "grad_norm": 0.468802273273468, + "learning_rate": 2.9306333333333334e-05, + "loss": 0.0298, + "step": 12087 + }, + { + "epoch": 9.51831429696731, + "grad_norm": 0.1442011147737503, + "learning_rate": 2.9306e-05, + "loss": 0.0167, + "step": 12088 + }, + { + "epoch": 9.519102008664829, + "grad_norm": 0.2872978746891022, + "learning_rate": 2.930566666666667e-05, + "loss": 0.0726, + "step": 12089 + }, + { + "epoch": 9.519889720362347, + "grad_norm": 0.680636465549469, + "learning_rate": 2.9305333333333332e-05, + "loss": 0.0178, + "step": 12090 + }, + { + "epoch": 9.520677432059866, + "grad_norm": 0.23213158547878265, + "learning_rate": 2.9305e-05, + "loss": 0.0403, + "step": 12091 + }, + { + "epoch": 9.521465143757386, + "grad_norm": 0.25282299518585205, + "learning_rate": 2.9304666666666667e-05, + "loss": 0.0256, + "step": 12092 + }, + { + "epoch": 9.522252855454903, + "grad_norm": 0.287517249584198, + "learning_rate": 2.9304333333333333e-05, + "loss": 0.0511, + "step": 12093 + }, + { + "epoch": 9.523040567152423, + "grad_norm": 0.22958797216415405, + "learning_rate": 2.9304e-05, + "loss": 0.0153, + "step": 12094 + }, + { + "epoch": 9.52382827884994, + "grad_norm": 0.2723681628704071, + "learning_rate": 2.930366666666667e-05, + "loss": 0.0191, + "step": 12095 + }, + { + "epoch": 9.52461599054746, + "grad_norm": 0.2876879870891571, + "learning_rate": 2.9303333333333335e-05, + "loss": 0.0156, + "step": 12096 + }, + { + "epoch": 9.525403702244978, + "grad_norm": 0.1824374943971634, + "learning_rate": 2.9303e-05, + "loss": 0.0092, + "step": 12097 + }, + { + "epoch": 9.526191413942497, + "grad_norm": 0.5923250317573547, + "learning_rate": 2.930266666666667e-05, + "loss": 0.0169, + "step": 12098 + }, + { + "epoch": 9.526979125640016, + "grad_norm": 0.2869500517845154, + "learning_rate": 2.9302333333333332e-05, + "loss": 0.021, + "step": 12099 + }, + { + "epoch": 9.527766837337534, + "grad_norm": 0.6044073700904846, + "learning_rate": 2.9302e-05, + "loss": 0.0113, + "step": 12100 + }, + { + "epoch": 9.528554549035054, + "grad_norm": 0.2112773060798645, + "learning_rate": 2.9301666666666668e-05, + "loss": 0.0148, + "step": 12101 + }, + { + "epoch": 9.529342260732571, + "grad_norm": 0.1576167345046997, + "learning_rate": 2.9301333333333334e-05, + "loss": 0.0068, + "step": 12102 + }, + { + "epoch": 9.53012997243009, + "grad_norm": 0.4387015104293823, + "learning_rate": 2.9301e-05, + "loss": 0.0245, + "step": 12103 + }, + { + "epoch": 9.530917684127608, + "grad_norm": 0.2827775180339813, + "learning_rate": 2.930066666666667e-05, + "loss": 0.0185, + "step": 12104 + }, + { + "epoch": 9.531705395825128, + "grad_norm": 0.5152256488800049, + "learning_rate": 2.9300333333333335e-05, + "loss": 0.0192, + "step": 12105 + }, + { + "epoch": 9.532493107522647, + "grad_norm": 0.1954905390739441, + "learning_rate": 2.93e-05, + "loss": 0.0149, + "step": 12106 + }, + { + "epoch": 9.533280819220165, + "grad_norm": 0.29031476378440857, + "learning_rate": 2.929966666666667e-05, + "loss": 0.0156, + "step": 12107 + }, + { + "epoch": 9.534068530917684, + "grad_norm": 0.24294346570968628, + "learning_rate": 2.9299333333333333e-05, + "loss": 0.0128, + "step": 12108 + }, + { + "epoch": 9.534856242615202, + "grad_norm": 0.21390780806541443, + "learning_rate": 2.9299000000000002e-05, + "loss": 0.0105, + "step": 12109 + }, + { + "epoch": 9.535643954312722, + "grad_norm": 0.13422515988349915, + "learning_rate": 2.9298666666666664e-05, + "loss": 0.006, + "step": 12110 + }, + { + "epoch": 9.536431666010241, + "grad_norm": 0.1924867331981659, + "learning_rate": 2.9298333333333334e-05, + "loss": 0.0084, + "step": 12111 + }, + { + "epoch": 9.537219377707759, + "grad_norm": 0.23139594495296478, + "learning_rate": 2.9298000000000003e-05, + "loss": 0.0137, + "step": 12112 + }, + { + "epoch": 9.538007089405278, + "grad_norm": 0.24446842074394226, + "learning_rate": 2.9297666666666666e-05, + "loss": 0.0212, + "step": 12113 + }, + { + "epoch": 9.538794801102796, + "grad_norm": 0.17729803919792175, + "learning_rate": 2.9297333333333335e-05, + "loss": 0.0127, + "step": 12114 + }, + { + "epoch": 9.539582512800315, + "grad_norm": 0.31077879667282104, + "learning_rate": 2.9297e-05, + "loss": 0.0105, + "step": 12115 + }, + { + "epoch": 9.540370224497833, + "grad_norm": 0.43200600147247314, + "learning_rate": 2.9296666666666667e-05, + "loss": 0.0146, + "step": 12116 + }, + { + "epoch": 9.541157936195352, + "grad_norm": 0.3325859308242798, + "learning_rate": 2.9296333333333333e-05, + "loss": 0.013, + "step": 12117 + }, + { + "epoch": 9.541945647892872, + "grad_norm": 0.2009541392326355, + "learning_rate": 2.9296000000000002e-05, + "loss": 0.0105, + "step": 12118 + }, + { + "epoch": 9.54273335959039, + "grad_norm": 0.35058456659317017, + "learning_rate": 2.9295666666666665e-05, + "loss": 0.0194, + "step": 12119 + }, + { + "epoch": 9.543521071287909, + "grad_norm": 0.26407623291015625, + "learning_rate": 2.9295333333333334e-05, + "loss": 0.0141, + "step": 12120 + }, + { + "epoch": 9.544308782985427, + "grad_norm": 0.1580285131931305, + "learning_rate": 2.9295000000000003e-05, + "loss": 0.0131, + "step": 12121 + }, + { + "epoch": 9.545096494682946, + "grad_norm": 0.16787347197532654, + "learning_rate": 2.9294666666666666e-05, + "loss": 0.0086, + "step": 12122 + }, + { + "epoch": 9.545884206380464, + "grad_norm": 0.1318359076976776, + "learning_rate": 2.9294333333333335e-05, + "loss": 0.008, + "step": 12123 + }, + { + "epoch": 9.546671918077983, + "grad_norm": 0.36719802021980286, + "learning_rate": 2.9294e-05, + "loss": 0.0256, + "step": 12124 + }, + { + "epoch": 9.547459629775503, + "grad_norm": 0.18798798322677612, + "learning_rate": 2.9293666666666667e-05, + "loss": 0.0151, + "step": 12125 + }, + { + "epoch": 9.54824734147302, + "grad_norm": 0.5775755047798157, + "learning_rate": 2.9293333333333333e-05, + "loss": 0.0168, + "step": 12126 + }, + { + "epoch": 9.54903505317054, + "grad_norm": 0.21172262728214264, + "learning_rate": 2.9293000000000002e-05, + "loss": 0.0079, + "step": 12127 + }, + { + "epoch": 9.549822764868058, + "grad_norm": 0.2727298438549042, + "learning_rate": 2.9292666666666665e-05, + "loss": 0.0167, + "step": 12128 + }, + { + "epoch": 9.550610476565577, + "grad_norm": 0.48805680871009827, + "learning_rate": 2.9292333333333334e-05, + "loss": 0.0213, + "step": 12129 + }, + { + "epoch": 9.551398188263097, + "grad_norm": 0.28254541754722595, + "learning_rate": 2.9292000000000003e-05, + "loss": 0.0197, + "step": 12130 + }, + { + "epoch": 9.552185899960614, + "grad_norm": 0.6953290104866028, + "learning_rate": 2.9291666666666666e-05, + "loss": 0.235, + "step": 12131 + }, + { + "epoch": 9.552973611658134, + "grad_norm": 0.5007844567298889, + "learning_rate": 2.9291333333333335e-05, + "loss": 0.1591, + "step": 12132 + }, + { + "epoch": 9.553761323355651, + "grad_norm": 0.5075289011001587, + "learning_rate": 2.9291e-05, + "loss": 0.1634, + "step": 12133 + }, + { + "epoch": 9.55454903505317, + "grad_norm": 0.3382608890533447, + "learning_rate": 2.9290666666666667e-05, + "loss": 0.0816, + "step": 12134 + }, + { + "epoch": 9.555336746750688, + "grad_norm": 0.6400121450424194, + "learning_rate": 2.9290333333333333e-05, + "loss": 0.0553, + "step": 12135 + }, + { + "epoch": 9.556124458448208, + "grad_norm": 0.23918336629867554, + "learning_rate": 2.9290000000000002e-05, + "loss": 0.034, + "step": 12136 + }, + { + "epoch": 9.556912170145727, + "grad_norm": 0.26208072900772095, + "learning_rate": 2.928966666666667e-05, + "loss": 0.0421, + "step": 12137 + }, + { + "epoch": 9.557699881843245, + "grad_norm": 0.2626838684082031, + "learning_rate": 2.9289333333333334e-05, + "loss": 0.025, + "step": 12138 + }, + { + "epoch": 9.558487593540764, + "grad_norm": 0.13937783241271973, + "learning_rate": 2.9289e-05, + "loss": 0.0145, + "step": 12139 + }, + { + "epoch": 9.559275305238282, + "grad_norm": 0.20974285900592804, + "learning_rate": 2.9288666666666666e-05, + "loss": 0.0114, + "step": 12140 + }, + { + "epoch": 9.560063016935802, + "grad_norm": 0.536199152469635, + "learning_rate": 2.9288333333333336e-05, + "loss": 0.0305, + "step": 12141 + }, + { + "epoch": 9.56085072863332, + "grad_norm": 0.15477052330970764, + "learning_rate": 2.9287999999999998e-05, + "loss": 0.0095, + "step": 12142 + }, + { + "epoch": 9.561638440330839, + "grad_norm": 0.4014012813568115, + "learning_rate": 2.9287666666666667e-05, + "loss": 0.0196, + "step": 12143 + }, + { + "epoch": 9.562426152028358, + "grad_norm": 0.18227951228618622, + "learning_rate": 2.9287333333333333e-05, + "loss": 0.0096, + "step": 12144 + }, + { + "epoch": 9.563213863725876, + "grad_norm": 0.27041178941726685, + "learning_rate": 2.9287e-05, + "loss": 0.0173, + "step": 12145 + }, + { + "epoch": 9.564001575423395, + "grad_norm": 0.19544824957847595, + "learning_rate": 2.928666666666667e-05, + "loss": 0.0111, + "step": 12146 + }, + { + "epoch": 9.564789287120913, + "grad_norm": 0.2327764332294464, + "learning_rate": 2.9286333333333335e-05, + "loss": 0.0143, + "step": 12147 + }, + { + "epoch": 9.565576998818432, + "grad_norm": 0.15305651724338531, + "learning_rate": 2.9286e-05, + "loss": 0.0091, + "step": 12148 + }, + { + "epoch": 9.566364710515952, + "grad_norm": 0.21351124346256256, + "learning_rate": 2.9285666666666666e-05, + "loss": 0.019, + "step": 12149 + }, + { + "epoch": 9.56715242221347, + "grad_norm": 0.20528702437877655, + "learning_rate": 2.9285333333333336e-05, + "loss": 0.026, + "step": 12150 + }, + { + "epoch": 9.567940133910989, + "grad_norm": 0.575273871421814, + "learning_rate": 2.9284999999999998e-05, + "loss": 0.0201, + "step": 12151 + }, + { + "epoch": 9.568727845608507, + "grad_norm": 0.26244011521339417, + "learning_rate": 2.9284666666666668e-05, + "loss": 0.0131, + "step": 12152 + }, + { + "epoch": 9.569515557306026, + "grad_norm": 0.11968593299388885, + "learning_rate": 2.9284333333333334e-05, + "loss": 0.0076, + "step": 12153 + }, + { + "epoch": 9.570303269003544, + "grad_norm": 0.25224506855010986, + "learning_rate": 2.9284e-05, + "loss": 0.0179, + "step": 12154 + }, + { + "epoch": 9.571090980701063, + "grad_norm": 0.5046495199203491, + "learning_rate": 2.928366666666667e-05, + "loss": 0.0189, + "step": 12155 + }, + { + "epoch": 9.571878692398583, + "grad_norm": 0.5004394054412842, + "learning_rate": 2.9283333333333335e-05, + "loss": 0.0144, + "step": 12156 + }, + { + "epoch": 9.5726664040961, + "grad_norm": 0.31062135100364685, + "learning_rate": 2.9283e-05, + "loss": 0.0189, + "step": 12157 + }, + { + "epoch": 9.57345411579362, + "grad_norm": 0.1855502426624298, + "learning_rate": 2.9282666666666667e-05, + "loss": 0.0115, + "step": 12158 + }, + { + "epoch": 9.574241827491138, + "grad_norm": 0.3577156066894531, + "learning_rate": 2.9282333333333336e-05, + "loss": 0.0179, + "step": 12159 + }, + { + "epoch": 9.575029539188657, + "grad_norm": 0.1714911013841629, + "learning_rate": 2.9282e-05, + "loss": 0.0064, + "step": 12160 + }, + { + "epoch": 9.575817250886175, + "grad_norm": 0.3390234410762787, + "learning_rate": 2.9281666666666668e-05, + "loss": 0.0175, + "step": 12161 + }, + { + "epoch": 9.576604962583694, + "grad_norm": 0.4082511067390442, + "learning_rate": 2.9281333333333337e-05, + "loss": 0.015, + "step": 12162 + }, + { + "epoch": 9.577392674281214, + "grad_norm": 0.46841055154800415, + "learning_rate": 2.9281e-05, + "loss": 0.0296, + "step": 12163 + }, + { + "epoch": 9.578180385978731, + "grad_norm": 0.24212370812892914, + "learning_rate": 2.928066666666667e-05, + "loss": 0.0079, + "step": 12164 + }, + { + "epoch": 9.57896809767625, + "grad_norm": 0.33576491475105286, + "learning_rate": 2.9280333333333335e-05, + "loss": 0.0125, + "step": 12165 + }, + { + "epoch": 9.579755809373768, + "grad_norm": 0.1388327181339264, + "learning_rate": 2.928e-05, + "loss": 0.0104, + "step": 12166 + }, + { + "epoch": 9.580543521071288, + "grad_norm": 0.8118165731430054, + "learning_rate": 2.9279666666666667e-05, + "loss": 0.0231, + "step": 12167 + }, + { + "epoch": 9.581331232768807, + "grad_norm": 0.3154430091381073, + "learning_rate": 2.9279333333333336e-05, + "loss": 0.0114, + "step": 12168 + }, + { + "epoch": 9.582118944466325, + "grad_norm": 0.2232603281736374, + "learning_rate": 2.9279e-05, + "loss": 0.0105, + "step": 12169 + }, + { + "epoch": 9.582906656163845, + "grad_norm": 0.3124442994594574, + "learning_rate": 2.9278666666666668e-05, + "loss": 0.0157, + "step": 12170 + }, + { + "epoch": 9.583694367861362, + "grad_norm": 0.9867367148399353, + "learning_rate": 2.9278333333333334e-05, + "loss": 0.0238, + "step": 12171 + }, + { + "epoch": 9.584482079558882, + "grad_norm": 0.25460636615753174, + "learning_rate": 2.9278e-05, + "loss": 0.0264, + "step": 12172 + }, + { + "epoch": 9.5852697912564, + "grad_norm": 0.18111281096935272, + "learning_rate": 2.927766666666667e-05, + "loss": 0.0132, + "step": 12173 + }, + { + "epoch": 9.586057502953919, + "grad_norm": 0.5622695684432983, + "learning_rate": 2.9277333333333332e-05, + "loss": 0.0191, + "step": 12174 + }, + { + "epoch": 9.586845214651438, + "grad_norm": 0.169769749045372, + "learning_rate": 2.9277e-05, + "loss": 0.0148, + "step": 12175 + }, + { + "epoch": 9.587632926348956, + "grad_norm": 0.18547695875167847, + "learning_rate": 2.9276666666666667e-05, + "loss": 0.0089, + "step": 12176 + }, + { + "epoch": 9.588420638046475, + "grad_norm": 0.16706740856170654, + "learning_rate": 2.9276333333333333e-05, + "loss": 0.0081, + "step": 12177 + }, + { + "epoch": 9.589208349743993, + "grad_norm": 0.22504793107509613, + "learning_rate": 2.9276e-05, + "loss": 0.0097, + "step": 12178 + }, + { + "epoch": 9.589996061441513, + "grad_norm": 0.3136616349220276, + "learning_rate": 2.9275666666666668e-05, + "loss": 0.0154, + "step": 12179 + }, + { + "epoch": 9.59078377313903, + "grad_norm": 0.35796672105789185, + "learning_rate": 2.9275333333333334e-05, + "loss": 0.0143, + "step": 12180 + }, + { + "epoch": 9.59157148483655, + "grad_norm": 0.8023622632026672, + "learning_rate": 2.9275e-05, + "loss": 0.166, + "step": 12181 + }, + { + "epoch": 9.592359196534069, + "grad_norm": 0.5264546275138855, + "learning_rate": 2.927466666666667e-05, + "loss": 0.1482, + "step": 12182 + }, + { + "epoch": 9.593146908231587, + "grad_norm": 0.4230860769748688, + "learning_rate": 2.9274333333333332e-05, + "loss": 0.1046, + "step": 12183 + }, + { + "epoch": 9.593934619929106, + "grad_norm": 0.5192244648933411, + "learning_rate": 2.9274e-05, + "loss": 0.1201, + "step": 12184 + }, + { + "epoch": 9.594722331626624, + "grad_norm": 0.35483676195144653, + "learning_rate": 2.9273666666666667e-05, + "loss": 0.0692, + "step": 12185 + }, + { + "epoch": 9.595510043324143, + "grad_norm": 0.40817704796791077, + "learning_rate": 2.9273333333333333e-05, + "loss": 0.0493, + "step": 12186 + }, + { + "epoch": 9.596297755021663, + "grad_norm": 0.3894074261188507, + "learning_rate": 2.9273000000000002e-05, + "loss": 0.0599, + "step": 12187 + }, + { + "epoch": 9.59708546671918, + "grad_norm": 0.27729231119155884, + "learning_rate": 2.927266666666667e-05, + "loss": 0.0302, + "step": 12188 + }, + { + "epoch": 9.5978731784167, + "grad_norm": 0.264721155166626, + "learning_rate": 2.9272333333333334e-05, + "loss": 0.0285, + "step": 12189 + }, + { + "epoch": 9.598660890114218, + "grad_norm": 0.3011358082294464, + "learning_rate": 2.9272e-05, + "loss": 0.018, + "step": 12190 + }, + { + "epoch": 9.599448601811737, + "grad_norm": 0.3873874843120575, + "learning_rate": 2.927166666666667e-05, + "loss": 0.0234, + "step": 12191 + }, + { + "epoch": 9.600236313509257, + "grad_norm": 0.4207015037536621, + "learning_rate": 2.9271333333333332e-05, + "loss": 0.0209, + "step": 12192 + }, + { + "epoch": 9.601024025206774, + "grad_norm": 0.12845197319984436, + "learning_rate": 2.9271e-05, + "loss": 0.0079, + "step": 12193 + }, + { + "epoch": 9.601811736904294, + "grad_norm": 0.17236444354057312, + "learning_rate": 2.9270666666666667e-05, + "loss": 0.0127, + "step": 12194 + }, + { + "epoch": 9.602599448601811, + "grad_norm": 0.13430838286876678, + "learning_rate": 2.9270333333333333e-05, + "loss": 0.0112, + "step": 12195 + }, + { + "epoch": 9.60338716029933, + "grad_norm": 0.13509130477905273, + "learning_rate": 2.9270000000000003e-05, + "loss": 0.0107, + "step": 12196 + }, + { + "epoch": 9.604174871996848, + "grad_norm": 0.30099350214004517, + "learning_rate": 2.926966666666667e-05, + "loss": 0.0117, + "step": 12197 + }, + { + "epoch": 9.604962583694368, + "grad_norm": 0.15459062159061432, + "learning_rate": 2.9269333333333335e-05, + "loss": 0.012, + "step": 12198 + }, + { + "epoch": 9.605750295391886, + "grad_norm": 0.20443592965602875, + "learning_rate": 2.9269e-05, + "loss": 0.0105, + "step": 12199 + }, + { + "epoch": 9.606538007089405, + "grad_norm": 0.5995217561721802, + "learning_rate": 2.9268666666666666e-05, + "loss": 0.0226, + "step": 12200 + }, + { + "epoch": 9.607325718786925, + "grad_norm": 0.18808123469352722, + "learning_rate": 2.9268333333333332e-05, + "loss": 0.013, + "step": 12201 + }, + { + "epoch": 9.608113430484442, + "grad_norm": 0.1801539957523346, + "learning_rate": 2.9268e-05, + "loss": 0.0082, + "step": 12202 + }, + { + "epoch": 9.608901142181962, + "grad_norm": 0.14994622766971588, + "learning_rate": 2.9267666666666664e-05, + "loss": 0.011, + "step": 12203 + }, + { + "epoch": 9.60968885387948, + "grad_norm": 0.36444997787475586, + "learning_rate": 2.9267333333333334e-05, + "loss": 0.0179, + "step": 12204 + }, + { + "epoch": 9.610476565576999, + "grad_norm": 0.3024239242076874, + "learning_rate": 2.9267000000000003e-05, + "loss": 0.0191, + "step": 12205 + }, + { + "epoch": 9.611264277274518, + "grad_norm": 0.29982128739356995, + "learning_rate": 2.9266666666666665e-05, + "loss": 0.014, + "step": 12206 + }, + { + "epoch": 9.612051988972036, + "grad_norm": 0.2944229543209076, + "learning_rate": 2.9266333333333335e-05, + "loss": 0.0107, + "step": 12207 + }, + { + "epoch": 9.612839700669555, + "grad_norm": 0.2686828076839447, + "learning_rate": 2.9266e-05, + "loss": 0.0179, + "step": 12208 + }, + { + "epoch": 9.613627412367073, + "grad_norm": 0.40601611137390137, + "learning_rate": 2.9265666666666667e-05, + "loss": 0.0156, + "step": 12209 + }, + { + "epoch": 9.614415124064593, + "grad_norm": 0.45673811435699463, + "learning_rate": 2.9265333333333333e-05, + "loss": 0.0121, + "step": 12210 + }, + { + "epoch": 9.615202835762112, + "grad_norm": 0.2819598615169525, + "learning_rate": 2.9265000000000002e-05, + "loss": 0.0136, + "step": 12211 + }, + { + "epoch": 9.61599054745963, + "grad_norm": 0.22434484958648682, + "learning_rate": 2.9264666666666668e-05, + "loss": 0.0169, + "step": 12212 + }, + { + "epoch": 9.61677825915715, + "grad_norm": 0.2591429054737091, + "learning_rate": 2.9264333333333334e-05, + "loss": 0.0161, + "step": 12213 + }, + { + "epoch": 9.617565970854667, + "grad_norm": 0.2785398066043854, + "learning_rate": 2.9264000000000003e-05, + "loss": 0.0142, + "step": 12214 + }, + { + "epoch": 9.618353682552186, + "grad_norm": 0.2920708954334259, + "learning_rate": 2.9263666666666666e-05, + "loss": 0.0199, + "step": 12215 + }, + { + "epoch": 9.619141394249704, + "grad_norm": 0.35320979356765747, + "learning_rate": 2.9263333333333335e-05, + "loss": 0.0112, + "step": 12216 + }, + { + "epoch": 9.619929105947223, + "grad_norm": 0.37571683526039124, + "learning_rate": 2.9263e-05, + "loss": 0.0133, + "step": 12217 + }, + { + "epoch": 9.620716817644743, + "grad_norm": 0.5022777915000916, + "learning_rate": 2.9262666666666667e-05, + "loss": 0.0155, + "step": 12218 + }, + { + "epoch": 9.62150452934226, + "grad_norm": 0.3510448634624481, + "learning_rate": 2.9262333333333333e-05, + "loss": 0.0156, + "step": 12219 + }, + { + "epoch": 9.62229224103978, + "grad_norm": 0.6021804213523865, + "learning_rate": 2.9262000000000002e-05, + "loss": 0.0176, + "step": 12220 + }, + { + "epoch": 9.623079952737298, + "grad_norm": 0.32911381125450134, + "learning_rate": 2.9261666666666668e-05, + "loss": 0.0103, + "step": 12221 + }, + { + "epoch": 9.623867664434817, + "grad_norm": 0.14373993873596191, + "learning_rate": 2.9261333333333334e-05, + "loss": 0.0073, + "step": 12222 + }, + { + "epoch": 9.624655376132335, + "grad_norm": 0.5962730050086975, + "learning_rate": 2.9261000000000003e-05, + "loss": 0.022, + "step": 12223 + }, + { + "epoch": 9.625443087829854, + "grad_norm": 0.4261768162250519, + "learning_rate": 2.9260666666666666e-05, + "loss": 0.0269, + "step": 12224 + }, + { + "epoch": 9.626230799527374, + "grad_norm": 0.2844716012477875, + "learning_rate": 2.9260333333333335e-05, + "loss": 0.0148, + "step": 12225 + }, + { + "epoch": 9.627018511224891, + "grad_norm": 0.389067143201828, + "learning_rate": 2.926e-05, + "loss": 0.0095, + "step": 12226 + }, + { + "epoch": 9.62780622292241, + "grad_norm": 0.21492300927639008, + "learning_rate": 2.9259666666666667e-05, + "loss": 0.0126, + "step": 12227 + }, + { + "epoch": 9.628593934619929, + "grad_norm": 0.25809234380722046, + "learning_rate": 2.9259333333333333e-05, + "loss": 0.0098, + "step": 12228 + }, + { + "epoch": 9.629381646317448, + "grad_norm": 0.35060015320777893, + "learning_rate": 2.9259e-05, + "loss": 0.0193, + "step": 12229 + }, + { + "epoch": 9.630169358014967, + "grad_norm": 0.1929166615009308, + "learning_rate": 2.9258666666666668e-05, + "loss": 0.0094, + "step": 12230 + }, + { + "epoch": 9.630957069712485, + "grad_norm": 0.6425132155418396, + "learning_rate": 2.9258333333333334e-05, + "loss": 0.2709, + "step": 12231 + }, + { + "epoch": 9.631744781410005, + "grad_norm": 0.5706053376197815, + "learning_rate": 2.9258e-05, + "loss": 0.1588, + "step": 12232 + }, + { + "epoch": 9.632532493107522, + "grad_norm": 0.4442330002784729, + "learning_rate": 2.9257666666666666e-05, + "loss": 0.1064, + "step": 12233 + }, + { + "epoch": 9.633320204805042, + "grad_norm": 0.7463997602462769, + "learning_rate": 2.9257333333333335e-05, + "loss": 0.095, + "step": 12234 + }, + { + "epoch": 9.63410791650256, + "grad_norm": 0.5032381415367126, + "learning_rate": 2.9256999999999998e-05, + "loss": 0.0725, + "step": 12235 + }, + { + "epoch": 9.634895628200079, + "grad_norm": 0.3858823776245117, + "learning_rate": 2.9256666666666667e-05, + "loss": 0.0275, + "step": 12236 + }, + { + "epoch": 9.635683339897598, + "grad_norm": 0.20733432471752167, + "learning_rate": 2.9256333333333337e-05, + "loss": 0.02, + "step": 12237 + }, + { + "epoch": 9.636471051595116, + "grad_norm": 0.23260870575904846, + "learning_rate": 2.9256e-05, + "loss": 0.0269, + "step": 12238 + }, + { + "epoch": 9.637258763292635, + "grad_norm": 0.42458483576774597, + "learning_rate": 2.925566666666667e-05, + "loss": 0.0419, + "step": 12239 + }, + { + "epoch": 9.638046474990153, + "grad_norm": 0.19119787216186523, + "learning_rate": 2.9255333333333334e-05, + "loss": 0.01, + "step": 12240 + }, + { + "epoch": 9.638834186687673, + "grad_norm": 0.20779316127300262, + "learning_rate": 2.9255e-05, + "loss": 0.0153, + "step": 12241 + }, + { + "epoch": 9.63962189838519, + "grad_norm": 0.23215758800506592, + "learning_rate": 2.9254666666666666e-05, + "loss": 0.0121, + "step": 12242 + }, + { + "epoch": 9.64040961008271, + "grad_norm": 0.3406306803226471, + "learning_rate": 2.9254333333333336e-05, + "loss": 0.0108, + "step": 12243 + }, + { + "epoch": 9.64119732178023, + "grad_norm": 0.14855451881885529, + "learning_rate": 2.9253999999999998e-05, + "loss": 0.0131, + "step": 12244 + }, + { + "epoch": 9.641985033477747, + "grad_norm": 0.25099173188209534, + "learning_rate": 2.9253666666666667e-05, + "loss": 0.0274, + "step": 12245 + }, + { + "epoch": 9.642772745175266, + "grad_norm": 0.16368548572063446, + "learning_rate": 2.9253333333333337e-05, + "loss": 0.0106, + "step": 12246 + }, + { + "epoch": 9.643560456872784, + "grad_norm": 0.2314317375421524, + "learning_rate": 2.9253e-05, + "loss": 0.0129, + "step": 12247 + }, + { + "epoch": 9.644348168570303, + "grad_norm": 0.6323203444480896, + "learning_rate": 2.925266666666667e-05, + "loss": 0.0203, + "step": 12248 + }, + { + "epoch": 9.645135880267823, + "grad_norm": 0.41784152388572693, + "learning_rate": 2.9252333333333335e-05, + "loss": 0.012, + "step": 12249 + }, + { + "epoch": 9.64592359196534, + "grad_norm": 0.2348436713218689, + "learning_rate": 2.9252e-05, + "loss": 0.0115, + "step": 12250 + }, + { + "epoch": 9.64671130366286, + "grad_norm": 0.3971461057662964, + "learning_rate": 2.9251666666666666e-05, + "loss": 0.0146, + "step": 12251 + }, + { + "epoch": 9.647499015360378, + "grad_norm": 0.2956724166870117, + "learning_rate": 2.9251333333333336e-05, + "loss": 0.0092, + "step": 12252 + }, + { + "epoch": 9.648286727057897, + "grad_norm": 0.18524852395057678, + "learning_rate": 2.9250999999999998e-05, + "loss": 0.0087, + "step": 12253 + }, + { + "epoch": 9.649074438755415, + "grad_norm": 0.29835280776023865, + "learning_rate": 2.9250666666666668e-05, + "loss": 0.0232, + "step": 12254 + }, + { + "epoch": 9.649862150452934, + "grad_norm": 0.8830116391181946, + "learning_rate": 2.9250333333333337e-05, + "loss": 0.0119, + "step": 12255 + }, + { + "epoch": 9.650649862150454, + "grad_norm": 0.3706909120082855, + "learning_rate": 2.925e-05, + "loss": 0.0109, + "step": 12256 + }, + { + "epoch": 9.651437573847971, + "grad_norm": 0.5072891116142273, + "learning_rate": 2.924966666666667e-05, + "loss": 0.0183, + "step": 12257 + }, + { + "epoch": 9.65222528554549, + "grad_norm": 0.5965934991836548, + "learning_rate": 2.9249333333333335e-05, + "loss": 0.0123, + "step": 12258 + }, + { + "epoch": 9.653012997243009, + "grad_norm": 0.19937767088413239, + "learning_rate": 2.9249e-05, + "loss": 0.0099, + "step": 12259 + }, + { + "epoch": 9.653800708940528, + "grad_norm": 0.1312050223350525, + "learning_rate": 2.9248666666666667e-05, + "loss": 0.0079, + "step": 12260 + }, + { + "epoch": 9.654588420638046, + "grad_norm": 0.2542969286441803, + "learning_rate": 2.9248333333333333e-05, + "loss": 0.0077, + "step": 12261 + }, + { + "epoch": 9.655376132335565, + "grad_norm": 0.42330217361450195, + "learning_rate": 2.9248000000000002e-05, + "loss": 0.0127, + "step": 12262 + }, + { + "epoch": 9.656163844033085, + "grad_norm": 0.40716975927352905, + "learning_rate": 2.9247666666666668e-05, + "loss": 0.0139, + "step": 12263 + }, + { + "epoch": 9.656951555730602, + "grad_norm": 0.26059019565582275, + "learning_rate": 2.9247333333333334e-05, + "loss": 0.0134, + "step": 12264 + }, + { + "epoch": 9.657739267428122, + "grad_norm": 0.3054046332836151, + "learning_rate": 2.9247e-05, + "loss": 0.0134, + "step": 12265 + }, + { + "epoch": 9.65852697912564, + "grad_norm": 0.2756984829902649, + "learning_rate": 2.924666666666667e-05, + "loss": 0.0136, + "step": 12266 + }, + { + "epoch": 9.659314690823159, + "grad_norm": 0.3477438986301422, + "learning_rate": 2.924633333333333e-05, + "loss": 0.0198, + "step": 12267 + }, + { + "epoch": 9.660102402520678, + "grad_norm": 0.6262338161468506, + "learning_rate": 2.9246e-05, + "loss": 0.0134, + "step": 12268 + }, + { + "epoch": 9.660890114218196, + "grad_norm": 0.25555330514907837, + "learning_rate": 2.9245666666666667e-05, + "loss": 0.0111, + "step": 12269 + }, + { + "epoch": 9.661677825915715, + "grad_norm": 0.4691862165927887, + "learning_rate": 2.9245333333333333e-05, + "loss": 0.0266, + "step": 12270 + }, + { + "epoch": 9.662465537613233, + "grad_norm": 0.23063166439533234, + "learning_rate": 2.9245000000000002e-05, + "loss": 0.0128, + "step": 12271 + }, + { + "epoch": 9.663253249310753, + "grad_norm": 0.2514406144618988, + "learning_rate": 2.9244666666666668e-05, + "loss": 0.0096, + "step": 12272 + }, + { + "epoch": 9.66404096100827, + "grad_norm": 0.2639479637145996, + "learning_rate": 2.9244333333333334e-05, + "loss": 0.0113, + "step": 12273 + }, + { + "epoch": 9.66482867270579, + "grad_norm": 0.32358261942863464, + "learning_rate": 2.9244e-05, + "loss": 0.0142, + "step": 12274 + }, + { + "epoch": 9.66561638440331, + "grad_norm": 0.2510979175567627, + "learning_rate": 2.924366666666667e-05, + "loss": 0.0078, + "step": 12275 + }, + { + "epoch": 9.666404096100827, + "grad_norm": 0.3870448172092438, + "learning_rate": 2.9243333333333332e-05, + "loss": 0.0229, + "step": 12276 + }, + { + "epoch": 9.667191807798346, + "grad_norm": 0.522141695022583, + "learning_rate": 2.9243e-05, + "loss": 0.0168, + "step": 12277 + }, + { + "epoch": 9.667979519495864, + "grad_norm": 0.35898834466934204, + "learning_rate": 2.9242666666666667e-05, + "loss": 0.0118, + "step": 12278 + }, + { + "epoch": 9.668767231193383, + "grad_norm": 0.831372082233429, + "learning_rate": 2.9242333333333333e-05, + "loss": 0.0208, + "step": 12279 + }, + { + "epoch": 9.669554942890901, + "grad_norm": 0.3913828134536743, + "learning_rate": 2.9242000000000002e-05, + "loss": 0.016, + "step": 12280 + }, + { + "epoch": 9.67034265458842, + "grad_norm": 0.6694963574409485, + "learning_rate": 2.9241666666666668e-05, + "loss": 0.2671, + "step": 12281 + }, + { + "epoch": 9.67113036628594, + "grad_norm": 0.5924591422080994, + "learning_rate": 2.9241333333333334e-05, + "loss": 0.2029, + "step": 12282 + }, + { + "epoch": 9.671918077983458, + "grad_norm": 0.5345156788825989, + "learning_rate": 2.9241e-05, + "loss": 0.1392, + "step": 12283 + }, + { + "epoch": 9.672705789680977, + "grad_norm": 0.7274753451347351, + "learning_rate": 2.924066666666667e-05, + "loss": 0.0893, + "step": 12284 + }, + { + "epoch": 9.673493501378495, + "grad_norm": 0.3747418522834778, + "learning_rate": 2.9240333333333332e-05, + "loss": 0.0756, + "step": 12285 + }, + { + "epoch": 9.674281213076014, + "grad_norm": 0.48826050758361816, + "learning_rate": 2.924e-05, + "loss": 0.0397, + "step": 12286 + }, + { + "epoch": 9.675068924773534, + "grad_norm": 0.37252122163772583, + "learning_rate": 2.923966666666667e-05, + "loss": 0.031, + "step": 12287 + }, + { + "epoch": 9.675856636471051, + "grad_norm": 0.28963249921798706, + "learning_rate": 2.9239333333333333e-05, + "loss": 0.0255, + "step": 12288 + }, + { + "epoch": 9.67664434816857, + "grad_norm": 0.5546546578407288, + "learning_rate": 2.9239000000000002e-05, + "loss": 0.0197, + "step": 12289 + }, + { + "epoch": 9.677432059866089, + "grad_norm": 0.2618027329444885, + "learning_rate": 2.9238666666666665e-05, + "loss": 0.0213, + "step": 12290 + }, + { + "epoch": 9.678219771563608, + "grad_norm": 0.5300785303115845, + "learning_rate": 2.9238333333333334e-05, + "loss": 0.0141, + "step": 12291 + }, + { + "epoch": 9.679007483261126, + "grad_norm": 0.8176157474517822, + "learning_rate": 2.9238e-05, + "loss": 0.0479, + "step": 12292 + }, + { + "epoch": 9.679795194958645, + "grad_norm": 0.3391462564468384, + "learning_rate": 2.9237666666666666e-05, + "loss": 0.0184, + "step": 12293 + }, + { + "epoch": 9.680582906656165, + "grad_norm": 0.48216673731803894, + "learning_rate": 2.9237333333333332e-05, + "loss": 0.0191, + "step": 12294 + }, + { + "epoch": 9.681370618353682, + "grad_norm": 0.4453667402267456, + "learning_rate": 2.9237e-05, + "loss": 0.025, + "step": 12295 + }, + { + "epoch": 9.682158330051202, + "grad_norm": 0.3355284035205841, + "learning_rate": 2.9236666666666667e-05, + "loss": 0.0186, + "step": 12296 + }, + { + "epoch": 9.68294604174872, + "grad_norm": 0.24970415234565735, + "learning_rate": 2.9236333333333333e-05, + "loss": 0.012, + "step": 12297 + }, + { + "epoch": 9.683733753446239, + "grad_norm": 0.1448126882314682, + "learning_rate": 2.9236000000000003e-05, + "loss": 0.009, + "step": 12298 + }, + { + "epoch": 9.684521465143757, + "grad_norm": 0.18444736301898956, + "learning_rate": 2.9235666666666665e-05, + "loss": 0.0084, + "step": 12299 + }, + { + "epoch": 9.685309176841276, + "grad_norm": 0.37553900480270386, + "learning_rate": 2.9235333333333335e-05, + "loss": 0.0269, + "step": 12300 + }, + { + "epoch": 9.686096888538795, + "grad_norm": 0.20020811259746552, + "learning_rate": 2.9235e-05, + "loss": 0.0099, + "step": 12301 + }, + { + "epoch": 9.686884600236313, + "grad_norm": 0.6214545369148254, + "learning_rate": 2.9234666666666666e-05, + "loss": 0.0205, + "step": 12302 + }, + { + "epoch": 9.687672311933833, + "grad_norm": 0.24788136780261993, + "learning_rate": 2.9234333333333332e-05, + "loss": 0.0179, + "step": 12303 + }, + { + "epoch": 9.68846002363135, + "grad_norm": 0.8478105664253235, + "learning_rate": 2.9234e-05, + "loss": 0.0194, + "step": 12304 + }, + { + "epoch": 9.68924773532887, + "grad_norm": 0.21415506303310394, + "learning_rate": 2.9233666666666668e-05, + "loss": 0.0094, + "step": 12305 + }, + { + "epoch": 9.69003544702639, + "grad_norm": 0.2719341218471527, + "learning_rate": 2.9233333333333334e-05, + "loss": 0.0482, + "step": 12306 + }, + { + "epoch": 9.690823158723907, + "grad_norm": 0.13699699938297272, + "learning_rate": 2.9233000000000003e-05, + "loss": 0.0064, + "step": 12307 + }, + { + "epoch": 9.691610870421426, + "grad_norm": 0.27368995547294617, + "learning_rate": 2.9232666666666665e-05, + "loss": 0.0158, + "step": 12308 + }, + { + "epoch": 9.692398582118944, + "grad_norm": 0.43531304597854614, + "learning_rate": 2.9232333333333335e-05, + "loss": 0.0227, + "step": 12309 + }, + { + "epoch": 9.693186293816463, + "grad_norm": 0.4147284924983978, + "learning_rate": 2.9232e-05, + "loss": 0.0122, + "step": 12310 + }, + { + "epoch": 9.693974005513981, + "grad_norm": 0.17434582114219666, + "learning_rate": 2.9231666666666667e-05, + "loss": 0.0133, + "step": 12311 + }, + { + "epoch": 9.6947617172115, + "grad_norm": 0.15478381514549255, + "learning_rate": 2.9231333333333336e-05, + "loss": 0.009, + "step": 12312 + }, + { + "epoch": 9.69554942890902, + "grad_norm": 0.27513301372528076, + "learning_rate": 2.9231000000000002e-05, + "loss": 0.02, + "step": 12313 + }, + { + "epoch": 9.696337140606538, + "grad_norm": 0.3000088632106781, + "learning_rate": 2.9230666666666668e-05, + "loss": 0.0112, + "step": 12314 + }, + { + "epoch": 9.697124852304057, + "grad_norm": 0.4554111361503601, + "learning_rate": 2.9230333333333334e-05, + "loss": 0.0116, + "step": 12315 + }, + { + "epoch": 9.697912564001575, + "grad_norm": 0.26652106642723083, + "learning_rate": 2.9230000000000003e-05, + "loss": 0.0149, + "step": 12316 + }, + { + "epoch": 9.698700275699094, + "grad_norm": 0.3131850063800812, + "learning_rate": 2.9229666666666666e-05, + "loss": 0.0075, + "step": 12317 + }, + { + "epoch": 9.699487987396612, + "grad_norm": 0.5941728353500366, + "learning_rate": 2.9229333333333335e-05, + "loss": 0.0163, + "step": 12318 + }, + { + "epoch": 9.700275699094131, + "grad_norm": 0.2126225382089615, + "learning_rate": 2.9229e-05, + "loss": 0.0052, + "step": 12319 + }, + { + "epoch": 9.701063410791651, + "grad_norm": 0.5269520282745361, + "learning_rate": 2.9228666666666667e-05, + "loss": 0.0209, + "step": 12320 + }, + { + "epoch": 9.701851122489169, + "grad_norm": 0.25299468636512756, + "learning_rate": 2.9228333333333336e-05, + "loss": 0.0217, + "step": 12321 + }, + { + "epoch": 9.702638834186688, + "grad_norm": 0.33093586564064026, + "learning_rate": 2.9228e-05, + "loss": 0.0115, + "step": 12322 + }, + { + "epoch": 9.703426545884206, + "grad_norm": 0.34814268350601196, + "learning_rate": 2.9227666666666668e-05, + "loss": 0.0151, + "step": 12323 + }, + { + "epoch": 9.704214257581725, + "grad_norm": 0.4531005620956421, + "learning_rate": 2.9227333333333334e-05, + "loss": 0.0252, + "step": 12324 + }, + { + "epoch": 9.705001969279245, + "grad_norm": 0.49185144901275635, + "learning_rate": 2.9227e-05, + "loss": 0.0302, + "step": 12325 + }, + { + "epoch": 9.705789680976762, + "grad_norm": 0.43047675490379333, + "learning_rate": 2.9226666666666666e-05, + "loss": 0.0227, + "step": 12326 + }, + { + "epoch": 9.706577392674282, + "grad_norm": 0.36102694272994995, + "learning_rate": 2.9226333333333335e-05, + "loss": 0.0115, + "step": 12327 + }, + { + "epoch": 9.7073651043718, + "grad_norm": 0.17826958000659943, + "learning_rate": 2.9226e-05, + "loss": 0.008, + "step": 12328 + }, + { + "epoch": 9.708152816069319, + "grad_norm": 0.28566688299179077, + "learning_rate": 2.9225666666666667e-05, + "loss": 0.0118, + "step": 12329 + }, + { + "epoch": 9.708940527766837, + "grad_norm": 0.38955971598625183, + "learning_rate": 2.9225333333333336e-05, + "loss": 0.0088, + "step": 12330 + }, + { + "epoch": 9.709728239464356, + "grad_norm": 0.6860036849975586, + "learning_rate": 2.9225e-05, + "loss": 0.2145, + "step": 12331 + }, + { + "epoch": 9.710515951161875, + "grad_norm": 0.5226418375968933, + "learning_rate": 2.9224666666666668e-05, + "loss": 0.1307, + "step": 12332 + }, + { + "epoch": 9.711303662859393, + "grad_norm": 0.6607868075370789, + "learning_rate": 2.9224333333333334e-05, + "loss": 0.1266, + "step": 12333 + }, + { + "epoch": 9.712091374556913, + "grad_norm": 0.4552789032459259, + "learning_rate": 2.9224e-05, + "loss": 0.1044, + "step": 12334 + }, + { + "epoch": 9.71287908625443, + "grad_norm": 0.5827717781066895, + "learning_rate": 2.9223666666666666e-05, + "loss": 0.1253, + "step": 12335 + }, + { + "epoch": 9.71366679795195, + "grad_norm": 0.5120692253112793, + "learning_rate": 2.9223333333333335e-05, + "loss": 0.1005, + "step": 12336 + }, + { + "epoch": 9.714454509649467, + "grad_norm": 0.23071357607841492, + "learning_rate": 2.9223e-05, + "loss": 0.0438, + "step": 12337 + }, + { + "epoch": 9.715242221346987, + "grad_norm": 0.1977076381444931, + "learning_rate": 2.9222666666666667e-05, + "loss": 0.0212, + "step": 12338 + }, + { + "epoch": 9.716029933044506, + "grad_norm": 0.40934237837791443, + "learning_rate": 2.9222333333333337e-05, + "loss": 0.0171, + "step": 12339 + }, + { + "epoch": 9.716817644742024, + "grad_norm": 0.6208634376525879, + "learning_rate": 2.9222e-05, + "loss": 0.0402, + "step": 12340 + }, + { + "epoch": 9.717605356439543, + "grad_norm": 0.26801612973213196, + "learning_rate": 2.922166666666667e-05, + "loss": 0.0318, + "step": 12341 + }, + { + "epoch": 9.718393068137061, + "grad_norm": 0.4255911409854889, + "learning_rate": 2.9221333333333334e-05, + "loss": 0.0167, + "step": 12342 + }, + { + "epoch": 9.71918077983458, + "grad_norm": 0.29390767216682434, + "learning_rate": 2.9221e-05, + "loss": 0.0133, + "step": 12343 + }, + { + "epoch": 9.7199684915321, + "grad_norm": 0.4177290201187134, + "learning_rate": 2.9220666666666666e-05, + "loss": 0.0384, + "step": 12344 + }, + { + "epoch": 9.720756203229618, + "grad_norm": 0.1355806142091751, + "learning_rate": 2.9220333333333336e-05, + "loss": 0.0079, + "step": 12345 + }, + { + "epoch": 9.721543914927137, + "grad_norm": 0.20087100565433502, + "learning_rate": 2.922e-05, + "loss": 0.0235, + "step": 12346 + }, + { + "epoch": 9.722331626624655, + "grad_norm": 0.5056079030036926, + "learning_rate": 2.9219666666666667e-05, + "loss": 0.0261, + "step": 12347 + }, + { + "epoch": 9.723119338322174, + "grad_norm": 0.3215755820274353, + "learning_rate": 2.9219333333333337e-05, + "loss": 0.0186, + "step": 12348 + }, + { + "epoch": 9.723907050019692, + "grad_norm": 0.22276422381401062, + "learning_rate": 2.9219e-05, + "loss": 0.0177, + "step": 12349 + }, + { + "epoch": 9.724694761717211, + "grad_norm": 0.14812670648097992, + "learning_rate": 2.921866666666667e-05, + "loss": 0.0097, + "step": 12350 + }, + { + "epoch": 9.725482473414731, + "grad_norm": 0.2628515958786011, + "learning_rate": 2.921833333333333e-05, + "loss": 0.0144, + "step": 12351 + }, + { + "epoch": 9.726270185112249, + "grad_norm": 0.3796280324459076, + "learning_rate": 2.9218e-05, + "loss": 0.0134, + "step": 12352 + }, + { + "epoch": 9.727057896809768, + "grad_norm": 0.22711782157421112, + "learning_rate": 2.9217666666666666e-05, + "loss": 0.0097, + "step": 12353 + }, + { + "epoch": 9.727845608507286, + "grad_norm": 0.35924243927001953, + "learning_rate": 2.9217333333333332e-05, + "loss": 0.0137, + "step": 12354 + }, + { + "epoch": 9.728633320204805, + "grad_norm": 0.16632197797298431, + "learning_rate": 2.9217e-05, + "loss": 0.0126, + "step": 12355 + }, + { + "epoch": 9.729421031902323, + "grad_norm": 0.27626585960388184, + "learning_rate": 2.9216666666666668e-05, + "loss": 0.0118, + "step": 12356 + }, + { + "epoch": 9.730208743599842, + "grad_norm": 0.3302980065345764, + "learning_rate": 2.9216333333333334e-05, + "loss": 0.0116, + "step": 12357 + }, + { + "epoch": 9.730996455297362, + "grad_norm": 0.20835690200328827, + "learning_rate": 2.9216e-05, + "loss": 0.0089, + "step": 12358 + }, + { + "epoch": 9.73178416699488, + "grad_norm": 0.2628459632396698, + "learning_rate": 2.921566666666667e-05, + "loss": 0.0121, + "step": 12359 + }, + { + "epoch": 9.732571878692399, + "grad_norm": 0.3034976124763489, + "learning_rate": 2.921533333333333e-05, + "loss": 0.0145, + "step": 12360 + }, + { + "epoch": 9.733359590389917, + "grad_norm": 0.29634422063827515, + "learning_rate": 2.9215e-05, + "loss": 0.0175, + "step": 12361 + }, + { + "epoch": 9.734147302087436, + "grad_norm": 0.2880372107028961, + "learning_rate": 2.921466666666667e-05, + "loss": 0.0115, + "step": 12362 + }, + { + "epoch": 9.734935013784956, + "grad_norm": 0.2918242812156677, + "learning_rate": 2.9214333333333333e-05, + "loss": 0.02, + "step": 12363 + }, + { + "epoch": 9.735722725482473, + "grad_norm": 0.1955975741147995, + "learning_rate": 2.9214000000000002e-05, + "loss": 0.0124, + "step": 12364 + }, + { + "epoch": 9.736510437179993, + "grad_norm": 0.21387967467308044, + "learning_rate": 2.9213666666666668e-05, + "loss": 0.0195, + "step": 12365 + }, + { + "epoch": 9.73729814887751, + "grad_norm": 0.41127628087997437, + "learning_rate": 2.9213333333333334e-05, + "loss": 0.0143, + "step": 12366 + }, + { + "epoch": 9.73808586057503, + "grad_norm": 0.18376648426055908, + "learning_rate": 2.9213e-05, + "loss": 0.0125, + "step": 12367 + }, + { + "epoch": 9.738873572272547, + "grad_norm": 0.410474568605423, + "learning_rate": 2.921266666666667e-05, + "loss": 0.0153, + "step": 12368 + }, + { + "epoch": 9.739661283970067, + "grad_norm": 0.2822839021682739, + "learning_rate": 2.921233333333333e-05, + "loss": 0.0243, + "step": 12369 + }, + { + "epoch": 9.740448995667586, + "grad_norm": 0.2585303783416748, + "learning_rate": 2.9212e-05, + "loss": 0.0147, + "step": 12370 + }, + { + "epoch": 9.741236707365104, + "grad_norm": 0.319929301738739, + "learning_rate": 2.921166666666667e-05, + "loss": 0.0191, + "step": 12371 + }, + { + "epoch": 9.742024419062624, + "grad_norm": 0.2565779983997345, + "learning_rate": 2.9211333333333333e-05, + "loss": 0.0114, + "step": 12372 + }, + { + "epoch": 9.742812130760141, + "grad_norm": 1.1240613460540771, + "learning_rate": 2.9211000000000002e-05, + "loss": 0.0155, + "step": 12373 + }, + { + "epoch": 9.74359984245766, + "grad_norm": 0.2265288531780243, + "learning_rate": 2.9210666666666668e-05, + "loss": 0.0153, + "step": 12374 + }, + { + "epoch": 9.744387554155178, + "grad_norm": 0.18533553183078766, + "learning_rate": 2.9210333333333334e-05, + "loss": 0.0101, + "step": 12375 + }, + { + "epoch": 9.745175265852698, + "grad_norm": 0.13053005933761597, + "learning_rate": 2.921e-05, + "loss": 0.0084, + "step": 12376 + }, + { + "epoch": 9.745962977550217, + "grad_norm": 0.40044888854026794, + "learning_rate": 2.920966666666667e-05, + "loss": 0.0201, + "step": 12377 + }, + { + "epoch": 9.746750689247735, + "grad_norm": 0.48188814520835876, + "learning_rate": 2.9209333333333335e-05, + "loss": 0.0245, + "step": 12378 + }, + { + "epoch": 9.747538400945254, + "grad_norm": 0.2500670254230499, + "learning_rate": 2.9209e-05, + "loss": 0.0108, + "step": 12379 + }, + { + "epoch": 9.748326112642772, + "grad_norm": 0.1861724555492401, + "learning_rate": 2.9208666666666667e-05, + "loss": 0.0087, + "step": 12380 + }, + { + "epoch": 9.749113824340292, + "grad_norm": 0.6336219906806946, + "learning_rate": 2.9208333333333333e-05, + "loss": 0.1874, + "step": 12381 + }, + { + "epoch": 9.749901536037811, + "grad_norm": 0.5395677089691162, + "learning_rate": 2.9208000000000002e-05, + "loss": 0.1371, + "step": 12382 + }, + { + "epoch": 9.750689247735329, + "grad_norm": 0.5062285661697388, + "learning_rate": 2.9207666666666665e-05, + "loss": 0.1117, + "step": 12383 + }, + { + "epoch": 9.751476959432848, + "grad_norm": 0.4194500744342804, + "learning_rate": 2.9207333333333334e-05, + "loss": 0.1011, + "step": 12384 + }, + { + "epoch": 9.752264671130366, + "grad_norm": 0.36115020513534546, + "learning_rate": 2.9207e-05, + "loss": 0.0828, + "step": 12385 + }, + { + "epoch": 9.753052382827885, + "grad_norm": 0.24176821112632751, + "learning_rate": 2.9206666666666666e-05, + "loss": 0.0262, + "step": 12386 + }, + { + "epoch": 9.753840094525403, + "grad_norm": 0.22294127941131592, + "learning_rate": 2.9206333333333335e-05, + "loss": 0.0444, + "step": 12387 + }, + { + "epoch": 9.754627806222922, + "grad_norm": 0.58278888463974, + "learning_rate": 2.9206e-05, + "loss": 0.0557, + "step": 12388 + }, + { + "epoch": 9.755415517920442, + "grad_norm": 0.16539283096790314, + "learning_rate": 2.9205666666666667e-05, + "loss": 0.0186, + "step": 12389 + }, + { + "epoch": 9.75620322961796, + "grad_norm": 0.15674299001693726, + "learning_rate": 2.9205333333333333e-05, + "loss": 0.0143, + "step": 12390 + }, + { + "epoch": 9.756990941315479, + "grad_norm": 0.2294137179851532, + "learning_rate": 2.9205000000000002e-05, + "loss": 0.014, + "step": 12391 + }, + { + "epoch": 9.757778653012997, + "grad_norm": 0.3534223735332489, + "learning_rate": 2.9204666666666665e-05, + "loss": 0.0199, + "step": 12392 + }, + { + "epoch": 9.758566364710516, + "grad_norm": 0.3126717507839203, + "learning_rate": 2.9204333333333334e-05, + "loss": 0.0257, + "step": 12393 + }, + { + "epoch": 9.759354076408034, + "grad_norm": 0.19960978627204895, + "learning_rate": 2.9204e-05, + "loss": 0.0135, + "step": 12394 + }, + { + "epoch": 9.760141788105553, + "grad_norm": 0.20286120474338531, + "learning_rate": 2.9203666666666666e-05, + "loss": 0.0103, + "step": 12395 + }, + { + "epoch": 9.760929499803073, + "grad_norm": 0.2558288872241974, + "learning_rate": 2.9203333333333336e-05, + "loss": 0.0098, + "step": 12396 + }, + { + "epoch": 9.76171721150059, + "grad_norm": 0.17334061861038208, + "learning_rate": 2.9203e-05, + "loss": 0.0097, + "step": 12397 + }, + { + "epoch": 9.76250492319811, + "grad_norm": 0.2796182334423065, + "learning_rate": 2.9202666666666667e-05, + "loss": 0.0174, + "step": 12398 + }, + { + "epoch": 9.763292634895627, + "grad_norm": 0.2653733789920807, + "learning_rate": 2.9202333333333333e-05, + "loss": 0.0103, + "step": 12399 + }, + { + "epoch": 9.764080346593147, + "grad_norm": 0.447797954082489, + "learning_rate": 2.9202000000000003e-05, + "loss": 0.0128, + "step": 12400 + }, + { + "epoch": 9.764868058290666, + "grad_norm": 0.2394559383392334, + "learning_rate": 2.9201666666666665e-05, + "loss": 0.0163, + "step": 12401 + }, + { + "epoch": 9.765655769988184, + "grad_norm": 0.2743283808231354, + "learning_rate": 2.9201333333333335e-05, + "loss": 0.0079, + "step": 12402 + }, + { + "epoch": 9.766443481685704, + "grad_norm": 0.17585456371307373, + "learning_rate": 2.9201e-05, + "loss": 0.0125, + "step": 12403 + }, + { + "epoch": 9.767231193383221, + "grad_norm": 0.2403867542743683, + "learning_rate": 2.9200666666666666e-05, + "loss": 0.0202, + "step": 12404 + }, + { + "epoch": 9.76801890508074, + "grad_norm": 0.18045182526111603, + "learning_rate": 2.9200333333333336e-05, + "loss": 0.0091, + "step": 12405 + }, + { + "epoch": 9.768806616778258, + "grad_norm": 0.29274559020996094, + "learning_rate": 2.92e-05, + "loss": 0.0148, + "step": 12406 + }, + { + "epoch": 9.769594328475778, + "grad_norm": 0.6419203281402588, + "learning_rate": 2.9199666666666668e-05, + "loss": 0.0099, + "step": 12407 + }, + { + "epoch": 9.770382040173297, + "grad_norm": 0.18974700570106506, + "learning_rate": 2.9199333333333334e-05, + "loss": 0.008, + "step": 12408 + }, + { + "epoch": 9.771169751870815, + "grad_norm": 0.1881694793701172, + "learning_rate": 2.9199000000000003e-05, + "loss": 0.012, + "step": 12409 + }, + { + "epoch": 9.771957463568334, + "grad_norm": 0.1718652993440628, + "learning_rate": 2.9198666666666665e-05, + "loss": 0.0081, + "step": 12410 + }, + { + "epoch": 9.772745175265852, + "grad_norm": 0.30836933851242065, + "learning_rate": 2.9198333333333335e-05, + "loss": 0.0106, + "step": 12411 + }, + { + "epoch": 9.773532886963372, + "grad_norm": 0.2594447135925293, + "learning_rate": 2.9198e-05, + "loss": 0.0148, + "step": 12412 + }, + { + "epoch": 9.77432059866089, + "grad_norm": 0.10543178021907806, + "learning_rate": 2.9197666666666667e-05, + "loss": 0.0078, + "step": 12413 + }, + { + "epoch": 9.775108310358409, + "grad_norm": 0.24818506836891174, + "learning_rate": 2.9197333333333336e-05, + "loss": 0.0194, + "step": 12414 + }, + { + "epoch": 9.775896022055928, + "grad_norm": 0.3662746846675873, + "learning_rate": 2.9197e-05, + "loss": 0.0124, + "step": 12415 + }, + { + "epoch": 9.776683733753446, + "grad_norm": 0.573785662651062, + "learning_rate": 2.9196666666666668e-05, + "loss": 0.0148, + "step": 12416 + }, + { + "epoch": 9.777471445450965, + "grad_norm": 0.29083704948425293, + "learning_rate": 2.9196333333333334e-05, + "loss": 0.0168, + "step": 12417 + }, + { + "epoch": 9.778259157148483, + "grad_norm": 0.25144872069358826, + "learning_rate": 2.9196e-05, + "loss": 0.0138, + "step": 12418 + }, + { + "epoch": 9.779046868846002, + "grad_norm": 0.268844336271286, + "learning_rate": 2.9195666666666666e-05, + "loss": 0.0277, + "step": 12419 + }, + { + "epoch": 9.779834580543522, + "grad_norm": 0.40350064635276794, + "learning_rate": 2.9195333333333335e-05, + "loss": 0.0144, + "step": 12420 + }, + { + "epoch": 9.78062229224104, + "grad_norm": 0.27914655208587646, + "learning_rate": 2.9195e-05, + "loss": 0.0145, + "step": 12421 + }, + { + "epoch": 9.781410003938559, + "grad_norm": 0.24133490025997162, + "learning_rate": 2.9194666666666667e-05, + "loss": 0.0106, + "step": 12422 + }, + { + "epoch": 9.782197715636077, + "grad_norm": 0.6920658946037292, + "learning_rate": 2.9194333333333336e-05, + "loss": 0.0264, + "step": 12423 + }, + { + "epoch": 9.782985427333596, + "grad_norm": 0.4262842535972595, + "learning_rate": 2.9194e-05, + "loss": 0.0158, + "step": 12424 + }, + { + "epoch": 9.783773139031114, + "grad_norm": 0.23762017488479614, + "learning_rate": 2.9193666666666668e-05, + "loss": 0.0154, + "step": 12425 + }, + { + "epoch": 9.784560850728633, + "grad_norm": 0.2686269283294678, + "learning_rate": 2.9193333333333334e-05, + "loss": 0.0131, + "step": 12426 + }, + { + "epoch": 9.785348562426153, + "grad_norm": 0.34429988265037537, + "learning_rate": 2.9193e-05, + "loss": 0.0177, + "step": 12427 + }, + { + "epoch": 9.78613627412367, + "grad_norm": 0.24188558757305145, + "learning_rate": 2.919266666666667e-05, + "loss": 0.0155, + "step": 12428 + }, + { + "epoch": 9.78692398582119, + "grad_norm": 0.2521408498287201, + "learning_rate": 2.9192333333333335e-05, + "loss": 0.0152, + "step": 12429 + }, + { + "epoch": 9.787711697518708, + "grad_norm": 0.310242235660553, + "learning_rate": 2.9192e-05, + "loss": 0.019, + "step": 12430 + }, + { + "epoch": 9.788499409216227, + "grad_norm": 0.7671269178390503, + "learning_rate": 2.9191666666666667e-05, + "loss": 0.2085, + "step": 12431 + }, + { + "epoch": 9.789287120913745, + "grad_norm": 0.5142435431480408, + "learning_rate": 2.9191333333333336e-05, + "loss": 0.1974, + "step": 12432 + }, + { + "epoch": 9.790074832611264, + "grad_norm": 0.3681847155094147, + "learning_rate": 2.9191e-05, + "loss": 0.095, + "step": 12433 + }, + { + "epoch": 9.790862544308784, + "grad_norm": 0.4091109037399292, + "learning_rate": 2.9190666666666668e-05, + "loss": 0.0954, + "step": 12434 + }, + { + "epoch": 9.791650256006301, + "grad_norm": 0.42152947187423706, + "learning_rate": 2.9190333333333334e-05, + "loss": 0.0832, + "step": 12435 + }, + { + "epoch": 9.79243796770382, + "grad_norm": 0.27783626317977905, + "learning_rate": 2.919e-05, + "loss": 0.0331, + "step": 12436 + }, + { + "epoch": 9.793225679401338, + "grad_norm": 0.33385390043258667, + "learning_rate": 2.918966666666667e-05, + "loss": 0.0285, + "step": 12437 + }, + { + "epoch": 9.794013391098858, + "grad_norm": 0.2726515829563141, + "learning_rate": 2.9189333333333335e-05, + "loss": 0.0294, + "step": 12438 + }, + { + "epoch": 9.794801102796377, + "grad_norm": 0.18922331929206848, + "learning_rate": 2.9189e-05, + "loss": 0.0162, + "step": 12439 + }, + { + "epoch": 9.795588814493895, + "grad_norm": 0.22312958538532257, + "learning_rate": 2.9188666666666667e-05, + "loss": 0.018, + "step": 12440 + }, + { + "epoch": 9.796376526191414, + "grad_norm": 0.23034562170505524, + "learning_rate": 2.9188333333333333e-05, + "loss": 0.0148, + "step": 12441 + }, + { + "epoch": 9.797164237888932, + "grad_norm": 0.266674280166626, + "learning_rate": 2.9188e-05, + "loss": 0.0139, + "step": 12442 + }, + { + "epoch": 9.797951949586452, + "grad_norm": 0.2885451316833496, + "learning_rate": 2.918766666666667e-05, + "loss": 0.0274, + "step": 12443 + }, + { + "epoch": 9.798739661283971, + "grad_norm": 0.18296298384666443, + "learning_rate": 2.918733333333333e-05, + "loss": 0.0083, + "step": 12444 + }, + { + "epoch": 9.799527372981489, + "grad_norm": 0.31052958965301514, + "learning_rate": 2.9187e-05, + "loss": 0.0183, + "step": 12445 + }, + { + "epoch": 9.800315084679008, + "grad_norm": 0.23983143270015717, + "learning_rate": 2.918666666666667e-05, + "loss": 0.0166, + "step": 12446 + }, + { + "epoch": 9.801102796376526, + "grad_norm": 0.24161723256111145, + "learning_rate": 2.9186333333333332e-05, + "loss": 0.0137, + "step": 12447 + }, + { + "epoch": 9.801890508074045, + "grad_norm": 0.14737434685230255, + "learning_rate": 2.9186e-05, + "loss": 0.0071, + "step": 12448 + }, + { + "epoch": 9.802678219771563, + "grad_norm": 0.1585020273923874, + "learning_rate": 2.9185666666666667e-05, + "loss": 0.008, + "step": 12449 + }, + { + "epoch": 9.803465931469082, + "grad_norm": 0.2780205309391022, + "learning_rate": 2.9185333333333333e-05, + "loss": 0.0134, + "step": 12450 + }, + { + "epoch": 9.8042536431666, + "grad_norm": 0.2356192022562027, + "learning_rate": 2.9185e-05, + "loss": 0.0073, + "step": 12451 + }, + { + "epoch": 9.80504135486412, + "grad_norm": 0.3840092122554779, + "learning_rate": 2.918466666666667e-05, + "loss": 0.0295, + "step": 12452 + }, + { + "epoch": 9.805829066561639, + "grad_norm": 0.23255786299705505, + "learning_rate": 2.9184333333333335e-05, + "loss": 0.0066, + "step": 12453 + }, + { + "epoch": 9.806616778259157, + "grad_norm": 0.38331663608551025, + "learning_rate": 2.9184e-05, + "loss": 0.0179, + "step": 12454 + }, + { + "epoch": 9.807404489956676, + "grad_norm": 0.11274369060993195, + "learning_rate": 2.918366666666667e-05, + "loss": 0.0073, + "step": 12455 + }, + { + "epoch": 9.808192201654194, + "grad_norm": 0.29035183787345886, + "learning_rate": 2.9183333333333332e-05, + "loss": 0.0153, + "step": 12456 + }, + { + "epoch": 9.808979913351713, + "grad_norm": 0.25432318449020386, + "learning_rate": 2.9183e-05, + "loss": 0.0149, + "step": 12457 + }, + { + "epoch": 9.809767625049233, + "grad_norm": 0.3982793688774109, + "learning_rate": 2.9182666666666668e-05, + "loss": 0.02, + "step": 12458 + }, + { + "epoch": 9.81055533674675, + "grad_norm": 0.29504525661468506, + "learning_rate": 2.9182333333333334e-05, + "loss": 0.0261, + "step": 12459 + }, + { + "epoch": 9.81134304844427, + "grad_norm": 0.22330988943576813, + "learning_rate": 2.9182e-05, + "loss": 0.0096, + "step": 12460 + }, + { + "epoch": 9.812130760141788, + "grad_norm": 0.5419144630432129, + "learning_rate": 2.918166666666667e-05, + "loss": 0.0079, + "step": 12461 + }, + { + "epoch": 9.812918471839307, + "grad_norm": 0.27812302112579346, + "learning_rate": 2.9181333333333335e-05, + "loss": 0.0131, + "step": 12462 + }, + { + "epoch": 9.813706183536826, + "grad_norm": 0.31737881898880005, + "learning_rate": 2.9181e-05, + "loss": 0.0216, + "step": 12463 + }, + { + "epoch": 9.814493895234344, + "grad_norm": 0.21723012626171112, + "learning_rate": 2.918066666666667e-05, + "loss": 0.0149, + "step": 12464 + }, + { + "epoch": 9.815281606931864, + "grad_norm": 0.23969905078411102, + "learning_rate": 2.9180333333333333e-05, + "loss": 0.0105, + "step": 12465 + }, + { + "epoch": 9.816069318629381, + "grad_norm": 0.15593892335891724, + "learning_rate": 2.9180000000000002e-05, + "loss": 0.0069, + "step": 12466 + }, + { + "epoch": 9.8168570303269, + "grad_norm": 0.26937443017959595, + "learning_rate": 2.9179666666666668e-05, + "loss": 0.0229, + "step": 12467 + }, + { + "epoch": 9.817644742024418, + "grad_norm": 0.4496159553527832, + "learning_rate": 2.9179333333333334e-05, + "loss": 0.0284, + "step": 12468 + }, + { + "epoch": 9.818432453721938, + "grad_norm": 0.37924715876579285, + "learning_rate": 2.9179e-05, + "loss": 0.0191, + "step": 12469 + }, + { + "epoch": 9.819220165419457, + "grad_norm": 0.4329739809036255, + "learning_rate": 2.917866666666667e-05, + "loss": 0.0136, + "step": 12470 + }, + { + "epoch": 9.820007877116975, + "grad_norm": 0.24538840353488922, + "learning_rate": 2.9178333333333335e-05, + "loss": 0.018, + "step": 12471 + }, + { + "epoch": 9.820795588814494, + "grad_norm": 0.2031591832637787, + "learning_rate": 2.9178e-05, + "loss": 0.0085, + "step": 12472 + }, + { + "epoch": 9.821583300512012, + "grad_norm": 0.22875376045703888, + "learning_rate": 2.9177666666666667e-05, + "loss": 0.0109, + "step": 12473 + }, + { + "epoch": 9.822371012209532, + "grad_norm": 0.18439678847789764, + "learning_rate": 2.9177333333333333e-05, + "loss": 0.0116, + "step": 12474 + }, + { + "epoch": 9.82315872390705, + "grad_norm": 0.32964062690734863, + "learning_rate": 2.9177000000000002e-05, + "loss": 0.0153, + "step": 12475 + }, + { + "epoch": 9.823946435604569, + "grad_norm": 0.3301542401313782, + "learning_rate": 2.9176666666666665e-05, + "loss": 0.0374, + "step": 12476 + }, + { + "epoch": 9.824734147302088, + "grad_norm": 0.8749275803565979, + "learning_rate": 2.9176333333333334e-05, + "loss": 0.0305, + "step": 12477 + }, + { + "epoch": 9.825521858999606, + "grad_norm": 0.370885968208313, + "learning_rate": 2.9176000000000003e-05, + "loss": 0.0105, + "step": 12478 + }, + { + "epoch": 9.826309570697125, + "grad_norm": 0.38556480407714844, + "learning_rate": 2.9175666666666666e-05, + "loss": 0.0257, + "step": 12479 + }, + { + "epoch": 9.827097282394643, + "grad_norm": 0.4763261675834656, + "learning_rate": 2.9175333333333335e-05, + "loss": 0.0241, + "step": 12480 + }, + { + "epoch": 9.827884994092162, + "grad_norm": 0.7896156311035156, + "learning_rate": 2.9175e-05, + "loss": 0.2759, + "step": 12481 + }, + { + "epoch": 9.828672705789682, + "grad_norm": 0.5028300285339355, + "learning_rate": 2.9174666666666667e-05, + "loss": 0.1476, + "step": 12482 + }, + { + "epoch": 9.8294604174872, + "grad_norm": 0.47812482714653015, + "learning_rate": 2.9174333333333333e-05, + "loss": 0.1362, + "step": 12483 + }, + { + "epoch": 9.830248129184719, + "grad_norm": 0.8633087277412415, + "learning_rate": 2.9174000000000002e-05, + "loss": 0.0824, + "step": 12484 + }, + { + "epoch": 9.831035840882237, + "grad_norm": 0.4068009853363037, + "learning_rate": 2.9173666666666665e-05, + "loss": 0.042, + "step": 12485 + }, + { + "epoch": 9.831823552579756, + "grad_norm": 0.398829847574234, + "learning_rate": 2.9173333333333334e-05, + "loss": 0.0451, + "step": 12486 + }, + { + "epoch": 9.832611264277274, + "grad_norm": 0.34665796160697937, + "learning_rate": 2.9173000000000003e-05, + "loss": 0.0446, + "step": 12487 + }, + { + "epoch": 9.833398975974793, + "grad_norm": 0.21505406498908997, + "learning_rate": 2.9172666666666666e-05, + "loss": 0.0149, + "step": 12488 + }, + { + "epoch": 9.834186687672313, + "grad_norm": 0.34082621335983276, + "learning_rate": 2.9172333333333335e-05, + "loss": 0.0359, + "step": 12489 + }, + { + "epoch": 9.83497439936983, + "grad_norm": 0.2960125803947449, + "learning_rate": 2.9172e-05, + "loss": 0.0232, + "step": 12490 + }, + { + "epoch": 9.83576211106735, + "grad_norm": 0.14827774465084076, + "learning_rate": 2.9171666666666667e-05, + "loss": 0.0096, + "step": 12491 + }, + { + "epoch": 9.836549822764868, + "grad_norm": 0.26550203561782837, + "learning_rate": 2.9171333333333333e-05, + "loss": 0.0137, + "step": 12492 + }, + { + "epoch": 9.837337534462387, + "grad_norm": 0.24019017815589905, + "learning_rate": 2.9171000000000002e-05, + "loss": 0.0188, + "step": 12493 + }, + { + "epoch": 9.838125246159905, + "grad_norm": 0.16565845906734467, + "learning_rate": 2.9170666666666665e-05, + "loss": 0.0106, + "step": 12494 + }, + { + "epoch": 9.838912957857424, + "grad_norm": 0.12535403668880463, + "learning_rate": 2.9170333333333334e-05, + "loss": 0.0076, + "step": 12495 + }, + { + "epoch": 9.839700669554944, + "grad_norm": 0.28475967049598694, + "learning_rate": 2.9170000000000004e-05, + "loss": 0.0089, + "step": 12496 + }, + { + "epoch": 9.840488381252461, + "grad_norm": 0.27499502897262573, + "learning_rate": 2.9169666666666666e-05, + "loss": 0.0146, + "step": 12497 + }, + { + "epoch": 9.84127609294998, + "grad_norm": 0.22809700667858124, + "learning_rate": 2.9169333333333335e-05, + "loss": 0.0132, + "step": 12498 + }, + { + "epoch": 9.842063804647498, + "grad_norm": 0.2202320247888565, + "learning_rate": 2.9169e-05, + "loss": 0.0142, + "step": 12499 + }, + { + "epoch": 9.842851516345018, + "grad_norm": 0.21959298849105835, + "learning_rate": 2.9168666666666667e-05, + "loss": 0.0075, + "step": 12500 + }, + { + "epoch": 9.843639228042537, + "grad_norm": 0.2665179669857025, + "learning_rate": 2.9168333333333333e-05, + "loss": 0.0102, + "step": 12501 + }, + { + "epoch": 9.844426939740055, + "grad_norm": 0.24749480187892914, + "learning_rate": 2.9168e-05, + "loss": 0.0104, + "step": 12502 + }, + { + "epoch": 9.845214651437574, + "grad_norm": 0.4779033064842224, + "learning_rate": 2.916766666666667e-05, + "loss": 0.0208, + "step": 12503 + }, + { + "epoch": 9.846002363135092, + "grad_norm": 0.33150845766067505, + "learning_rate": 2.9167333333333334e-05, + "loss": 0.016, + "step": 12504 + }, + { + "epoch": 9.846790074832612, + "grad_norm": 1.088761806488037, + "learning_rate": 2.9167e-05, + "loss": 0.0087, + "step": 12505 + }, + { + "epoch": 9.84757778653013, + "grad_norm": 0.4754149615764618, + "learning_rate": 2.9166666666666666e-05, + "loss": 0.0124, + "step": 12506 + }, + { + "epoch": 9.848365498227649, + "grad_norm": 0.2130357027053833, + "learning_rate": 2.9166333333333336e-05, + "loss": 0.0184, + "step": 12507 + }, + { + "epoch": 9.849153209925168, + "grad_norm": 0.30053237080574036, + "learning_rate": 2.9165999999999998e-05, + "loss": 0.012, + "step": 12508 + }, + { + "epoch": 9.849940921622686, + "grad_norm": 0.19034193456172943, + "learning_rate": 2.9165666666666668e-05, + "loss": 0.0178, + "step": 12509 + }, + { + "epoch": 9.850728633320205, + "grad_norm": 0.22580558061599731, + "learning_rate": 2.9165333333333334e-05, + "loss": 0.0132, + "step": 12510 + }, + { + "epoch": 9.851516345017723, + "grad_norm": 0.49213293194770813, + "learning_rate": 2.9165e-05, + "loss": 0.0194, + "step": 12511 + }, + { + "epoch": 9.852304056715242, + "grad_norm": 0.39307934045791626, + "learning_rate": 2.916466666666667e-05, + "loss": 0.0234, + "step": 12512 + }, + { + "epoch": 9.85309176841276, + "grad_norm": 0.35793063044548035, + "learning_rate": 2.9164333333333335e-05, + "loss": 0.0257, + "step": 12513 + }, + { + "epoch": 9.85387948011028, + "grad_norm": 0.33992713689804077, + "learning_rate": 2.9164e-05, + "loss": 0.0095, + "step": 12514 + }, + { + "epoch": 9.854667191807799, + "grad_norm": 1.307047963142395, + "learning_rate": 2.9163666666666667e-05, + "loss": 0.0192, + "step": 12515 + }, + { + "epoch": 9.855454903505317, + "grad_norm": 0.19137486815452576, + "learning_rate": 2.9163333333333336e-05, + "loss": 0.0089, + "step": 12516 + }, + { + "epoch": 9.856242615202836, + "grad_norm": 0.38512739539146423, + "learning_rate": 2.9163e-05, + "loss": 0.0126, + "step": 12517 + }, + { + "epoch": 9.857030326900354, + "grad_norm": 0.21228042244911194, + "learning_rate": 2.9162666666666668e-05, + "loss": 0.0146, + "step": 12518 + }, + { + "epoch": 9.857818038597873, + "grad_norm": 0.29112565517425537, + "learning_rate": 2.9162333333333334e-05, + "loss": 0.0168, + "step": 12519 + }, + { + "epoch": 9.858605750295393, + "grad_norm": 0.21041397750377655, + "learning_rate": 2.9162e-05, + "loss": 0.0081, + "step": 12520 + }, + { + "epoch": 9.85939346199291, + "grad_norm": 0.33518773317337036, + "learning_rate": 2.916166666666667e-05, + "loss": 0.011, + "step": 12521 + }, + { + "epoch": 9.86018117369043, + "grad_norm": 0.48791956901550293, + "learning_rate": 2.9161333333333335e-05, + "loss": 0.0163, + "step": 12522 + }, + { + "epoch": 9.860968885387948, + "grad_norm": 0.3966834247112274, + "learning_rate": 2.9161e-05, + "loss": 0.0252, + "step": 12523 + }, + { + "epoch": 9.861756597085467, + "grad_norm": 0.3253757953643799, + "learning_rate": 2.9160666666666667e-05, + "loss": 0.0119, + "step": 12524 + }, + { + "epoch": 9.862544308782985, + "grad_norm": 0.706229567527771, + "learning_rate": 2.9160333333333336e-05, + "loss": 0.0223, + "step": 12525 + }, + { + "epoch": 9.863332020480504, + "grad_norm": 0.2338034063577652, + "learning_rate": 2.916e-05, + "loss": 0.0138, + "step": 12526 + }, + { + "epoch": 9.864119732178024, + "grad_norm": 0.3145793080329895, + "learning_rate": 2.9159666666666668e-05, + "loss": 0.0138, + "step": 12527 + }, + { + "epoch": 9.864907443875541, + "grad_norm": 0.36567845940589905, + "learning_rate": 2.9159333333333337e-05, + "loss": 0.0229, + "step": 12528 + }, + { + "epoch": 9.86569515557306, + "grad_norm": 0.44532057642936707, + "learning_rate": 2.9159e-05, + "loss": 0.0183, + "step": 12529 + }, + { + "epoch": 9.866482867270578, + "grad_norm": 0.3382594883441925, + "learning_rate": 2.915866666666667e-05, + "loss": 0.0173, + "step": 12530 + }, + { + "epoch": 9.867270578968098, + "grad_norm": 0.7355940341949463, + "learning_rate": 2.9158333333333335e-05, + "loss": 0.2225, + "step": 12531 + }, + { + "epoch": 9.868058290665616, + "grad_norm": 0.5302306413650513, + "learning_rate": 2.9158e-05, + "loss": 0.1569, + "step": 12532 + }, + { + "epoch": 9.868846002363135, + "grad_norm": 0.43382197618484497, + "learning_rate": 2.9157666666666667e-05, + "loss": 0.1306, + "step": 12533 + }, + { + "epoch": 9.869633714060654, + "grad_norm": 0.4085794687271118, + "learning_rate": 2.9157333333333333e-05, + "loss": 0.0998, + "step": 12534 + }, + { + "epoch": 9.870421425758172, + "grad_norm": 0.30278873443603516, + "learning_rate": 2.9157e-05, + "loss": 0.0736, + "step": 12535 + }, + { + "epoch": 9.871209137455692, + "grad_norm": 0.35448187589645386, + "learning_rate": 2.9156666666666668e-05, + "loss": 0.0416, + "step": 12536 + }, + { + "epoch": 9.87199684915321, + "grad_norm": 0.2881854772567749, + "learning_rate": 2.9156333333333334e-05, + "loss": 0.0447, + "step": 12537 + }, + { + "epoch": 9.872784560850729, + "grad_norm": 0.22024467587471008, + "learning_rate": 2.9156e-05, + "loss": 0.0246, + "step": 12538 + }, + { + "epoch": 9.873572272548248, + "grad_norm": 0.8075476884841919, + "learning_rate": 2.915566666666667e-05, + "loss": 0.0235, + "step": 12539 + }, + { + "epoch": 9.874359984245766, + "grad_norm": 0.2640390396118164, + "learning_rate": 2.9155333333333332e-05, + "loss": 0.0123, + "step": 12540 + }, + { + "epoch": 9.875147695943285, + "grad_norm": 0.2325528860092163, + "learning_rate": 2.9155e-05, + "loss": 0.0153, + "step": 12541 + }, + { + "epoch": 9.875935407640803, + "grad_norm": 0.1771036833524704, + "learning_rate": 2.9154666666666667e-05, + "loss": 0.0101, + "step": 12542 + }, + { + "epoch": 9.876723119338322, + "grad_norm": 0.16785196959972382, + "learning_rate": 2.9154333333333333e-05, + "loss": 0.0104, + "step": 12543 + }, + { + "epoch": 9.87751083103584, + "grad_norm": 0.4086218476295471, + "learning_rate": 2.9154e-05, + "loss": 0.0155, + "step": 12544 + }, + { + "epoch": 9.87829854273336, + "grad_norm": 0.1603579819202423, + "learning_rate": 2.915366666666667e-05, + "loss": 0.0145, + "step": 12545 + }, + { + "epoch": 9.879086254430879, + "grad_norm": 0.1965462863445282, + "learning_rate": 2.9153333333333334e-05, + "loss": 0.0095, + "step": 12546 + }, + { + "epoch": 9.879873966128397, + "grad_norm": 0.5817281007766724, + "learning_rate": 2.9153e-05, + "loss": 0.0122, + "step": 12547 + }, + { + "epoch": 9.880661677825916, + "grad_norm": 0.18847936391830444, + "learning_rate": 2.915266666666667e-05, + "loss": 0.0103, + "step": 12548 + }, + { + "epoch": 9.881449389523434, + "grad_norm": 0.1727415770292282, + "learning_rate": 2.9152333333333332e-05, + "loss": 0.0071, + "step": 12549 + }, + { + "epoch": 9.882237101220953, + "grad_norm": 0.3240164518356323, + "learning_rate": 2.9152e-05, + "loss": 0.0166, + "step": 12550 + }, + { + "epoch": 9.883024812918471, + "grad_norm": 0.29627102613449097, + "learning_rate": 2.9151666666666667e-05, + "loss": 0.0169, + "step": 12551 + }, + { + "epoch": 9.88381252461599, + "grad_norm": 0.2304316610097885, + "learning_rate": 2.9151333333333333e-05, + "loss": 0.0107, + "step": 12552 + }, + { + "epoch": 9.88460023631351, + "grad_norm": 0.35811784863471985, + "learning_rate": 2.9151000000000003e-05, + "loss": 0.0229, + "step": 12553 + }, + { + "epoch": 9.885387948011028, + "grad_norm": 0.20320120453834534, + "learning_rate": 2.915066666666667e-05, + "loss": 0.008, + "step": 12554 + }, + { + "epoch": 9.886175659708547, + "grad_norm": 0.6276481747627258, + "learning_rate": 2.9150333333333334e-05, + "loss": 0.0218, + "step": 12555 + }, + { + "epoch": 9.886963371406065, + "grad_norm": 0.3872637450695038, + "learning_rate": 2.915e-05, + "loss": 0.0155, + "step": 12556 + }, + { + "epoch": 9.887751083103584, + "grad_norm": 0.4274654984474182, + "learning_rate": 2.914966666666667e-05, + "loss": 0.0093, + "step": 12557 + }, + { + "epoch": 9.888538794801104, + "grad_norm": 0.2494245171546936, + "learning_rate": 2.9149333333333332e-05, + "loss": 0.0122, + "step": 12558 + }, + { + "epoch": 9.889326506498621, + "grad_norm": 0.23964160680770874, + "learning_rate": 2.9149e-05, + "loss": 0.0091, + "step": 12559 + }, + { + "epoch": 9.89011421819614, + "grad_norm": 0.3373747169971466, + "learning_rate": 2.9148666666666668e-05, + "loss": 0.0174, + "step": 12560 + }, + { + "epoch": 9.890901929893658, + "grad_norm": 0.19920414686203003, + "learning_rate": 2.9148333333333333e-05, + "loss": 0.0153, + "step": 12561 + }, + { + "epoch": 9.891689641591178, + "grad_norm": 0.2719447612762451, + "learning_rate": 2.9148000000000003e-05, + "loss": 0.0142, + "step": 12562 + }, + { + "epoch": 9.892477353288696, + "grad_norm": 0.24087511003017426, + "learning_rate": 2.9147666666666665e-05, + "loss": 0.0106, + "step": 12563 + }, + { + "epoch": 9.893265064986215, + "grad_norm": 0.34495967626571655, + "learning_rate": 2.9147333333333335e-05, + "loss": 0.02, + "step": 12564 + }, + { + "epoch": 9.894052776683735, + "grad_norm": 0.24109254777431488, + "learning_rate": 2.9147e-05, + "loss": 0.0127, + "step": 12565 + }, + { + "epoch": 9.894840488381252, + "grad_norm": 0.20331266522407532, + "learning_rate": 2.9146666666666667e-05, + "loss": 0.0104, + "step": 12566 + }, + { + "epoch": 9.895628200078772, + "grad_norm": 0.21770760416984558, + "learning_rate": 2.9146333333333332e-05, + "loss": 0.0141, + "step": 12567 + }, + { + "epoch": 9.89641591177629, + "grad_norm": 0.1485525220632553, + "learning_rate": 2.9146000000000002e-05, + "loss": 0.0078, + "step": 12568 + }, + { + "epoch": 9.897203623473809, + "grad_norm": 0.3982635736465454, + "learning_rate": 2.9145666666666664e-05, + "loss": 0.0153, + "step": 12569 + }, + { + "epoch": 9.897991335171326, + "grad_norm": 0.41777175664901733, + "learning_rate": 2.9145333333333334e-05, + "loss": 0.0122, + "step": 12570 + }, + { + "epoch": 9.898779046868846, + "grad_norm": 0.17702248692512512, + "learning_rate": 2.9145000000000003e-05, + "loss": 0.0105, + "step": 12571 + }, + { + "epoch": 9.899566758566365, + "grad_norm": 0.33155012130737305, + "learning_rate": 2.9144666666666666e-05, + "loss": 0.0161, + "step": 12572 + }, + { + "epoch": 9.900354470263883, + "grad_norm": 0.5235569477081299, + "learning_rate": 2.9144333333333335e-05, + "loss": 0.014, + "step": 12573 + }, + { + "epoch": 9.901142181961402, + "grad_norm": 0.21170054376125336, + "learning_rate": 2.9144e-05, + "loss": 0.0119, + "step": 12574 + }, + { + "epoch": 9.90192989365892, + "grad_norm": 0.22334866225719452, + "learning_rate": 2.9143666666666667e-05, + "loss": 0.0142, + "step": 12575 + }, + { + "epoch": 9.90271760535644, + "grad_norm": 0.2960684895515442, + "learning_rate": 2.9143333333333333e-05, + "loss": 0.0118, + "step": 12576 + }, + { + "epoch": 9.903505317053959, + "grad_norm": 0.37590518593788147, + "learning_rate": 2.9143000000000002e-05, + "loss": 0.0237, + "step": 12577 + }, + { + "epoch": 9.904293028751477, + "grad_norm": 0.1901734620332718, + "learning_rate": 2.9142666666666668e-05, + "loss": 0.0101, + "step": 12578 + }, + { + "epoch": 9.905080740448996, + "grad_norm": 0.2893660068511963, + "learning_rate": 2.9142333333333334e-05, + "loss": 0.0147, + "step": 12579 + }, + { + "epoch": 9.905868452146514, + "grad_norm": 0.3286881744861603, + "learning_rate": 2.9142000000000003e-05, + "loss": 0.0333, + "step": 12580 + }, + { + "epoch": 9.906656163844033, + "grad_norm": 0.99849534034729, + "learning_rate": 2.9141666666666666e-05, + "loss": 0.193, + "step": 12581 + }, + { + "epoch": 9.907443875541551, + "grad_norm": 0.6657039523124695, + "learning_rate": 2.9141333333333335e-05, + "loss": 0.1846, + "step": 12582 + }, + { + "epoch": 9.90823158723907, + "grad_norm": 0.632929801940918, + "learning_rate": 2.9141e-05, + "loss": 0.1227, + "step": 12583 + }, + { + "epoch": 9.90901929893659, + "grad_norm": 0.6811887621879578, + "learning_rate": 2.9140666666666667e-05, + "loss": 0.0873, + "step": 12584 + }, + { + "epoch": 9.909807010634108, + "grad_norm": 0.7365664839744568, + "learning_rate": 2.9140333333333333e-05, + "loss": 0.0791, + "step": 12585 + }, + { + "epoch": 9.910594722331627, + "grad_norm": 0.7605775594711304, + "learning_rate": 2.9140000000000002e-05, + "loss": 0.0599, + "step": 12586 + }, + { + "epoch": 9.911382434029145, + "grad_norm": 0.2735472023487091, + "learning_rate": 2.9139666666666668e-05, + "loss": 0.0212, + "step": 12587 + }, + { + "epoch": 9.912170145726664, + "grad_norm": 0.21296784281730652, + "learning_rate": 2.9139333333333334e-05, + "loss": 0.021, + "step": 12588 + }, + { + "epoch": 9.912957857424182, + "grad_norm": 0.17827185988426208, + "learning_rate": 2.9139000000000003e-05, + "loss": 0.0134, + "step": 12589 + }, + { + "epoch": 9.913745569121701, + "grad_norm": 0.20962710678577423, + "learning_rate": 2.9138666666666666e-05, + "loss": 0.0146, + "step": 12590 + }, + { + "epoch": 9.91453328081922, + "grad_norm": 0.29654040932655334, + "learning_rate": 2.9138333333333335e-05, + "loss": 0.0272, + "step": 12591 + }, + { + "epoch": 9.915320992516738, + "grad_norm": 0.3653016984462738, + "learning_rate": 2.9137999999999998e-05, + "loss": 0.0221, + "step": 12592 + }, + { + "epoch": 9.916108704214258, + "grad_norm": 0.19539286196231842, + "learning_rate": 2.9137666666666667e-05, + "loss": 0.009, + "step": 12593 + }, + { + "epoch": 9.916896415911776, + "grad_norm": 0.3385782539844513, + "learning_rate": 2.9137333333333333e-05, + "loss": 0.0208, + "step": 12594 + }, + { + "epoch": 9.917684127609295, + "grad_norm": 0.22952955961227417, + "learning_rate": 2.9137e-05, + "loss": 0.014, + "step": 12595 + }, + { + "epoch": 9.918471839306815, + "grad_norm": 0.07728399336338043, + "learning_rate": 2.913666666666667e-05, + "loss": 0.0046, + "step": 12596 + }, + { + "epoch": 9.919259551004332, + "grad_norm": 0.33022382855415344, + "learning_rate": 2.9136333333333334e-05, + "loss": 0.0158, + "step": 12597 + }, + { + "epoch": 9.920047262701852, + "grad_norm": 0.2467040717601776, + "learning_rate": 2.9136e-05, + "loss": 0.0188, + "step": 12598 + }, + { + "epoch": 9.92083497439937, + "grad_norm": 0.18984216451644897, + "learning_rate": 2.9135666666666666e-05, + "loss": 0.0198, + "step": 12599 + }, + { + "epoch": 9.921622686096889, + "grad_norm": 0.25874799489974976, + "learning_rate": 2.9135333333333335e-05, + "loss": 0.0132, + "step": 12600 + }, + { + "epoch": 9.922410397794406, + "grad_norm": 0.2560107707977295, + "learning_rate": 2.9134999999999998e-05, + "loss": 0.0152, + "step": 12601 + }, + { + "epoch": 9.923198109491926, + "grad_norm": 0.8741106390953064, + "learning_rate": 2.9134666666666667e-05, + "loss": 0.025, + "step": 12602 + }, + { + "epoch": 9.923985821189445, + "grad_norm": 0.41485434770584106, + "learning_rate": 2.9134333333333337e-05, + "loss": 0.0118, + "step": 12603 + }, + { + "epoch": 9.924773532886963, + "grad_norm": 0.5077798366546631, + "learning_rate": 2.9134e-05, + "loss": 0.021, + "step": 12604 + }, + { + "epoch": 9.925561244584483, + "grad_norm": 0.27837228775024414, + "learning_rate": 2.913366666666667e-05, + "loss": 0.0168, + "step": 12605 + }, + { + "epoch": 9.926348956282, + "grad_norm": 0.14792318642139435, + "learning_rate": 2.9133333333333334e-05, + "loss": 0.0128, + "step": 12606 + }, + { + "epoch": 9.92713666797952, + "grad_norm": 0.43910902738571167, + "learning_rate": 2.9133e-05, + "loss": 0.0272, + "step": 12607 + }, + { + "epoch": 9.927924379677037, + "grad_norm": 0.3269402086734772, + "learning_rate": 2.9132666666666666e-05, + "loss": 0.0146, + "step": 12608 + }, + { + "epoch": 9.928712091374557, + "grad_norm": 0.5012386441230774, + "learning_rate": 2.9132333333333336e-05, + "loss": 0.0103, + "step": 12609 + }, + { + "epoch": 9.929499803072076, + "grad_norm": 0.28550398349761963, + "learning_rate": 2.9131999999999998e-05, + "loss": 0.0174, + "step": 12610 + }, + { + "epoch": 9.930287514769594, + "grad_norm": 0.34236541390419006, + "learning_rate": 2.9131666666666668e-05, + "loss": 0.0157, + "step": 12611 + }, + { + "epoch": 9.931075226467113, + "grad_norm": 0.24527227878570557, + "learning_rate": 2.9131333333333337e-05, + "loss": 0.0106, + "step": 12612 + }, + { + "epoch": 9.931862938164631, + "grad_norm": 0.3984626829624176, + "learning_rate": 2.9131e-05, + "loss": 0.0179, + "step": 12613 + }, + { + "epoch": 9.93265064986215, + "grad_norm": 0.3865591585636139, + "learning_rate": 2.913066666666667e-05, + "loss": 0.0116, + "step": 12614 + }, + { + "epoch": 9.93343836155967, + "grad_norm": 0.20719154179096222, + "learning_rate": 2.9130333333333335e-05, + "loss": 0.0101, + "step": 12615 + }, + { + "epoch": 9.934226073257188, + "grad_norm": 0.20279015600681305, + "learning_rate": 2.913e-05, + "loss": 0.0185, + "step": 12616 + }, + { + "epoch": 9.935013784954707, + "grad_norm": 0.25420328974723816, + "learning_rate": 2.9129666666666667e-05, + "loss": 0.009, + "step": 12617 + }, + { + "epoch": 9.935801496652225, + "grad_norm": 0.19208301603794098, + "learning_rate": 2.9129333333333336e-05, + "loss": 0.011, + "step": 12618 + }, + { + "epoch": 9.936589208349744, + "grad_norm": 0.2133534997701645, + "learning_rate": 2.9129e-05, + "loss": 0.0125, + "step": 12619 + }, + { + "epoch": 9.937376920047262, + "grad_norm": 0.35991397500038147, + "learning_rate": 2.9128666666666668e-05, + "loss": 0.0195, + "step": 12620 + }, + { + "epoch": 9.938164631744781, + "grad_norm": 0.19436560571193695, + "learning_rate": 2.9128333333333337e-05, + "loss": 0.0094, + "step": 12621 + }, + { + "epoch": 9.9389523434423, + "grad_norm": 0.4170288145542145, + "learning_rate": 2.9128e-05, + "loss": 0.0232, + "step": 12622 + }, + { + "epoch": 9.939740055139819, + "grad_norm": 0.5006246566772461, + "learning_rate": 2.912766666666667e-05, + "loss": 0.0188, + "step": 12623 + }, + { + "epoch": 9.940527766837338, + "grad_norm": 0.28028959035873413, + "learning_rate": 2.912733333333333e-05, + "loss": 0.012, + "step": 12624 + }, + { + "epoch": 9.941315478534856, + "grad_norm": 0.8489769101142883, + "learning_rate": 2.9127e-05, + "loss": 0.0362, + "step": 12625 + }, + { + "epoch": 9.942103190232375, + "grad_norm": 0.27960270643234253, + "learning_rate": 2.9126666666666667e-05, + "loss": 0.0201, + "step": 12626 + }, + { + "epoch": 9.942890901929893, + "grad_norm": 0.284829705953598, + "learning_rate": 2.9126333333333333e-05, + "loss": 0.0119, + "step": 12627 + }, + { + "epoch": 9.943678613627412, + "grad_norm": 0.2593217194080353, + "learning_rate": 2.9126000000000002e-05, + "loss": 0.0102, + "step": 12628 + }, + { + "epoch": 9.944466325324932, + "grad_norm": 0.21516622602939606, + "learning_rate": 2.9125666666666668e-05, + "loss": 0.0119, + "step": 12629 + }, + { + "epoch": 9.94525403702245, + "grad_norm": 0.2599688470363617, + "learning_rate": 2.9125333333333334e-05, + "loss": 0.0096, + "step": 12630 + }, + { + "epoch": 9.946041748719969, + "grad_norm": 1.1956511735916138, + "learning_rate": 2.9125e-05, + "loss": 0.2445, + "step": 12631 + }, + { + "epoch": 9.946829460417487, + "grad_norm": 0.5666888356208801, + "learning_rate": 2.912466666666667e-05, + "loss": 0.1301, + "step": 12632 + }, + { + "epoch": 9.947617172115006, + "grad_norm": 0.5082970857620239, + "learning_rate": 2.912433333333333e-05, + "loss": 0.1464, + "step": 12633 + }, + { + "epoch": 9.948404883812525, + "grad_norm": 0.6837469935417175, + "learning_rate": 2.9124e-05, + "loss": 0.1047, + "step": 12634 + }, + { + "epoch": 9.949192595510043, + "grad_norm": 0.35130470991134644, + "learning_rate": 2.9123666666666667e-05, + "loss": 0.0595, + "step": 12635 + }, + { + "epoch": 9.949980307207563, + "grad_norm": 0.2528823912143707, + "learning_rate": 2.9123333333333333e-05, + "loss": 0.0288, + "step": 12636 + }, + { + "epoch": 9.95076801890508, + "grad_norm": 0.7148856520652771, + "learning_rate": 2.9123000000000002e-05, + "loss": 0.0578, + "step": 12637 + }, + { + "epoch": 9.9515557306026, + "grad_norm": 0.24341106414794922, + "learning_rate": 2.9122666666666668e-05, + "loss": 0.0181, + "step": 12638 + }, + { + "epoch": 9.952343442300117, + "grad_norm": 0.1451549530029297, + "learning_rate": 2.9122333333333334e-05, + "loss": 0.0101, + "step": 12639 + }, + { + "epoch": 9.953131153997637, + "grad_norm": 0.2642075717449188, + "learning_rate": 2.9122e-05, + "loss": 0.0119, + "step": 12640 + }, + { + "epoch": 9.953918865695156, + "grad_norm": 0.22707153856754303, + "learning_rate": 2.912166666666667e-05, + "loss": 0.0176, + "step": 12641 + }, + { + "epoch": 9.954706577392674, + "grad_norm": 0.3764229714870453, + "learning_rate": 2.9121333333333332e-05, + "loss": 0.011, + "step": 12642 + }, + { + "epoch": 9.955494289090193, + "grad_norm": 0.18841002881526947, + "learning_rate": 2.9121e-05, + "loss": 0.0155, + "step": 12643 + }, + { + "epoch": 9.956282000787711, + "grad_norm": 0.26399436593055725, + "learning_rate": 2.9120666666666667e-05, + "loss": 0.0106, + "step": 12644 + }, + { + "epoch": 9.95706971248523, + "grad_norm": 0.52602219581604, + "learning_rate": 2.9120333333333333e-05, + "loss": 0.0151, + "step": 12645 + }, + { + "epoch": 9.957857424182748, + "grad_norm": 0.23194147646427155, + "learning_rate": 2.9120000000000002e-05, + "loss": 0.0179, + "step": 12646 + }, + { + "epoch": 9.958645135880268, + "grad_norm": 0.18417544662952423, + "learning_rate": 2.911966666666667e-05, + "loss": 0.0182, + "step": 12647 + }, + { + "epoch": 9.959432847577787, + "grad_norm": 0.40168067812919617, + "learning_rate": 2.9119333333333334e-05, + "loss": 0.0166, + "step": 12648 + }, + { + "epoch": 9.960220559275305, + "grad_norm": 0.33814737200737, + "learning_rate": 2.9119e-05, + "loss": 0.0169, + "step": 12649 + }, + { + "epoch": 9.961008270972824, + "grad_norm": 0.19087547063827515, + "learning_rate": 2.911866666666667e-05, + "loss": 0.0074, + "step": 12650 + }, + { + "epoch": 9.961795982670342, + "grad_norm": 0.43471455574035645, + "learning_rate": 2.9118333333333332e-05, + "loss": 0.0193, + "step": 12651 + }, + { + "epoch": 9.962583694367861, + "grad_norm": 0.2510121166706085, + "learning_rate": 2.9118e-05, + "loss": 0.0148, + "step": 12652 + }, + { + "epoch": 9.96337140606538, + "grad_norm": 0.20830121636390686, + "learning_rate": 2.9117666666666667e-05, + "loss": 0.0103, + "step": 12653 + }, + { + "epoch": 9.964159117762899, + "grad_norm": 0.34038427472114563, + "learning_rate": 2.9117333333333333e-05, + "loss": 0.0185, + "step": 12654 + }, + { + "epoch": 9.964946829460418, + "grad_norm": 0.22739261388778687, + "learning_rate": 2.9117000000000003e-05, + "loss": 0.0094, + "step": 12655 + }, + { + "epoch": 9.965734541157936, + "grad_norm": 0.16226662695407867, + "learning_rate": 2.9116666666666665e-05, + "loss": 0.0132, + "step": 12656 + }, + { + "epoch": 9.966522252855455, + "grad_norm": 0.17220357060432434, + "learning_rate": 2.9116333333333334e-05, + "loss": 0.0138, + "step": 12657 + }, + { + "epoch": 9.967309964552973, + "grad_norm": 0.21633270382881165, + "learning_rate": 2.9116e-05, + "loss": 0.0094, + "step": 12658 + }, + { + "epoch": 9.968097676250492, + "grad_norm": 0.5014261603355408, + "learning_rate": 2.9115666666666666e-05, + "loss": 0.0133, + "step": 12659 + }, + { + "epoch": 9.968885387948012, + "grad_norm": 0.23030292987823486, + "learning_rate": 2.9115333333333332e-05, + "loss": 0.0133, + "step": 12660 + }, + { + "epoch": 9.96967309964553, + "grad_norm": 0.5458848476409912, + "learning_rate": 2.9115e-05, + "loss": 0.0188, + "step": 12661 + }, + { + "epoch": 9.970460811343049, + "grad_norm": 0.26919296383857727, + "learning_rate": 2.9114666666666668e-05, + "loss": 0.008, + "step": 12662 + }, + { + "epoch": 9.971248523040567, + "grad_norm": 0.6519238352775574, + "learning_rate": 2.9114333333333333e-05, + "loss": 0.0142, + "step": 12663 + }, + { + "epoch": 9.972036234738086, + "grad_norm": 0.2968526780605316, + "learning_rate": 2.9114000000000003e-05, + "loss": 0.0148, + "step": 12664 + }, + { + "epoch": 9.972823946435604, + "grad_norm": 0.3516509532928467, + "learning_rate": 2.9113666666666665e-05, + "loss": 0.0156, + "step": 12665 + }, + { + "epoch": 9.973611658133123, + "grad_norm": 0.15486712753772736, + "learning_rate": 2.9113333333333335e-05, + "loss": 0.0161, + "step": 12666 + }, + { + "epoch": 9.974399369830643, + "grad_norm": 0.37758272886276245, + "learning_rate": 2.9113e-05, + "loss": 0.0216, + "step": 12667 + }, + { + "epoch": 9.97518708152816, + "grad_norm": 0.6996884346008301, + "learning_rate": 2.9112666666666667e-05, + "loss": 0.0163, + "step": 12668 + }, + { + "epoch": 9.97597479322568, + "grad_norm": 0.22464801371097565, + "learning_rate": 2.9112333333333332e-05, + "loss": 0.0098, + "step": 12669 + }, + { + "epoch": 9.976762504923197, + "grad_norm": 0.617799699306488, + "learning_rate": 2.9112000000000002e-05, + "loss": 0.0196, + "step": 12670 + }, + { + "epoch": 9.977550216620717, + "grad_norm": 0.3728055953979492, + "learning_rate": 2.9111666666666668e-05, + "loss": 0.0135, + "step": 12671 + }, + { + "epoch": 9.978337928318236, + "grad_norm": 1.2376347780227661, + "learning_rate": 2.9111333333333334e-05, + "loss": 0.0264, + "step": 12672 + }, + { + "epoch": 9.979125640015754, + "grad_norm": 0.4044995605945587, + "learning_rate": 2.9111000000000003e-05, + "loss": 0.0152, + "step": 12673 + }, + { + "epoch": 9.979913351713273, + "grad_norm": 0.510543704032898, + "learning_rate": 2.9110666666666666e-05, + "loss": 0.0191, + "step": 12674 + }, + { + "epoch": 9.980701063410791, + "grad_norm": 0.5644238591194153, + "learning_rate": 2.9110333333333335e-05, + "loss": 0.0215, + "step": 12675 + }, + { + "epoch": 9.98148877510831, + "grad_norm": 0.4570309519767761, + "learning_rate": 2.911e-05, + "loss": 0.0364, + "step": 12676 + }, + { + "epoch": 9.982276486805828, + "grad_norm": 0.2238026112318039, + "learning_rate": 2.9109666666666667e-05, + "loss": 0.0156, + "step": 12677 + }, + { + "epoch": 9.983064198503348, + "grad_norm": 0.46629196405410767, + "learning_rate": 2.9109333333333336e-05, + "loss": 0.0173, + "step": 12678 + }, + { + "epoch": 9.983851910200867, + "grad_norm": 0.397941529750824, + "learning_rate": 2.9109000000000002e-05, + "loss": 0.0167, + "step": 12679 + }, + { + "epoch": 9.984639621898385, + "grad_norm": 0.4988164007663727, + "learning_rate": 2.9108666666666668e-05, + "loss": 0.0246, + "step": 12680 + }, + { + "epoch": 9.985427333595904, + "grad_norm": 0.6664209365844727, + "learning_rate": 2.9108333333333334e-05, + "loss": 0.1768, + "step": 12681 + }, + { + "epoch": 9.986215045293422, + "grad_norm": 0.5612698197364807, + "learning_rate": 2.9108000000000003e-05, + "loss": 0.0842, + "step": 12682 + }, + { + "epoch": 9.987002756990941, + "grad_norm": 0.2891629636287689, + "learning_rate": 2.9107666666666666e-05, + "loss": 0.036, + "step": 12683 + }, + { + "epoch": 9.987790468688459, + "grad_norm": 0.24913093447685242, + "learning_rate": 2.9107333333333335e-05, + "loss": 0.0129, + "step": 12684 + }, + { + "epoch": 9.988578180385979, + "grad_norm": 0.30110302567481995, + "learning_rate": 2.9106999999999998e-05, + "loss": 0.0145, + "step": 12685 + }, + { + "epoch": 9.989365892083498, + "grad_norm": 0.18724124133586884, + "learning_rate": 2.9106666666666667e-05, + "loss": 0.0132, + "step": 12686 + }, + { + "epoch": 9.990153603781016, + "grad_norm": 0.21264876425266266, + "learning_rate": 2.9106333333333336e-05, + "loss": 0.009, + "step": 12687 + }, + { + "epoch": 9.990941315478535, + "grad_norm": 0.8465186953544617, + "learning_rate": 2.9106e-05, + "loss": 0.0282, + "step": 12688 + }, + { + "epoch": 9.991729027176053, + "grad_norm": 0.27849385142326355, + "learning_rate": 2.9105666666666668e-05, + "loss": 0.0209, + "step": 12689 + }, + { + "epoch": 9.992516738873572, + "grad_norm": 0.2525031864643097, + "learning_rate": 2.9105333333333334e-05, + "loss": 0.0195, + "step": 12690 + }, + { + "epoch": 9.993304450571092, + "grad_norm": 0.16773498058319092, + "learning_rate": 2.9105e-05, + "loss": 0.0164, + "step": 12691 + }, + { + "epoch": 9.99409216226861, + "grad_norm": 0.593271017074585, + "learning_rate": 2.9104666666666666e-05, + "loss": 0.0245, + "step": 12692 + }, + { + "epoch": 9.994879873966129, + "grad_norm": 0.33542099595069885, + "learning_rate": 2.9104333333333335e-05, + "loss": 0.0126, + "step": 12693 + }, + { + "epoch": 9.995667585663647, + "grad_norm": 0.5997739434242249, + "learning_rate": 2.9103999999999998e-05, + "loss": 0.0131, + "step": 12694 + }, + { + "epoch": 9.996455297361166, + "grad_norm": 0.24924901127815247, + "learning_rate": 2.9103666666666667e-05, + "loss": 0.0194, + "step": 12695 + }, + { + "epoch": 9.997243009058685, + "grad_norm": 0.38269394636154175, + "learning_rate": 2.9103333333333336e-05, + "loss": 0.0185, + "step": 12696 + }, + { + "epoch": 9.998030720756203, + "grad_norm": 0.6494938731193542, + "learning_rate": 2.9103e-05, + "loss": 0.0115, + "step": 12697 + }, + { + "epoch": 9.998818432453723, + "grad_norm": 0.7970678806304932, + "learning_rate": 2.910266666666667e-05, + "loss": 0.0173, + "step": 12698 + }, + { + "epoch": 9.99960614415124, + "grad_norm": 0.2537453770637512, + "learning_rate": 2.9102333333333334e-05, + "loss": 0.0203, + "step": 12699 + }, + { + "epoch": 10.0, + "grad_norm": 0.20762528479099274, + "learning_rate": 2.9102e-05, + "loss": 0.0096, + "step": 12700 + }, + { + "epoch": 10.00078771169752, + "grad_norm": 0.7403138279914856, + "learning_rate": 2.9101666666666666e-05, + "loss": 0.1692, + "step": 12701 + }, + { + "epoch": 10.001575423395037, + "grad_norm": 0.4110041856765747, + "learning_rate": 2.9101333333333335e-05, + "loss": 0.1158, + "step": 12702 + }, + { + "epoch": 10.002363135092557, + "grad_norm": 0.404879093170166, + "learning_rate": 2.9101e-05, + "loss": 0.093, + "step": 12703 + }, + { + "epoch": 10.003150846790074, + "grad_norm": 0.3802742660045624, + "learning_rate": 2.9100666666666667e-05, + "loss": 0.1274, + "step": 12704 + }, + { + "epoch": 10.003938558487594, + "grad_norm": 0.6344476938247681, + "learning_rate": 2.9100333333333337e-05, + "loss": 0.0915, + "step": 12705 + }, + { + "epoch": 10.004726270185111, + "grad_norm": 0.8833972811698914, + "learning_rate": 2.91e-05, + "loss": 0.0587, + "step": 12706 + }, + { + "epoch": 10.00551398188263, + "grad_norm": 1.1599345207214355, + "learning_rate": 2.909966666666667e-05, + "loss": 0.1342, + "step": 12707 + }, + { + "epoch": 10.00630169358015, + "grad_norm": 0.4028572142124176, + "learning_rate": 2.9099333333333334e-05, + "loss": 0.0373, + "step": 12708 + }, + { + "epoch": 10.007089405277668, + "grad_norm": 0.3055399954319, + "learning_rate": 2.9099e-05, + "loss": 0.0209, + "step": 12709 + }, + { + "epoch": 10.007877116975187, + "grad_norm": 0.24373981356620789, + "learning_rate": 2.9098666666666666e-05, + "loss": 0.0175, + "step": 12710 + }, + { + "epoch": 10.008664828672705, + "grad_norm": 0.25229761004447937, + "learning_rate": 2.9098333333333336e-05, + "loss": 0.0249, + "step": 12711 + }, + { + "epoch": 10.009452540370225, + "grad_norm": 0.17062503099441528, + "learning_rate": 2.9098e-05, + "loss": 0.0152, + "step": 12712 + }, + { + "epoch": 10.010240252067744, + "grad_norm": 0.4362850487232208, + "learning_rate": 2.9097666666666668e-05, + "loss": 0.0169, + "step": 12713 + }, + { + "epoch": 10.011027963765262, + "grad_norm": 0.7338756918907166, + "learning_rate": 2.9097333333333333e-05, + "loss": 0.0161, + "step": 12714 + }, + { + "epoch": 10.011815675462781, + "grad_norm": 0.34644782543182373, + "learning_rate": 2.9097e-05, + "loss": 0.0241, + "step": 12715 + }, + { + "epoch": 10.012603387160299, + "grad_norm": 0.27551448345184326, + "learning_rate": 2.909666666666667e-05, + "loss": 0.0238, + "step": 12716 + }, + { + "epoch": 10.013391098857818, + "grad_norm": 0.2863774001598358, + "learning_rate": 2.909633333333333e-05, + "loss": 0.0156, + "step": 12717 + }, + { + "epoch": 10.014178810555336, + "grad_norm": 0.27047625184059143, + "learning_rate": 2.9096e-05, + "loss": 0.0099, + "step": 12718 + }, + { + "epoch": 10.014966522252855, + "grad_norm": 0.4034997820854187, + "learning_rate": 2.9095666666666667e-05, + "loss": 0.0183, + "step": 12719 + }, + { + "epoch": 10.015754233950375, + "grad_norm": 0.19555789232254028, + "learning_rate": 2.9095333333333332e-05, + "loss": 0.0088, + "step": 12720 + }, + { + "epoch": 10.016541945647893, + "grad_norm": 0.13114680349826813, + "learning_rate": 2.9095000000000002e-05, + "loss": 0.0067, + "step": 12721 + }, + { + "epoch": 10.017329657345412, + "grad_norm": 0.3226056396961212, + "learning_rate": 2.9094666666666668e-05, + "loss": 0.0254, + "step": 12722 + }, + { + "epoch": 10.01811736904293, + "grad_norm": 0.2607314884662628, + "learning_rate": 2.9094333333333334e-05, + "loss": 0.0111, + "step": 12723 + }, + { + "epoch": 10.01890508074045, + "grad_norm": 1.1498775482177734, + "learning_rate": 2.9094e-05, + "loss": 0.0145, + "step": 12724 + }, + { + "epoch": 10.019692792437967, + "grad_norm": 0.22624358534812927, + "learning_rate": 2.909366666666667e-05, + "loss": 0.0171, + "step": 12725 + }, + { + "epoch": 10.020480504135486, + "grad_norm": 0.2238740175962448, + "learning_rate": 2.909333333333333e-05, + "loss": 0.0079, + "step": 12726 + }, + { + "epoch": 10.021268215833006, + "grad_norm": 0.16769708693027496, + "learning_rate": 2.9093e-05, + "loss": 0.0104, + "step": 12727 + }, + { + "epoch": 10.022055927530523, + "grad_norm": 0.1583753526210785, + "learning_rate": 2.909266666666667e-05, + "loss": 0.0058, + "step": 12728 + }, + { + "epoch": 10.022843639228043, + "grad_norm": 0.22797201573848724, + "learning_rate": 2.9092333333333333e-05, + "loss": 0.0152, + "step": 12729 + }, + { + "epoch": 10.02363135092556, + "grad_norm": 0.23183736205101013, + "learning_rate": 2.9092000000000002e-05, + "loss": 0.0066, + "step": 12730 + }, + { + "epoch": 10.02441906262308, + "grad_norm": 0.27788811922073364, + "learning_rate": 2.9091666666666668e-05, + "loss": 0.013, + "step": 12731 + }, + { + "epoch": 10.0252067743206, + "grad_norm": 0.3627462089061737, + "learning_rate": 2.9091333333333334e-05, + "loss": 0.0155, + "step": 12732 + }, + { + "epoch": 10.025994486018117, + "grad_norm": 0.3092856705188751, + "learning_rate": 2.9091e-05, + "loss": 0.0154, + "step": 12733 + }, + { + "epoch": 10.026782197715637, + "grad_norm": 0.28211671113967896, + "learning_rate": 2.909066666666667e-05, + "loss": 0.0072, + "step": 12734 + }, + { + "epoch": 10.027569909413154, + "grad_norm": 0.2451944351196289, + "learning_rate": 2.909033333333333e-05, + "loss": 0.0127, + "step": 12735 + }, + { + "epoch": 10.028357621110674, + "grad_norm": 0.331767201423645, + "learning_rate": 2.909e-05, + "loss": 0.0168, + "step": 12736 + }, + { + "epoch": 10.029145332808191, + "grad_norm": 0.5664268732070923, + "learning_rate": 2.908966666666667e-05, + "loss": 0.013, + "step": 12737 + }, + { + "epoch": 10.02993304450571, + "grad_norm": 0.3344402015209198, + "learning_rate": 2.9089333333333333e-05, + "loss": 0.0186, + "step": 12738 + }, + { + "epoch": 10.03072075620323, + "grad_norm": 0.3147570490837097, + "learning_rate": 2.9089000000000002e-05, + "loss": 0.0168, + "step": 12739 + }, + { + "epoch": 10.031508467900748, + "grad_norm": 0.8533557057380676, + "learning_rate": 2.9088666666666668e-05, + "loss": 0.0152, + "step": 12740 + }, + { + "epoch": 10.032296179598267, + "grad_norm": 0.505980908870697, + "learning_rate": 2.9088333333333334e-05, + "loss": 0.0086, + "step": 12741 + }, + { + "epoch": 10.033083891295785, + "grad_norm": 0.41763123869895935, + "learning_rate": 2.9088e-05, + "loss": 0.0129, + "step": 12742 + }, + { + "epoch": 10.033871602993305, + "grad_norm": 0.5017719268798828, + "learning_rate": 2.9087666666666666e-05, + "loss": 0.016, + "step": 12743 + }, + { + "epoch": 10.034659314690822, + "grad_norm": 0.41539430618286133, + "learning_rate": 2.9087333333333332e-05, + "loss": 0.0166, + "step": 12744 + }, + { + "epoch": 10.035447026388342, + "grad_norm": 0.14368179440498352, + "learning_rate": 2.9087e-05, + "loss": 0.0105, + "step": 12745 + }, + { + "epoch": 10.036234738085861, + "grad_norm": 0.3186468183994293, + "learning_rate": 2.9086666666666667e-05, + "loss": 0.0123, + "step": 12746 + }, + { + "epoch": 10.037022449783379, + "grad_norm": 0.6821594834327698, + "learning_rate": 2.9086333333333333e-05, + "loss": 0.0302, + "step": 12747 + }, + { + "epoch": 10.037810161480898, + "grad_norm": 0.2535792291164398, + "learning_rate": 2.9086000000000002e-05, + "loss": 0.0071, + "step": 12748 + }, + { + "epoch": 10.038597873178416, + "grad_norm": 0.403448224067688, + "learning_rate": 2.9085666666666665e-05, + "loss": 0.0196, + "step": 12749 + }, + { + "epoch": 10.039385584875935, + "grad_norm": 0.34623971581459045, + "learning_rate": 2.9085333333333334e-05, + "loss": 0.0185, + "step": 12750 + }, + { + "epoch": 10.040173296573455, + "grad_norm": 0.5263148546218872, + "learning_rate": 2.9085e-05, + "loss": 0.1877, + "step": 12751 + }, + { + "epoch": 10.040961008270973, + "grad_norm": 0.9218083620071411, + "learning_rate": 2.9084666666666666e-05, + "loss": 0.193, + "step": 12752 + }, + { + "epoch": 10.041748719968492, + "grad_norm": 0.6438521146774292, + "learning_rate": 2.9084333333333335e-05, + "loss": 0.1553, + "step": 12753 + }, + { + "epoch": 10.04253643166601, + "grad_norm": 0.6033491492271423, + "learning_rate": 2.9084e-05, + "loss": 0.0801, + "step": 12754 + }, + { + "epoch": 10.04332414336353, + "grad_norm": 0.427813321352005, + "learning_rate": 2.9083666666666667e-05, + "loss": 0.0721, + "step": 12755 + }, + { + "epoch": 10.044111855061047, + "grad_norm": 0.444134384393692, + "learning_rate": 2.9083333333333333e-05, + "loss": 0.0675, + "step": 12756 + }, + { + "epoch": 10.044899566758566, + "grad_norm": 0.21703213453292847, + "learning_rate": 2.9083000000000003e-05, + "loss": 0.0181, + "step": 12757 + }, + { + "epoch": 10.045687278456086, + "grad_norm": 0.31438493728637695, + "learning_rate": 2.9082666666666665e-05, + "loss": 0.0282, + "step": 12758 + }, + { + "epoch": 10.046474990153603, + "grad_norm": 0.2962920367717743, + "learning_rate": 2.9082333333333334e-05, + "loss": 0.0191, + "step": 12759 + }, + { + "epoch": 10.047262701851123, + "grad_norm": 0.3095787763595581, + "learning_rate": 2.9082e-05, + "loss": 0.0169, + "step": 12760 + }, + { + "epoch": 10.04805041354864, + "grad_norm": 0.1374129056930542, + "learning_rate": 2.9081666666666666e-05, + "loss": 0.0065, + "step": 12761 + }, + { + "epoch": 10.04883812524616, + "grad_norm": 0.3133596181869507, + "learning_rate": 2.9081333333333336e-05, + "loss": 0.0184, + "step": 12762 + }, + { + "epoch": 10.04962583694368, + "grad_norm": 0.1586621254682541, + "learning_rate": 2.9081e-05, + "loss": 0.011, + "step": 12763 + }, + { + "epoch": 10.050413548641197, + "grad_norm": 0.4415090084075928, + "learning_rate": 2.9080666666666668e-05, + "loss": 0.0119, + "step": 12764 + }, + { + "epoch": 10.051201260338717, + "grad_norm": 0.24499936401844025, + "learning_rate": 2.9080333333333333e-05, + "loss": 0.0123, + "step": 12765 + }, + { + "epoch": 10.051988972036234, + "grad_norm": 0.2220752239227295, + "learning_rate": 2.9080000000000003e-05, + "loss": 0.0128, + "step": 12766 + }, + { + "epoch": 10.052776683733754, + "grad_norm": 0.13411833345890045, + "learning_rate": 2.9079666666666665e-05, + "loss": 0.0087, + "step": 12767 + }, + { + "epoch": 10.053564395431271, + "grad_norm": 0.16210073232650757, + "learning_rate": 2.9079333333333335e-05, + "loss": 0.0081, + "step": 12768 + }, + { + "epoch": 10.054352107128791, + "grad_norm": 0.31068331003189087, + "learning_rate": 2.9079e-05, + "loss": 0.017, + "step": 12769 + }, + { + "epoch": 10.05513981882631, + "grad_norm": 0.0942550078034401, + "learning_rate": 2.9078666666666667e-05, + "loss": 0.0047, + "step": 12770 + }, + { + "epoch": 10.055927530523828, + "grad_norm": 0.19567838311195374, + "learning_rate": 2.9078333333333336e-05, + "loss": 0.0132, + "step": 12771 + }, + { + "epoch": 10.056715242221347, + "grad_norm": 0.34354811906814575, + "learning_rate": 2.9078000000000002e-05, + "loss": 0.0156, + "step": 12772 + }, + { + "epoch": 10.057502953918865, + "grad_norm": 0.2993329167366028, + "learning_rate": 2.9077666666666668e-05, + "loss": 0.0223, + "step": 12773 + }, + { + "epoch": 10.058290665616385, + "grad_norm": 0.2829805910587311, + "learning_rate": 2.9077333333333334e-05, + "loss": 0.0192, + "step": 12774 + }, + { + "epoch": 10.059078377313902, + "grad_norm": 0.3087173402309418, + "learning_rate": 2.9077e-05, + "loss": 0.0189, + "step": 12775 + }, + { + "epoch": 10.059866089011422, + "grad_norm": 0.16200973093509674, + "learning_rate": 2.9076666666666666e-05, + "loss": 0.0094, + "step": 12776 + }, + { + "epoch": 10.060653800708941, + "grad_norm": 0.42286503314971924, + "learning_rate": 2.9076333333333335e-05, + "loss": 0.0161, + "step": 12777 + }, + { + "epoch": 10.061441512406459, + "grad_norm": 0.3355909585952759, + "learning_rate": 2.9076e-05, + "loss": 0.0176, + "step": 12778 + }, + { + "epoch": 10.062229224103978, + "grad_norm": 0.2651488780975342, + "learning_rate": 2.9075666666666667e-05, + "loss": 0.0131, + "step": 12779 + }, + { + "epoch": 10.063016935801496, + "grad_norm": 0.4146614670753479, + "learning_rate": 2.9075333333333336e-05, + "loss": 0.0195, + "step": 12780 + }, + { + "epoch": 10.063804647499015, + "grad_norm": 0.3718011975288391, + "learning_rate": 2.9075e-05, + "loss": 0.0158, + "step": 12781 + }, + { + "epoch": 10.064592359196535, + "grad_norm": 0.31501930952072144, + "learning_rate": 2.9074666666666668e-05, + "loss": 0.0158, + "step": 12782 + }, + { + "epoch": 10.065380070894053, + "grad_norm": 0.7594137787818909, + "learning_rate": 2.9074333333333334e-05, + "loss": 0.0142, + "step": 12783 + }, + { + "epoch": 10.066167782591572, + "grad_norm": 0.35471078753471375, + "learning_rate": 2.9074e-05, + "loss": 0.0106, + "step": 12784 + }, + { + "epoch": 10.06695549428909, + "grad_norm": 0.15477430820465088, + "learning_rate": 2.9073666666666666e-05, + "loss": 0.0168, + "step": 12785 + }, + { + "epoch": 10.06774320598661, + "grad_norm": 0.18302886188030243, + "learning_rate": 2.9073333333333335e-05, + "loss": 0.016, + "step": 12786 + }, + { + "epoch": 10.068530917684127, + "grad_norm": 0.26686763763427734, + "learning_rate": 2.9073e-05, + "loss": 0.0075, + "step": 12787 + }, + { + "epoch": 10.069318629381646, + "grad_norm": 0.6729685068130493, + "learning_rate": 2.9072666666666667e-05, + "loss": 0.014, + "step": 12788 + }, + { + "epoch": 10.070106341079166, + "grad_norm": 0.25184860825538635, + "learning_rate": 2.9072333333333336e-05, + "loss": 0.0104, + "step": 12789 + }, + { + "epoch": 10.070894052776683, + "grad_norm": 0.3122164011001587, + "learning_rate": 2.9072e-05, + "loss": 0.0195, + "step": 12790 + }, + { + "epoch": 10.071681764474203, + "grad_norm": 0.21405300498008728, + "learning_rate": 2.9071666666666668e-05, + "loss": 0.008, + "step": 12791 + }, + { + "epoch": 10.07246947617172, + "grad_norm": 0.8823539614677429, + "learning_rate": 2.9071333333333334e-05, + "loss": 0.0248, + "step": 12792 + }, + { + "epoch": 10.07325718786924, + "grad_norm": 0.3548937439918518, + "learning_rate": 2.9071e-05, + "loss": 0.0174, + "step": 12793 + }, + { + "epoch": 10.074044899566758, + "grad_norm": 0.44840556383132935, + "learning_rate": 2.9070666666666666e-05, + "loss": 0.0223, + "step": 12794 + }, + { + "epoch": 10.074832611264277, + "grad_norm": 0.4385424256324768, + "learning_rate": 2.9070333333333335e-05, + "loss": 0.0278, + "step": 12795 + }, + { + "epoch": 10.075620322961797, + "grad_norm": 0.21448549628257751, + "learning_rate": 2.907e-05, + "loss": 0.0126, + "step": 12796 + }, + { + "epoch": 10.076408034659314, + "grad_norm": 0.32873475551605225, + "learning_rate": 2.9069666666666667e-05, + "loss": 0.012, + "step": 12797 + }, + { + "epoch": 10.077195746356834, + "grad_norm": 0.48729708790779114, + "learning_rate": 2.9069333333333336e-05, + "loss": 0.0138, + "step": 12798 + }, + { + "epoch": 10.077983458054351, + "grad_norm": 0.2835831642150879, + "learning_rate": 2.9069e-05, + "loss": 0.0148, + "step": 12799 + }, + { + "epoch": 10.078771169751871, + "grad_norm": 0.3924562931060791, + "learning_rate": 2.906866666666667e-05, + "loss": 0.0119, + "step": 12800 + }, + { + "epoch": 10.07955888144939, + "grad_norm": 0.64310622215271, + "learning_rate": 2.9068333333333334e-05, + "loss": 0.1868, + "step": 12801 + }, + { + "epoch": 10.080346593146908, + "grad_norm": 0.6509113907814026, + "learning_rate": 2.9068e-05, + "loss": 0.184, + "step": 12802 + }, + { + "epoch": 10.081134304844428, + "grad_norm": 0.5350961685180664, + "learning_rate": 2.906766666666667e-05, + "loss": 0.1331, + "step": 12803 + }, + { + "epoch": 10.081922016541945, + "grad_norm": 0.5298290848731995, + "learning_rate": 2.9067333333333332e-05, + "loss": 0.098, + "step": 12804 + }, + { + "epoch": 10.082709728239465, + "grad_norm": 0.3687085211277008, + "learning_rate": 2.9067e-05, + "loss": 0.0758, + "step": 12805 + }, + { + "epoch": 10.083497439936982, + "grad_norm": 0.41720083355903625, + "learning_rate": 2.9066666666666667e-05, + "loss": 0.0278, + "step": 12806 + }, + { + "epoch": 10.084285151634502, + "grad_norm": 0.17910180985927582, + "learning_rate": 2.9066333333333333e-05, + "loss": 0.0291, + "step": 12807 + }, + { + "epoch": 10.085072863332021, + "grad_norm": 0.713843584060669, + "learning_rate": 2.9066e-05, + "loss": 0.0262, + "step": 12808 + }, + { + "epoch": 10.085860575029539, + "grad_norm": 0.23804153501987457, + "learning_rate": 2.906566666666667e-05, + "loss": 0.0114, + "step": 12809 + }, + { + "epoch": 10.086648286727058, + "grad_norm": 0.17939209938049316, + "learning_rate": 2.906533333333333e-05, + "loss": 0.012, + "step": 12810 + }, + { + "epoch": 10.087435998424576, + "grad_norm": 0.21687528491020203, + "learning_rate": 2.9065e-05, + "loss": 0.0171, + "step": 12811 + }, + { + "epoch": 10.088223710122096, + "grad_norm": 0.14011810719966888, + "learning_rate": 2.906466666666667e-05, + "loss": 0.0122, + "step": 12812 + }, + { + "epoch": 10.089011421819613, + "grad_norm": 0.1750912070274353, + "learning_rate": 2.9064333333333332e-05, + "loss": 0.0074, + "step": 12813 + }, + { + "epoch": 10.089799133517133, + "grad_norm": 0.3480643630027771, + "learning_rate": 2.9064e-05, + "loss": 0.0222, + "step": 12814 + }, + { + "epoch": 10.090586845214652, + "grad_norm": 0.5376880168914795, + "learning_rate": 2.9063666666666668e-05, + "loss": 0.0214, + "step": 12815 + }, + { + "epoch": 10.09137455691217, + "grad_norm": 0.39819082617759705, + "learning_rate": 2.9063333333333333e-05, + "loss": 0.0103, + "step": 12816 + }, + { + "epoch": 10.09216226860969, + "grad_norm": 0.15112121403217316, + "learning_rate": 2.9063e-05, + "loss": 0.0069, + "step": 12817 + }, + { + "epoch": 10.092949980307207, + "grad_norm": 0.1880967915058136, + "learning_rate": 2.906266666666667e-05, + "loss": 0.0105, + "step": 12818 + }, + { + "epoch": 10.093737692004726, + "grad_norm": 0.20253829658031464, + "learning_rate": 2.9062333333333335e-05, + "loss": 0.0171, + "step": 12819 + }, + { + "epoch": 10.094525403702246, + "grad_norm": 0.2007640302181244, + "learning_rate": 2.9062e-05, + "loss": 0.0089, + "step": 12820 + }, + { + "epoch": 10.095313115399764, + "grad_norm": 0.12195482105016708, + "learning_rate": 2.906166666666667e-05, + "loss": 0.0082, + "step": 12821 + }, + { + "epoch": 10.096100827097283, + "grad_norm": 0.3841218948364258, + "learning_rate": 2.9061333333333332e-05, + "loss": 0.0166, + "step": 12822 + }, + { + "epoch": 10.0968885387948, + "grad_norm": 0.42889195680618286, + "learning_rate": 2.9061000000000002e-05, + "loss": 0.0159, + "step": 12823 + }, + { + "epoch": 10.09767625049232, + "grad_norm": 0.167558491230011, + "learning_rate": 2.9060666666666668e-05, + "loss": 0.0142, + "step": 12824 + }, + { + "epoch": 10.098463962189838, + "grad_norm": 0.1566423773765564, + "learning_rate": 2.9060333333333334e-05, + "loss": 0.0114, + "step": 12825 + }, + { + "epoch": 10.099251673887357, + "grad_norm": 0.21297691762447357, + "learning_rate": 2.906e-05, + "loss": 0.0193, + "step": 12826 + }, + { + "epoch": 10.100039385584877, + "grad_norm": 0.3260344862937927, + "learning_rate": 2.905966666666667e-05, + "loss": 0.018, + "step": 12827 + }, + { + "epoch": 10.100827097282394, + "grad_norm": 0.36586421728134155, + "learning_rate": 2.9059333333333335e-05, + "loss": 0.014, + "step": 12828 + }, + { + "epoch": 10.101614808979914, + "grad_norm": 0.21221287548542023, + "learning_rate": 2.9059e-05, + "loss": 0.0071, + "step": 12829 + }, + { + "epoch": 10.102402520677431, + "grad_norm": 0.21770724654197693, + "learning_rate": 2.905866666666667e-05, + "loss": 0.0129, + "step": 12830 + }, + { + "epoch": 10.103190232374951, + "grad_norm": 0.734647810459137, + "learning_rate": 2.9058333333333333e-05, + "loss": 0.0151, + "step": 12831 + }, + { + "epoch": 10.103977944072469, + "grad_norm": 0.29025259613990784, + "learning_rate": 2.9058000000000002e-05, + "loss": 0.0249, + "step": 12832 + }, + { + "epoch": 10.104765655769988, + "grad_norm": 0.17990127205848694, + "learning_rate": 2.9057666666666668e-05, + "loss": 0.0057, + "step": 12833 + }, + { + "epoch": 10.105553367467508, + "grad_norm": 0.278860479593277, + "learning_rate": 2.9057333333333334e-05, + "loss": 0.0099, + "step": 12834 + }, + { + "epoch": 10.106341079165025, + "grad_norm": 0.13389833271503448, + "learning_rate": 2.9057e-05, + "loss": 0.01, + "step": 12835 + }, + { + "epoch": 10.107128790862545, + "grad_norm": 0.14972953498363495, + "learning_rate": 2.9056666666666666e-05, + "loss": 0.0058, + "step": 12836 + }, + { + "epoch": 10.107916502560062, + "grad_norm": 0.1533711552619934, + "learning_rate": 2.9056333333333335e-05, + "loss": 0.0082, + "step": 12837 + }, + { + "epoch": 10.108704214257582, + "grad_norm": 0.2094152569770813, + "learning_rate": 2.9056e-05, + "loss": 0.0127, + "step": 12838 + }, + { + "epoch": 10.109491925955101, + "grad_norm": 0.5457439422607422, + "learning_rate": 2.9055666666666667e-05, + "loss": 0.015, + "step": 12839 + }, + { + "epoch": 10.110279637652619, + "grad_norm": 0.10732632130384445, + "learning_rate": 2.9055333333333333e-05, + "loss": 0.0064, + "step": 12840 + }, + { + "epoch": 10.111067349350138, + "grad_norm": 0.40373221039772034, + "learning_rate": 2.9055000000000002e-05, + "loss": 0.0244, + "step": 12841 + }, + { + "epoch": 10.111855061047656, + "grad_norm": 0.2293357104063034, + "learning_rate": 2.9054666666666665e-05, + "loss": 0.0128, + "step": 12842 + }, + { + "epoch": 10.112642772745176, + "grad_norm": 0.2266974151134491, + "learning_rate": 2.9054333333333334e-05, + "loss": 0.0122, + "step": 12843 + }, + { + "epoch": 10.113430484442693, + "grad_norm": 0.44681403040885925, + "learning_rate": 2.9054e-05, + "loss": 0.0142, + "step": 12844 + }, + { + "epoch": 10.114218196140213, + "grad_norm": 0.18255525827407837, + "learning_rate": 2.9053666666666666e-05, + "loss": 0.0109, + "step": 12845 + }, + { + "epoch": 10.115005907837732, + "grad_norm": 0.2624584138393402, + "learning_rate": 2.9053333333333335e-05, + "loss": 0.0148, + "step": 12846 + }, + { + "epoch": 10.11579361953525, + "grad_norm": 0.5673035383224487, + "learning_rate": 2.9053e-05, + "loss": 0.0222, + "step": 12847 + }, + { + "epoch": 10.11658133123277, + "grad_norm": 0.2941087484359741, + "learning_rate": 2.9052666666666667e-05, + "loss": 0.0093, + "step": 12848 + }, + { + "epoch": 10.117369042930287, + "grad_norm": 0.7908488512039185, + "learning_rate": 2.9052333333333333e-05, + "loss": 0.0331, + "step": 12849 + }, + { + "epoch": 10.118156754627806, + "grad_norm": 0.16897517442703247, + "learning_rate": 2.9052000000000002e-05, + "loss": 0.0127, + "step": 12850 + }, + { + "epoch": 10.118944466325324, + "grad_norm": 0.6624463200569153, + "learning_rate": 2.9051666666666665e-05, + "loss": 0.2084, + "step": 12851 + }, + { + "epoch": 10.119732178022844, + "grad_norm": 0.7846646904945374, + "learning_rate": 2.9051333333333334e-05, + "loss": 0.1472, + "step": 12852 + }, + { + "epoch": 10.120519889720363, + "grad_norm": 0.5327421426773071, + "learning_rate": 2.9051000000000004e-05, + "loss": 0.0706, + "step": 12853 + }, + { + "epoch": 10.12130760141788, + "grad_norm": 0.4039718210697174, + "learning_rate": 2.9050666666666666e-05, + "loss": 0.0644, + "step": 12854 + }, + { + "epoch": 10.1220953131154, + "grad_norm": 0.2771168053150177, + "learning_rate": 2.9050333333333335e-05, + "loss": 0.0405, + "step": 12855 + }, + { + "epoch": 10.122883024812918, + "grad_norm": 0.498776912689209, + "learning_rate": 2.905e-05, + "loss": 0.0896, + "step": 12856 + }, + { + "epoch": 10.123670736510437, + "grad_norm": 0.235012024641037, + "learning_rate": 2.9049666666666667e-05, + "loss": 0.0155, + "step": 12857 + }, + { + "epoch": 10.124458448207957, + "grad_norm": 0.1689882129430771, + "learning_rate": 2.9049333333333333e-05, + "loss": 0.0148, + "step": 12858 + }, + { + "epoch": 10.125246159905474, + "grad_norm": 0.238669291138649, + "learning_rate": 2.9049000000000003e-05, + "loss": 0.0091, + "step": 12859 + }, + { + "epoch": 10.126033871602994, + "grad_norm": 0.9987475275993347, + "learning_rate": 2.9048666666666665e-05, + "loss": 0.0171, + "step": 12860 + }, + { + "epoch": 10.126821583300512, + "grad_norm": 0.3513759970664978, + "learning_rate": 2.9048333333333334e-05, + "loss": 0.0631, + "step": 12861 + }, + { + "epoch": 10.127609294998031, + "grad_norm": 0.14673149585723877, + "learning_rate": 2.9048000000000004e-05, + "loss": 0.0114, + "step": 12862 + }, + { + "epoch": 10.128397006695549, + "grad_norm": 0.2144249528646469, + "learning_rate": 2.9047666666666666e-05, + "loss": 0.0088, + "step": 12863 + }, + { + "epoch": 10.129184718393068, + "grad_norm": 0.20834322273731232, + "learning_rate": 2.9047333333333336e-05, + "loss": 0.0088, + "step": 12864 + }, + { + "epoch": 10.129972430090588, + "grad_norm": 0.15634571015834808, + "learning_rate": 2.9046999999999998e-05, + "loss": 0.0125, + "step": 12865 + }, + { + "epoch": 10.130760141788105, + "grad_norm": 0.31854555010795593, + "learning_rate": 2.9046666666666668e-05, + "loss": 0.017, + "step": 12866 + }, + { + "epoch": 10.131547853485625, + "grad_norm": 0.30279019474983215, + "learning_rate": 2.9046333333333333e-05, + "loss": 0.0146, + "step": 12867 + }, + { + "epoch": 10.132335565183142, + "grad_norm": 0.32719266414642334, + "learning_rate": 2.9046e-05, + "loss": 0.0164, + "step": 12868 + }, + { + "epoch": 10.133123276880662, + "grad_norm": 0.3356369137763977, + "learning_rate": 2.904566666666667e-05, + "loss": 0.0215, + "step": 12869 + }, + { + "epoch": 10.13391098857818, + "grad_norm": 0.3016042113304138, + "learning_rate": 2.9045333333333335e-05, + "loss": 0.0161, + "step": 12870 + }, + { + "epoch": 10.134698700275699, + "grad_norm": 0.3845246136188507, + "learning_rate": 2.9045e-05, + "loss": 0.0155, + "step": 12871 + }, + { + "epoch": 10.135486411973218, + "grad_norm": 0.8783190250396729, + "learning_rate": 2.9044666666666667e-05, + "loss": 0.0157, + "step": 12872 + }, + { + "epoch": 10.136274123670736, + "grad_norm": 0.31541013717651367, + "learning_rate": 2.9044333333333336e-05, + "loss": 0.0103, + "step": 12873 + }, + { + "epoch": 10.137061835368256, + "grad_norm": 0.3425993025302887, + "learning_rate": 2.9044e-05, + "loss": 0.0108, + "step": 12874 + }, + { + "epoch": 10.137849547065773, + "grad_norm": 0.37836533784866333, + "learning_rate": 2.9043666666666668e-05, + "loss": 0.0109, + "step": 12875 + }, + { + "epoch": 10.138637258763293, + "grad_norm": 0.33516427874565125, + "learning_rate": 2.9043333333333334e-05, + "loss": 0.0108, + "step": 12876 + }, + { + "epoch": 10.139424970460812, + "grad_norm": 0.2786388695240021, + "learning_rate": 2.9043e-05, + "loss": 0.0148, + "step": 12877 + }, + { + "epoch": 10.14021268215833, + "grad_norm": 0.34000518918037415, + "learning_rate": 2.904266666666667e-05, + "loss": 0.0156, + "step": 12878 + }, + { + "epoch": 10.14100039385585, + "grad_norm": 0.1245148777961731, + "learning_rate": 2.9042333333333335e-05, + "loss": 0.0115, + "step": 12879 + }, + { + "epoch": 10.141788105553367, + "grad_norm": 0.35796406865119934, + "learning_rate": 2.9042e-05, + "loss": 0.0187, + "step": 12880 + }, + { + "epoch": 10.142575817250886, + "grad_norm": 0.3302769958972931, + "learning_rate": 2.9041666666666667e-05, + "loss": 0.0133, + "step": 12881 + }, + { + "epoch": 10.143363528948404, + "grad_norm": 0.5362419486045837, + "learning_rate": 2.9041333333333336e-05, + "loss": 0.0193, + "step": 12882 + }, + { + "epoch": 10.144151240645924, + "grad_norm": 0.5246534943580627, + "learning_rate": 2.9041e-05, + "loss": 0.0129, + "step": 12883 + }, + { + "epoch": 10.144938952343443, + "grad_norm": 0.22678038477897644, + "learning_rate": 2.9040666666666668e-05, + "loss": 0.0158, + "step": 12884 + }, + { + "epoch": 10.14572666404096, + "grad_norm": 0.23591460287570953, + "learning_rate": 2.9040333333333334e-05, + "loss": 0.0139, + "step": 12885 + }, + { + "epoch": 10.14651437573848, + "grad_norm": 0.31617245078086853, + "learning_rate": 2.904e-05, + "loss": 0.0107, + "step": 12886 + }, + { + "epoch": 10.147302087435998, + "grad_norm": 0.14765308797359467, + "learning_rate": 2.903966666666667e-05, + "loss": 0.0087, + "step": 12887 + }, + { + "epoch": 10.148089799133517, + "grad_norm": 0.22080013155937195, + "learning_rate": 2.9039333333333335e-05, + "loss": 0.0113, + "step": 12888 + }, + { + "epoch": 10.148877510831035, + "grad_norm": 0.5087916254997253, + "learning_rate": 2.9039e-05, + "loss": 0.0166, + "step": 12889 + }, + { + "epoch": 10.149665222528554, + "grad_norm": 0.1936628371477127, + "learning_rate": 2.9038666666666667e-05, + "loss": 0.0128, + "step": 12890 + }, + { + "epoch": 10.150452934226074, + "grad_norm": 0.20994596183300018, + "learning_rate": 2.9038333333333336e-05, + "loss": 0.0103, + "step": 12891 + }, + { + "epoch": 10.151240645923592, + "grad_norm": 0.36803460121154785, + "learning_rate": 2.9038e-05, + "loss": 0.0226, + "step": 12892 + }, + { + "epoch": 10.152028357621111, + "grad_norm": 0.17596323788166046, + "learning_rate": 2.9037666666666668e-05, + "loss": 0.0097, + "step": 12893 + }, + { + "epoch": 10.152816069318629, + "grad_norm": 0.5533131957054138, + "learning_rate": 2.9037333333333334e-05, + "loss": 0.018, + "step": 12894 + }, + { + "epoch": 10.153603781016148, + "grad_norm": 0.25937795639038086, + "learning_rate": 2.9037e-05, + "loss": 0.0134, + "step": 12895 + }, + { + "epoch": 10.154391492713668, + "grad_norm": 0.21995307505130768, + "learning_rate": 2.903666666666667e-05, + "loss": 0.0095, + "step": 12896 + }, + { + "epoch": 10.155179204411185, + "grad_norm": 0.5512291193008423, + "learning_rate": 2.9036333333333332e-05, + "loss": 0.047, + "step": 12897 + }, + { + "epoch": 10.155966916108705, + "grad_norm": 0.30906373262405396, + "learning_rate": 2.9036e-05, + "loss": 0.01, + "step": 12898 + }, + { + "epoch": 10.156754627806222, + "grad_norm": 0.36029309034347534, + "learning_rate": 2.9035666666666667e-05, + "loss": 0.0129, + "step": 12899 + }, + { + "epoch": 10.157542339503742, + "grad_norm": 0.3023202121257782, + "learning_rate": 2.9035333333333333e-05, + "loss": 0.0088, + "step": 12900 + }, + { + "epoch": 10.15833005120126, + "grad_norm": 0.5579606294631958, + "learning_rate": 2.9035e-05, + "loss": 0.2394, + "step": 12901 + }, + { + "epoch": 10.159117762898779, + "grad_norm": 0.6161529421806335, + "learning_rate": 2.903466666666667e-05, + "loss": 0.2058, + "step": 12902 + }, + { + "epoch": 10.159905474596298, + "grad_norm": 0.5043689608573914, + "learning_rate": 2.9034333333333334e-05, + "loss": 0.0835, + "step": 12903 + }, + { + "epoch": 10.160693186293816, + "grad_norm": 0.5061014890670776, + "learning_rate": 2.9034e-05, + "loss": 0.1239, + "step": 12904 + }, + { + "epoch": 10.161480897991336, + "grad_norm": 0.47935113310813904, + "learning_rate": 2.903366666666667e-05, + "loss": 0.0746, + "step": 12905 + }, + { + "epoch": 10.162268609688853, + "grad_norm": 0.20271709561347961, + "learning_rate": 2.9033333333333332e-05, + "loss": 0.0266, + "step": 12906 + }, + { + "epoch": 10.163056321386373, + "grad_norm": 0.4519413709640503, + "learning_rate": 2.9033e-05, + "loss": 0.0234, + "step": 12907 + }, + { + "epoch": 10.16384403308389, + "grad_norm": 0.2847728431224823, + "learning_rate": 2.9032666666666667e-05, + "loss": 0.0197, + "step": 12908 + }, + { + "epoch": 10.16463174478141, + "grad_norm": 0.422171950340271, + "learning_rate": 2.9032333333333333e-05, + "loss": 0.0691, + "step": 12909 + }, + { + "epoch": 10.16541945647893, + "grad_norm": 0.12332560867071152, + "learning_rate": 2.9032e-05, + "loss": 0.0083, + "step": 12910 + }, + { + "epoch": 10.166207168176447, + "grad_norm": 0.2665262818336487, + "learning_rate": 2.903166666666667e-05, + "loss": 0.0161, + "step": 12911 + }, + { + "epoch": 10.166994879873966, + "grad_norm": 0.20785292983055115, + "learning_rate": 2.9031333333333334e-05, + "loss": 0.0182, + "step": 12912 + }, + { + "epoch": 10.167782591571484, + "grad_norm": 0.31720465421676636, + "learning_rate": 2.9031e-05, + "loss": 0.016, + "step": 12913 + }, + { + "epoch": 10.168570303269004, + "grad_norm": 0.12563449144363403, + "learning_rate": 2.903066666666667e-05, + "loss": 0.0092, + "step": 12914 + }, + { + "epoch": 10.169358014966523, + "grad_norm": 0.3056204915046692, + "learning_rate": 2.9030333333333332e-05, + "loss": 0.0085, + "step": 12915 + }, + { + "epoch": 10.17014572666404, + "grad_norm": 0.11644399166107178, + "learning_rate": 2.903e-05, + "loss": 0.0074, + "step": 12916 + }, + { + "epoch": 10.17093343836156, + "grad_norm": 0.3046824038028717, + "learning_rate": 2.9029666666666668e-05, + "loss": 0.0152, + "step": 12917 + }, + { + "epoch": 10.171721150059078, + "grad_norm": 0.28871336579322815, + "learning_rate": 2.9029333333333333e-05, + "loss": 0.0122, + "step": 12918 + }, + { + "epoch": 10.172508861756597, + "grad_norm": 0.19001884758472443, + "learning_rate": 2.9029000000000003e-05, + "loss": 0.0091, + "step": 12919 + }, + { + "epoch": 10.173296573454115, + "grad_norm": 0.1294165402650833, + "learning_rate": 2.902866666666667e-05, + "loss": 0.0048, + "step": 12920 + }, + { + "epoch": 10.174084285151634, + "grad_norm": 0.1527329832315445, + "learning_rate": 2.9028333333333335e-05, + "loss": 0.0086, + "step": 12921 + }, + { + "epoch": 10.174871996849154, + "grad_norm": 0.31844791769981384, + "learning_rate": 2.9028e-05, + "loss": 0.0144, + "step": 12922 + }, + { + "epoch": 10.175659708546672, + "grad_norm": 0.31584620475769043, + "learning_rate": 2.902766666666667e-05, + "loss": 0.0127, + "step": 12923 + }, + { + "epoch": 10.176447420244191, + "grad_norm": 0.18985193967819214, + "learning_rate": 2.9027333333333332e-05, + "loss": 0.0043, + "step": 12924 + }, + { + "epoch": 10.177235131941709, + "grad_norm": 0.18788422644138336, + "learning_rate": 2.9027000000000002e-05, + "loss": 0.0135, + "step": 12925 + }, + { + "epoch": 10.178022843639228, + "grad_norm": 0.13580386340618134, + "learning_rate": 2.9026666666666664e-05, + "loss": 0.0052, + "step": 12926 + }, + { + "epoch": 10.178810555336748, + "grad_norm": 0.2278439998626709, + "learning_rate": 2.9026333333333334e-05, + "loss": 0.0079, + "step": 12927 + }, + { + "epoch": 10.179598267034265, + "grad_norm": 0.3067496418952942, + "learning_rate": 2.9026000000000003e-05, + "loss": 0.0106, + "step": 12928 + }, + { + "epoch": 10.180385978731785, + "grad_norm": 0.17230971157550812, + "learning_rate": 2.9025666666666666e-05, + "loss": 0.0098, + "step": 12929 + }, + { + "epoch": 10.181173690429302, + "grad_norm": 0.3164953291416168, + "learning_rate": 2.9025333333333335e-05, + "loss": 0.0119, + "step": 12930 + }, + { + "epoch": 10.181961402126822, + "grad_norm": 0.28914016485214233, + "learning_rate": 2.9025e-05, + "loss": 0.0054, + "step": 12931 + }, + { + "epoch": 10.18274911382434, + "grad_norm": 0.13598576188087463, + "learning_rate": 2.9024666666666667e-05, + "loss": 0.009, + "step": 12932 + }, + { + "epoch": 10.183536825521859, + "grad_norm": 0.4156563878059387, + "learning_rate": 2.9024333333333333e-05, + "loss": 0.0158, + "step": 12933 + }, + { + "epoch": 10.184324537219378, + "grad_norm": 0.4945215582847595, + "learning_rate": 2.9024000000000002e-05, + "loss": 0.0106, + "step": 12934 + }, + { + "epoch": 10.185112248916896, + "grad_norm": 0.2126588225364685, + "learning_rate": 2.9023666666666665e-05, + "loss": 0.0062, + "step": 12935 + }, + { + "epoch": 10.185899960614416, + "grad_norm": 0.2839297354221344, + "learning_rate": 2.9023333333333334e-05, + "loss": 0.0131, + "step": 12936 + }, + { + "epoch": 10.186687672311933, + "grad_norm": 0.5097809433937073, + "learning_rate": 2.9023000000000003e-05, + "loss": 0.0206, + "step": 12937 + }, + { + "epoch": 10.187475384009453, + "grad_norm": 0.3767991364002228, + "learning_rate": 2.9022666666666666e-05, + "loss": 0.0085, + "step": 12938 + }, + { + "epoch": 10.18826309570697, + "grad_norm": 0.3735264241695404, + "learning_rate": 2.9022333333333335e-05, + "loss": 0.0137, + "step": 12939 + }, + { + "epoch": 10.18905080740449, + "grad_norm": 0.6954552531242371, + "learning_rate": 2.9022e-05, + "loss": 0.0202, + "step": 12940 + }, + { + "epoch": 10.18983851910201, + "grad_norm": 0.2558833658695221, + "learning_rate": 2.9021666666666667e-05, + "loss": 0.0154, + "step": 12941 + }, + { + "epoch": 10.190626230799527, + "grad_norm": 0.20002779364585876, + "learning_rate": 2.9021333333333333e-05, + "loss": 0.0082, + "step": 12942 + }, + { + "epoch": 10.191413942497046, + "grad_norm": 0.23667654395103455, + "learning_rate": 2.9021000000000002e-05, + "loss": 0.0141, + "step": 12943 + }, + { + "epoch": 10.192201654194564, + "grad_norm": 0.14103573560714722, + "learning_rate": 2.9020666666666668e-05, + "loss": 0.0068, + "step": 12944 + }, + { + "epoch": 10.192989365892084, + "grad_norm": 0.3344731628894806, + "learning_rate": 2.9020333333333334e-05, + "loss": 0.0144, + "step": 12945 + }, + { + "epoch": 10.193777077589603, + "grad_norm": 0.2112157791852951, + "learning_rate": 2.9020000000000003e-05, + "loss": 0.0098, + "step": 12946 + }, + { + "epoch": 10.19456478928712, + "grad_norm": 0.13067838549613953, + "learning_rate": 2.9019666666666666e-05, + "loss": 0.0078, + "step": 12947 + }, + { + "epoch": 10.19535250098464, + "grad_norm": 0.7123518586158752, + "learning_rate": 2.9019333333333335e-05, + "loss": 0.0167, + "step": 12948 + }, + { + "epoch": 10.196140212682158, + "grad_norm": 0.29930785298347473, + "learning_rate": 2.9019e-05, + "loss": 0.0087, + "step": 12949 + }, + { + "epoch": 10.196927924379677, + "grad_norm": 0.3940761089324951, + "learning_rate": 2.9018666666666667e-05, + "loss": 0.014, + "step": 12950 + }, + { + "epoch": 10.197715636077195, + "grad_norm": 0.6898806691169739, + "learning_rate": 2.9018333333333333e-05, + "loss": 0.2006, + "step": 12951 + }, + { + "epoch": 10.198503347774714, + "grad_norm": 0.41849058866500854, + "learning_rate": 2.9018000000000002e-05, + "loss": 0.1127, + "step": 12952 + }, + { + "epoch": 10.199291059472234, + "grad_norm": 0.46131059527397156, + "learning_rate": 2.9017666666666668e-05, + "loss": 0.0814, + "step": 12953 + }, + { + "epoch": 10.200078771169752, + "grad_norm": 0.4608217775821686, + "learning_rate": 2.9017333333333334e-05, + "loss": 0.0977, + "step": 12954 + }, + { + "epoch": 10.200866482867271, + "grad_norm": 0.44487836956977844, + "learning_rate": 2.9017e-05, + "loss": 0.0571, + "step": 12955 + }, + { + "epoch": 10.201654194564789, + "grad_norm": 0.46983346343040466, + "learning_rate": 2.9016666666666666e-05, + "loss": 0.0813, + "step": 12956 + }, + { + "epoch": 10.202441906262308, + "grad_norm": 0.17735396325588226, + "learning_rate": 2.9016333333333335e-05, + "loss": 0.0157, + "step": 12957 + }, + { + "epoch": 10.203229617959826, + "grad_norm": 0.9244132041931152, + "learning_rate": 2.9015999999999998e-05, + "loss": 0.034, + "step": 12958 + }, + { + "epoch": 10.204017329657345, + "grad_norm": 0.6612385511398315, + "learning_rate": 2.9015666666666667e-05, + "loss": 0.0359, + "step": 12959 + }, + { + "epoch": 10.204805041354865, + "grad_norm": 1.2967511415481567, + "learning_rate": 2.9015333333333333e-05, + "loss": 0.0181, + "step": 12960 + }, + { + "epoch": 10.205592753052382, + "grad_norm": 0.4299561679363251, + "learning_rate": 2.9015e-05, + "loss": 0.0142, + "step": 12961 + }, + { + "epoch": 10.206380464749902, + "grad_norm": 0.20174537599086761, + "learning_rate": 2.901466666666667e-05, + "loss": 0.0137, + "step": 12962 + }, + { + "epoch": 10.20716817644742, + "grad_norm": 0.16433867812156677, + "learning_rate": 2.9014333333333334e-05, + "loss": 0.0063, + "step": 12963 + }, + { + "epoch": 10.207955888144939, + "grad_norm": 0.20876334607601166, + "learning_rate": 2.9014e-05, + "loss": 0.01, + "step": 12964 + }, + { + "epoch": 10.208743599842458, + "grad_norm": 0.48674455285072327, + "learning_rate": 2.9013666666666666e-05, + "loss": 0.0217, + "step": 12965 + }, + { + "epoch": 10.209531311539976, + "grad_norm": 0.5132799744606018, + "learning_rate": 2.9013333333333336e-05, + "loss": 0.0233, + "step": 12966 + }, + { + "epoch": 10.210319023237496, + "grad_norm": 0.1864643394947052, + "learning_rate": 2.9012999999999998e-05, + "loss": 0.0121, + "step": 12967 + }, + { + "epoch": 10.211106734935013, + "grad_norm": 0.4172002971172333, + "learning_rate": 2.9012666666666668e-05, + "loss": 0.0155, + "step": 12968 + }, + { + "epoch": 10.211894446632533, + "grad_norm": 0.4530961215496063, + "learning_rate": 2.9012333333333337e-05, + "loss": 0.021, + "step": 12969 + }, + { + "epoch": 10.21268215833005, + "grad_norm": 0.2323295772075653, + "learning_rate": 2.9012e-05, + "loss": 0.0201, + "step": 12970 + }, + { + "epoch": 10.21346987002757, + "grad_norm": 0.17530983686447144, + "learning_rate": 2.901166666666667e-05, + "loss": 0.0083, + "step": 12971 + }, + { + "epoch": 10.21425758172509, + "grad_norm": 0.3516336679458618, + "learning_rate": 2.9011333333333335e-05, + "loss": 0.0134, + "step": 12972 + }, + { + "epoch": 10.215045293422607, + "grad_norm": 0.1965133547782898, + "learning_rate": 2.9011e-05, + "loss": 0.014, + "step": 12973 + }, + { + "epoch": 10.215833005120126, + "grad_norm": 0.22708986699581146, + "learning_rate": 2.9010666666666667e-05, + "loss": 0.0127, + "step": 12974 + }, + { + "epoch": 10.216620716817644, + "grad_norm": 0.14988930523395538, + "learning_rate": 2.9010333333333336e-05, + "loss": 0.0081, + "step": 12975 + }, + { + "epoch": 10.217408428515164, + "grad_norm": 0.3028007447719574, + "learning_rate": 2.901e-05, + "loss": 0.0133, + "step": 12976 + }, + { + "epoch": 10.218196140212681, + "grad_norm": 0.3202640116214752, + "learning_rate": 2.9009666666666668e-05, + "loss": 0.0151, + "step": 12977 + }, + { + "epoch": 10.2189838519102, + "grad_norm": 0.22196514904499054, + "learning_rate": 2.9009333333333337e-05, + "loss": 0.0138, + "step": 12978 + }, + { + "epoch": 10.21977156360772, + "grad_norm": 0.21449506282806396, + "learning_rate": 2.9009e-05, + "loss": 0.0144, + "step": 12979 + }, + { + "epoch": 10.220559275305238, + "grad_norm": 0.410322368144989, + "learning_rate": 2.900866666666667e-05, + "loss": 0.0192, + "step": 12980 + }, + { + "epoch": 10.221346987002757, + "grad_norm": 0.37256819009780884, + "learning_rate": 2.9008333333333335e-05, + "loss": 0.0198, + "step": 12981 + }, + { + "epoch": 10.222134698700275, + "grad_norm": 0.1879338026046753, + "learning_rate": 2.9008e-05, + "loss": 0.0067, + "step": 12982 + }, + { + "epoch": 10.222922410397794, + "grad_norm": 0.22249434888362885, + "learning_rate": 2.9007666666666667e-05, + "loss": 0.0089, + "step": 12983 + }, + { + "epoch": 10.223710122095314, + "grad_norm": 0.9453466534614563, + "learning_rate": 2.9007333333333336e-05, + "loss": 0.0155, + "step": 12984 + }, + { + "epoch": 10.224497833792832, + "grad_norm": 0.11539854854345322, + "learning_rate": 2.9007e-05, + "loss": 0.0055, + "step": 12985 + }, + { + "epoch": 10.225285545490351, + "grad_norm": 0.29440146684646606, + "learning_rate": 2.9006666666666668e-05, + "loss": 0.0136, + "step": 12986 + }, + { + "epoch": 10.226073257187869, + "grad_norm": 0.5698176026344299, + "learning_rate": 2.9006333333333334e-05, + "loss": 0.016, + "step": 12987 + }, + { + "epoch": 10.226860968885388, + "grad_norm": 0.356534481048584, + "learning_rate": 2.9006e-05, + "loss": 0.016, + "step": 12988 + }, + { + "epoch": 10.227648680582906, + "grad_norm": 0.1588473618030548, + "learning_rate": 2.900566666666667e-05, + "loss": 0.0113, + "step": 12989 + }, + { + "epoch": 10.228436392280425, + "grad_norm": 0.330634206533432, + "learning_rate": 2.900533333333333e-05, + "loss": 0.0266, + "step": 12990 + }, + { + "epoch": 10.229224103977945, + "grad_norm": 0.6493995785713196, + "learning_rate": 2.9005e-05, + "loss": 0.0141, + "step": 12991 + }, + { + "epoch": 10.230011815675462, + "grad_norm": 0.3901277482509613, + "learning_rate": 2.9004666666666667e-05, + "loss": 0.0217, + "step": 12992 + }, + { + "epoch": 10.230799527372982, + "grad_norm": 0.5237030982971191, + "learning_rate": 2.9004333333333333e-05, + "loss": 0.0254, + "step": 12993 + }, + { + "epoch": 10.2315872390705, + "grad_norm": 0.4235017001628876, + "learning_rate": 2.9004000000000002e-05, + "loss": 0.015, + "step": 12994 + }, + { + "epoch": 10.232374950768019, + "grad_norm": 0.47698086500167847, + "learning_rate": 2.9003666666666668e-05, + "loss": 0.0183, + "step": 12995 + }, + { + "epoch": 10.233162662465537, + "grad_norm": 0.854978084564209, + "learning_rate": 2.9003333333333334e-05, + "loss": 0.0193, + "step": 12996 + }, + { + "epoch": 10.233950374163056, + "grad_norm": 0.49500802159309387, + "learning_rate": 2.9003e-05, + "loss": 0.0191, + "step": 12997 + }, + { + "epoch": 10.234738085860576, + "grad_norm": 1.0017293691635132, + "learning_rate": 2.900266666666667e-05, + "loss": 0.0128, + "step": 12998 + }, + { + "epoch": 10.235525797558093, + "grad_norm": 0.4975751042366028, + "learning_rate": 2.9002333333333332e-05, + "loss": 0.0247, + "step": 12999 + }, + { + "epoch": 10.236313509255613, + "grad_norm": 0.28335288166999817, + "learning_rate": 2.9002e-05, + "loss": 0.0079, + "step": 13000 + }, + { + "epoch": 10.236313509255613, + "eval_cer": 0.11094693729424268, + "eval_loss": 0.29905760288238525, + "eval_runtime": 15.973, + "eval_samples_per_second": 19.032, + "eval_steps_per_second": 0.626, + "eval_wer": 0.3873752877973906, + "step": 13000 + }, + { + "epoch": 10.23710122095313, + "grad_norm": 0.7043026089668274, + "learning_rate": 2.9001666666666667e-05, + "loss": 0.2228, + "step": 13001 + }, + { + "epoch": 10.23788893265065, + "grad_norm": 0.6445655822753906, + "learning_rate": 2.9001333333333333e-05, + "loss": 0.1604, + "step": 13002 + }, + { + "epoch": 10.23867664434817, + "grad_norm": 0.42598435282707214, + "learning_rate": 2.9001000000000002e-05, + "loss": 0.1289, + "step": 13003 + }, + { + "epoch": 10.239464356045687, + "grad_norm": 0.5368931889533997, + "learning_rate": 2.9000666666666668e-05, + "loss": 0.1177, + "step": 13004 + }, + { + "epoch": 10.240252067743207, + "grad_norm": 0.39074715971946716, + "learning_rate": 2.9000333333333334e-05, + "loss": 0.1238, + "step": 13005 + }, + { + "epoch": 10.241039779440724, + "grad_norm": 0.4353043735027313, + "learning_rate": 2.9e-05, + "loss": 0.036, + "step": 13006 + }, + { + "epoch": 10.241827491138244, + "grad_norm": 0.23465484380722046, + "learning_rate": 2.899966666666667e-05, + "loss": 0.026, + "step": 13007 + }, + { + "epoch": 10.242615202835761, + "grad_norm": 0.54081130027771, + "learning_rate": 2.8999333333333332e-05, + "loss": 0.026, + "step": 13008 + }, + { + "epoch": 10.24340291453328, + "grad_norm": 0.09256885200738907, + "learning_rate": 2.8999e-05, + "loss": 0.0079, + "step": 13009 + }, + { + "epoch": 10.2441906262308, + "grad_norm": 0.5084026455879211, + "learning_rate": 2.8998666666666667e-05, + "loss": 0.0182, + "step": 13010 + }, + { + "epoch": 10.244978337928318, + "grad_norm": 0.17304964363574982, + "learning_rate": 2.8998333333333333e-05, + "loss": 0.0093, + "step": 13011 + }, + { + "epoch": 10.245766049625837, + "grad_norm": 0.4763941466808319, + "learning_rate": 2.8998000000000003e-05, + "loss": 0.0158, + "step": 13012 + }, + { + "epoch": 10.246553761323355, + "grad_norm": 0.48683521151542664, + "learning_rate": 2.899766666666667e-05, + "loss": 0.0132, + "step": 13013 + }, + { + "epoch": 10.247341473020875, + "grad_norm": 0.23437681794166565, + "learning_rate": 2.8997333333333334e-05, + "loss": 0.0124, + "step": 13014 + }, + { + "epoch": 10.248129184718394, + "grad_norm": 0.2978419363498688, + "learning_rate": 2.8997e-05, + "loss": 0.0171, + "step": 13015 + }, + { + "epoch": 10.248916896415912, + "grad_norm": 0.20220164954662323, + "learning_rate": 2.8996666666666666e-05, + "loss": 0.0113, + "step": 13016 + }, + { + "epoch": 10.249704608113431, + "grad_norm": 0.5611044764518738, + "learning_rate": 2.8996333333333332e-05, + "loss": 0.0295, + "step": 13017 + }, + { + "epoch": 10.250492319810949, + "grad_norm": 0.32741689682006836, + "learning_rate": 2.8996e-05, + "loss": 0.0127, + "step": 13018 + }, + { + "epoch": 10.251280031508468, + "grad_norm": 0.26688340306282043, + "learning_rate": 2.8995666666666667e-05, + "loss": 0.0187, + "step": 13019 + }, + { + "epoch": 10.252067743205986, + "grad_norm": 0.12704309821128845, + "learning_rate": 2.8995333333333333e-05, + "loss": 0.0058, + "step": 13020 + }, + { + "epoch": 10.252855454903505, + "grad_norm": 0.5063637495040894, + "learning_rate": 2.8995000000000003e-05, + "loss": 0.0142, + "step": 13021 + }, + { + "epoch": 10.253643166601025, + "grad_norm": 0.3622889220714569, + "learning_rate": 2.8994666666666665e-05, + "loss": 0.01, + "step": 13022 + }, + { + "epoch": 10.254430878298542, + "grad_norm": 0.20535971224308014, + "learning_rate": 2.8994333333333335e-05, + "loss": 0.0176, + "step": 13023 + }, + { + "epoch": 10.255218589996062, + "grad_norm": 0.1583230048418045, + "learning_rate": 2.8994e-05, + "loss": 0.0093, + "step": 13024 + }, + { + "epoch": 10.25600630169358, + "grad_norm": 0.2892811596393585, + "learning_rate": 2.8993666666666667e-05, + "loss": 0.0121, + "step": 13025 + }, + { + "epoch": 10.256794013391099, + "grad_norm": 0.19535131752490997, + "learning_rate": 2.8993333333333332e-05, + "loss": 0.0084, + "step": 13026 + }, + { + "epoch": 10.257581725088617, + "grad_norm": 0.3173353970050812, + "learning_rate": 2.8993000000000002e-05, + "loss": 0.0101, + "step": 13027 + }, + { + "epoch": 10.258369436786136, + "grad_norm": 0.14089910686016083, + "learning_rate": 2.8992666666666668e-05, + "loss": 0.0049, + "step": 13028 + }, + { + "epoch": 10.259157148483656, + "grad_norm": 0.3549148142337799, + "learning_rate": 2.8992333333333334e-05, + "loss": 0.0249, + "step": 13029 + }, + { + "epoch": 10.259944860181173, + "grad_norm": 0.21469421684741974, + "learning_rate": 2.8992000000000003e-05, + "loss": 0.0093, + "step": 13030 + }, + { + "epoch": 10.260732571878693, + "grad_norm": 0.22849762439727783, + "learning_rate": 2.8991666666666666e-05, + "loss": 0.0093, + "step": 13031 + }, + { + "epoch": 10.26152028357621, + "grad_norm": 0.2642950713634491, + "learning_rate": 2.8991333333333335e-05, + "loss": 0.014, + "step": 13032 + }, + { + "epoch": 10.26230799527373, + "grad_norm": 0.20315222442150116, + "learning_rate": 2.8991e-05, + "loss": 0.0127, + "step": 13033 + }, + { + "epoch": 10.26309570697125, + "grad_norm": 0.1867522895336151, + "learning_rate": 2.8990666666666667e-05, + "loss": 0.0119, + "step": 13034 + }, + { + "epoch": 10.263883418668767, + "grad_norm": 0.1978980153799057, + "learning_rate": 2.8990333333333333e-05, + "loss": 0.0154, + "step": 13035 + }, + { + "epoch": 10.264671130366287, + "grad_norm": 0.27536168694496155, + "learning_rate": 2.8990000000000002e-05, + "loss": 0.016, + "step": 13036 + }, + { + "epoch": 10.265458842063804, + "grad_norm": 0.30976080894470215, + "learning_rate": 2.8989666666666668e-05, + "loss": 0.013, + "step": 13037 + }, + { + "epoch": 10.266246553761324, + "grad_norm": 0.267849862575531, + "learning_rate": 2.8989333333333334e-05, + "loss": 0.0149, + "step": 13038 + }, + { + "epoch": 10.267034265458841, + "grad_norm": 0.19757497310638428, + "learning_rate": 2.8989000000000003e-05, + "loss": 0.0091, + "step": 13039 + }, + { + "epoch": 10.26782197715636, + "grad_norm": 0.2167387157678604, + "learning_rate": 2.8988666666666666e-05, + "loss": 0.0119, + "step": 13040 + }, + { + "epoch": 10.26860968885388, + "grad_norm": 0.3025544583797455, + "learning_rate": 2.8988333333333335e-05, + "loss": 0.0119, + "step": 13041 + }, + { + "epoch": 10.269397400551398, + "grad_norm": 0.14545278251171112, + "learning_rate": 2.8988e-05, + "loss": 0.0088, + "step": 13042 + }, + { + "epoch": 10.270185112248917, + "grad_norm": 0.23285038769245148, + "learning_rate": 2.8987666666666667e-05, + "loss": 0.0103, + "step": 13043 + }, + { + "epoch": 10.270972823946435, + "grad_norm": 0.38696348667144775, + "learning_rate": 2.8987333333333336e-05, + "loss": 0.0136, + "step": 13044 + }, + { + "epoch": 10.271760535643955, + "grad_norm": 0.16584666073322296, + "learning_rate": 2.8987000000000002e-05, + "loss": 0.0094, + "step": 13045 + }, + { + "epoch": 10.272548247341472, + "grad_norm": 0.5103957056999207, + "learning_rate": 2.8986666666666668e-05, + "loss": 0.0058, + "step": 13046 + }, + { + "epoch": 10.273335959038992, + "grad_norm": 0.4575819671154022, + "learning_rate": 2.8986333333333334e-05, + "loss": 0.0132, + "step": 13047 + }, + { + "epoch": 10.274123670736511, + "grad_norm": 0.29215919971466064, + "learning_rate": 2.8986e-05, + "loss": 0.0143, + "step": 13048 + }, + { + "epoch": 10.274911382434029, + "grad_norm": 1.1279733180999756, + "learning_rate": 2.8985666666666666e-05, + "loss": 0.0247, + "step": 13049 + }, + { + "epoch": 10.275699094131548, + "grad_norm": 0.3390137851238251, + "learning_rate": 2.8985333333333335e-05, + "loss": 0.0115, + "step": 13050 + }, + { + "epoch": 10.276486805829066, + "grad_norm": 0.6406967043876648, + "learning_rate": 2.8984999999999998e-05, + "loss": 0.1846, + "step": 13051 + }, + { + "epoch": 10.277274517526585, + "grad_norm": 0.6362506747245789, + "learning_rate": 2.8984666666666667e-05, + "loss": 0.2526, + "step": 13052 + }, + { + "epoch": 10.278062229224105, + "grad_norm": 0.6178362369537354, + "learning_rate": 2.8984333333333336e-05, + "loss": 0.139, + "step": 13053 + }, + { + "epoch": 10.278849940921623, + "grad_norm": 0.431528240442276, + "learning_rate": 2.8984e-05, + "loss": 0.0738, + "step": 13054 + }, + { + "epoch": 10.279637652619142, + "grad_norm": 0.40013590455055237, + "learning_rate": 2.8983666666666668e-05, + "loss": 0.0841, + "step": 13055 + }, + { + "epoch": 10.28042536431666, + "grad_norm": 0.37792137265205383, + "learning_rate": 2.8983333333333334e-05, + "loss": 0.0509, + "step": 13056 + }, + { + "epoch": 10.281213076014179, + "grad_norm": 0.3724953234195709, + "learning_rate": 2.8983e-05, + "loss": 0.0355, + "step": 13057 + }, + { + "epoch": 10.282000787711697, + "grad_norm": 0.4440839886665344, + "learning_rate": 2.8982666666666666e-05, + "loss": 0.0353, + "step": 13058 + }, + { + "epoch": 10.282788499409216, + "grad_norm": 0.8032215237617493, + "learning_rate": 2.8982333333333335e-05, + "loss": 0.0269, + "step": 13059 + }, + { + "epoch": 10.283576211106736, + "grad_norm": 0.3802952170372009, + "learning_rate": 2.8981999999999998e-05, + "loss": 0.0246, + "step": 13060 + }, + { + "epoch": 10.284363922804253, + "grad_norm": 0.42407503724098206, + "learning_rate": 2.8981666666666667e-05, + "loss": 0.0243, + "step": 13061 + }, + { + "epoch": 10.285151634501773, + "grad_norm": 0.1428801715373993, + "learning_rate": 2.8981333333333337e-05, + "loss": 0.0121, + "step": 13062 + }, + { + "epoch": 10.28593934619929, + "grad_norm": 0.18103095889091492, + "learning_rate": 2.8981e-05, + "loss": 0.0095, + "step": 13063 + }, + { + "epoch": 10.28672705789681, + "grad_norm": 0.09016869962215424, + "learning_rate": 2.898066666666667e-05, + "loss": 0.0058, + "step": 13064 + }, + { + "epoch": 10.287514769594328, + "grad_norm": 0.38440874218940735, + "learning_rate": 2.8980333333333334e-05, + "loss": 0.0158, + "step": 13065 + }, + { + "epoch": 10.288302481291847, + "grad_norm": 0.30856451392173767, + "learning_rate": 2.898e-05, + "loss": 0.0092, + "step": 13066 + }, + { + "epoch": 10.289090192989367, + "grad_norm": 0.49311378598213196, + "learning_rate": 2.8979666666666666e-05, + "loss": 0.0109, + "step": 13067 + }, + { + "epoch": 10.289877904686884, + "grad_norm": 0.31756654381752014, + "learning_rate": 2.8979333333333336e-05, + "loss": 0.0218, + "step": 13068 + }, + { + "epoch": 10.290665616384404, + "grad_norm": 0.4648553133010864, + "learning_rate": 2.8979e-05, + "loss": 0.0145, + "step": 13069 + }, + { + "epoch": 10.291453328081921, + "grad_norm": 0.3324569761753082, + "learning_rate": 2.8978666666666667e-05, + "loss": 0.0126, + "step": 13070 + }, + { + "epoch": 10.29224103977944, + "grad_norm": 0.22225290536880493, + "learning_rate": 2.8978333333333337e-05, + "loss": 0.0136, + "step": 13071 + }, + { + "epoch": 10.29302875147696, + "grad_norm": 0.22857476770877838, + "learning_rate": 2.8978e-05, + "loss": 0.011, + "step": 13072 + }, + { + "epoch": 10.293816463174478, + "grad_norm": 0.23215101659297943, + "learning_rate": 2.897766666666667e-05, + "loss": 0.0115, + "step": 13073 + }, + { + "epoch": 10.294604174871997, + "grad_norm": 0.1786297857761383, + "learning_rate": 2.8977333333333335e-05, + "loss": 0.0075, + "step": 13074 + }, + { + "epoch": 10.295391886569515, + "grad_norm": 0.28857526183128357, + "learning_rate": 2.8977e-05, + "loss": 0.0111, + "step": 13075 + }, + { + "epoch": 10.296179598267035, + "grad_norm": 0.4276649057865143, + "learning_rate": 2.8976666666666666e-05, + "loss": 0.0094, + "step": 13076 + }, + { + "epoch": 10.296967309964552, + "grad_norm": 0.27117544412612915, + "learning_rate": 2.8976333333333332e-05, + "loss": 0.0191, + "step": 13077 + }, + { + "epoch": 10.297755021662072, + "grad_norm": 0.4139465391635895, + "learning_rate": 2.8976000000000002e-05, + "loss": 0.0167, + "step": 13078 + }, + { + "epoch": 10.298542733359591, + "grad_norm": 0.35963255167007446, + "learning_rate": 2.8975666666666668e-05, + "loss": 0.0111, + "step": 13079 + }, + { + "epoch": 10.299330445057109, + "grad_norm": 0.31370165944099426, + "learning_rate": 2.8975333333333334e-05, + "loss": 0.014, + "step": 13080 + }, + { + "epoch": 10.300118156754628, + "grad_norm": 0.32816100120544434, + "learning_rate": 2.8975e-05, + "loss": 0.0207, + "step": 13081 + }, + { + "epoch": 10.300905868452146, + "grad_norm": 0.1902482509613037, + "learning_rate": 2.897466666666667e-05, + "loss": 0.0151, + "step": 13082 + }, + { + "epoch": 10.301693580149665, + "grad_norm": 0.295989453792572, + "learning_rate": 2.897433333333333e-05, + "loss": 0.0116, + "step": 13083 + }, + { + "epoch": 10.302481291847183, + "grad_norm": 0.25968417525291443, + "learning_rate": 2.8974e-05, + "loss": 0.0153, + "step": 13084 + }, + { + "epoch": 10.303269003544703, + "grad_norm": 0.23362378776073456, + "learning_rate": 2.8973666666666667e-05, + "loss": 0.0097, + "step": 13085 + }, + { + "epoch": 10.304056715242222, + "grad_norm": 0.2661806046962738, + "learning_rate": 2.8973333333333333e-05, + "loss": 0.0127, + "step": 13086 + }, + { + "epoch": 10.30484442693974, + "grad_norm": 0.23860692977905273, + "learning_rate": 2.8973000000000002e-05, + "loss": 0.0131, + "step": 13087 + }, + { + "epoch": 10.30563213863726, + "grad_norm": 0.1969558745622635, + "learning_rate": 2.8972666666666668e-05, + "loss": 0.0088, + "step": 13088 + }, + { + "epoch": 10.306419850334777, + "grad_norm": 0.12021223455667496, + "learning_rate": 2.8972333333333334e-05, + "loss": 0.0069, + "step": 13089 + }, + { + "epoch": 10.307207562032296, + "grad_norm": 0.5106764435768127, + "learning_rate": 2.8972e-05, + "loss": 0.0143, + "step": 13090 + }, + { + "epoch": 10.307995273729816, + "grad_norm": 0.30182507634162903, + "learning_rate": 2.897166666666667e-05, + "loss": 0.0111, + "step": 13091 + }, + { + "epoch": 10.308782985427333, + "grad_norm": 0.28675761818885803, + "learning_rate": 2.897133333333333e-05, + "loss": 0.0159, + "step": 13092 + }, + { + "epoch": 10.309570697124853, + "grad_norm": 0.4427415132522583, + "learning_rate": 2.8971e-05, + "loss": 0.0121, + "step": 13093 + }, + { + "epoch": 10.31035840882237, + "grad_norm": 0.3662012815475464, + "learning_rate": 2.897066666666667e-05, + "loss": 0.0158, + "step": 13094 + }, + { + "epoch": 10.31114612051989, + "grad_norm": 0.5243609547615051, + "learning_rate": 2.8970333333333333e-05, + "loss": 0.0181, + "step": 13095 + }, + { + "epoch": 10.311933832217408, + "grad_norm": 0.34867578744888306, + "learning_rate": 2.8970000000000002e-05, + "loss": 0.0143, + "step": 13096 + }, + { + "epoch": 10.312721543914927, + "grad_norm": 0.2593687176704407, + "learning_rate": 2.8969666666666668e-05, + "loss": 0.0136, + "step": 13097 + }, + { + "epoch": 10.313509255612447, + "grad_norm": 0.3024132251739502, + "learning_rate": 2.8969333333333334e-05, + "loss": 0.011, + "step": 13098 + }, + { + "epoch": 10.314296967309964, + "grad_norm": 3.369635581970215, + "learning_rate": 2.8969e-05, + "loss": 0.0161, + "step": 13099 + }, + { + "epoch": 10.315084679007484, + "grad_norm": 0.5270916223526001, + "learning_rate": 2.896866666666667e-05, + "loss": 0.0285, + "step": 13100 + }, + { + "epoch": 10.315872390705001, + "grad_norm": 0.9481898546218872, + "learning_rate": 2.8968333333333332e-05, + "loss": 0.2081, + "step": 13101 + }, + { + "epoch": 10.31666010240252, + "grad_norm": 0.4820623993873596, + "learning_rate": 2.8968e-05, + "loss": 0.1264, + "step": 13102 + }, + { + "epoch": 10.317447814100039, + "grad_norm": 0.40716710686683655, + "learning_rate": 2.896766666666667e-05, + "loss": 0.0915, + "step": 13103 + }, + { + "epoch": 10.318235525797558, + "grad_norm": 0.39938175678253174, + "learning_rate": 2.8967333333333333e-05, + "loss": 0.0561, + "step": 13104 + }, + { + "epoch": 10.319023237495077, + "grad_norm": 0.49387118220329285, + "learning_rate": 2.8967000000000002e-05, + "loss": 0.0393, + "step": 13105 + }, + { + "epoch": 10.319810949192595, + "grad_norm": 0.1862078756093979, + "learning_rate": 2.8966666666666665e-05, + "loss": 0.0284, + "step": 13106 + }, + { + "epoch": 10.320598660890115, + "grad_norm": 0.2557104229927063, + "learning_rate": 2.8966333333333334e-05, + "loss": 0.0144, + "step": 13107 + }, + { + "epoch": 10.321386372587632, + "grad_norm": 0.45201271772384644, + "learning_rate": 2.8966e-05, + "loss": 0.0323, + "step": 13108 + }, + { + "epoch": 10.322174084285152, + "grad_norm": 0.2524372935295105, + "learning_rate": 2.8965666666666666e-05, + "loss": 0.0313, + "step": 13109 + }, + { + "epoch": 10.322961795982671, + "grad_norm": 0.4472155272960663, + "learning_rate": 2.8965333333333332e-05, + "loss": 0.0152, + "step": 13110 + }, + { + "epoch": 10.323749507680189, + "grad_norm": 0.1469065397977829, + "learning_rate": 2.8965e-05, + "loss": 0.009, + "step": 13111 + }, + { + "epoch": 10.324537219377708, + "grad_norm": 0.306243896484375, + "learning_rate": 2.8964666666666667e-05, + "loss": 0.0168, + "step": 13112 + }, + { + "epoch": 10.325324931075226, + "grad_norm": 0.1351892650127411, + "learning_rate": 2.8964333333333333e-05, + "loss": 0.0134, + "step": 13113 + }, + { + "epoch": 10.326112642772745, + "grad_norm": 0.19602443277835846, + "learning_rate": 2.8964000000000003e-05, + "loss": 0.0075, + "step": 13114 + }, + { + "epoch": 10.326900354470263, + "grad_norm": 0.4975096881389618, + "learning_rate": 2.8963666666666665e-05, + "loss": 0.0146, + "step": 13115 + }, + { + "epoch": 10.327688066167783, + "grad_norm": 0.5341570973396301, + "learning_rate": 2.8963333333333334e-05, + "loss": 0.0177, + "step": 13116 + }, + { + "epoch": 10.328475777865302, + "grad_norm": 0.4371277689933777, + "learning_rate": 2.8963e-05, + "loss": 0.0205, + "step": 13117 + }, + { + "epoch": 10.32926348956282, + "grad_norm": 0.41871148347854614, + "learning_rate": 2.8962666666666666e-05, + "loss": 0.0118, + "step": 13118 + }, + { + "epoch": 10.33005120126034, + "grad_norm": 0.37413859367370605, + "learning_rate": 2.8962333333333336e-05, + "loss": 0.0106, + "step": 13119 + }, + { + "epoch": 10.330838912957857, + "grad_norm": 0.19920912384986877, + "learning_rate": 2.8962e-05, + "loss": 0.0083, + "step": 13120 + }, + { + "epoch": 10.331626624655376, + "grad_norm": 0.2118016630411148, + "learning_rate": 2.8961666666666667e-05, + "loss": 0.0113, + "step": 13121 + }, + { + "epoch": 10.332414336352894, + "grad_norm": 0.26586589217185974, + "learning_rate": 2.8961333333333333e-05, + "loss": 0.0099, + "step": 13122 + }, + { + "epoch": 10.333202048050413, + "grad_norm": 0.23644089698791504, + "learning_rate": 2.8961000000000003e-05, + "loss": 0.0093, + "step": 13123 + }, + { + "epoch": 10.333989759747933, + "grad_norm": 0.20247574150562286, + "learning_rate": 2.8960666666666665e-05, + "loss": 0.0121, + "step": 13124 + }, + { + "epoch": 10.33477747144545, + "grad_norm": 0.3554787337779999, + "learning_rate": 2.8960333333333335e-05, + "loss": 0.0182, + "step": 13125 + }, + { + "epoch": 10.33556518314297, + "grad_norm": 0.17812898755073547, + "learning_rate": 2.896e-05, + "loss": 0.0114, + "step": 13126 + }, + { + "epoch": 10.336352894840488, + "grad_norm": 0.3493358790874481, + "learning_rate": 2.8959666666666666e-05, + "loss": 0.0179, + "step": 13127 + }, + { + "epoch": 10.337140606538007, + "grad_norm": 0.4379335343837738, + "learning_rate": 2.8959333333333336e-05, + "loss": 0.0125, + "step": 13128 + }, + { + "epoch": 10.337928318235527, + "grad_norm": 0.14871005713939667, + "learning_rate": 2.8959000000000002e-05, + "loss": 0.0104, + "step": 13129 + }, + { + "epoch": 10.338716029933044, + "grad_norm": 0.3303496241569519, + "learning_rate": 2.8958666666666668e-05, + "loss": 0.0127, + "step": 13130 + }, + { + "epoch": 10.339503741630564, + "grad_norm": 0.11175008863210678, + "learning_rate": 2.8958333333333334e-05, + "loss": 0.0057, + "step": 13131 + }, + { + "epoch": 10.340291453328081, + "grad_norm": 0.21404576301574707, + "learning_rate": 2.8958000000000003e-05, + "loss": 0.0101, + "step": 13132 + }, + { + "epoch": 10.3410791650256, + "grad_norm": 0.17253005504608154, + "learning_rate": 2.8957666666666665e-05, + "loss": 0.01, + "step": 13133 + }, + { + "epoch": 10.341866876723119, + "grad_norm": 0.7225163578987122, + "learning_rate": 2.8957333333333335e-05, + "loss": 0.0195, + "step": 13134 + }, + { + "epoch": 10.342654588420638, + "grad_norm": 0.19862402975559235, + "learning_rate": 2.8957e-05, + "loss": 0.0084, + "step": 13135 + }, + { + "epoch": 10.343442300118157, + "grad_norm": 0.3573535084724426, + "learning_rate": 2.8956666666666667e-05, + "loss": 0.0156, + "step": 13136 + }, + { + "epoch": 10.344230011815675, + "grad_norm": 0.5282558798789978, + "learning_rate": 2.8956333333333336e-05, + "loss": 0.0137, + "step": 13137 + }, + { + "epoch": 10.345017723513195, + "grad_norm": 0.4829184114933014, + "learning_rate": 2.8956e-05, + "loss": 0.0203, + "step": 13138 + }, + { + "epoch": 10.345805435210712, + "grad_norm": 0.16679908335208893, + "learning_rate": 2.8955666666666668e-05, + "loss": 0.0112, + "step": 13139 + }, + { + "epoch": 10.346593146908232, + "grad_norm": 0.43192610144615173, + "learning_rate": 2.8955333333333334e-05, + "loss": 0.0061, + "step": 13140 + }, + { + "epoch": 10.34738085860575, + "grad_norm": 0.18979611992835999, + "learning_rate": 2.8955e-05, + "loss": 0.0078, + "step": 13141 + }, + { + "epoch": 10.348168570303269, + "grad_norm": 0.17250868678092957, + "learning_rate": 2.8954666666666666e-05, + "loss": 0.0111, + "step": 13142 + }, + { + "epoch": 10.348956282000788, + "grad_norm": 0.31008580327033997, + "learning_rate": 2.8954333333333335e-05, + "loss": 0.0223, + "step": 13143 + }, + { + "epoch": 10.349743993698306, + "grad_norm": 0.3846202492713928, + "learning_rate": 2.8954e-05, + "loss": 0.0201, + "step": 13144 + }, + { + "epoch": 10.350531705395825, + "grad_norm": 0.619420051574707, + "learning_rate": 2.8953666666666667e-05, + "loss": 0.0299, + "step": 13145 + }, + { + "epoch": 10.351319417093343, + "grad_norm": 0.32506999373435974, + "learning_rate": 2.8953333333333336e-05, + "loss": 0.0192, + "step": 13146 + }, + { + "epoch": 10.352107128790863, + "grad_norm": 0.42935317754745483, + "learning_rate": 2.8953e-05, + "loss": 0.012, + "step": 13147 + }, + { + "epoch": 10.352894840488382, + "grad_norm": 0.2838910222053528, + "learning_rate": 2.8952666666666668e-05, + "loss": 0.0152, + "step": 13148 + }, + { + "epoch": 10.3536825521859, + "grad_norm": 0.32889363169670105, + "learning_rate": 2.8952333333333334e-05, + "loss": 0.0129, + "step": 13149 + }, + { + "epoch": 10.35447026388342, + "grad_norm": 0.3273991644382477, + "learning_rate": 2.8952e-05, + "loss": 0.0165, + "step": 13150 + }, + { + "epoch": 10.355257975580937, + "grad_norm": 1.1170841455459595, + "learning_rate": 2.8951666666666666e-05, + "loss": 0.2842, + "step": 13151 + }, + { + "epoch": 10.356045687278456, + "grad_norm": 0.5958340764045715, + "learning_rate": 2.8951333333333335e-05, + "loss": 0.1562, + "step": 13152 + }, + { + "epoch": 10.356833398975974, + "grad_norm": 0.4255351424217224, + "learning_rate": 2.8951e-05, + "loss": 0.0904, + "step": 13153 + }, + { + "epoch": 10.357621110673493, + "grad_norm": 0.4709649384021759, + "learning_rate": 2.8950666666666667e-05, + "loss": 0.0748, + "step": 13154 + }, + { + "epoch": 10.358408822371013, + "grad_norm": 0.4459827244281769, + "learning_rate": 2.8950333333333336e-05, + "loss": 0.0895, + "step": 13155 + }, + { + "epoch": 10.35919653406853, + "grad_norm": 0.37289750576019287, + "learning_rate": 2.895e-05, + "loss": 0.0852, + "step": 13156 + }, + { + "epoch": 10.35998424576605, + "grad_norm": 0.29726311564445496, + "learning_rate": 2.8949666666666668e-05, + "loss": 0.0192, + "step": 13157 + }, + { + "epoch": 10.360771957463568, + "grad_norm": 0.21455642580986023, + "learning_rate": 2.8949333333333334e-05, + "loss": 0.0105, + "step": 13158 + }, + { + "epoch": 10.361559669161087, + "grad_norm": 0.321188360452652, + "learning_rate": 2.8949e-05, + "loss": 0.0203, + "step": 13159 + }, + { + "epoch": 10.362347380858605, + "grad_norm": 0.20381875336170197, + "learning_rate": 2.8948666666666666e-05, + "loss": 0.0123, + "step": 13160 + }, + { + "epoch": 10.363135092556124, + "grad_norm": 0.31803518533706665, + "learning_rate": 2.8948333333333335e-05, + "loss": 0.0185, + "step": 13161 + }, + { + "epoch": 10.363922804253644, + "grad_norm": 0.3484441936016083, + "learning_rate": 2.8948e-05, + "loss": 0.016, + "step": 13162 + }, + { + "epoch": 10.364710515951161, + "grad_norm": 1.525602102279663, + "learning_rate": 2.8947666666666667e-05, + "loss": 0.0722, + "step": 13163 + }, + { + "epoch": 10.365498227648681, + "grad_norm": 0.3379852771759033, + "learning_rate": 2.8947333333333337e-05, + "loss": 0.0165, + "step": 13164 + }, + { + "epoch": 10.366285939346199, + "grad_norm": 0.1594754010438919, + "learning_rate": 2.8947e-05, + "loss": 0.0065, + "step": 13165 + }, + { + "epoch": 10.367073651043718, + "grad_norm": 0.1307603120803833, + "learning_rate": 2.894666666666667e-05, + "loss": 0.0068, + "step": 13166 + }, + { + "epoch": 10.367861362741237, + "grad_norm": 0.3511560261249542, + "learning_rate": 2.894633333333333e-05, + "loss": 0.0217, + "step": 13167 + }, + { + "epoch": 10.368649074438755, + "grad_norm": 0.3655143976211548, + "learning_rate": 2.8946e-05, + "loss": 0.0122, + "step": 13168 + }, + { + "epoch": 10.369436786136275, + "grad_norm": 0.2637365162372589, + "learning_rate": 2.894566666666667e-05, + "loss": 0.0107, + "step": 13169 + }, + { + "epoch": 10.370224497833792, + "grad_norm": 0.19896893203258514, + "learning_rate": 2.8945333333333332e-05, + "loss": 0.012, + "step": 13170 + }, + { + "epoch": 10.371012209531312, + "grad_norm": 0.32953161001205444, + "learning_rate": 2.8945e-05, + "loss": 0.0195, + "step": 13171 + }, + { + "epoch": 10.37179992122883, + "grad_norm": 0.1751626580953598, + "learning_rate": 2.8944666666666667e-05, + "loss": 0.0082, + "step": 13172 + }, + { + "epoch": 10.372587632926349, + "grad_norm": 0.15775083005428314, + "learning_rate": 2.8944333333333333e-05, + "loss": 0.0052, + "step": 13173 + }, + { + "epoch": 10.373375344623868, + "grad_norm": 0.3306902050971985, + "learning_rate": 2.8944e-05, + "loss": 0.0113, + "step": 13174 + }, + { + "epoch": 10.374163056321386, + "grad_norm": 0.1573101282119751, + "learning_rate": 2.894366666666667e-05, + "loss": 0.008, + "step": 13175 + }, + { + "epoch": 10.374950768018905, + "grad_norm": 0.16518394649028778, + "learning_rate": 2.894333333333333e-05, + "loss": 0.0131, + "step": 13176 + }, + { + "epoch": 10.375738479716423, + "grad_norm": 0.21626189351081848, + "learning_rate": 2.8943e-05, + "loss": 0.0101, + "step": 13177 + }, + { + "epoch": 10.376526191413943, + "grad_norm": 0.24462473392486572, + "learning_rate": 2.894266666666667e-05, + "loss": 0.0118, + "step": 13178 + }, + { + "epoch": 10.37731390311146, + "grad_norm": 0.8816014528274536, + "learning_rate": 2.8942333333333332e-05, + "loss": 0.0179, + "step": 13179 + }, + { + "epoch": 10.37810161480898, + "grad_norm": 0.07412365078926086, + "learning_rate": 2.8942000000000002e-05, + "loss": 0.0045, + "step": 13180 + }, + { + "epoch": 10.3788893265065, + "grad_norm": 0.21013550460338593, + "learning_rate": 2.8941666666666668e-05, + "loss": 0.0148, + "step": 13181 + }, + { + "epoch": 10.379677038204017, + "grad_norm": 0.25428101420402527, + "learning_rate": 2.8941333333333334e-05, + "loss": 0.0143, + "step": 13182 + }, + { + "epoch": 10.380464749901536, + "grad_norm": 0.4519805610179901, + "learning_rate": 2.8941e-05, + "loss": 0.013, + "step": 13183 + }, + { + "epoch": 10.381252461599054, + "grad_norm": 0.25443318486213684, + "learning_rate": 2.894066666666667e-05, + "loss": 0.0201, + "step": 13184 + }, + { + "epoch": 10.382040173296573, + "grad_norm": 0.21731321513652802, + "learning_rate": 2.894033333333333e-05, + "loss": 0.0131, + "step": 13185 + }, + { + "epoch": 10.382827884994093, + "grad_norm": 0.2873965799808502, + "learning_rate": 2.894e-05, + "loss": 0.0151, + "step": 13186 + }, + { + "epoch": 10.38361559669161, + "grad_norm": 0.1902063637971878, + "learning_rate": 2.893966666666667e-05, + "loss": 0.0133, + "step": 13187 + }, + { + "epoch": 10.38440330838913, + "grad_norm": 0.11444836109876633, + "learning_rate": 2.8939333333333333e-05, + "loss": 0.0111, + "step": 13188 + }, + { + "epoch": 10.385191020086648, + "grad_norm": 0.3923102617263794, + "learning_rate": 2.8939000000000002e-05, + "loss": 0.0186, + "step": 13189 + }, + { + "epoch": 10.385978731784167, + "grad_norm": 0.16634656488895416, + "learning_rate": 2.8938666666666668e-05, + "loss": 0.0105, + "step": 13190 + }, + { + "epoch": 10.386766443481685, + "grad_norm": 0.2967030704021454, + "learning_rate": 2.8938333333333334e-05, + "loss": 0.0098, + "step": 13191 + }, + { + "epoch": 10.387554155179204, + "grad_norm": 0.14287053048610687, + "learning_rate": 2.8938e-05, + "loss": 0.0071, + "step": 13192 + }, + { + "epoch": 10.388341866876724, + "grad_norm": 0.25574466586112976, + "learning_rate": 2.893766666666667e-05, + "loss": 0.0128, + "step": 13193 + }, + { + "epoch": 10.389129578574241, + "grad_norm": 0.4294334352016449, + "learning_rate": 2.8937333333333335e-05, + "loss": 0.0272, + "step": 13194 + }, + { + "epoch": 10.389917290271761, + "grad_norm": 0.2929322421550751, + "learning_rate": 2.8937e-05, + "loss": 0.0087, + "step": 13195 + }, + { + "epoch": 10.390705001969279, + "grad_norm": 0.26340216398239136, + "learning_rate": 2.893666666666667e-05, + "loss": 0.0343, + "step": 13196 + }, + { + "epoch": 10.391492713666798, + "grad_norm": 0.8220221996307373, + "learning_rate": 2.8936333333333333e-05, + "loss": 0.0174, + "step": 13197 + }, + { + "epoch": 10.392280425364318, + "grad_norm": 0.3354601263999939, + "learning_rate": 2.8936000000000002e-05, + "loss": 0.0129, + "step": 13198 + }, + { + "epoch": 10.393068137061835, + "grad_norm": 0.45830100774765015, + "learning_rate": 2.8935666666666665e-05, + "loss": 0.0186, + "step": 13199 + }, + { + "epoch": 10.393855848759355, + "grad_norm": 0.4743565022945404, + "learning_rate": 2.8935333333333334e-05, + "loss": 0.0444, + "step": 13200 + }, + { + "epoch": 10.394643560456872, + "grad_norm": 0.8249583840370178, + "learning_rate": 2.8935e-05, + "loss": 0.1773, + "step": 13201 + }, + { + "epoch": 10.395431272154392, + "grad_norm": 0.7087099552154541, + "learning_rate": 2.8934666666666666e-05, + "loss": 0.242, + "step": 13202 + }, + { + "epoch": 10.39621898385191, + "grad_norm": 0.556654691696167, + "learning_rate": 2.8934333333333335e-05, + "loss": 0.1037, + "step": 13203 + }, + { + "epoch": 10.397006695549429, + "grad_norm": 0.398322731256485, + "learning_rate": 2.8934e-05, + "loss": 0.1138, + "step": 13204 + }, + { + "epoch": 10.397794407246948, + "grad_norm": 0.32082197070121765, + "learning_rate": 2.8933666666666667e-05, + "loss": 0.0555, + "step": 13205 + }, + { + "epoch": 10.398582118944466, + "grad_norm": 0.41069459915161133, + "learning_rate": 2.8933333333333333e-05, + "loss": 0.061, + "step": 13206 + }, + { + "epoch": 10.399369830641986, + "grad_norm": 0.2721847891807556, + "learning_rate": 2.8933000000000002e-05, + "loss": 0.0689, + "step": 13207 + }, + { + "epoch": 10.400157542339503, + "grad_norm": 0.2292632907629013, + "learning_rate": 2.8932666666666665e-05, + "loss": 0.0151, + "step": 13208 + }, + { + "epoch": 10.400945254037023, + "grad_norm": 0.6045862436294556, + "learning_rate": 2.8932333333333334e-05, + "loss": 0.0142, + "step": 13209 + }, + { + "epoch": 10.40173296573454, + "grad_norm": 0.27480530738830566, + "learning_rate": 2.8932e-05, + "loss": 0.0232, + "step": 13210 + }, + { + "epoch": 10.40252067743206, + "grad_norm": 0.20577973127365112, + "learning_rate": 2.8931666666666666e-05, + "loss": 0.01, + "step": 13211 + }, + { + "epoch": 10.40330838912958, + "grad_norm": 0.2800405025482178, + "learning_rate": 2.8931333333333335e-05, + "loss": 0.0171, + "step": 13212 + }, + { + "epoch": 10.404096100827097, + "grad_norm": 0.4857857823371887, + "learning_rate": 2.8931e-05, + "loss": 0.0273, + "step": 13213 + }, + { + "epoch": 10.404883812524616, + "grad_norm": 0.20080262422561646, + "learning_rate": 2.8930666666666667e-05, + "loss": 0.0087, + "step": 13214 + }, + { + "epoch": 10.405671524222134, + "grad_norm": 0.22376511991024017, + "learning_rate": 2.8930333333333333e-05, + "loss": 0.0119, + "step": 13215 + }, + { + "epoch": 10.406459235919653, + "grad_norm": 0.10440588742494583, + "learning_rate": 2.8930000000000003e-05, + "loss": 0.0046, + "step": 13216 + }, + { + "epoch": 10.407246947617173, + "grad_norm": 0.19987864792346954, + "learning_rate": 2.8929666666666665e-05, + "loss": 0.0128, + "step": 13217 + }, + { + "epoch": 10.40803465931469, + "grad_norm": 0.10723798722028732, + "learning_rate": 2.8929333333333334e-05, + "loss": 0.0085, + "step": 13218 + }, + { + "epoch": 10.40882237101221, + "grad_norm": 0.262421190738678, + "learning_rate": 2.8929000000000004e-05, + "loss": 0.0191, + "step": 13219 + }, + { + "epoch": 10.409610082709728, + "grad_norm": 0.1778566688299179, + "learning_rate": 2.8928666666666666e-05, + "loss": 0.0073, + "step": 13220 + }, + { + "epoch": 10.410397794407247, + "grad_norm": 0.33788415789604187, + "learning_rate": 2.8928333333333336e-05, + "loss": 0.0138, + "step": 13221 + }, + { + "epoch": 10.411185506104765, + "grad_norm": 0.410131573677063, + "learning_rate": 2.8928e-05, + "loss": 0.0256, + "step": 13222 + }, + { + "epoch": 10.411973217802284, + "grad_norm": 0.15405020117759705, + "learning_rate": 2.8927666666666667e-05, + "loss": 0.0102, + "step": 13223 + }, + { + "epoch": 10.412760929499804, + "grad_norm": 0.28685784339904785, + "learning_rate": 2.8927333333333333e-05, + "loss": 0.0055, + "step": 13224 + }, + { + "epoch": 10.413548641197321, + "grad_norm": 0.16097237169742584, + "learning_rate": 2.8927000000000003e-05, + "loss": 0.0055, + "step": 13225 + }, + { + "epoch": 10.414336352894841, + "grad_norm": 0.18627536296844482, + "learning_rate": 2.8926666666666665e-05, + "loss": 0.0102, + "step": 13226 + }, + { + "epoch": 10.415124064592359, + "grad_norm": 0.16676053404808044, + "learning_rate": 2.8926333333333335e-05, + "loss": 0.0119, + "step": 13227 + }, + { + "epoch": 10.415911776289878, + "grad_norm": 0.1604301482439041, + "learning_rate": 2.8926e-05, + "loss": 0.0105, + "step": 13228 + }, + { + "epoch": 10.416699487987396, + "grad_norm": 0.7282990217208862, + "learning_rate": 2.8925666666666666e-05, + "loss": 0.0211, + "step": 13229 + }, + { + "epoch": 10.417487199684915, + "grad_norm": 0.22870078682899475, + "learning_rate": 2.8925333333333336e-05, + "loss": 0.0106, + "step": 13230 + }, + { + "epoch": 10.418274911382435, + "grad_norm": 0.34350648522377014, + "learning_rate": 2.8925e-05, + "loss": 0.0065, + "step": 13231 + }, + { + "epoch": 10.419062623079952, + "grad_norm": 0.12171883881092072, + "learning_rate": 2.8924666666666668e-05, + "loss": 0.006, + "step": 13232 + }, + { + "epoch": 10.419850334777472, + "grad_norm": 0.3709452152252197, + "learning_rate": 2.8924333333333334e-05, + "loss": 0.0154, + "step": 13233 + }, + { + "epoch": 10.42063804647499, + "grad_norm": 0.15247981250286102, + "learning_rate": 2.8924e-05, + "loss": 0.0088, + "step": 13234 + }, + { + "epoch": 10.421425758172509, + "grad_norm": 0.43947216868400574, + "learning_rate": 2.8923666666666665e-05, + "loss": 0.0149, + "step": 13235 + }, + { + "epoch": 10.422213469870028, + "grad_norm": 0.2870592176914215, + "learning_rate": 2.8923333333333335e-05, + "loss": 0.0135, + "step": 13236 + }, + { + "epoch": 10.423001181567546, + "grad_norm": 0.37642940878868103, + "learning_rate": 2.8923e-05, + "loss": 0.0191, + "step": 13237 + }, + { + "epoch": 10.423788893265066, + "grad_norm": 0.3412913382053375, + "learning_rate": 2.8922666666666667e-05, + "loss": 0.0052, + "step": 13238 + }, + { + "epoch": 10.424576604962583, + "grad_norm": 0.4132593274116516, + "learning_rate": 2.8922333333333336e-05, + "loss": 0.021, + "step": 13239 + }, + { + "epoch": 10.425364316660103, + "grad_norm": 0.1928900182247162, + "learning_rate": 2.8922e-05, + "loss": 0.0083, + "step": 13240 + }, + { + "epoch": 10.42615202835762, + "grad_norm": 0.14418427646160126, + "learning_rate": 2.8921666666666668e-05, + "loss": 0.0058, + "step": 13241 + }, + { + "epoch": 10.42693974005514, + "grad_norm": 0.16704557836055756, + "learning_rate": 2.8921333333333334e-05, + "loss": 0.0087, + "step": 13242 + }, + { + "epoch": 10.42772745175266, + "grad_norm": 0.1523127555847168, + "learning_rate": 2.8921e-05, + "loss": 0.0101, + "step": 13243 + }, + { + "epoch": 10.428515163450177, + "grad_norm": 0.23914845287799835, + "learning_rate": 2.892066666666667e-05, + "loss": 0.0147, + "step": 13244 + }, + { + "epoch": 10.429302875147696, + "grad_norm": 0.20018400251865387, + "learning_rate": 2.8920333333333335e-05, + "loss": 0.0117, + "step": 13245 + }, + { + "epoch": 10.430090586845214, + "grad_norm": 0.18962976336479187, + "learning_rate": 2.892e-05, + "loss": 0.0076, + "step": 13246 + }, + { + "epoch": 10.430878298542734, + "grad_norm": 0.3228554129600525, + "learning_rate": 2.8919666666666667e-05, + "loss": 0.0115, + "step": 13247 + }, + { + "epoch": 10.431666010240253, + "grad_norm": 0.2958277761936188, + "learning_rate": 2.8919333333333336e-05, + "loss": 0.0187, + "step": 13248 + }, + { + "epoch": 10.43245372193777, + "grad_norm": 0.5691194534301758, + "learning_rate": 2.8919e-05, + "loss": 0.0183, + "step": 13249 + }, + { + "epoch": 10.43324143363529, + "grad_norm": 0.33341270685195923, + "learning_rate": 2.8918666666666668e-05, + "loss": 0.0115, + "step": 13250 + }, + { + "epoch": 10.434029145332808, + "grad_norm": 0.709775984287262, + "learning_rate": 2.8918333333333334e-05, + "loss": 0.2411, + "step": 13251 + }, + { + "epoch": 10.434816857030327, + "grad_norm": 0.6489830613136292, + "learning_rate": 2.8918e-05, + "loss": 0.1421, + "step": 13252 + }, + { + "epoch": 10.435604568727845, + "grad_norm": 0.7088537812232971, + "learning_rate": 2.891766666666667e-05, + "loss": 0.1561, + "step": 13253 + }, + { + "epoch": 10.436392280425364, + "grad_norm": 0.49839159846305847, + "learning_rate": 2.8917333333333335e-05, + "loss": 0.0899, + "step": 13254 + }, + { + "epoch": 10.437179992122884, + "grad_norm": 0.4703514873981476, + "learning_rate": 2.8917e-05, + "loss": 0.0566, + "step": 13255 + }, + { + "epoch": 10.437967703820402, + "grad_norm": 0.25947174429893494, + "learning_rate": 2.8916666666666667e-05, + "loss": 0.0386, + "step": 13256 + }, + { + "epoch": 10.438755415517921, + "grad_norm": 0.2472347468137741, + "learning_rate": 2.8916333333333336e-05, + "loss": 0.0312, + "step": 13257 + }, + { + "epoch": 10.439543127215439, + "grad_norm": 0.14498695731163025, + "learning_rate": 2.8916e-05, + "loss": 0.0224, + "step": 13258 + }, + { + "epoch": 10.440330838912958, + "grad_norm": 0.1922789216041565, + "learning_rate": 2.8915666666666668e-05, + "loss": 0.009, + "step": 13259 + }, + { + "epoch": 10.441118550610476, + "grad_norm": 0.209083691239357, + "learning_rate": 2.8915333333333334e-05, + "loss": 0.0084, + "step": 13260 + }, + { + "epoch": 10.441906262307995, + "grad_norm": 0.17705678939819336, + "learning_rate": 2.8915e-05, + "loss": 0.0152, + "step": 13261 + }, + { + "epoch": 10.442693974005515, + "grad_norm": 0.2091992199420929, + "learning_rate": 2.891466666666667e-05, + "loss": 0.0397, + "step": 13262 + }, + { + "epoch": 10.443481685703032, + "grad_norm": 0.1349962204694748, + "learning_rate": 2.8914333333333332e-05, + "loss": 0.0115, + "step": 13263 + }, + { + "epoch": 10.444269397400552, + "grad_norm": 0.2531183362007141, + "learning_rate": 2.8914e-05, + "loss": 0.0116, + "step": 13264 + }, + { + "epoch": 10.44505710909807, + "grad_norm": 0.20275768637657166, + "learning_rate": 2.8913666666666667e-05, + "loss": 0.014, + "step": 13265 + }, + { + "epoch": 10.445844820795589, + "grad_norm": 0.19203433394432068, + "learning_rate": 2.8913333333333333e-05, + "loss": 0.014, + "step": 13266 + }, + { + "epoch": 10.446632532493108, + "grad_norm": 0.15995220839977264, + "learning_rate": 2.8913e-05, + "loss": 0.0125, + "step": 13267 + }, + { + "epoch": 10.447420244190626, + "grad_norm": 0.10184748470783234, + "learning_rate": 2.891266666666667e-05, + "loss": 0.0099, + "step": 13268 + }, + { + "epoch": 10.448207955888146, + "grad_norm": 0.32692837715148926, + "learning_rate": 2.8912333333333334e-05, + "loss": 0.0201, + "step": 13269 + }, + { + "epoch": 10.448995667585663, + "grad_norm": 0.14093554019927979, + "learning_rate": 2.8912e-05, + "loss": 0.0063, + "step": 13270 + }, + { + "epoch": 10.449783379283183, + "grad_norm": 0.2308996617794037, + "learning_rate": 2.891166666666667e-05, + "loss": 0.0181, + "step": 13271 + }, + { + "epoch": 10.4505710909807, + "grad_norm": 0.21564555168151855, + "learning_rate": 2.8911333333333332e-05, + "loss": 0.0173, + "step": 13272 + }, + { + "epoch": 10.45135880267822, + "grad_norm": 0.1898801475763321, + "learning_rate": 2.8911e-05, + "loss": 0.0117, + "step": 13273 + }, + { + "epoch": 10.45214651437574, + "grad_norm": 0.18288958072662354, + "learning_rate": 2.8910666666666667e-05, + "loss": 0.0111, + "step": 13274 + }, + { + "epoch": 10.452934226073257, + "grad_norm": 0.21864882111549377, + "learning_rate": 2.8910333333333333e-05, + "loss": 0.0119, + "step": 13275 + }, + { + "epoch": 10.453721937770776, + "grad_norm": 0.4379083216190338, + "learning_rate": 2.891e-05, + "loss": 0.0108, + "step": 13276 + }, + { + "epoch": 10.454509649468294, + "grad_norm": 0.18320047855377197, + "learning_rate": 2.890966666666667e-05, + "loss": 0.0095, + "step": 13277 + }, + { + "epoch": 10.455297361165814, + "grad_norm": 0.14736878871917725, + "learning_rate": 2.8909333333333335e-05, + "loss": 0.0106, + "step": 13278 + }, + { + "epoch": 10.456085072863331, + "grad_norm": 0.2105065882205963, + "learning_rate": 2.8909e-05, + "loss": 0.0102, + "step": 13279 + }, + { + "epoch": 10.45687278456085, + "grad_norm": 0.20327574014663696, + "learning_rate": 2.890866666666667e-05, + "loss": 0.0106, + "step": 13280 + }, + { + "epoch": 10.45766049625837, + "grad_norm": 0.25858092308044434, + "learning_rate": 2.8908333333333332e-05, + "loss": 0.0134, + "step": 13281 + }, + { + "epoch": 10.458448207955888, + "grad_norm": 0.11011002957820892, + "learning_rate": 2.8908000000000002e-05, + "loss": 0.005, + "step": 13282 + }, + { + "epoch": 10.459235919653407, + "grad_norm": 0.18207278847694397, + "learning_rate": 2.8907666666666668e-05, + "loss": 0.014, + "step": 13283 + }, + { + "epoch": 10.460023631350925, + "grad_norm": 0.43402519822120667, + "learning_rate": 2.8907333333333334e-05, + "loss": 0.0118, + "step": 13284 + }, + { + "epoch": 10.460811343048444, + "grad_norm": 0.1428612619638443, + "learning_rate": 2.8907e-05, + "loss": 0.0063, + "step": 13285 + }, + { + "epoch": 10.461599054745964, + "grad_norm": 0.49122336506843567, + "learning_rate": 2.890666666666667e-05, + "loss": 0.0166, + "step": 13286 + }, + { + "epoch": 10.462386766443482, + "grad_norm": 0.5184640884399414, + "learning_rate": 2.8906333333333335e-05, + "loss": 0.0115, + "step": 13287 + }, + { + "epoch": 10.463174478141001, + "grad_norm": 0.29767128825187683, + "learning_rate": 2.8906e-05, + "loss": 0.018, + "step": 13288 + }, + { + "epoch": 10.463962189838519, + "grad_norm": 0.25195273756980896, + "learning_rate": 2.8905666666666667e-05, + "loss": 0.0123, + "step": 13289 + }, + { + "epoch": 10.464749901536038, + "grad_norm": 1.179945468902588, + "learning_rate": 2.8905333333333333e-05, + "loss": 0.0249, + "step": 13290 + }, + { + "epoch": 10.465537613233556, + "grad_norm": 0.15672364830970764, + "learning_rate": 2.8905000000000002e-05, + "loss": 0.0059, + "step": 13291 + }, + { + "epoch": 10.466325324931075, + "grad_norm": 0.3959991931915283, + "learning_rate": 2.8904666666666664e-05, + "loss": 0.0123, + "step": 13292 + }, + { + "epoch": 10.467113036628595, + "grad_norm": 0.5290806293487549, + "learning_rate": 2.8904333333333334e-05, + "loss": 0.0091, + "step": 13293 + }, + { + "epoch": 10.467900748326112, + "grad_norm": 0.25351613759994507, + "learning_rate": 2.8904000000000003e-05, + "loss": 0.0102, + "step": 13294 + }, + { + "epoch": 10.468688460023632, + "grad_norm": 0.4481489062309265, + "learning_rate": 2.8903666666666666e-05, + "loss": 0.0122, + "step": 13295 + }, + { + "epoch": 10.46947617172115, + "grad_norm": 0.4265894293785095, + "learning_rate": 2.8903333333333335e-05, + "loss": 0.0156, + "step": 13296 + }, + { + "epoch": 10.470263883418669, + "grad_norm": 0.4780440628528595, + "learning_rate": 2.8903e-05, + "loss": 0.0198, + "step": 13297 + }, + { + "epoch": 10.471051595116187, + "grad_norm": 0.29506927728652954, + "learning_rate": 2.8902666666666667e-05, + "loss": 0.0198, + "step": 13298 + }, + { + "epoch": 10.471839306813706, + "grad_norm": 0.2257353812456131, + "learning_rate": 2.8902333333333333e-05, + "loss": 0.014, + "step": 13299 + }, + { + "epoch": 10.472627018511226, + "grad_norm": 0.1886129528284073, + "learning_rate": 2.8902000000000002e-05, + "loss": 0.0111, + "step": 13300 + }, + { + "epoch": 10.473414730208743, + "grad_norm": 0.4935609996318817, + "learning_rate": 2.8901666666666665e-05, + "loss": 0.1984, + "step": 13301 + }, + { + "epoch": 10.474202441906263, + "grad_norm": 0.4578980803489685, + "learning_rate": 2.8901333333333334e-05, + "loss": 0.1355, + "step": 13302 + }, + { + "epoch": 10.47499015360378, + "grad_norm": 0.4283401072025299, + "learning_rate": 2.8901000000000003e-05, + "loss": 0.1034, + "step": 13303 + }, + { + "epoch": 10.4757778653013, + "grad_norm": 0.4314243793487549, + "learning_rate": 2.8900666666666666e-05, + "loss": 0.0846, + "step": 13304 + }, + { + "epoch": 10.47656557699882, + "grad_norm": 0.49528539180755615, + "learning_rate": 2.8900333333333335e-05, + "loss": 0.1297, + "step": 13305 + }, + { + "epoch": 10.477353288696337, + "grad_norm": 0.6082545518875122, + "learning_rate": 2.89e-05, + "loss": 0.0481, + "step": 13306 + }, + { + "epoch": 10.478141000393856, + "grad_norm": 0.48190054297447205, + "learning_rate": 2.8899666666666667e-05, + "loss": 0.0494, + "step": 13307 + }, + { + "epoch": 10.478928712091374, + "grad_norm": 0.25044262409210205, + "learning_rate": 2.8899333333333333e-05, + "loss": 0.0203, + "step": 13308 + }, + { + "epoch": 10.479716423788894, + "grad_norm": 0.24314972758293152, + "learning_rate": 2.8899000000000002e-05, + "loss": 0.0102, + "step": 13309 + }, + { + "epoch": 10.480504135486411, + "grad_norm": 0.14743292331695557, + "learning_rate": 2.8898666666666668e-05, + "loss": 0.0165, + "step": 13310 + }, + { + "epoch": 10.48129184718393, + "grad_norm": 0.5541433691978455, + "learning_rate": 2.8898333333333334e-05, + "loss": 0.0304, + "step": 13311 + }, + { + "epoch": 10.48207955888145, + "grad_norm": 0.18898603320121765, + "learning_rate": 2.8898000000000004e-05, + "loss": 0.0099, + "step": 13312 + }, + { + "epoch": 10.482867270578968, + "grad_norm": 0.28542715311050415, + "learning_rate": 2.8897666666666666e-05, + "loss": 0.0144, + "step": 13313 + }, + { + "epoch": 10.483654982276487, + "grad_norm": 0.2936382591724396, + "learning_rate": 2.8897333333333335e-05, + "loss": 0.0132, + "step": 13314 + }, + { + "epoch": 10.484442693974005, + "grad_norm": 0.2892419397830963, + "learning_rate": 2.8897e-05, + "loss": 0.0133, + "step": 13315 + }, + { + "epoch": 10.485230405671524, + "grad_norm": 0.32848384976387024, + "learning_rate": 2.8896666666666667e-05, + "loss": 0.0158, + "step": 13316 + }, + { + "epoch": 10.486018117369042, + "grad_norm": 0.3250981271266937, + "learning_rate": 2.8896333333333333e-05, + "loss": 0.02, + "step": 13317 + }, + { + "epoch": 10.486805829066562, + "grad_norm": 0.176417276263237, + "learning_rate": 2.8896e-05, + "loss": 0.0089, + "step": 13318 + }, + { + "epoch": 10.487593540764081, + "grad_norm": 0.32611560821533203, + "learning_rate": 2.889566666666667e-05, + "loss": 0.0126, + "step": 13319 + }, + { + "epoch": 10.488381252461599, + "grad_norm": 0.15671846270561218, + "learning_rate": 2.8895333333333334e-05, + "loss": 0.0077, + "step": 13320 + }, + { + "epoch": 10.489168964159118, + "grad_norm": 0.20684324204921722, + "learning_rate": 2.8895e-05, + "loss": 0.0098, + "step": 13321 + }, + { + "epoch": 10.489956675856636, + "grad_norm": 0.13494718074798584, + "learning_rate": 2.8894666666666666e-05, + "loss": 0.0078, + "step": 13322 + }, + { + "epoch": 10.490744387554155, + "grad_norm": 0.2117188274860382, + "learning_rate": 2.8894333333333336e-05, + "loss": 0.0099, + "step": 13323 + }, + { + "epoch": 10.491532099251675, + "grad_norm": 0.1440226137638092, + "learning_rate": 2.8893999999999998e-05, + "loss": 0.0055, + "step": 13324 + }, + { + "epoch": 10.492319810949192, + "grad_norm": 0.3847104609012604, + "learning_rate": 2.8893666666666667e-05, + "loss": 0.0141, + "step": 13325 + }, + { + "epoch": 10.493107522646712, + "grad_norm": 0.35509949922561646, + "learning_rate": 2.8893333333333333e-05, + "loss": 0.0102, + "step": 13326 + }, + { + "epoch": 10.49389523434423, + "grad_norm": 0.5637507438659668, + "learning_rate": 2.8893e-05, + "loss": 0.0266, + "step": 13327 + }, + { + "epoch": 10.494682946041749, + "grad_norm": 0.3688829839229584, + "learning_rate": 2.889266666666667e-05, + "loss": 0.015, + "step": 13328 + }, + { + "epoch": 10.495470657739267, + "grad_norm": 0.291779100894928, + "learning_rate": 2.8892333333333335e-05, + "loss": 0.0156, + "step": 13329 + }, + { + "epoch": 10.496258369436786, + "grad_norm": 0.34204983711242676, + "learning_rate": 2.8892e-05, + "loss": 0.0096, + "step": 13330 + }, + { + "epoch": 10.497046081134306, + "grad_norm": 0.46684256196022034, + "learning_rate": 2.8891666666666666e-05, + "loss": 0.0151, + "step": 13331 + }, + { + "epoch": 10.497833792831823, + "grad_norm": 0.21074095368385315, + "learning_rate": 2.8891333333333336e-05, + "loss": 0.0116, + "step": 13332 + }, + { + "epoch": 10.498621504529343, + "grad_norm": 0.1989026665687561, + "learning_rate": 2.8891e-05, + "loss": 0.0102, + "step": 13333 + }, + { + "epoch": 10.49940921622686, + "grad_norm": 0.13853971660137177, + "learning_rate": 2.8890666666666668e-05, + "loss": 0.0051, + "step": 13334 + }, + { + "epoch": 10.50019692792438, + "grad_norm": 0.5917916893959045, + "learning_rate": 2.8890333333333334e-05, + "loss": 0.0102, + "step": 13335 + }, + { + "epoch": 10.500984639621898, + "grad_norm": 0.20495595037937164, + "learning_rate": 2.889e-05, + "loss": 0.0097, + "step": 13336 + }, + { + "epoch": 10.501772351319417, + "grad_norm": 0.24927818775177002, + "learning_rate": 2.888966666666667e-05, + "loss": 0.0239, + "step": 13337 + }, + { + "epoch": 10.502560063016936, + "grad_norm": 0.506199061870575, + "learning_rate": 2.8889333333333335e-05, + "loss": 0.0083, + "step": 13338 + }, + { + "epoch": 10.503347774714454, + "grad_norm": 0.2530936598777771, + "learning_rate": 2.8889e-05, + "loss": 0.0067, + "step": 13339 + }, + { + "epoch": 10.504135486411974, + "grad_norm": 0.3951898217201233, + "learning_rate": 2.8888666666666667e-05, + "loss": 0.0096, + "step": 13340 + }, + { + "epoch": 10.504923198109491, + "grad_norm": 0.2029803991317749, + "learning_rate": 2.8888333333333336e-05, + "loss": 0.0086, + "step": 13341 + }, + { + "epoch": 10.50571090980701, + "grad_norm": 0.3202512264251709, + "learning_rate": 2.8888e-05, + "loss": 0.0147, + "step": 13342 + }, + { + "epoch": 10.50649862150453, + "grad_norm": 0.40948209166526794, + "learning_rate": 2.8887666666666668e-05, + "loss": 0.0175, + "step": 13343 + }, + { + "epoch": 10.507286333202048, + "grad_norm": 0.6058855652809143, + "learning_rate": 2.8887333333333337e-05, + "loss": 0.017, + "step": 13344 + }, + { + "epoch": 10.508074044899567, + "grad_norm": 0.3942953050136566, + "learning_rate": 2.8887e-05, + "loss": 0.0166, + "step": 13345 + }, + { + "epoch": 10.508861756597085, + "grad_norm": 0.1610478162765503, + "learning_rate": 2.888666666666667e-05, + "loss": 0.0098, + "step": 13346 + }, + { + "epoch": 10.509649468294604, + "grad_norm": 0.2949821352958679, + "learning_rate": 2.8886333333333335e-05, + "loss": 0.0165, + "step": 13347 + }, + { + "epoch": 10.510437179992122, + "grad_norm": 0.35793644189834595, + "learning_rate": 2.8886e-05, + "loss": 0.0193, + "step": 13348 + }, + { + "epoch": 10.511224891689642, + "grad_norm": 0.26781824231147766, + "learning_rate": 2.8885666666666667e-05, + "loss": 0.009, + "step": 13349 + }, + { + "epoch": 10.512012603387161, + "grad_norm": 0.2990707457065582, + "learning_rate": 2.8885333333333333e-05, + "loss": 0.0104, + "step": 13350 + }, + { + "epoch": 10.512800315084679, + "grad_norm": 0.5400180220603943, + "learning_rate": 2.8885e-05, + "loss": 0.1754, + "step": 13351 + }, + { + "epoch": 10.513588026782198, + "grad_norm": 0.5674304962158203, + "learning_rate": 2.8884666666666668e-05, + "loss": 0.1648, + "step": 13352 + }, + { + "epoch": 10.514375738479716, + "grad_norm": 0.6064514517784119, + "learning_rate": 2.8884333333333334e-05, + "loss": 0.12, + "step": 13353 + }, + { + "epoch": 10.515163450177235, + "grad_norm": 0.35020560026168823, + "learning_rate": 2.8884e-05, + "loss": 0.1015, + "step": 13354 + }, + { + "epoch": 10.515951161874753, + "grad_norm": 0.516136646270752, + "learning_rate": 2.888366666666667e-05, + "loss": 0.0545, + "step": 13355 + }, + { + "epoch": 10.516738873572272, + "grad_norm": 0.12746430933475494, + "learning_rate": 2.8883333333333332e-05, + "loss": 0.0378, + "step": 13356 + }, + { + "epoch": 10.517526585269792, + "grad_norm": 0.29330146312713623, + "learning_rate": 2.8883e-05, + "loss": 0.0189, + "step": 13357 + }, + { + "epoch": 10.51831429696731, + "grad_norm": 0.3316335678100586, + "learning_rate": 2.8882666666666667e-05, + "loss": 0.0235, + "step": 13358 + }, + { + "epoch": 10.519102008664829, + "grad_norm": 0.19160911440849304, + "learning_rate": 2.8882333333333333e-05, + "loss": 0.0165, + "step": 13359 + }, + { + "epoch": 10.519889720362347, + "grad_norm": 0.2100517898797989, + "learning_rate": 2.8882000000000002e-05, + "loss": 0.0213, + "step": 13360 + }, + { + "epoch": 10.520677432059866, + "grad_norm": 0.2540443539619446, + "learning_rate": 2.8881666666666668e-05, + "loss": 0.011, + "step": 13361 + }, + { + "epoch": 10.521465143757386, + "grad_norm": 0.3577911853790283, + "learning_rate": 2.8881333333333334e-05, + "loss": 0.018, + "step": 13362 + }, + { + "epoch": 10.522252855454903, + "grad_norm": 0.206342875957489, + "learning_rate": 2.8881e-05, + "loss": 0.0101, + "step": 13363 + }, + { + "epoch": 10.523040567152423, + "grad_norm": 0.14157645404338837, + "learning_rate": 2.888066666666667e-05, + "loss": 0.0061, + "step": 13364 + }, + { + "epoch": 10.52382827884994, + "grad_norm": 0.19720950722694397, + "learning_rate": 2.8880333333333332e-05, + "loss": 0.0095, + "step": 13365 + }, + { + "epoch": 10.52461599054746, + "grad_norm": 0.22968925535678864, + "learning_rate": 2.888e-05, + "loss": 0.0214, + "step": 13366 + }, + { + "epoch": 10.525403702244978, + "grad_norm": 0.14006035029888153, + "learning_rate": 2.8879666666666667e-05, + "loss": 0.0067, + "step": 13367 + }, + { + "epoch": 10.526191413942497, + "grad_norm": 0.34655117988586426, + "learning_rate": 2.8879333333333333e-05, + "loss": 0.0203, + "step": 13368 + }, + { + "epoch": 10.526979125640016, + "grad_norm": 0.0843210518360138, + "learning_rate": 2.8879000000000003e-05, + "loss": 0.0062, + "step": 13369 + }, + { + "epoch": 10.527766837337534, + "grad_norm": 0.1894468069076538, + "learning_rate": 2.887866666666667e-05, + "loss": 0.0122, + "step": 13370 + }, + { + "epoch": 10.528554549035054, + "grad_norm": 0.2084953486919403, + "learning_rate": 2.8878333333333334e-05, + "loss": 0.0066, + "step": 13371 + }, + { + "epoch": 10.529342260732571, + "grad_norm": 0.26187294721603394, + "learning_rate": 2.8878e-05, + "loss": 0.0108, + "step": 13372 + }, + { + "epoch": 10.53012997243009, + "grad_norm": 0.7056725025177002, + "learning_rate": 2.887766666666667e-05, + "loss": 0.0302, + "step": 13373 + }, + { + "epoch": 10.530917684127608, + "grad_norm": 0.33083096146583557, + "learning_rate": 2.8877333333333332e-05, + "loss": 0.0079, + "step": 13374 + }, + { + "epoch": 10.531705395825128, + "grad_norm": 0.23225557804107666, + "learning_rate": 2.8877e-05, + "loss": 0.0118, + "step": 13375 + }, + { + "epoch": 10.532493107522647, + "grad_norm": 0.7843543887138367, + "learning_rate": 2.8876666666666667e-05, + "loss": 0.0162, + "step": 13376 + }, + { + "epoch": 10.533280819220165, + "grad_norm": 0.25202926993370056, + "learning_rate": 2.8876333333333333e-05, + "loss": 0.0445, + "step": 13377 + }, + { + "epoch": 10.534068530917684, + "grad_norm": 0.20953892171382904, + "learning_rate": 2.8876000000000003e-05, + "loss": 0.0116, + "step": 13378 + }, + { + "epoch": 10.534856242615202, + "grad_norm": 0.18517635762691498, + "learning_rate": 2.8875666666666665e-05, + "loss": 0.0095, + "step": 13379 + }, + { + "epoch": 10.535643954312722, + "grad_norm": 0.3414231836795807, + "learning_rate": 2.8875333333333335e-05, + "loss": 0.0204, + "step": 13380 + }, + { + "epoch": 10.536431666010241, + "grad_norm": 0.2572978138923645, + "learning_rate": 2.8875e-05, + "loss": 0.0134, + "step": 13381 + }, + { + "epoch": 10.537219377707759, + "grad_norm": 0.46501708030700684, + "learning_rate": 2.8874666666666666e-05, + "loss": 0.0116, + "step": 13382 + }, + { + "epoch": 10.538007089405278, + "grad_norm": 0.21162104606628418, + "learning_rate": 2.8874333333333332e-05, + "loss": 0.0083, + "step": 13383 + }, + { + "epoch": 10.538794801102796, + "grad_norm": 0.24927924573421478, + "learning_rate": 2.8874000000000002e-05, + "loss": 0.0178, + "step": 13384 + }, + { + "epoch": 10.539582512800315, + "grad_norm": 0.369588702917099, + "learning_rate": 2.8873666666666668e-05, + "loss": 0.0073, + "step": 13385 + }, + { + "epoch": 10.540370224497833, + "grad_norm": 0.4635765254497528, + "learning_rate": 2.8873333333333334e-05, + "loss": 0.0113, + "step": 13386 + }, + { + "epoch": 10.541157936195352, + "grad_norm": 0.17408128082752228, + "learning_rate": 2.8873000000000003e-05, + "loss": 0.0092, + "step": 13387 + }, + { + "epoch": 10.541945647892872, + "grad_norm": 0.3919314742088318, + "learning_rate": 2.8872666666666665e-05, + "loss": 0.0103, + "step": 13388 + }, + { + "epoch": 10.54273335959039, + "grad_norm": 0.6271984577178955, + "learning_rate": 2.8872333333333335e-05, + "loss": 0.0141, + "step": 13389 + }, + { + "epoch": 10.543521071287909, + "grad_norm": 0.20100995898246765, + "learning_rate": 2.8872e-05, + "loss": 0.0095, + "step": 13390 + }, + { + "epoch": 10.544308782985427, + "grad_norm": 0.13451597094535828, + "learning_rate": 2.8871666666666667e-05, + "loss": 0.006, + "step": 13391 + }, + { + "epoch": 10.545096494682946, + "grad_norm": 0.24784021079540253, + "learning_rate": 2.8871333333333333e-05, + "loss": 0.0103, + "step": 13392 + }, + { + "epoch": 10.545884206380464, + "grad_norm": 0.3203374743461609, + "learning_rate": 2.8871000000000002e-05, + "loss": 0.0131, + "step": 13393 + }, + { + "epoch": 10.546671918077983, + "grad_norm": 0.3433254361152649, + "learning_rate": 2.8870666666666668e-05, + "loss": 0.0209, + "step": 13394 + }, + { + "epoch": 10.547459629775503, + "grad_norm": 0.26235610246658325, + "learning_rate": 2.8870333333333334e-05, + "loss": 0.0158, + "step": 13395 + }, + { + "epoch": 10.54824734147302, + "grad_norm": 0.17659728229045868, + "learning_rate": 2.8870000000000003e-05, + "loss": 0.0088, + "step": 13396 + }, + { + "epoch": 10.54903505317054, + "grad_norm": 0.4800730049610138, + "learning_rate": 2.8869666666666666e-05, + "loss": 0.0118, + "step": 13397 + }, + { + "epoch": 10.549822764868058, + "grad_norm": 0.1511547714471817, + "learning_rate": 2.8869333333333335e-05, + "loss": 0.0097, + "step": 13398 + }, + { + "epoch": 10.550610476565577, + "grad_norm": 0.22011850774288177, + "learning_rate": 2.8869e-05, + "loss": 0.0101, + "step": 13399 + }, + { + "epoch": 10.551398188263097, + "grad_norm": 0.2937903106212616, + "learning_rate": 2.8868666666666667e-05, + "loss": 0.0136, + "step": 13400 + }, + { + "epoch": 10.552185899960614, + "grad_norm": 0.6408870220184326, + "learning_rate": 2.8868333333333333e-05, + "loss": 0.17, + "step": 13401 + }, + { + "epoch": 10.552973611658134, + "grad_norm": 0.5339019298553467, + "learning_rate": 2.8868000000000002e-05, + "loss": 0.1652, + "step": 13402 + }, + { + "epoch": 10.553761323355651, + "grad_norm": 0.6123151183128357, + "learning_rate": 2.8867666666666668e-05, + "loss": 0.1705, + "step": 13403 + }, + { + "epoch": 10.55454903505317, + "grad_norm": 0.649024486541748, + "learning_rate": 2.8867333333333334e-05, + "loss": 0.0973, + "step": 13404 + }, + { + "epoch": 10.555336746750688, + "grad_norm": 0.35143110156059265, + "learning_rate": 2.8867000000000003e-05, + "loss": 0.0692, + "step": 13405 + }, + { + "epoch": 10.556124458448208, + "grad_norm": 0.42117834091186523, + "learning_rate": 2.8866666666666666e-05, + "loss": 0.0409, + "step": 13406 + }, + { + "epoch": 10.556912170145727, + "grad_norm": 0.3981575667858124, + "learning_rate": 2.8866333333333335e-05, + "loss": 0.051, + "step": 13407 + }, + { + "epoch": 10.557699881843245, + "grad_norm": 0.3390035629272461, + "learning_rate": 2.8866e-05, + "loss": 0.0336, + "step": 13408 + }, + { + "epoch": 10.558487593540764, + "grad_norm": 0.1971249133348465, + "learning_rate": 2.8865666666666667e-05, + "loss": 0.009, + "step": 13409 + }, + { + "epoch": 10.559275305238282, + "grad_norm": 0.4733351171016693, + "learning_rate": 2.8865333333333336e-05, + "loss": 0.0133, + "step": 13410 + }, + { + "epoch": 10.560063016935802, + "grad_norm": 0.6072251796722412, + "learning_rate": 2.8865e-05, + "loss": 0.016, + "step": 13411 + }, + { + "epoch": 10.56085072863332, + "grad_norm": 0.34561818838119507, + "learning_rate": 2.8864666666666668e-05, + "loss": 0.0132, + "step": 13412 + }, + { + "epoch": 10.561638440330839, + "grad_norm": 0.35817739367485046, + "learning_rate": 2.8864333333333334e-05, + "loss": 0.0461, + "step": 13413 + }, + { + "epoch": 10.562426152028358, + "grad_norm": 0.20502635836601257, + "learning_rate": 2.8864e-05, + "loss": 0.0172, + "step": 13414 + }, + { + "epoch": 10.563213863725876, + "grad_norm": 0.13045257329940796, + "learning_rate": 2.8863666666666666e-05, + "loss": 0.0057, + "step": 13415 + }, + { + "epoch": 10.564001575423395, + "grad_norm": 0.13825735449790955, + "learning_rate": 2.8863333333333335e-05, + "loss": 0.0072, + "step": 13416 + }, + { + "epoch": 10.564789287120913, + "grad_norm": 0.18821793794631958, + "learning_rate": 2.8862999999999998e-05, + "loss": 0.011, + "step": 13417 + }, + { + "epoch": 10.565576998818432, + "grad_norm": 0.2052711397409439, + "learning_rate": 2.8862666666666667e-05, + "loss": 0.0079, + "step": 13418 + }, + { + "epoch": 10.566364710515952, + "grad_norm": 0.17369914054870605, + "learning_rate": 2.8862333333333337e-05, + "loss": 0.0136, + "step": 13419 + }, + { + "epoch": 10.56715242221347, + "grad_norm": 0.5838130712509155, + "learning_rate": 2.8862e-05, + "loss": 0.0182, + "step": 13420 + }, + { + "epoch": 10.567940133910989, + "grad_norm": 0.20892028510570526, + "learning_rate": 2.886166666666667e-05, + "loss": 0.009, + "step": 13421 + }, + { + "epoch": 10.568727845608507, + "grad_norm": 0.24762853980064392, + "learning_rate": 2.8861333333333334e-05, + "loss": 0.0126, + "step": 13422 + }, + { + "epoch": 10.569515557306026, + "grad_norm": 0.3087880611419678, + "learning_rate": 2.8861e-05, + "loss": 0.0095, + "step": 13423 + }, + { + "epoch": 10.570303269003544, + "grad_norm": 0.277890682220459, + "learning_rate": 2.8860666666666666e-05, + "loss": 0.0154, + "step": 13424 + }, + { + "epoch": 10.571090980701063, + "grad_norm": 0.18437287211418152, + "learning_rate": 2.8860333333333336e-05, + "loss": 0.0089, + "step": 13425 + }, + { + "epoch": 10.571878692398583, + "grad_norm": 0.20641353726387024, + "learning_rate": 2.8859999999999998e-05, + "loss": 0.038, + "step": 13426 + }, + { + "epoch": 10.5726664040961, + "grad_norm": 0.21780796349048615, + "learning_rate": 2.8859666666666667e-05, + "loss": 0.0102, + "step": 13427 + }, + { + "epoch": 10.57345411579362, + "grad_norm": 0.29546070098876953, + "learning_rate": 2.8859333333333337e-05, + "loss": 0.0197, + "step": 13428 + }, + { + "epoch": 10.574241827491138, + "grad_norm": 0.22850096225738525, + "learning_rate": 2.8859e-05, + "loss": 0.0133, + "step": 13429 + }, + { + "epoch": 10.575029539188657, + "grad_norm": 0.20787519216537476, + "learning_rate": 2.885866666666667e-05, + "loss": 0.0139, + "step": 13430 + }, + { + "epoch": 10.575817250886175, + "grad_norm": 0.2610778510570526, + "learning_rate": 2.8858333333333335e-05, + "loss": 0.02, + "step": 13431 + }, + { + "epoch": 10.576604962583694, + "grad_norm": 0.17236031591892242, + "learning_rate": 2.8858e-05, + "loss": 0.0114, + "step": 13432 + }, + { + "epoch": 10.577392674281214, + "grad_norm": 0.6051767468452454, + "learning_rate": 2.8857666666666666e-05, + "loss": 0.0202, + "step": 13433 + }, + { + "epoch": 10.578180385978731, + "grad_norm": 0.23573492467403412, + "learning_rate": 2.8857333333333336e-05, + "loss": 0.0146, + "step": 13434 + }, + { + "epoch": 10.57896809767625, + "grad_norm": 0.2684711515903473, + "learning_rate": 2.8857000000000002e-05, + "loss": 0.0155, + "step": 13435 + }, + { + "epoch": 10.579755809373768, + "grad_norm": 0.2563893496990204, + "learning_rate": 2.8856666666666668e-05, + "loss": 0.0166, + "step": 13436 + }, + { + "epoch": 10.580543521071288, + "grad_norm": 0.2868308424949646, + "learning_rate": 2.8856333333333337e-05, + "loss": 0.0118, + "step": 13437 + }, + { + "epoch": 10.581331232768807, + "grad_norm": 0.14532653987407684, + "learning_rate": 2.8856e-05, + "loss": 0.0093, + "step": 13438 + }, + { + "epoch": 10.582118944466325, + "grad_norm": 0.1711723357439041, + "learning_rate": 2.885566666666667e-05, + "loss": 0.0102, + "step": 13439 + }, + { + "epoch": 10.582906656163845, + "grad_norm": 0.14823776483535767, + "learning_rate": 2.885533333333333e-05, + "loss": 0.0145, + "step": 13440 + }, + { + "epoch": 10.583694367861362, + "grad_norm": 0.28840726613998413, + "learning_rate": 2.8855e-05, + "loss": 0.0111, + "step": 13441 + }, + { + "epoch": 10.584482079558882, + "grad_norm": 0.3682619333267212, + "learning_rate": 2.8854666666666667e-05, + "loss": 0.02, + "step": 13442 + }, + { + "epoch": 10.5852697912564, + "grad_norm": 0.2697284519672394, + "learning_rate": 2.8854333333333333e-05, + "loss": 0.01, + "step": 13443 + }, + { + "epoch": 10.586057502953919, + "grad_norm": 0.23200881481170654, + "learning_rate": 2.8854000000000002e-05, + "loss": 0.0156, + "step": 13444 + }, + { + "epoch": 10.586845214651438, + "grad_norm": 0.2967822551727295, + "learning_rate": 2.8853666666666668e-05, + "loss": 0.0123, + "step": 13445 + }, + { + "epoch": 10.587632926348956, + "grad_norm": 0.6680034399032593, + "learning_rate": 2.8853333333333334e-05, + "loss": 0.0131, + "step": 13446 + }, + { + "epoch": 10.588420638046475, + "grad_norm": 0.49699756503105164, + "learning_rate": 2.8853e-05, + "loss": 0.0163, + "step": 13447 + }, + { + "epoch": 10.589208349743993, + "grad_norm": 0.46014219522476196, + "learning_rate": 2.885266666666667e-05, + "loss": 0.0166, + "step": 13448 + }, + { + "epoch": 10.589996061441513, + "grad_norm": 0.43262892961502075, + "learning_rate": 2.885233333333333e-05, + "loss": 0.019, + "step": 13449 + }, + { + "epoch": 10.59078377313903, + "grad_norm": 0.33545175194740295, + "learning_rate": 2.8852e-05, + "loss": 0.011, + "step": 13450 + }, + { + "epoch": 10.59157148483655, + "grad_norm": 0.7077990174293518, + "learning_rate": 2.8851666666666667e-05, + "loss": 0.2032, + "step": 13451 + }, + { + "epoch": 10.592359196534069, + "grad_norm": 0.61416095495224, + "learning_rate": 2.8851333333333333e-05, + "loss": 0.1423, + "step": 13452 + }, + { + "epoch": 10.593146908231587, + "grad_norm": 0.5074251890182495, + "learning_rate": 2.8851000000000002e-05, + "loss": 0.1015, + "step": 13453 + }, + { + "epoch": 10.593934619929106, + "grad_norm": 0.5795175433158875, + "learning_rate": 2.8850666666666668e-05, + "loss": 0.0693, + "step": 13454 + }, + { + "epoch": 10.594722331626624, + "grad_norm": 0.3274850845336914, + "learning_rate": 2.8850333333333334e-05, + "loss": 0.08, + "step": 13455 + }, + { + "epoch": 10.595510043324143, + "grad_norm": 0.23500540852546692, + "learning_rate": 2.885e-05, + "loss": 0.0243, + "step": 13456 + }, + { + "epoch": 10.596297755021663, + "grad_norm": 0.4553017318248749, + "learning_rate": 2.884966666666667e-05, + "loss": 0.0409, + "step": 13457 + }, + { + "epoch": 10.59708546671918, + "grad_norm": 0.19422714412212372, + "learning_rate": 2.8849333333333332e-05, + "loss": 0.0165, + "step": 13458 + }, + { + "epoch": 10.5978731784167, + "grad_norm": 0.22211413085460663, + "learning_rate": 2.8849e-05, + "loss": 0.0162, + "step": 13459 + }, + { + "epoch": 10.598660890114218, + "grad_norm": 0.4832357168197632, + "learning_rate": 2.884866666666667e-05, + "loss": 0.0258, + "step": 13460 + }, + { + "epoch": 10.599448601811737, + "grad_norm": 0.15323594212532043, + "learning_rate": 2.8848333333333333e-05, + "loss": 0.0121, + "step": 13461 + }, + { + "epoch": 10.600236313509257, + "grad_norm": 0.19212442636489868, + "learning_rate": 2.8848000000000002e-05, + "loss": 0.01, + "step": 13462 + }, + { + "epoch": 10.601024025206774, + "grad_norm": 0.19381755590438843, + "learning_rate": 2.8847666666666668e-05, + "loss": 0.0117, + "step": 13463 + }, + { + "epoch": 10.601811736904294, + "grad_norm": 0.2528533637523651, + "learning_rate": 2.8847333333333334e-05, + "loss": 0.0133, + "step": 13464 + }, + { + "epoch": 10.602599448601811, + "grad_norm": 0.3025796413421631, + "learning_rate": 2.8847e-05, + "loss": 0.0163, + "step": 13465 + }, + { + "epoch": 10.60338716029933, + "grad_norm": 0.469224214553833, + "learning_rate": 2.884666666666667e-05, + "loss": 0.055, + "step": 13466 + }, + { + "epoch": 10.604174871996848, + "grad_norm": 0.14330698549747467, + "learning_rate": 2.8846333333333332e-05, + "loss": 0.01, + "step": 13467 + }, + { + "epoch": 10.604962583694368, + "grad_norm": 0.40491244196891785, + "learning_rate": 2.8846e-05, + "loss": 0.0204, + "step": 13468 + }, + { + "epoch": 10.605750295391886, + "grad_norm": 0.15415363013744354, + "learning_rate": 2.8845666666666667e-05, + "loss": 0.0089, + "step": 13469 + }, + { + "epoch": 10.606538007089405, + "grad_norm": 0.12185779958963394, + "learning_rate": 2.8845333333333333e-05, + "loss": 0.0082, + "step": 13470 + }, + { + "epoch": 10.607325718786925, + "grad_norm": 0.1741432398557663, + "learning_rate": 2.8845000000000003e-05, + "loss": 0.0096, + "step": 13471 + }, + { + "epoch": 10.608113430484442, + "grad_norm": 0.234680637717247, + "learning_rate": 2.8844666666666665e-05, + "loss": 0.0138, + "step": 13472 + }, + { + "epoch": 10.608901142181962, + "grad_norm": 0.21557408571243286, + "learning_rate": 2.8844333333333334e-05, + "loss": 0.0139, + "step": 13473 + }, + { + "epoch": 10.60968885387948, + "grad_norm": 0.23946444690227509, + "learning_rate": 2.8844e-05, + "loss": 0.0142, + "step": 13474 + }, + { + "epoch": 10.610476565576999, + "grad_norm": 0.4525044858455658, + "learning_rate": 2.8843666666666666e-05, + "loss": 0.012, + "step": 13475 + }, + { + "epoch": 10.611264277274518, + "grad_norm": 0.15185174345970154, + "learning_rate": 2.8843333333333332e-05, + "loss": 0.0058, + "step": 13476 + }, + { + "epoch": 10.612051988972036, + "grad_norm": 0.21748465299606323, + "learning_rate": 2.8843e-05, + "loss": 0.0157, + "step": 13477 + }, + { + "epoch": 10.612839700669555, + "grad_norm": 0.34778642654418945, + "learning_rate": 2.8842666666666667e-05, + "loss": 0.0147, + "step": 13478 + }, + { + "epoch": 10.613627412367073, + "grad_norm": 0.09083747863769531, + "learning_rate": 2.8842333333333333e-05, + "loss": 0.0042, + "step": 13479 + }, + { + "epoch": 10.614415124064593, + "grad_norm": 0.22120235860347748, + "learning_rate": 2.8842000000000003e-05, + "loss": 0.0174, + "step": 13480 + }, + { + "epoch": 10.615202835762112, + "grad_norm": 0.20210720598697662, + "learning_rate": 2.8841666666666665e-05, + "loss": 0.0128, + "step": 13481 + }, + { + "epoch": 10.61599054745963, + "grad_norm": 0.1408466249704361, + "learning_rate": 2.8841333333333335e-05, + "loss": 0.0056, + "step": 13482 + }, + { + "epoch": 10.61677825915715, + "grad_norm": 0.5024460554122925, + "learning_rate": 2.8841e-05, + "loss": 0.0221, + "step": 13483 + }, + { + "epoch": 10.617565970854667, + "grad_norm": 0.2126515805721283, + "learning_rate": 2.8840666666666666e-05, + "loss": 0.0099, + "step": 13484 + }, + { + "epoch": 10.618353682552186, + "grad_norm": 0.43680545687675476, + "learning_rate": 2.8840333333333336e-05, + "loss": 0.0247, + "step": 13485 + }, + { + "epoch": 10.619141394249704, + "grad_norm": 0.1730051338672638, + "learning_rate": 2.8840000000000002e-05, + "loss": 0.0083, + "step": 13486 + }, + { + "epoch": 10.619929105947223, + "grad_norm": 0.2594952881336212, + "learning_rate": 2.8839666666666668e-05, + "loss": 0.0109, + "step": 13487 + }, + { + "epoch": 10.620716817644743, + "grad_norm": 0.1855374127626419, + "learning_rate": 2.8839333333333334e-05, + "loss": 0.0106, + "step": 13488 + }, + { + "epoch": 10.62150452934226, + "grad_norm": 0.24333958327770233, + "learning_rate": 2.8839000000000003e-05, + "loss": 0.01, + "step": 13489 + }, + { + "epoch": 10.62229224103978, + "grad_norm": 0.2412107139825821, + "learning_rate": 2.8838666666666665e-05, + "loss": 0.0094, + "step": 13490 + }, + { + "epoch": 10.623079952737298, + "grad_norm": 0.5599671006202698, + "learning_rate": 2.8838333333333335e-05, + "loss": 0.0115, + "step": 13491 + }, + { + "epoch": 10.623867664434817, + "grad_norm": 0.1542225033044815, + "learning_rate": 2.8838e-05, + "loss": 0.0111, + "step": 13492 + }, + { + "epoch": 10.624655376132335, + "grad_norm": 0.21384547650814056, + "learning_rate": 2.8837666666666667e-05, + "loss": 0.0118, + "step": 13493 + }, + { + "epoch": 10.625443087829854, + "grad_norm": 0.15079903602600098, + "learning_rate": 2.8837333333333336e-05, + "loss": 0.0097, + "step": 13494 + }, + { + "epoch": 10.626230799527374, + "grad_norm": 0.1996588259935379, + "learning_rate": 2.8837000000000002e-05, + "loss": 0.014, + "step": 13495 + }, + { + "epoch": 10.627018511224891, + "grad_norm": 0.4020625650882721, + "learning_rate": 2.8836666666666668e-05, + "loss": 0.0098, + "step": 13496 + }, + { + "epoch": 10.62780622292241, + "grad_norm": 0.45407989621162415, + "learning_rate": 2.8836333333333334e-05, + "loss": 0.0149, + "step": 13497 + }, + { + "epoch": 10.628593934619929, + "grad_norm": 0.2697799801826477, + "learning_rate": 2.8836000000000003e-05, + "loss": 0.007, + "step": 13498 + }, + { + "epoch": 10.629381646317448, + "grad_norm": 0.37282201647758484, + "learning_rate": 2.8835666666666666e-05, + "loss": 0.0183, + "step": 13499 + }, + { + "epoch": 10.630169358014967, + "grad_norm": 0.18525151908397675, + "learning_rate": 2.8835333333333335e-05, + "loss": 0.0055, + "step": 13500 + }, + { + "epoch": 10.630957069712485, + "grad_norm": 0.6605966687202454, + "learning_rate": 2.8834999999999998e-05, + "loss": 0.2528, + "step": 13501 + }, + { + "epoch": 10.631744781410005, + "grad_norm": 0.3906901478767395, + "learning_rate": 2.8834666666666667e-05, + "loss": 0.1152, + "step": 13502 + }, + { + "epoch": 10.632532493107522, + "grad_norm": 0.4050426781177521, + "learning_rate": 2.8834333333333336e-05, + "loss": 0.1158, + "step": 13503 + }, + { + "epoch": 10.633320204805042, + "grad_norm": 0.30762797594070435, + "learning_rate": 2.8834e-05, + "loss": 0.0613, + "step": 13504 + }, + { + "epoch": 10.63410791650256, + "grad_norm": 0.2857702970504761, + "learning_rate": 2.8833666666666668e-05, + "loss": 0.0316, + "step": 13505 + }, + { + "epoch": 10.634895628200079, + "grad_norm": 0.25924113392829895, + "learning_rate": 2.8833333333333334e-05, + "loss": 0.0325, + "step": 13506 + }, + { + "epoch": 10.635683339897598, + "grad_norm": 0.3119805157184601, + "learning_rate": 2.8833e-05, + "loss": 0.0225, + "step": 13507 + }, + { + "epoch": 10.636471051595116, + "grad_norm": 0.16036587953567505, + "learning_rate": 2.8832666666666666e-05, + "loss": 0.0156, + "step": 13508 + }, + { + "epoch": 10.637258763292635, + "grad_norm": 0.13563118875026703, + "learning_rate": 2.8832333333333335e-05, + "loss": 0.0359, + "step": 13509 + }, + { + "epoch": 10.638046474990153, + "grad_norm": 0.23232625424861908, + "learning_rate": 2.8832e-05, + "loss": 0.0127, + "step": 13510 + }, + { + "epoch": 10.638834186687673, + "grad_norm": 0.43976831436157227, + "learning_rate": 2.8831666666666667e-05, + "loss": 0.0156, + "step": 13511 + }, + { + "epoch": 10.63962189838519, + "grad_norm": 0.20112109184265137, + "learning_rate": 2.8831333333333336e-05, + "loss": 0.0097, + "step": 13512 + }, + { + "epoch": 10.64040961008271, + "grad_norm": 0.12559229135513306, + "learning_rate": 2.8831e-05, + "loss": 0.0059, + "step": 13513 + }, + { + "epoch": 10.64119732178023, + "grad_norm": 0.12788806855678558, + "learning_rate": 2.8830666666666668e-05, + "loss": 0.0078, + "step": 13514 + }, + { + "epoch": 10.641985033477747, + "grad_norm": 0.15892599523067474, + "learning_rate": 2.8830333333333334e-05, + "loss": 0.0092, + "step": 13515 + }, + { + "epoch": 10.642772745175266, + "grad_norm": 0.24116109311580658, + "learning_rate": 2.883e-05, + "loss": 0.0154, + "step": 13516 + }, + { + "epoch": 10.643560456872784, + "grad_norm": 0.13525621592998505, + "learning_rate": 2.8829666666666666e-05, + "loss": 0.0084, + "step": 13517 + }, + { + "epoch": 10.644348168570303, + "grad_norm": 0.3181772232055664, + "learning_rate": 2.8829333333333335e-05, + "loss": 0.007, + "step": 13518 + }, + { + "epoch": 10.645135880267823, + "grad_norm": 0.19009679555892944, + "learning_rate": 2.8829e-05, + "loss": 0.0073, + "step": 13519 + }, + { + "epoch": 10.64592359196534, + "grad_norm": 0.1972496509552002, + "learning_rate": 2.8828666666666667e-05, + "loss": 0.0115, + "step": 13520 + }, + { + "epoch": 10.64671130366286, + "grad_norm": 0.7245154976844788, + "learning_rate": 2.8828333333333337e-05, + "loss": 0.0156, + "step": 13521 + }, + { + "epoch": 10.647499015360378, + "grad_norm": 0.14837788045406342, + "learning_rate": 2.8828e-05, + "loss": 0.0097, + "step": 13522 + }, + { + "epoch": 10.648286727057897, + "grad_norm": 0.27221500873565674, + "learning_rate": 2.882766666666667e-05, + "loss": 0.0106, + "step": 13523 + }, + { + "epoch": 10.649074438755415, + "grad_norm": 0.19018328189849854, + "learning_rate": 2.8827333333333334e-05, + "loss": 0.0222, + "step": 13524 + }, + { + "epoch": 10.649862150452934, + "grad_norm": 0.6933301091194153, + "learning_rate": 2.8827e-05, + "loss": 0.06, + "step": 13525 + }, + { + "epoch": 10.650649862150454, + "grad_norm": 0.20899918675422668, + "learning_rate": 2.8826666666666666e-05, + "loss": 0.0129, + "step": 13526 + }, + { + "epoch": 10.651437573847971, + "grad_norm": 0.14893072843551636, + "learning_rate": 2.8826333333333336e-05, + "loss": 0.0086, + "step": 13527 + }, + { + "epoch": 10.65222528554549, + "grad_norm": 0.1434585601091385, + "learning_rate": 2.8826e-05, + "loss": 0.0064, + "step": 13528 + }, + { + "epoch": 10.653012997243009, + "grad_norm": 0.32330089807510376, + "learning_rate": 2.8825666666666667e-05, + "loss": 0.0149, + "step": 13529 + }, + { + "epoch": 10.653800708940528, + "grad_norm": 0.272360622882843, + "learning_rate": 2.8825333333333333e-05, + "loss": 0.0162, + "step": 13530 + }, + { + "epoch": 10.654588420638046, + "grad_norm": 0.46583259105682373, + "learning_rate": 2.8825e-05, + "loss": 0.0106, + "step": 13531 + }, + { + "epoch": 10.655376132335565, + "grad_norm": 0.15206561982631683, + "learning_rate": 2.882466666666667e-05, + "loss": 0.0148, + "step": 13532 + }, + { + "epoch": 10.656163844033085, + "grad_norm": 0.26988619565963745, + "learning_rate": 2.882433333333333e-05, + "loss": 0.0159, + "step": 13533 + }, + { + "epoch": 10.656951555730602, + "grad_norm": 0.18064671754837036, + "learning_rate": 2.8824e-05, + "loss": 0.0086, + "step": 13534 + }, + { + "epoch": 10.657739267428122, + "grad_norm": 0.2578853666782379, + "learning_rate": 2.882366666666667e-05, + "loss": 0.0127, + "step": 13535 + }, + { + "epoch": 10.65852697912564, + "grad_norm": 0.2784597873687744, + "learning_rate": 2.8823333333333332e-05, + "loss": 0.007, + "step": 13536 + }, + { + "epoch": 10.659314690823159, + "grad_norm": 0.24492987990379333, + "learning_rate": 2.8823000000000002e-05, + "loss": 0.0112, + "step": 13537 + }, + { + "epoch": 10.660102402520678, + "grad_norm": 0.1478525549173355, + "learning_rate": 2.8822666666666668e-05, + "loss": 0.0068, + "step": 13538 + }, + { + "epoch": 10.660890114218196, + "grad_norm": 0.23506315052509308, + "learning_rate": 2.8822333333333334e-05, + "loss": 0.0079, + "step": 13539 + }, + { + "epoch": 10.661677825915715, + "grad_norm": 0.33473891019821167, + "learning_rate": 2.8822e-05, + "loss": 0.0093, + "step": 13540 + }, + { + "epoch": 10.662465537613233, + "grad_norm": 0.21829362213611603, + "learning_rate": 2.882166666666667e-05, + "loss": 0.0118, + "step": 13541 + }, + { + "epoch": 10.663253249310753, + "grad_norm": 0.3891061544418335, + "learning_rate": 2.882133333333333e-05, + "loss": 0.0123, + "step": 13542 + }, + { + "epoch": 10.66404096100827, + "grad_norm": 0.1751141995191574, + "learning_rate": 2.8821e-05, + "loss": 0.0064, + "step": 13543 + }, + { + "epoch": 10.66482867270579, + "grad_norm": 0.29380717873573303, + "learning_rate": 2.882066666666667e-05, + "loss": 0.015, + "step": 13544 + }, + { + "epoch": 10.66561638440331, + "grad_norm": 0.39579150080680847, + "learning_rate": 2.8820333333333333e-05, + "loss": 0.0122, + "step": 13545 + }, + { + "epoch": 10.666404096100827, + "grad_norm": 0.44427263736724854, + "learning_rate": 2.8820000000000002e-05, + "loss": 0.016, + "step": 13546 + }, + { + "epoch": 10.667191807798346, + "grad_norm": 0.23695023357868195, + "learning_rate": 2.8819666666666668e-05, + "loss": 0.0098, + "step": 13547 + }, + { + "epoch": 10.667979519495864, + "grad_norm": 0.33463215827941895, + "learning_rate": 2.8819333333333334e-05, + "loss": 0.0142, + "step": 13548 + }, + { + "epoch": 10.668767231193383, + "grad_norm": 0.2556186318397522, + "learning_rate": 2.8819e-05, + "loss": 0.0127, + "step": 13549 + }, + { + "epoch": 10.669554942890901, + "grad_norm": 0.4359557330608368, + "learning_rate": 2.881866666666667e-05, + "loss": 0.0167, + "step": 13550 + }, + { + "epoch": 10.67034265458842, + "grad_norm": 0.48299068212509155, + "learning_rate": 2.881833333333333e-05, + "loss": 0.1479, + "step": 13551 + }, + { + "epoch": 10.67113036628594, + "grad_norm": 0.5499733090400696, + "learning_rate": 2.8818e-05, + "loss": 0.1304, + "step": 13552 + }, + { + "epoch": 10.671918077983458, + "grad_norm": 0.8779734969139099, + "learning_rate": 2.881766666666667e-05, + "loss": 0.2149, + "step": 13553 + }, + { + "epoch": 10.672705789680977, + "grad_norm": 0.35697832703590393, + "learning_rate": 2.8817333333333333e-05, + "loss": 0.0793, + "step": 13554 + }, + { + "epoch": 10.673493501378495, + "grad_norm": 0.3048725426197052, + "learning_rate": 2.8817000000000002e-05, + "loss": 0.0458, + "step": 13555 + }, + { + "epoch": 10.674281213076014, + "grad_norm": 0.195083886384964, + "learning_rate": 2.8816666666666668e-05, + "loss": 0.0265, + "step": 13556 + }, + { + "epoch": 10.675068924773534, + "grad_norm": 0.5742843747138977, + "learning_rate": 2.8816333333333334e-05, + "loss": 0.0333, + "step": 13557 + }, + { + "epoch": 10.675856636471051, + "grad_norm": 0.17816200852394104, + "learning_rate": 2.8816e-05, + "loss": 0.0189, + "step": 13558 + }, + { + "epoch": 10.67664434816857, + "grad_norm": 0.17495784163475037, + "learning_rate": 2.881566666666667e-05, + "loss": 0.0148, + "step": 13559 + }, + { + "epoch": 10.677432059866089, + "grad_norm": 0.3344147205352783, + "learning_rate": 2.8815333333333335e-05, + "loss": 0.0268, + "step": 13560 + }, + { + "epoch": 10.678219771563608, + "grad_norm": 0.34884732961654663, + "learning_rate": 2.8815e-05, + "loss": 0.0115, + "step": 13561 + }, + { + "epoch": 10.679007483261126, + "grad_norm": 0.2535802721977234, + "learning_rate": 2.8814666666666667e-05, + "loss": 0.0173, + "step": 13562 + }, + { + "epoch": 10.679795194958645, + "grad_norm": 0.15035925805568695, + "learning_rate": 2.8814333333333333e-05, + "loss": 0.0131, + "step": 13563 + }, + { + "epoch": 10.680582906656165, + "grad_norm": 0.19471608102321625, + "learning_rate": 2.8814000000000002e-05, + "loss": 0.0138, + "step": 13564 + }, + { + "epoch": 10.681370618353682, + "grad_norm": 0.20663496851921082, + "learning_rate": 2.8813666666666665e-05, + "loss": 0.0139, + "step": 13565 + }, + { + "epoch": 10.682158330051202, + "grad_norm": 0.17198051512241364, + "learning_rate": 2.8813333333333334e-05, + "loss": 0.0087, + "step": 13566 + }, + { + "epoch": 10.68294604174872, + "grad_norm": 0.3478901982307434, + "learning_rate": 2.8813e-05, + "loss": 0.0163, + "step": 13567 + }, + { + "epoch": 10.683733753446239, + "grad_norm": 0.20873185992240906, + "learning_rate": 2.8812666666666666e-05, + "loss": 0.014, + "step": 13568 + }, + { + "epoch": 10.684521465143757, + "grad_norm": 0.36492592096328735, + "learning_rate": 2.8812333333333335e-05, + "loss": 0.0161, + "step": 13569 + }, + { + "epoch": 10.685309176841276, + "grad_norm": 0.22961559891700745, + "learning_rate": 2.8812e-05, + "loss": 0.0137, + "step": 13570 + }, + { + "epoch": 10.686096888538795, + "grad_norm": 2.0204803943634033, + "learning_rate": 2.8811666666666667e-05, + "loss": 0.0131, + "step": 13571 + }, + { + "epoch": 10.686884600236313, + "grad_norm": 0.1741936206817627, + "learning_rate": 2.8811333333333333e-05, + "loss": 0.0129, + "step": 13572 + }, + { + "epoch": 10.687672311933833, + "grad_norm": 0.1808317005634308, + "learning_rate": 2.8811000000000002e-05, + "loss": 0.0477, + "step": 13573 + }, + { + "epoch": 10.68846002363135, + "grad_norm": 0.6869224309921265, + "learning_rate": 2.8810666666666665e-05, + "loss": 0.0104, + "step": 13574 + }, + { + "epoch": 10.68924773532887, + "grad_norm": 0.8080973029136658, + "learning_rate": 2.8810333333333334e-05, + "loss": 0.0146, + "step": 13575 + }, + { + "epoch": 10.69003544702639, + "grad_norm": 0.42427533864974976, + "learning_rate": 2.881e-05, + "loss": 0.0085, + "step": 13576 + }, + { + "epoch": 10.690823158723907, + "grad_norm": 0.26461002230644226, + "learning_rate": 2.8809666666666666e-05, + "loss": 0.0151, + "step": 13577 + }, + { + "epoch": 10.691610870421426, + "grad_norm": 0.27182331681251526, + "learning_rate": 2.8809333333333336e-05, + "loss": 0.01, + "step": 13578 + }, + { + "epoch": 10.692398582118944, + "grad_norm": 0.3464599847793579, + "learning_rate": 2.8809e-05, + "loss": 0.0126, + "step": 13579 + }, + { + "epoch": 10.693186293816463, + "grad_norm": 0.26569753885269165, + "learning_rate": 2.8808666666666667e-05, + "loss": 0.0133, + "step": 13580 + }, + { + "epoch": 10.693974005513981, + "grad_norm": 0.18255749344825745, + "learning_rate": 2.8808333333333333e-05, + "loss": 0.0129, + "step": 13581 + }, + { + "epoch": 10.6947617172115, + "grad_norm": 0.44058725237846375, + "learning_rate": 2.8808000000000003e-05, + "loss": 0.0163, + "step": 13582 + }, + { + "epoch": 10.69554942890902, + "grad_norm": 0.30032461881637573, + "learning_rate": 2.8807666666666665e-05, + "loss": 0.0229, + "step": 13583 + }, + { + "epoch": 10.696337140606538, + "grad_norm": 0.18301209807395935, + "learning_rate": 2.8807333333333335e-05, + "loss": 0.0207, + "step": 13584 + }, + { + "epoch": 10.697124852304057, + "grad_norm": 0.21439006924629211, + "learning_rate": 2.8807000000000004e-05, + "loss": 0.013, + "step": 13585 + }, + { + "epoch": 10.697912564001575, + "grad_norm": 0.33285364508628845, + "learning_rate": 2.8806666666666666e-05, + "loss": 0.0276, + "step": 13586 + }, + { + "epoch": 10.698700275699094, + "grad_norm": 0.27699559926986694, + "learning_rate": 2.8806333333333336e-05, + "loss": 0.0114, + "step": 13587 + }, + { + "epoch": 10.699487987396612, + "grad_norm": 0.25681111216545105, + "learning_rate": 2.8806e-05, + "loss": 0.0154, + "step": 13588 + }, + { + "epoch": 10.700275699094131, + "grad_norm": 0.15110234916210175, + "learning_rate": 2.8805666666666668e-05, + "loss": 0.0101, + "step": 13589 + }, + { + "epoch": 10.701063410791651, + "grad_norm": 0.6802740693092346, + "learning_rate": 2.8805333333333334e-05, + "loss": 0.0216, + "step": 13590 + }, + { + "epoch": 10.701851122489169, + "grad_norm": 0.23100274801254272, + "learning_rate": 2.8805e-05, + "loss": 0.0096, + "step": 13591 + }, + { + "epoch": 10.702638834186688, + "grad_norm": 0.23526141047477722, + "learning_rate": 2.8804666666666665e-05, + "loss": 0.0115, + "step": 13592 + }, + { + "epoch": 10.703426545884206, + "grad_norm": 0.3379276990890503, + "learning_rate": 2.8804333333333335e-05, + "loss": 0.0192, + "step": 13593 + }, + { + "epoch": 10.704214257581725, + "grad_norm": 0.29817989468574524, + "learning_rate": 2.8804e-05, + "loss": 0.015, + "step": 13594 + }, + { + "epoch": 10.705001969279245, + "grad_norm": 0.2782810628414154, + "learning_rate": 2.8803666666666667e-05, + "loss": 0.0141, + "step": 13595 + }, + { + "epoch": 10.705789680976762, + "grad_norm": 0.16790829598903656, + "learning_rate": 2.8803333333333336e-05, + "loss": 0.0112, + "step": 13596 + }, + { + "epoch": 10.706577392674282, + "grad_norm": 0.35231950879096985, + "learning_rate": 2.8803e-05, + "loss": 0.0171, + "step": 13597 + }, + { + "epoch": 10.7073651043718, + "grad_norm": 0.41250962018966675, + "learning_rate": 2.8802666666666668e-05, + "loss": 0.0142, + "step": 13598 + }, + { + "epoch": 10.708152816069319, + "grad_norm": 0.45086610317230225, + "learning_rate": 2.8802333333333334e-05, + "loss": 0.0144, + "step": 13599 + }, + { + "epoch": 10.708940527766837, + "grad_norm": 0.16439978778362274, + "learning_rate": 2.8802e-05, + "loss": 0.0102, + "step": 13600 + }, + { + "epoch": 10.709728239464356, + "grad_norm": 0.5443260669708252, + "learning_rate": 2.8801666666666666e-05, + "loss": 0.153, + "step": 13601 + }, + { + "epoch": 10.710515951161875, + "grad_norm": 0.43665432929992676, + "learning_rate": 2.8801333333333335e-05, + "loss": 0.1224, + "step": 13602 + }, + { + "epoch": 10.711303662859393, + "grad_norm": 1.1837661266326904, + "learning_rate": 2.8801e-05, + "loss": 0.1042, + "step": 13603 + }, + { + "epoch": 10.712091374556913, + "grad_norm": 0.6846598386764526, + "learning_rate": 2.8800666666666667e-05, + "loss": 0.129, + "step": 13604 + }, + { + "epoch": 10.71287908625443, + "grad_norm": 0.49994105100631714, + "learning_rate": 2.8800333333333336e-05, + "loss": 0.0489, + "step": 13605 + }, + { + "epoch": 10.71366679795195, + "grad_norm": 0.4305044412612915, + "learning_rate": 2.88e-05, + "loss": 0.0357, + "step": 13606 + }, + { + "epoch": 10.714454509649467, + "grad_norm": 0.24847577512264252, + "learning_rate": 2.8799666666666668e-05, + "loss": 0.0233, + "step": 13607 + }, + { + "epoch": 10.715242221346987, + "grad_norm": 0.20707890391349792, + "learning_rate": 2.8799333333333334e-05, + "loss": 0.0093, + "step": 13608 + }, + { + "epoch": 10.716029933044506, + "grad_norm": 0.2029675394296646, + "learning_rate": 2.8799e-05, + "loss": 0.016, + "step": 13609 + }, + { + "epoch": 10.716817644742024, + "grad_norm": 0.23277504742145538, + "learning_rate": 2.879866666666667e-05, + "loss": 0.0176, + "step": 13610 + }, + { + "epoch": 10.717605356439543, + "grad_norm": 0.32212984561920166, + "learning_rate": 2.8798333333333335e-05, + "loss": 0.014, + "step": 13611 + }, + { + "epoch": 10.718393068137061, + "grad_norm": 0.2281959354877472, + "learning_rate": 2.8798e-05, + "loss": 0.016, + "step": 13612 + }, + { + "epoch": 10.71918077983458, + "grad_norm": 0.23302379250526428, + "learning_rate": 2.8797666666666667e-05, + "loss": 0.0095, + "step": 13613 + }, + { + "epoch": 10.7199684915321, + "grad_norm": 0.20475836098194122, + "learning_rate": 2.8797333333333336e-05, + "loss": 0.0099, + "step": 13614 + }, + { + "epoch": 10.720756203229618, + "grad_norm": 0.09204825758934021, + "learning_rate": 2.8797e-05, + "loss": 0.0121, + "step": 13615 + }, + { + "epoch": 10.721543914927137, + "grad_norm": 0.21388845145702362, + "learning_rate": 2.8796666666666668e-05, + "loss": 0.008, + "step": 13616 + }, + { + "epoch": 10.722331626624655, + "grad_norm": 0.15288791060447693, + "learning_rate": 2.8796333333333334e-05, + "loss": 0.0099, + "step": 13617 + }, + { + "epoch": 10.723119338322174, + "grad_norm": 0.13904263079166412, + "learning_rate": 2.8796e-05, + "loss": 0.0086, + "step": 13618 + }, + { + "epoch": 10.723907050019692, + "grad_norm": 0.07742742449045181, + "learning_rate": 2.879566666666667e-05, + "loss": 0.0055, + "step": 13619 + }, + { + "epoch": 10.724694761717211, + "grad_norm": 0.17104004323482513, + "learning_rate": 2.8795333333333332e-05, + "loss": 0.0128, + "step": 13620 + }, + { + "epoch": 10.725482473414731, + "grad_norm": 0.3792170286178589, + "learning_rate": 2.8795e-05, + "loss": 0.0163, + "step": 13621 + }, + { + "epoch": 10.726270185112249, + "grad_norm": 0.26220229268074036, + "learning_rate": 2.8794666666666667e-05, + "loss": 0.014, + "step": 13622 + }, + { + "epoch": 10.727057896809768, + "grad_norm": 0.14785443246364594, + "learning_rate": 2.8794333333333333e-05, + "loss": 0.0096, + "step": 13623 + }, + { + "epoch": 10.727845608507286, + "grad_norm": 0.2852683663368225, + "learning_rate": 2.8794e-05, + "loss": 0.0122, + "step": 13624 + }, + { + "epoch": 10.728633320204805, + "grad_norm": 0.22840102016925812, + "learning_rate": 2.879366666666667e-05, + "loss": 0.0107, + "step": 13625 + }, + { + "epoch": 10.729421031902323, + "grad_norm": 0.17708656191825867, + "learning_rate": 2.879333333333333e-05, + "loss": 0.0132, + "step": 13626 + }, + { + "epoch": 10.730208743599842, + "grad_norm": 0.1368529200553894, + "learning_rate": 2.8793e-05, + "loss": 0.0092, + "step": 13627 + }, + { + "epoch": 10.730996455297362, + "grad_norm": 0.5441668033599854, + "learning_rate": 2.879266666666667e-05, + "loss": 0.0156, + "step": 13628 + }, + { + "epoch": 10.73178416699488, + "grad_norm": 0.2871921956539154, + "learning_rate": 2.8792333333333332e-05, + "loss": 0.0104, + "step": 13629 + }, + { + "epoch": 10.732571878692399, + "grad_norm": 0.34787899255752563, + "learning_rate": 2.8792e-05, + "loss": 0.0108, + "step": 13630 + }, + { + "epoch": 10.733359590389917, + "grad_norm": 0.2055133879184723, + "learning_rate": 2.8791666666666667e-05, + "loss": 0.0087, + "step": 13631 + }, + { + "epoch": 10.734147302087436, + "grad_norm": 0.2872163653373718, + "learning_rate": 2.8791333333333333e-05, + "loss": 0.0106, + "step": 13632 + }, + { + "epoch": 10.734935013784956, + "grad_norm": 0.14668874442577362, + "learning_rate": 2.8791e-05, + "loss": 0.0136, + "step": 13633 + }, + { + "epoch": 10.735722725482473, + "grad_norm": 0.11169102042913437, + "learning_rate": 2.879066666666667e-05, + "loss": 0.0072, + "step": 13634 + }, + { + "epoch": 10.736510437179993, + "grad_norm": 0.11035527288913727, + "learning_rate": 2.8790333333333335e-05, + "loss": 0.0074, + "step": 13635 + }, + { + "epoch": 10.73729814887751, + "grad_norm": 0.2046915739774704, + "learning_rate": 2.879e-05, + "loss": 0.0068, + "step": 13636 + }, + { + "epoch": 10.73808586057503, + "grad_norm": 0.15739060938358307, + "learning_rate": 2.878966666666667e-05, + "loss": 0.0063, + "step": 13637 + }, + { + "epoch": 10.738873572272547, + "grad_norm": 1.304919958114624, + "learning_rate": 2.8789333333333332e-05, + "loss": 0.0185, + "step": 13638 + }, + { + "epoch": 10.739661283970067, + "grad_norm": 0.27550601959228516, + "learning_rate": 2.8789e-05, + "loss": 0.0098, + "step": 13639 + }, + { + "epoch": 10.740448995667586, + "grad_norm": 0.12911774218082428, + "learning_rate": 2.8788666666666668e-05, + "loss": 0.0092, + "step": 13640 + }, + { + "epoch": 10.741236707365104, + "grad_norm": 0.22269639372825623, + "learning_rate": 2.8788333333333334e-05, + "loss": 0.0187, + "step": 13641 + }, + { + "epoch": 10.742024419062624, + "grad_norm": 0.21520234644412994, + "learning_rate": 2.8788e-05, + "loss": 0.009, + "step": 13642 + }, + { + "epoch": 10.742812130760141, + "grad_norm": 0.513831377029419, + "learning_rate": 2.878766666666667e-05, + "loss": 0.0124, + "step": 13643 + }, + { + "epoch": 10.74359984245766, + "grad_norm": 0.33044102787971497, + "learning_rate": 2.8787333333333335e-05, + "loss": 0.0133, + "step": 13644 + }, + { + "epoch": 10.744387554155178, + "grad_norm": 1.5818477869033813, + "learning_rate": 2.8787e-05, + "loss": 0.0408, + "step": 13645 + }, + { + "epoch": 10.745175265852698, + "grad_norm": 0.5983198881149292, + "learning_rate": 2.878666666666667e-05, + "loss": 0.0198, + "step": 13646 + }, + { + "epoch": 10.745962977550217, + "grad_norm": 1.2456741333007812, + "learning_rate": 2.8786333333333333e-05, + "loss": 0.0244, + "step": 13647 + }, + { + "epoch": 10.746750689247735, + "grad_norm": 0.2400367259979248, + "learning_rate": 2.8786000000000002e-05, + "loss": 0.0153, + "step": 13648 + }, + { + "epoch": 10.747538400945254, + "grad_norm": 0.22710707783699036, + "learning_rate": 2.8785666666666668e-05, + "loss": 0.0105, + "step": 13649 + }, + { + "epoch": 10.748326112642772, + "grad_norm": 0.2919447124004364, + "learning_rate": 2.8785333333333334e-05, + "loss": 0.0148, + "step": 13650 + }, + { + "epoch": 10.749113824340292, + "grad_norm": 0.4858127236366272, + "learning_rate": 2.8785e-05, + "loss": 0.2127, + "step": 13651 + }, + { + "epoch": 10.749901536037811, + "grad_norm": 0.5770577788352966, + "learning_rate": 2.8784666666666666e-05, + "loss": 0.1489, + "step": 13652 + }, + { + "epoch": 10.750689247735329, + "grad_norm": 0.3661413788795471, + "learning_rate": 2.8784333333333335e-05, + "loss": 0.097, + "step": 13653 + }, + { + "epoch": 10.751476959432848, + "grad_norm": 0.42873725295066833, + "learning_rate": 2.8784e-05, + "loss": 0.1097, + "step": 13654 + }, + { + "epoch": 10.752264671130366, + "grad_norm": 0.36269623041152954, + "learning_rate": 2.8783666666666667e-05, + "loss": 0.0664, + "step": 13655 + }, + { + "epoch": 10.753052382827885, + "grad_norm": 0.49413737654685974, + "learning_rate": 2.8783333333333333e-05, + "loss": 0.0257, + "step": 13656 + }, + { + "epoch": 10.753840094525403, + "grad_norm": 0.2814118564128876, + "learning_rate": 2.8783000000000002e-05, + "loss": 0.0129, + "step": 13657 + }, + { + "epoch": 10.754627806222922, + "grad_norm": 0.3184499442577362, + "learning_rate": 2.8782666666666665e-05, + "loss": 0.024, + "step": 13658 + }, + { + "epoch": 10.755415517920442, + "grad_norm": 0.2978212535381317, + "learning_rate": 2.8782333333333334e-05, + "loss": 0.0189, + "step": 13659 + }, + { + "epoch": 10.75620322961796, + "grad_norm": 0.3703259229660034, + "learning_rate": 2.8782000000000003e-05, + "loss": 0.0209, + "step": 13660 + }, + { + "epoch": 10.756990941315479, + "grad_norm": 0.4423844516277313, + "learning_rate": 2.8781666666666666e-05, + "loss": 0.0186, + "step": 13661 + }, + { + "epoch": 10.757778653012997, + "grad_norm": 0.3046698570251465, + "learning_rate": 2.8781333333333335e-05, + "loss": 0.0229, + "step": 13662 + }, + { + "epoch": 10.758566364710516, + "grad_norm": 0.18408474326133728, + "learning_rate": 2.8781e-05, + "loss": 0.0109, + "step": 13663 + }, + { + "epoch": 10.759354076408034, + "grad_norm": 0.13168026506900787, + "learning_rate": 2.8780666666666667e-05, + "loss": 0.008, + "step": 13664 + }, + { + "epoch": 10.760141788105553, + "grad_norm": 0.7484548687934875, + "learning_rate": 2.8780333333333333e-05, + "loss": 0.016, + "step": 13665 + }, + { + "epoch": 10.760929499803073, + "grad_norm": 0.12163759022951126, + "learning_rate": 2.8780000000000002e-05, + "loss": 0.0068, + "step": 13666 + }, + { + "epoch": 10.76171721150059, + "grad_norm": 0.17638476192951202, + "learning_rate": 2.8779666666666665e-05, + "loss": 0.0157, + "step": 13667 + }, + { + "epoch": 10.76250492319811, + "grad_norm": 0.1254734992980957, + "learning_rate": 2.8779333333333334e-05, + "loss": 0.0144, + "step": 13668 + }, + { + "epoch": 10.763292634895627, + "grad_norm": 0.2103087455034256, + "learning_rate": 2.8779000000000003e-05, + "loss": 0.0134, + "step": 13669 + }, + { + "epoch": 10.764080346593147, + "grad_norm": 0.2012401521205902, + "learning_rate": 2.8778666666666666e-05, + "loss": 0.0116, + "step": 13670 + }, + { + "epoch": 10.764868058290666, + "grad_norm": 0.48299136757850647, + "learning_rate": 2.8778333333333335e-05, + "loss": 0.0181, + "step": 13671 + }, + { + "epoch": 10.765655769988184, + "grad_norm": 0.1998378187417984, + "learning_rate": 2.8778e-05, + "loss": 0.0084, + "step": 13672 + }, + { + "epoch": 10.766443481685704, + "grad_norm": 0.16655313968658447, + "learning_rate": 2.8777666666666667e-05, + "loss": 0.0126, + "step": 13673 + }, + { + "epoch": 10.767231193383221, + "grad_norm": 0.28011560440063477, + "learning_rate": 2.8777333333333333e-05, + "loss": 0.0189, + "step": 13674 + }, + { + "epoch": 10.76801890508074, + "grad_norm": 0.4393375813961029, + "learning_rate": 2.8777000000000002e-05, + "loss": 0.0151, + "step": 13675 + }, + { + "epoch": 10.768806616778258, + "grad_norm": 0.3768768310546875, + "learning_rate": 2.8776666666666665e-05, + "loss": 0.0126, + "step": 13676 + }, + { + "epoch": 10.769594328475778, + "grad_norm": 0.25794506072998047, + "learning_rate": 2.8776333333333334e-05, + "loss": 0.0122, + "step": 13677 + }, + { + "epoch": 10.770382040173297, + "grad_norm": 0.12333236634731293, + "learning_rate": 2.8776000000000004e-05, + "loss": 0.0081, + "step": 13678 + }, + { + "epoch": 10.771169751870815, + "grad_norm": 0.3944425880908966, + "learning_rate": 2.8775666666666666e-05, + "loss": 0.0222, + "step": 13679 + }, + { + "epoch": 10.771957463568334, + "grad_norm": 0.966789722442627, + "learning_rate": 2.8775333333333336e-05, + "loss": 0.0124, + "step": 13680 + }, + { + "epoch": 10.772745175265852, + "grad_norm": 0.24699103832244873, + "learning_rate": 2.8774999999999998e-05, + "loss": 0.0152, + "step": 13681 + }, + { + "epoch": 10.773532886963372, + "grad_norm": 0.15102548897266388, + "learning_rate": 2.8774666666666667e-05, + "loss": 0.0118, + "step": 13682 + }, + { + "epoch": 10.77432059866089, + "grad_norm": 0.19923311471939087, + "learning_rate": 2.8774333333333333e-05, + "loss": 0.0111, + "step": 13683 + }, + { + "epoch": 10.775108310358409, + "grad_norm": 0.36277490854263306, + "learning_rate": 2.8774e-05, + "loss": 0.0514, + "step": 13684 + }, + { + "epoch": 10.775896022055928, + "grad_norm": 0.1586446315050125, + "learning_rate": 2.877366666666667e-05, + "loss": 0.0064, + "step": 13685 + }, + { + "epoch": 10.776683733753446, + "grad_norm": 0.176683247089386, + "learning_rate": 2.8773333333333335e-05, + "loss": 0.0124, + "step": 13686 + }, + { + "epoch": 10.777471445450965, + "grad_norm": 0.42623037099838257, + "learning_rate": 2.8773e-05, + "loss": 0.018, + "step": 13687 + }, + { + "epoch": 10.778259157148483, + "grad_norm": 0.2195279896259308, + "learning_rate": 2.8772666666666666e-05, + "loss": 0.01, + "step": 13688 + }, + { + "epoch": 10.779046868846002, + "grad_norm": 0.12882021069526672, + "learning_rate": 2.8772333333333336e-05, + "loss": 0.0091, + "step": 13689 + }, + { + "epoch": 10.779834580543522, + "grad_norm": 0.2936634421348572, + "learning_rate": 2.8771999999999998e-05, + "loss": 0.0213, + "step": 13690 + }, + { + "epoch": 10.78062229224104, + "grad_norm": 0.385553777217865, + "learning_rate": 2.8771666666666668e-05, + "loss": 0.0157, + "step": 13691 + }, + { + "epoch": 10.781410003938559, + "grad_norm": 0.2602934241294861, + "learning_rate": 2.8771333333333334e-05, + "loss": 0.0147, + "step": 13692 + }, + { + "epoch": 10.782197715636077, + "grad_norm": 0.2872522473335266, + "learning_rate": 2.8771e-05, + "loss": 0.0214, + "step": 13693 + }, + { + "epoch": 10.782985427333596, + "grad_norm": 0.3818444013595581, + "learning_rate": 2.877066666666667e-05, + "loss": 0.0252, + "step": 13694 + }, + { + "epoch": 10.783773139031114, + "grad_norm": 0.3674823045730591, + "learning_rate": 2.8770333333333335e-05, + "loss": 0.0144, + "step": 13695 + }, + { + "epoch": 10.784560850728633, + "grad_norm": 0.5141420960426331, + "learning_rate": 2.877e-05, + "loss": 0.0148, + "step": 13696 + }, + { + "epoch": 10.785348562426153, + "grad_norm": 0.6456286311149597, + "learning_rate": 2.8769666666666667e-05, + "loss": 0.0141, + "step": 13697 + }, + { + "epoch": 10.78613627412367, + "grad_norm": 0.27272191643714905, + "learning_rate": 2.8769333333333336e-05, + "loss": 0.0151, + "step": 13698 + }, + { + "epoch": 10.78692398582119, + "grad_norm": 0.27464595437049866, + "learning_rate": 2.8769e-05, + "loss": 0.016, + "step": 13699 + }, + { + "epoch": 10.787711697518708, + "grad_norm": 0.3477989435195923, + "learning_rate": 2.8768666666666668e-05, + "loss": 0.0127, + "step": 13700 + }, + { + "epoch": 10.788499409216227, + "grad_norm": 0.7308120131492615, + "learning_rate": 2.8768333333333334e-05, + "loss": 0.2282, + "step": 13701 + }, + { + "epoch": 10.789287120913745, + "grad_norm": 0.684813916683197, + "learning_rate": 2.8768e-05, + "loss": 0.1726, + "step": 13702 + }, + { + "epoch": 10.790074832611264, + "grad_norm": 0.5183923244476318, + "learning_rate": 2.876766666666667e-05, + "loss": 0.1336, + "step": 13703 + }, + { + "epoch": 10.790862544308784, + "grad_norm": 0.4961523115634918, + "learning_rate": 2.8767333333333335e-05, + "loss": 0.0968, + "step": 13704 + }, + { + "epoch": 10.791650256006301, + "grad_norm": 0.30525079369544983, + "learning_rate": 2.8767e-05, + "loss": 0.0482, + "step": 13705 + }, + { + "epoch": 10.79243796770382, + "grad_norm": 0.23988506197929382, + "learning_rate": 2.8766666666666667e-05, + "loss": 0.0332, + "step": 13706 + }, + { + "epoch": 10.793225679401338, + "grad_norm": 0.6688709259033203, + "learning_rate": 2.8766333333333336e-05, + "loss": 0.0225, + "step": 13707 + }, + { + "epoch": 10.794013391098858, + "grad_norm": 0.2619762718677521, + "learning_rate": 2.8766e-05, + "loss": 0.0287, + "step": 13708 + }, + { + "epoch": 10.794801102796377, + "grad_norm": 0.43515390157699585, + "learning_rate": 2.8765666666666668e-05, + "loss": 0.0174, + "step": 13709 + }, + { + "epoch": 10.795588814493895, + "grad_norm": 1.119633436203003, + "learning_rate": 2.8765333333333337e-05, + "loss": 0.0251, + "step": 13710 + }, + { + "epoch": 10.796376526191414, + "grad_norm": 0.12954477965831757, + "learning_rate": 2.8765e-05, + "loss": 0.0087, + "step": 13711 + }, + { + "epoch": 10.797164237888932, + "grad_norm": 0.49787637591362, + "learning_rate": 2.876466666666667e-05, + "loss": 0.0135, + "step": 13712 + }, + { + "epoch": 10.797951949586452, + "grad_norm": 0.35339605808258057, + "learning_rate": 2.8764333333333332e-05, + "loss": 0.0645, + "step": 13713 + }, + { + "epoch": 10.798739661283971, + "grad_norm": 0.10248593240976334, + "learning_rate": 2.8764e-05, + "loss": 0.0076, + "step": 13714 + }, + { + "epoch": 10.799527372981489, + "grad_norm": 0.2363772690296173, + "learning_rate": 2.8763666666666667e-05, + "loss": 0.0147, + "step": 13715 + }, + { + "epoch": 10.800315084679008, + "grad_norm": 0.23772288858890533, + "learning_rate": 2.8763333333333333e-05, + "loss": 0.0077, + "step": 13716 + }, + { + "epoch": 10.801102796376526, + "grad_norm": 0.22201190888881683, + "learning_rate": 2.8763e-05, + "loss": 0.0209, + "step": 13717 + }, + { + "epoch": 10.801890508074045, + "grad_norm": 0.3449631929397583, + "learning_rate": 2.8762666666666668e-05, + "loss": 0.0088, + "step": 13718 + }, + { + "epoch": 10.802678219771563, + "grad_norm": 0.33810868859291077, + "learning_rate": 2.8762333333333334e-05, + "loss": 0.0148, + "step": 13719 + }, + { + "epoch": 10.803465931469082, + "grad_norm": 0.23656955361366272, + "learning_rate": 2.8762e-05, + "loss": 0.0113, + "step": 13720 + }, + { + "epoch": 10.8042536431666, + "grad_norm": 0.2876949608325958, + "learning_rate": 2.876166666666667e-05, + "loss": 0.0123, + "step": 13721 + }, + { + "epoch": 10.80504135486412, + "grad_norm": 0.2446364015340805, + "learning_rate": 2.8761333333333332e-05, + "loss": 0.0133, + "step": 13722 + }, + { + "epoch": 10.805829066561639, + "grad_norm": 0.2595805525779724, + "learning_rate": 2.8761e-05, + "loss": 0.0138, + "step": 13723 + }, + { + "epoch": 10.806616778259157, + "grad_norm": 1.1063498258590698, + "learning_rate": 2.8760666666666667e-05, + "loss": 0.0086, + "step": 13724 + }, + { + "epoch": 10.807404489956676, + "grad_norm": 0.39989718794822693, + "learning_rate": 2.8760333333333333e-05, + "loss": 0.0154, + "step": 13725 + }, + { + "epoch": 10.808192201654194, + "grad_norm": 0.3657818138599396, + "learning_rate": 2.876e-05, + "loss": 0.0273, + "step": 13726 + }, + { + "epoch": 10.808979913351713, + "grad_norm": 0.231062114238739, + "learning_rate": 2.875966666666667e-05, + "loss": 0.013, + "step": 13727 + }, + { + "epoch": 10.809767625049233, + "grad_norm": 0.24454563856124878, + "learning_rate": 2.8759333333333334e-05, + "loss": 0.0203, + "step": 13728 + }, + { + "epoch": 10.81055533674675, + "grad_norm": 0.27229493856430054, + "learning_rate": 2.8759e-05, + "loss": 0.0091, + "step": 13729 + }, + { + "epoch": 10.81134304844427, + "grad_norm": 0.2569252550601959, + "learning_rate": 2.875866666666667e-05, + "loss": 0.0204, + "step": 13730 + }, + { + "epoch": 10.812130760141788, + "grad_norm": 0.3242294192314148, + "learning_rate": 2.8758333333333332e-05, + "loss": 0.0186, + "step": 13731 + }, + { + "epoch": 10.812918471839307, + "grad_norm": 0.6924504041671753, + "learning_rate": 2.8758e-05, + "loss": 0.013, + "step": 13732 + }, + { + "epoch": 10.813706183536826, + "grad_norm": 0.19855132699012756, + "learning_rate": 2.8757666666666667e-05, + "loss": 0.0138, + "step": 13733 + }, + { + "epoch": 10.814493895234344, + "grad_norm": 0.16833238303661346, + "learning_rate": 2.8757333333333333e-05, + "loss": 0.0115, + "step": 13734 + }, + { + "epoch": 10.815281606931864, + "grad_norm": 0.12783373892307281, + "learning_rate": 2.8757000000000003e-05, + "loss": 0.0083, + "step": 13735 + }, + { + "epoch": 10.816069318629381, + "grad_norm": 0.22073589265346527, + "learning_rate": 2.875666666666667e-05, + "loss": 0.0091, + "step": 13736 + }, + { + "epoch": 10.8168570303269, + "grad_norm": 0.1881382018327713, + "learning_rate": 2.8756333333333335e-05, + "loss": 0.0103, + "step": 13737 + }, + { + "epoch": 10.817644742024418, + "grad_norm": 0.2363552749156952, + "learning_rate": 2.8756e-05, + "loss": 0.0111, + "step": 13738 + }, + { + "epoch": 10.818432453721938, + "grad_norm": 0.2634110152721405, + "learning_rate": 2.875566666666667e-05, + "loss": 0.0102, + "step": 13739 + }, + { + "epoch": 10.819220165419457, + "grad_norm": 0.23869751393795013, + "learning_rate": 2.8755333333333332e-05, + "loss": 0.0184, + "step": 13740 + }, + { + "epoch": 10.820007877116975, + "grad_norm": 0.8074941039085388, + "learning_rate": 2.8755e-05, + "loss": 0.019, + "step": 13741 + }, + { + "epoch": 10.820795588814494, + "grad_norm": 0.5459920167922974, + "learning_rate": 2.8754666666666664e-05, + "loss": 0.0201, + "step": 13742 + }, + { + "epoch": 10.821583300512012, + "grad_norm": 0.339531809091568, + "learning_rate": 2.8754333333333334e-05, + "loss": 0.0147, + "step": 13743 + }, + { + "epoch": 10.822371012209532, + "grad_norm": 0.22506141662597656, + "learning_rate": 2.8754000000000003e-05, + "loss": 0.0067, + "step": 13744 + }, + { + "epoch": 10.82315872390705, + "grad_norm": 0.367082804441452, + "learning_rate": 2.8753666666666665e-05, + "loss": 0.0091, + "step": 13745 + }, + { + "epoch": 10.823946435604569, + "grad_norm": 0.35540422797203064, + "learning_rate": 2.8753333333333335e-05, + "loss": 0.0292, + "step": 13746 + }, + { + "epoch": 10.824734147302088, + "grad_norm": 0.22575679421424866, + "learning_rate": 2.8753e-05, + "loss": 0.0108, + "step": 13747 + }, + { + "epoch": 10.825521858999606, + "grad_norm": 0.35289549827575684, + "learning_rate": 2.8752666666666667e-05, + "loss": 0.0149, + "step": 13748 + }, + { + "epoch": 10.826309570697125, + "grad_norm": 0.3068714141845703, + "learning_rate": 2.8752333333333333e-05, + "loss": 0.0136, + "step": 13749 + }, + { + "epoch": 10.827097282394643, + "grad_norm": 0.6774144768714905, + "learning_rate": 2.8752000000000002e-05, + "loss": 0.0189, + "step": 13750 + }, + { + "epoch": 10.827884994092162, + "grad_norm": 1.0914289951324463, + "learning_rate": 2.8751666666666664e-05, + "loss": 0.2597, + "step": 13751 + }, + { + "epoch": 10.828672705789682, + "grad_norm": 0.6009402871131897, + "learning_rate": 2.8751333333333334e-05, + "loss": 0.1558, + "step": 13752 + }, + { + "epoch": 10.8294604174872, + "grad_norm": 0.4930923283100128, + "learning_rate": 2.8751000000000003e-05, + "loss": 0.1086, + "step": 13753 + }, + { + "epoch": 10.830248129184719, + "grad_norm": 0.5035916566848755, + "learning_rate": 2.8750666666666666e-05, + "loss": 0.0942, + "step": 13754 + }, + { + "epoch": 10.831035840882237, + "grad_norm": 0.34068453311920166, + "learning_rate": 2.8750333333333335e-05, + "loss": 0.0519, + "step": 13755 + }, + { + "epoch": 10.831823552579756, + "grad_norm": 0.25137224793434143, + "learning_rate": 2.875e-05, + "loss": 0.0172, + "step": 13756 + }, + { + "epoch": 10.832611264277274, + "grad_norm": 0.34521883726119995, + "learning_rate": 2.8749666666666667e-05, + "loss": 0.0358, + "step": 13757 + }, + { + "epoch": 10.833398975974793, + "grad_norm": 0.46658751368522644, + "learning_rate": 2.8749333333333333e-05, + "loss": 0.0814, + "step": 13758 + }, + { + "epoch": 10.834186687672313, + "grad_norm": 0.24274718761444092, + "learning_rate": 2.8749000000000002e-05, + "loss": 0.0151, + "step": 13759 + }, + { + "epoch": 10.83497439936983, + "grad_norm": 0.36076852679252625, + "learning_rate": 2.8748666666666668e-05, + "loss": 0.0199, + "step": 13760 + }, + { + "epoch": 10.83576211106735, + "grad_norm": 0.24315990507602692, + "learning_rate": 2.8748333333333334e-05, + "loss": 0.0188, + "step": 13761 + }, + { + "epoch": 10.836549822764868, + "grad_norm": 0.26177141070365906, + "learning_rate": 2.8748000000000003e-05, + "loss": 0.0076, + "step": 13762 + }, + { + "epoch": 10.837337534462387, + "grad_norm": 0.2984359860420227, + "learning_rate": 2.8747666666666666e-05, + "loss": 0.0138, + "step": 13763 + }, + { + "epoch": 10.838125246159905, + "grad_norm": 0.37425339221954346, + "learning_rate": 2.8747333333333335e-05, + "loss": 0.0097, + "step": 13764 + }, + { + "epoch": 10.838912957857424, + "grad_norm": 0.6297921538352966, + "learning_rate": 2.8747e-05, + "loss": 0.0172, + "step": 13765 + }, + { + "epoch": 10.839700669554944, + "grad_norm": 0.31958508491516113, + "learning_rate": 2.8746666666666667e-05, + "loss": 0.0169, + "step": 13766 + }, + { + "epoch": 10.840488381252461, + "grad_norm": 0.2759864628314972, + "learning_rate": 2.8746333333333333e-05, + "loss": 0.0137, + "step": 13767 + }, + { + "epoch": 10.84127609294998, + "grad_norm": 0.26230695843696594, + "learning_rate": 2.8746000000000002e-05, + "loss": 0.0092, + "step": 13768 + }, + { + "epoch": 10.842063804647498, + "grad_norm": 0.22191818058490753, + "learning_rate": 2.8745666666666668e-05, + "loss": 0.0131, + "step": 13769 + }, + { + "epoch": 10.842851516345018, + "grad_norm": 0.2714749574661255, + "learning_rate": 2.8745333333333334e-05, + "loss": 0.0264, + "step": 13770 + }, + { + "epoch": 10.843639228042537, + "grad_norm": 0.26745399832725525, + "learning_rate": 2.8745000000000003e-05, + "loss": 0.0134, + "step": 13771 + }, + { + "epoch": 10.844426939740055, + "grad_norm": 0.59205162525177, + "learning_rate": 2.8744666666666666e-05, + "loss": 0.024, + "step": 13772 + }, + { + "epoch": 10.845214651437574, + "grad_norm": 0.29593318700790405, + "learning_rate": 2.8744333333333335e-05, + "loss": 0.0151, + "step": 13773 + }, + { + "epoch": 10.846002363135092, + "grad_norm": 0.16357171535491943, + "learning_rate": 2.8743999999999998e-05, + "loss": 0.0074, + "step": 13774 + }, + { + "epoch": 10.846790074832612, + "grad_norm": 0.17293468117713928, + "learning_rate": 2.8743666666666667e-05, + "loss": 0.0109, + "step": 13775 + }, + { + "epoch": 10.84757778653013, + "grad_norm": 0.41243821382522583, + "learning_rate": 2.8743333333333333e-05, + "loss": 0.0097, + "step": 13776 + }, + { + "epoch": 10.848365498227649, + "grad_norm": 0.22737900912761688, + "learning_rate": 2.8743e-05, + "loss": 0.0133, + "step": 13777 + }, + { + "epoch": 10.849153209925168, + "grad_norm": 0.12649282813072205, + "learning_rate": 2.874266666666667e-05, + "loss": 0.0068, + "step": 13778 + }, + { + "epoch": 10.849940921622686, + "grad_norm": 0.2222665399312973, + "learning_rate": 2.8742333333333334e-05, + "loss": 0.0074, + "step": 13779 + }, + { + "epoch": 10.850728633320205, + "grad_norm": 0.2747924327850342, + "learning_rate": 2.8742e-05, + "loss": 0.0087, + "step": 13780 + }, + { + "epoch": 10.851516345017723, + "grad_norm": 0.43684718012809753, + "learning_rate": 2.8741666666666666e-05, + "loss": 0.0166, + "step": 13781 + }, + { + "epoch": 10.852304056715242, + "grad_norm": 0.14158284664154053, + "learning_rate": 2.8741333333333336e-05, + "loss": 0.0062, + "step": 13782 + }, + { + "epoch": 10.85309176841276, + "grad_norm": 0.16299836337566376, + "learning_rate": 2.8740999999999998e-05, + "loss": 0.0137, + "step": 13783 + }, + { + "epoch": 10.85387948011028, + "grad_norm": 0.21727126836776733, + "learning_rate": 2.8740666666666667e-05, + "loss": 0.0129, + "step": 13784 + }, + { + "epoch": 10.854667191807799, + "grad_norm": 0.265054851770401, + "learning_rate": 2.8740333333333337e-05, + "loss": 0.0149, + "step": 13785 + }, + { + "epoch": 10.855454903505317, + "grad_norm": 0.23571117222309113, + "learning_rate": 2.874e-05, + "loss": 0.016, + "step": 13786 + }, + { + "epoch": 10.856242615202836, + "grad_norm": 0.3903847634792328, + "learning_rate": 2.873966666666667e-05, + "loss": 0.0205, + "step": 13787 + }, + { + "epoch": 10.857030326900354, + "grad_norm": 0.12097601592540741, + "learning_rate": 2.8739333333333335e-05, + "loss": 0.0064, + "step": 13788 + }, + { + "epoch": 10.857818038597873, + "grad_norm": 0.4076891243457794, + "learning_rate": 2.8739e-05, + "loss": 0.0117, + "step": 13789 + }, + { + "epoch": 10.858605750295393, + "grad_norm": 0.22241957485675812, + "learning_rate": 2.8738666666666666e-05, + "loss": 0.0093, + "step": 13790 + }, + { + "epoch": 10.85939346199291, + "grad_norm": 0.6715143918991089, + "learning_rate": 2.8738333333333336e-05, + "loss": 0.0157, + "step": 13791 + }, + { + "epoch": 10.86018117369043, + "grad_norm": 0.2329006940126419, + "learning_rate": 2.8737999999999998e-05, + "loss": 0.0119, + "step": 13792 + }, + { + "epoch": 10.860968885387948, + "grad_norm": 0.22361138463020325, + "learning_rate": 2.8737666666666668e-05, + "loss": 0.0133, + "step": 13793 + }, + { + "epoch": 10.861756597085467, + "grad_norm": 0.26690807938575745, + "learning_rate": 2.8737333333333337e-05, + "loss": 0.0202, + "step": 13794 + }, + { + "epoch": 10.862544308782985, + "grad_norm": 0.36875736713409424, + "learning_rate": 2.8737e-05, + "loss": 0.021, + "step": 13795 + }, + { + "epoch": 10.863332020480504, + "grad_norm": 0.2575679421424866, + "learning_rate": 2.873666666666667e-05, + "loss": 0.0147, + "step": 13796 + }, + { + "epoch": 10.864119732178024, + "grad_norm": 0.300359845161438, + "learning_rate": 2.8736333333333335e-05, + "loss": 0.024, + "step": 13797 + }, + { + "epoch": 10.864907443875541, + "grad_norm": 0.24862679839134216, + "learning_rate": 2.8736e-05, + "loss": 0.0115, + "step": 13798 + }, + { + "epoch": 10.86569515557306, + "grad_norm": 0.5695555210113525, + "learning_rate": 2.8735666666666667e-05, + "loss": 0.0157, + "step": 13799 + }, + { + "epoch": 10.866482867270578, + "grad_norm": 0.7786515951156616, + "learning_rate": 2.8735333333333336e-05, + "loss": 0.028, + "step": 13800 + }, + { + "epoch": 10.867270578968098, + "grad_norm": 0.8043920993804932, + "learning_rate": 2.8735e-05, + "loss": 0.1833, + "step": 13801 + }, + { + "epoch": 10.868058290665616, + "grad_norm": 0.6437150835990906, + "learning_rate": 2.8734666666666668e-05, + "loss": 0.1183, + "step": 13802 + }, + { + "epoch": 10.868846002363135, + "grad_norm": 0.5128699541091919, + "learning_rate": 2.8734333333333334e-05, + "loss": 0.1472, + "step": 13803 + }, + { + "epoch": 10.869633714060654, + "grad_norm": 0.468067467212677, + "learning_rate": 2.8734e-05, + "loss": 0.114, + "step": 13804 + }, + { + "epoch": 10.870421425758172, + "grad_norm": 0.4812185764312744, + "learning_rate": 2.873366666666667e-05, + "loss": 0.0823, + "step": 13805 + }, + { + "epoch": 10.871209137455692, + "grad_norm": 0.33585211634635925, + "learning_rate": 2.873333333333333e-05, + "loss": 0.0219, + "step": 13806 + }, + { + "epoch": 10.87199684915321, + "grad_norm": 0.3539287745952606, + "learning_rate": 2.8733e-05, + "loss": 0.0251, + "step": 13807 + }, + { + "epoch": 10.872784560850729, + "grad_norm": 0.21588361263275146, + "learning_rate": 2.8732666666666667e-05, + "loss": 0.0172, + "step": 13808 + }, + { + "epoch": 10.873572272548248, + "grad_norm": 0.2011120319366455, + "learning_rate": 2.8732333333333333e-05, + "loss": 0.0197, + "step": 13809 + }, + { + "epoch": 10.874359984245766, + "grad_norm": 0.2638900876045227, + "learning_rate": 2.8732000000000002e-05, + "loss": 0.0128, + "step": 13810 + }, + { + "epoch": 10.875147695943285, + "grad_norm": 0.33556488156318665, + "learning_rate": 2.8731666666666668e-05, + "loss": 0.0321, + "step": 13811 + }, + { + "epoch": 10.875935407640803, + "grad_norm": 0.2812095284461975, + "learning_rate": 2.8731333333333334e-05, + "loss": 0.0094, + "step": 13812 + }, + { + "epoch": 10.876723119338322, + "grad_norm": 0.2121199369430542, + "learning_rate": 2.8731e-05, + "loss": 0.0127, + "step": 13813 + }, + { + "epoch": 10.87751083103584, + "grad_norm": 1.1522341966629028, + "learning_rate": 2.873066666666667e-05, + "loss": 0.0538, + "step": 13814 + }, + { + "epoch": 10.87829854273336, + "grad_norm": 0.15989354252815247, + "learning_rate": 2.8730333333333332e-05, + "loss": 0.0076, + "step": 13815 + }, + { + "epoch": 10.879086254430879, + "grad_norm": 0.1316148191690445, + "learning_rate": 2.873e-05, + "loss": 0.0113, + "step": 13816 + }, + { + "epoch": 10.879873966128397, + "grad_norm": 0.23435112833976746, + "learning_rate": 2.8729666666666667e-05, + "loss": 0.0215, + "step": 13817 + }, + { + "epoch": 10.880661677825916, + "grad_norm": 0.3489934802055359, + "learning_rate": 2.8729333333333333e-05, + "loss": 0.0091, + "step": 13818 + }, + { + "epoch": 10.881449389523434, + "grad_norm": 0.2214091420173645, + "learning_rate": 2.8729000000000002e-05, + "loss": 0.0082, + "step": 13819 + }, + { + "epoch": 10.882237101220953, + "grad_norm": 0.13002383708953857, + "learning_rate": 2.8728666666666668e-05, + "loss": 0.0064, + "step": 13820 + }, + { + "epoch": 10.883024812918471, + "grad_norm": 0.22172492742538452, + "learning_rate": 2.8728333333333334e-05, + "loss": 0.0196, + "step": 13821 + }, + { + "epoch": 10.88381252461599, + "grad_norm": 0.16687095165252686, + "learning_rate": 2.8728e-05, + "loss": 0.0112, + "step": 13822 + }, + { + "epoch": 10.88460023631351, + "grad_norm": 0.20814506709575653, + "learning_rate": 2.872766666666667e-05, + "loss": 0.0111, + "step": 13823 + }, + { + "epoch": 10.885387948011028, + "grad_norm": 0.288944810628891, + "learning_rate": 2.8727333333333332e-05, + "loss": 0.0151, + "step": 13824 + }, + { + "epoch": 10.886175659708547, + "grad_norm": 0.1708415299654007, + "learning_rate": 2.8727e-05, + "loss": 0.0082, + "step": 13825 + }, + { + "epoch": 10.886963371406065, + "grad_norm": 0.2311953753232956, + "learning_rate": 2.8726666666666667e-05, + "loss": 0.0103, + "step": 13826 + }, + { + "epoch": 10.887751083103584, + "grad_norm": 0.2904912829399109, + "learning_rate": 2.8726333333333333e-05, + "loss": 0.0168, + "step": 13827 + }, + { + "epoch": 10.888538794801104, + "grad_norm": 0.5429815053939819, + "learning_rate": 2.8726000000000002e-05, + "loss": 0.0186, + "step": 13828 + }, + { + "epoch": 10.889326506498621, + "grad_norm": 0.20886074006557465, + "learning_rate": 2.872566666666667e-05, + "loss": 0.0075, + "step": 13829 + }, + { + "epoch": 10.89011421819614, + "grad_norm": 0.22391775250434875, + "learning_rate": 2.8725333333333334e-05, + "loss": 0.0138, + "step": 13830 + }, + { + "epoch": 10.890901929893658, + "grad_norm": 0.679882287979126, + "learning_rate": 2.8725e-05, + "loss": 0.0212, + "step": 13831 + }, + { + "epoch": 10.891689641591178, + "grad_norm": 0.1517365425825119, + "learning_rate": 2.8724666666666666e-05, + "loss": 0.0078, + "step": 13832 + }, + { + "epoch": 10.892477353288696, + "grad_norm": 0.12407803535461426, + "learning_rate": 2.8724333333333332e-05, + "loss": 0.008, + "step": 13833 + }, + { + "epoch": 10.893265064986215, + "grad_norm": 0.25233322381973267, + "learning_rate": 2.8724e-05, + "loss": 0.0154, + "step": 13834 + }, + { + "epoch": 10.894052776683735, + "grad_norm": 0.1639600694179535, + "learning_rate": 2.8723666666666667e-05, + "loss": 0.0138, + "step": 13835 + }, + { + "epoch": 10.894840488381252, + "grad_norm": 0.11667880415916443, + "learning_rate": 2.8723333333333333e-05, + "loss": 0.0065, + "step": 13836 + }, + { + "epoch": 10.895628200078772, + "grad_norm": 0.3250577449798584, + "learning_rate": 2.8723000000000003e-05, + "loss": 0.0157, + "step": 13837 + }, + { + "epoch": 10.89641591177629, + "grad_norm": 0.45062097907066345, + "learning_rate": 2.8722666666666665e-05, + "loss": 0.0233, + "step": 13838 + }, + { + "epoch": 10.897203623473809, + "grad_norm": 0.31845125555992126, + "learning_rate": 2.8722333333333335e-05, + "loss": 0.0122, + "step": 13839 + }, + { + "epoch": 10.897991335171326, + "grad_norm": 0.9421100616455078, + "learning_rate": 2.8722e-05, + "loss": 0.0191, + "step": 13840 + }, + { + "epoch": 10.898779046868846, + "grad_norm": 0.3355535864830017, + "learning_rate": 2.8721666666666666e-05, + "loss": 0.0162, + "step": 13841 + }, + { + "epoch": 10.899566758566365, + "grad_norm": 0.4023464620113373, + "learning_rate": 2.8721333333333332e-05, + "loss": 0.0172, + "step": 13842 + }, + { + "epoch": 10.900354470263883, + "grad_norm": 0.27445390820503235, + "learning_rate": 2.8721e-05, + "loss": 0.0147, + "step": 13843 + }, + { + "epoch": 10.901142181961402, + "grad_norm": 0.4170805513858795, + "learning_rate": 2.8720666666666668e-05, + "loss": 0.0176, + "step": 13844 + }, + { + "epoch": 10.90192989365892, + "grad_norm": 0.35776349902153015, + "learning_rate": 2.8720333333333334e-05, + "loss": 0.009, + "step": 13845 + }, + { + "epoch": 10.90271760535644, + "grad_norm": 0.25878652930259705, + "learning_rate": 2.8720000000000003e-05, + "loss": 0.0134, + "step": 13846 + }, + { + "epoch": 10.903505317053959, + "grad_norm": 0.4690752923488617, + "learning_rate": 2.8719666666666665e-05, + "loss": 0.0145, + "step": 13847 + }, + { + "epoch": 10.904293028751477, + "grad_norm": 0.43439093232154846, + "learning_rate": 2.8719333333333335e-05, + "loss": 0.025, + "step": 13848 + }, + { + "epoch": 10.905080740448996, + "grad_norm": 0.677558422088623, + "learning_rate": 2.8719e-05, + "loss": 0.0197, + "step": 13849 + }, + { + "epoch": 10.905868452146514, + "grad_norm": 0.5690310001373291, + "learning_rate": 2.8718666666666667e-05, + "loss": 0.0429, + "step": 13850 + }, + { + "epoch": 10.906656163844033, + "grad_norm": 0.8160239458084106, + "learning_rate": 2.8718333333333333e-05, + "loss": 0.2795, + "step": 13851 + }, + { + "epoch": 10.907443875541551, + "grad_norm": 0.6992602944374084, + "learning_rate": 2.8718000000000002e-05, + "loss": 0.1906, + "step": 13852 + }, + { + "epoch": 10.90823158723907, + "grad_norm": 0.8453605771064758, + "learning_rate": 2.8717666666666668e-05, + "loss": 0.1561, + "step": 13853 + }, + { + "epoch": 10.90901929893659, + "grad_norm": 1.0777145624160767, + "learning_rate": 2.8717333333333334e-05, + "loss": 0.0937, + "step": 13854 + }, + { + "epoch": 10.909807010634108, + "grad_norm": 0.43293309211730957, + "learning_rate": 2.8717000000000003e-05, + "loss": 0.0739, + "step": 13855 + }, + { + "epoch": 10.910594722331627, + "grad_norm": 0.618341863155365, + "learning_rate": 2.8716666666666666e-05, + "loss": 0.06, + "step": 13856 + }, + { + "epoch": 10.911382434029145, + "grad_norm": 0.32704710960388184, + "learning_rate": 2.8716333333333335e-05, + "loss": 0.0712, + "step": 13857 + }, + { + "epoch": 10.912170145726664, + "grad_norm": 0.36422309279441833, + "learning_rate": 2.8716e-05, + "loss": 0.032, + "step": 13858 + }, + { + "epoch": 10.912957857424182, + "grad_norm": 0.34260520339012146, + "learning_rate": 2.8715666666666667e-05, + "loss": 0.0236, + "step": 13859 + }, + { + "epoch": 10.913745569121701, + "grad_norm": 0.34244075417518616, + "learning_rate": 2.8715333333333336e-05, + "loss": 0.0122, + "step": 13860 + }, + { + "epoch": 10.91453328081922, + "grad_norm": 0.49599045515060425, + "learning_rate": 2.8715000000000002e-05, + "loss": 0.0166, + "step": 13861 + }, + { + "epoch": 10.915320992516738, + "grad_norm": 0.5157963633537292, + "learning_rate": 2.8714666666666668e-05, + "loss": 0.0125, + "step": 13862 + }, + { + "epoch": 10.916108704214258, + "grad_norm": 0.1845521479845047, + "learning_rate": 2.8714333333333334e-05, + "loss": 0.0102, + "step": 13863 + }, + { + "epoch": 10.916896415911776, + "grad_norm": 0.3765588700771332, + "learning_rate": 2.8714e-05, + "loss": 0.0173, + "step": 13864 + }, + { + "epoch": 10.917684127609295, + "grad_norm": 0.28254008293151855, + "learning_rate": 2.8713666666666666e-05, + "loss": 0.0141, + "step": 13865 + }, + { + "epoch": 10.918471839306815, + "grad_norm": 0.2650289535522461, + "learning_rate": 2.8713333333333335e-05, + "loss": 0.0171, + "step": 13866 + }, + { + "epoch": 10.919259551004332, + "grad_norm": 0.3445851802825928, + "learning_rate": 2.8712999999999998e-05, + "loss": 0.0144, + "step": 13867 + }, + { + "epoch": 10.920047262701852, + "grad_norm": 0.18825505673885345, + "learning_rate": 2.8712666666666667e-05, + "loss": 0.0161, + "step": 13868 + }, + { + "epoch": 10.92083497439937, + "grad_norm": 0.18042491376399994, + "learning_rate": 2.8712333333333336e-05, + "loss": 0.0119, + "step": 13869 + }, + { + "epoch": 10.921622686096889, + "grad_norm": 0.3007895052433014, + "learning_rate": 2.8712e-05, + "loss": 0.0127, + "step": 13870 + }, + { + "epoch": 10.922410397794406, + "grad_norm": 0.158944234251976, + "learning_rate": 2.8711666666666668e-05, + "loss": 0.0066, + "step": 13871 + }, + { + "epoch": 10.923198109491926, + "grad_norm": 0.5861099362373352, + "learning_rate": 2.8711333333333334e-05, + "loss": 0.0125, + "step": 13872 + }, + { + "epoch": 10.923985821189445, + "grad_norm": 0.1048574149608612, + "learning_rate": 2.8711e-05, + "loss": 0.0046, + "step": 13873 + }, + { + "epoch": 10.924773532886963, + "grad_norm": 0.25610148906707764, + "learning_rate": 2.8710666666666666e-05, + "loss": 0.0125, + "step": 13874 + }, + { + "epoch": 10.925561244584483, + "grad_norm": 0.20075024664402008, + "learning_rate": 2.8710333333333335e-05, + "loss": 0.0124, + "step": 13875 + }, + { + "epoch": 10.926348956282, + "grad_norm": 0.1914220154285431, + "learning_rate": 2.871e-05, + "loss": 0.0174, + "step": 13876 + }, + { + "epoch": 10.92713666797952, + "grad_norm": 0.20796775817871094, + "learning_rate": 2.8709666666666667e-05, + "loss": 0.0075, + "step": 13877 + }, + { + "epoch": 10.927924379677037, + "grad_norm": 0.3074047863483429, + "learning_rate": 2.8709333333333337e-05, + "loss": 0.0176, + "step": 13878 + }, + { + "epoch": 10.928712091374557, + "grad_norm": 0.2644895911216736, + "learning_rate": 2.8709e-05, + "loss": 0.0097, + "step": 13879 + }, + { + "epoch": 10.929499803072076, + "grad_norm": 0.1358409970998764, + "learning_rate": 2.870866666666667e-05, + "loss": 0.0056, + "step": 13880 + }, + { + "epoch": 10.930287514769594, + "grad_norm": 0.1782509833574295, + "learning_rate": 2.8708333333333334e-05, + "loss": 0.0099, + "step": 13881 + }, + { + "epoch": 10.931075226467113, + "grad_norm": 0.2546755373477936, + "learning_rate": 2.8708e-05, + "loss": 0.0178, + "step": 13882 + }, + { + "epoch": 10.931862938164631, + "grad_norm": 0.29655036330223083, + "learning_rate": 2.8707666666666666e-05, + "loss": 0.0095, + "step": 13883 + }, + { + "epoch": 10.93265064986215, + "grad_norm": 0.12123045325279236, + "learning_rate": 2.8707333333333336e-05, + "loss": 0.0064, + "step": 13884 + }, + { + "epoch": 10.93343836155967, + "grad_norm": 0.2721448540687561, + "learning_rate": 2.8707e-05, + "loss": 0.0101, + "step": 13885 + }, + { + "epoch": 10.934226073257188, + "grad_norm": 1.0785032510757446, + "learning_rate": 2.8706666666666667e-05, + "loss": 0.0213, + "step": 13886 + }, + { + "epoch": 10.935013784954707, + "grad_norm": 0.8088166117668152, + "learning_rate": 2.8706333333333337e-05, + "loss": 0.0259, + "step": 13887 + }, + { + "epoch": 10.935801496652225, + "grad_norm": 0.13952413201332092, + "learning_rate": 2.8706e-05, + "loss": 0.0074, + "step": 13888 + }, + { + "epoch": 10.936589208349744, + "grad_norm": 0.2212621569633484, + "learning_rate": 2.870566666666667e-05, + "loss": 0.0137, + "step": 13889 + }, + { + "epoch": 10.937376920047262, + "grad_norm": 0.3947744369506836, + "learning_rate": 2.8705333333333335e-05, + "loss": 0.0171, + "step": 13890 + }, + { + "epoch": 10.938164631744781, + "grad_norm": 0.340593546628952, + "learning_rate": 2.8705e-05, + "loss": 0.0146, + "step": 13891 + }, + { + "epoch": 10.9389523434423, + "grad_norm": 0.2645505666732788, + "learning_rate": 2.8704666666666666e-05, + "loss": 0.0081, + "step": 13892 + }, + { + "epoch": 10.939740055139819, + "grad_norm": 0.1688774824142456, + "learning_rate": 2.8704333333333332e-05, + "loss": 0.0106, + "step": 13893 + }, + { + "epoch": 10.940527766837338, + "grad_norm": 0.44975265860557556, + "learning_rate": 2.8704e-05, + "loss": 0.0141, + "step": 13894 + }, + { + "epoch": 10.941315478534856, + "grad_norm": 0.4664083421230316, + "learning_rate": 2.8703666666666668e-05, + "loss": 0.0204, + "step": 13895 + }, + { + "epoch": 10.942103190232375, + "grad_norm": 0.5801264643669128, + "learning_rate": 2.8703333333333334e-05, + "loss": 0.0093, + "step": 13896 + }, + { + "epoch": 10.942890901929893, + "grad_norm": 0.32144424319267273, + "learning_rate": 2.8703e-05, + "loss": 0.0108, + "step": 13897 + }, + { + "epoch": 10.943678613627412, + "grad_norm": 0.2482347935438156, + "learning_rate": 2.870266666666667e-05, + "loss": 0.0121, + "step": 13898 + }, + { + "epoch": 10.944466325324932, + "grad_norm": 0.33926308155059814, + "learning_rate": 2.870233333333333e-05, + "loss": 0.0146, + "step": 13899 + }, + { + "epoch": 10.94525403702245, + "grad_norm": 0.39254915714263916, + "learning_rate": 2.8702e-05, + "loss": 0.0141, + "step": 13900 + }, + { + "epoch": 10.946041748719969, + "grad_norm": 0.8728234171867371, + "learning_rate": 2.8701666666666667e-05, + "loss": 0.2701, + "step": 13901 + }, + { + "epoch": 10.946829460417487, + "grad_norm": 0.6596069931983948, + "learning_rate": 2.8701333333333333e-05, + "loss": 0.1514, + "step": 13902 + }, + { + "epoch": 10.947617172115006, + "grad_norm": 0.6535088419914246, + "learning_rate": 2.8701000000000002e-05, + "loss": 0.122, + "step": 13903 + }, + { + "epoch": 10.948404883812525, + "grad_norm": 0.5372586250305176, + "learning_rate": 2.8700666666666668e-05, + "loss": 0.1026, + "step": 13904 + }, + { + "epoch": 10.949192595510043, + "grad_norm": 0.4202340245246887, + "learning_rate": 2.8700333333333334e-05, + "loss": 0.0696, + "step": 13905 + }, + { + "epoch": 10.949980307207563, + "grad_norm": 0.6770884990692139, + "learning_rate": 2.87e-05, + "loss": 0.0857, + "step": 13906 + }, + { + "epoch": 10.95076801890508, + "grad_norm": 0.7076398134231567, + "learning_rate": 2.869966666666667e-05, + "loss": 0.0525, + "step": 13907 + }, + { + "epoch": 10.9515557306026, + "grad_norm": 0.2889617085456848, + "learning_rate": 2.869933333333333e-05, + "loss": 0.023, + "step": 13908 + }, + { + "epoch": 10.952343442300117, + "grad_norm": 0.19615872204303741, + "learning_rate": 2.8699e-05, + "loss": 0.0316, + "step": 13909 + }, + { + "epoch": 10.953131153997637, + "grad_norm": 0.24742291867733002, + "learning_rate": 2.869866666666667e-05, + "loss": 0.022, + "step": 13910 + }, + { + "epoch": 10.953918865695156, + "grad_norm": 0.2661895751953125, + "learning_rate": 2.8698333333333333e-05, + "loss": 0.024, + "step": 13911 + }, + { + "epoch": 10.954706577392674, + "grad_norm": 0.2665334939956665, + "learning_rate": 2.8698000000000002e-05, + "loss": 0.0251, + "step": 13912 + }, + { + "epoch": 10.955494289090193, + "grad_norm": 0.17901454865932465, + "learning_rate": 2.8697666666666668e-05, + "loss": 0.0158, + "step": 13913 + }, + { + "epoch": 10.956282000787711, + "grad_norm": 0.14760716259479523, + "learning_rate": 2.8697333333333334e-05, + "loss": 0.0095, + "step": 13914 + }, + { + "epoch": 10.95706971248523, + "grad_norm": 0.1887665092945099, + "learning_rate": 2.8697e-05, + "loss": 0.0108, + "step": 13915 + }, + { + "epoch": 10.957857424182748, + "grad_norm": 0.39099282026290894, + "learning_rate": 2.869666666666667e-05, + "loss": 0.0323, + "step": 13916 + }, + { + "epoch": 10.958645135880268, + "grad_norm": 0.28078562021255493, + "learning_rate": 2.8696333333333332e-05, + "loss": 0.0166, + "step": 13917 + }, + { + "epoch": 10.959432847577787, + "grad_norm": 0.5457625985145569, + "learning_rate": 2.8696e-05, + "loss": 0.0111, + "step": 13918 + }, + { + "epoch": 10.960220559275305, + "grad_norm": 0.2237992286682129, + "learning_rate": 2.869566666666667e-05, + "loss": 0.0178, + "step": 13919 + }, + { + "epoch": 10.961008270972824, + "grad_norm": 0.3432466983795166, + "learning_rate": 2.8695333333333333e-05, + "loss": 0.013, + "step": 13920 + }, + { + "epoch": 10.961795982670342, + "grad_norm": 0.298064261674881, + "learning_rate": 2.8695000000000002e-05, + "loss": 0.0172, + "step": 13921 + }, + { + "epoch": 10.962583694367861, + "grad_norm": 0.09582393616437912, + "learning_rate": 2.8694666666666668e-05, + "loss": 0.0057, + "step": 13922 + }, + { + "epoch": 10.96337140606538, + "grad_norm": 0.09508490562438965, + "learning_rate": 2.8694333333333334e-05, + "loss": 0.0065, + "step": 13923 + }, + { + "epoch": 10.964159117762899, + "grad_norm": 0.31923943758010864, + "learning_rate": 2.8694e-05, + "loss": 0.0173, + "step": 13924 + }, + { + "epoch": 10.964946829460418, + "grad_norm": 0.23670397698879242, + "learning_rate": 2.8693666666666666e-05, + "loss": 0.0112, + "step": 13925 + }, + { + "epoch": 10.965734541157936, + "grad_norm": 0.18236325681209564, + "learning_rate": 2.8693333333333335e-05, + "loss": 0.012, + "step": 13926 + }, + { + "epoch": 10.966522252855455, + "grad_norm": 0.11936553567647934, + "learning_rate": 2.8693e-05, + "loss": 0.009, + "step": 13927 + }, + { + "epoch": 10.967309964552973, + "grad_norm": 0.15308693051338196, + "learning_rate": 2.8692666666666667e-05, + "loss": 0.0114, + "step": 13928 + }, + { + "epoch": 10.968097676250492, + "grad_norm": 0.2999100387096405, + "learning_rate": 2.8692333333333333e-05, + "loss": 0.0123, + "step": 13929 + }, + { + "epoch": 10.968885387948012, + "grad_norm": 0.3631832003593445, + "learning_rate": 2.8692000000000002e-05, + "loss": 0.0145, + "step": 13930 + }, + { + "epoch": 10.96967309964553, + "grad_norm": 0.2355148047208786, + "learning_rate": 2.8691666666666665e-05, + "loss": 0.0166, + "step": 13931 + }, + { + "epoch": 10.970460811343049, + "grad_norm": 0.23610471189022064, + "learning_rate": 2.8691333333333334e-05, + "loss": 0.0072, + "step": 13932 + }, + { + "epoch": 10.971248523040567, + "grad_norm": 0.18581053614616394, + "learning_rate": 2.8691e-05, + "loss": 0.0086, + "step": 13933 + }, + { + "epoch": 10.972036234738086, + "grad_norm": 0.09411370754241943, + "learning_rate": 2.8690666666666666e-05, + "loss": 0.0037, + "step": 13934 + }, + { + "epoch": 10.972823946435604, + "grad_norm": 0.1545705646276474, + "learning_rate": 2.8690333333333336e-05, + "loss": 0.0117, + "step": 13935 + }, + { + "epoch": 10.973611658133123, + "grad_norm": 0.4188360571861267, + "learning_rate": 2.869e-05, + "loss": 0.0142, + "step": 13936 + }, + { + "epoch": 10.974399369830643, + "grad_norm": 0.2874288260936737, + "learning_rate": 2.8689666666666667e-05, + "loss": 0.0123, + "step": 13937 + }, + { + "epoch": 10.97518708152816, + "grad_norm": 0.17611075937747955, + "learning_rate": 2.8689333333333333e-05, + "loss": 0.014, + "step": 13938 + }, + { + "epoch": 10.97597479322568, + "grad_norm": 0.1419665664434433, + "learning_rate": 2.8689000000000003e-05, + "loss": 0.0068, + "step": 13939 + }, + { + "epoch": 10.976762504923197, + "grad_norm": 0.2251962423324585, + "learning_rate": 2.8688666666666665e-05, + "loss": 0.0104, + "step": 13940 + }, + { + "epoch": 10.977550216620717, + "grad_norm": 0.22436009347438812, + "learning_rate": 2.8688333333333335e-05, + "loss": 0.0132, + "step": 13941 + }, + { + "epoch": 10.978337928318236, + "grad_norm": 0.2508102357387543, + "learning_rate": 2.8688e-05, + "loss": 0.0131, + "step": 13942 + }, + { + "epoch": 10.979125640015754, + "grad_norm": 0.335642546415329, + "learning_rate": 2.8687666666666666e-05, + "loss": 0.0099, + "step": 13943 + }, + { + "epoch": 10.979913351713273, + "grad_norm": 0.14339810609817505, + "learning_rate": 2.8687333333333336e-05, + "loss": 0.0102, + "step": 13944 + }, + { + "epoch": 10.980701063410791, + "grad_norm": 0.28838253021240234, + "learning_rate": 2.8687e-05, + "loss": 0.0228, + "step": 13945 + }, + { + "epoch": 10.98148877510831, + "grad_norm": 0.36430689692497253, + "learning_rate": 2.8686666666666668e-05, + "loss": 0.0184, + "step": 13946 + }, + { + "epoch": 10.982276486805828, + "grad_norm": 0.29425713419914246, + "learning_rate": 2.8686333333333334e-05, + "loss": 0.0107, + "step": 13947 + }, + { + "epoch": 10.983064198503348, + "grad_norm": 0.29167330265045166, + "learning_rate": 2.8686000000000003e-05, + "loss": 0.0119, + "step": 13948 + }, + { + "epoch": 10.983851910200867, + "grad_norm": 0.35057589411735535, + "learning_rate": 2.8685666666666665e-05, + "loss": 0.0128, + "step": 13949 + }, + { + "epoch": 10.984639621898385, + "grad_norm": 1.4299322366714478, + "learning_rate": 2.8685333333333335e-05, + "loss": 0.0066, + "step": 13950 + }, + { + "epoch": 10.985427333595904, + "grad_norm": 0.5558947920799255, + "learning_rate": 2.8685e-05, + "loss": 0.1506, + "step": 13951 + }, + { + "epoch": 10.986215045293422, + "grad_norm": 0.4704981744289398, + "learning_rate": 2.8684666666666667e-05, + "loss": 0.0754, + "step": 13952 + }, + { + "epoch": 10.987002756990941, + "grad_norm": 0.3057076036930084, + "learning_rate": 2.8684333333333336e-05, + "loss": 0.0374, + "step": 13953 + }, + { + "epoch": 10.987790468688459, + "grad_norm": 0.22335238754749298, + "learning_rate": 2.8684e-05, + "loss": 0.0091, + "step": 13954 + }, + { + "epoch": 10.988578180385979, + "grad_norm": 0.44436657428741455, + "learning_rate": 2.8683666666666668e-05, + "loss": 0.0132, + "step": 13955 + }, + { + "epoch": 10.989365892083498, + "grad_norm": 0.3147038221359253, + "learning_rate": 2.8683333333333334e-05, + "loss": 0.0226, + "step": 13956 + }, + { + "epoch": 10.990153603781016, + "grad_norm": 0.22481398284435272, + "learning_rate": 2.8683e-05, + "loss": 0.0144, + "step": 13957 + }, + { + "epoch": 10.990941315478535, + "grad_norm": 0.3295281231403351, + "learning_rate": 2.8682666666666666e-05, + "loss": 0.0179, + "step": 13958 + }, + { + "epoch": 10.991729027176053, + "grad_norm": 0.23235060274600983, + "learning_rate": 2.8682333333333335e-05, + "loss": 0.0124, + "step": 13959 + }, + { + "epoch": 10.992516738873572, + "grad_norm": 0.46345725655555725, + "learning_rate": 2.8682e-05, + "loss": 0.0094, + "step": 13960 + }, + { + "epoch": 10.993304450571092, + "grad_norm": 0.48991554975509644, + "learning_rate": 2.8681666666666667e-05, + "loss": 0.0186, + "step": 13961 + }, + { + "epoch": 10.99409216226861, + "grad_norm": 0.33965742588043213, + "learning_rate": 2.8681333333333336e-05, + "loss": 0.0174, + "step": 13962 + }, + { + "epoch": 10.994879873966129, + "grad_norm": 0.31906938552856445, + "learning_rate": 2.8681e-05, + "loss": 0.0063, + "step": 13963 + }, + { + "epoch": 10.995667585663647, + "grad_norm": 0.3659922182559967, + "learning_rate": 2.8680666666666668e-05, + "loss": 0.0136, + "step": 13964 + }, + { + "epoch": 10.996455297361166, + "grad_norm": 0.2650274336338043, + "learning_rate": 2.8680333333333334e-05, + "loss": 0.008, + "step": 13965 + }, + { + "epoch": 10.997243009058685, + "grad_norm": 0.18190829455852509, + "learning_rate": 2.868e-05, + "loss": 0.0094, + "step": 13966 + }, + { + "epoch": 10.998030720756203, + "grad_norm": 0.3937542736530304, + "learning_rate": 2.8679666666666666e-05, + "loss": 0.0166, + "step": 13967 + }, + { + "epoch": 10.998818432453723, + "grad_norm": 0.25003674626350403, + "learning_rate": 2.8679333333333335e-05, + "loss": 0.0098, + "step": 13968 + }, + { + "epoch": 10.99960614415124, + "grad_norm": 0.3805529773235321, + "learning_rate": 2.8679e-05, + "loss": 0.0151, + "step": 13969 + }, + { + "epoch": 11.0, + "grad_norm": 0.23219355940818787, + "learning_rate": 2.8678666666666667e-05, + "loss": 0.007, + "step": 13970 + }, + { + "epoch": 11.00078771169752, + "grad_norm": 0.730473518371582, + "learning_rate": 2.8678333333333336e-05, + "loss": 0.2088, + "step": 13971 + }, + { + "epoch": 11.001575423395037, + "grad_norm": 0.7600783109664917, + "learning_rate": 2.8678e-05, + "loss": 0.1249, + "step": 13972 + }, + { + "epoch": 11.002363135092557, + "grad_norm": 0.673755407333374, + "learning_rate": 2.8677666666666668e-05, + "loss": 0.1038, + "step": 13973 + }, + { + "epoch": 11.003150846790074, + "grad_norm": 0.3604744076728821, + "learning_rate": 2.8677333333333334e-05, + "loss": 0.1146, + "step": 13974 + }, + { + "epoch": 11.003938558487594, + "grad_norm": 0.5287726521492004, + "learning_rate": 2.8677e-05, + "loss": 0.0365, + "step": 13975 + }, + { + "epoch": 11.004726270185111, + "grad_norm": 0.5921444296836853, + "learning_rate": 2.867666666666667e-05, + "loss": 0.0329, + "step": 13976 + }, + { + "epoch": 11.00551398188263, + "grad_norm": 0.19735005497932434, + "learning_rate": 2.8676333333333335e-05, + "loss": 0.0407, + "step": 13977 + }, + { + "epoch": 11.00630169358015, + "grad_norm": 0.22214242815971375, + "learning_rate": 2.8676e-05, + "loss": 0.0097, + "step": 13978 + }, + { + "epoch": 11.007089405277668, + "grad_norm": 0.19297458231449127, + "learning_rate": 2.8675666666666667e-05, + "loss": 0.0093, + "step": 13979 + }, + { + "epoch": 11.007877116975187, + "grad_norm": 0.19990302622318268, + "learning_rate": 2.8675333333333336e-05, + "loss": 0.0081, + "step": 13980 + }, + { + "epoch": 11.008664828672705, + "grad_norm": 0.1092466190457344, + "learning_rate": 2.8675e-05, + "loss": 0.0093, + "step": 13981 + }, + { + "epoch": 11.009452540370225, + "grad_norm": 0.2904384434223175, + "learning_rate": 2.867466666666667e-05, + "loss": 0.0133, + "step": 13982 + }, + { + "epoch": 11.010240252067744, + "grad_norm": 0.2095930427312851, + "learning_rate": 2.867433333333333e-05, + "loss": 0.0103, + "step": 13983 + }, + { + "epoch": 11.011027963765262, + "grad_norm": 0.15484291315078735, + "learning_rate": 2.8674e-05, + "loss": 0.0086, + "step": 13984 + }, + { + "epoch": 11.011815675462781, + "grad_norm": 0.2117304801940918, + "learning_rate": 2.867366666666667e-05, + "loss": 0.01, + "step": 13985 + }, + { + "epoch": 11.012603387160299, + "grad_norm": 0.1503506898880005, + "learning_rate": 2.8673333333333332e-05, + "loss": 0.0101, + "step": 13986 + }, + { + "epoch": 11.013391098857818, + "grad_norm": 0.11960307508707047, + "learning_rate": 2.8673e-05, + "loss": 0.0054, + "step": 13987 + }, + { + "epoch": 11.014178810555336, + "grad_norm": 0.35629454255104065, + "learning_rate": 2.8672666666666667e-05, + "loss": 0.012, + "step": 13988 + }, + { + "epoch": 11.014966522252855, + "grad_norm": 0.27681925892829895, + "learning_rate": 2.8672333333333333e-05, + "loss": 0.0123, + "step": 13989 + }, + { + "epoch": 11.015754233950375, + "grad_norm": 0.1535557359457016, + "learning_rate": 2.8672e-05, + "loss": 0.0054, + "step": 13990 + }, + { + "epoch": 11.016541945647893, + "grad_norm": 0.16647978127002716, + "learning_rate": 2.867166666666667e-05, + "loss": 0.0125, + "step": 13991 + }, + { + "epoch": 11.017329657345412, + "grad_norm": 0.22370576858520508, + "learning_rate": 2.867133333333333e-05, + "loss": 0.0083, + "step": 13992 + }, + { + "epoch": 11.01811736904293, + "grad_norm": 0.13090237975120544, + "learning_rate": 2.8671e-05, + "loss": 0.0092, + "step": 13993 + }, + { + "epoch": 11.01890508074045, + "grad_norm": 0.32701578736305237, + "learning_rate": 2.867066666666667e-05, + "loss": 0.0118, + "step": 13994 + }, + { + "epoch": 11.019692792437967, + "grad_norm": 0.18800024688243866, + "learning_rate": 2.8670333333333332e-05, + "loss": 0.0098, + "step": 13995 + }, + { + "epoch": 11.020480504135486, + "grad_norm": 0.19527128338813782, + "learning_rate": 2.867e-05, + "loss": 0.0079, + "step": 13996 + }, + { + "epoch": 11.021268215833006, + "grad_norm": 0.23104463517665863, + "learning_rate": 2.8669666666666668e-05, + "loss": 0.0067, + "step": 13997 + }, + { + "epoch": 11.022055927530523, + "grad_norm": 0.3216626048088074, + "learning_rate": 2.8669333333333334e-05, + "loss": 0.0105, + "step": 13998 + }, + { + "epoch": 11.022843639228043, + "grad_norm": 0.345224529504776, + "learning_rate": 2.8669e-05, + "loss": 0.0124, + "step": 13999 + }, + { + "epoch": 11.02363135092556, + "grad_norm": 0.3424503803253174, + "learning_rate": 2.866866666666667e-05, + "loss": 0.0106, + "step": 14000 + }, + { + "epoch": 11.02363135092556, + "eval_cer": 0.11929387977292169, + "eval_loss": 0.3454759120941162, + "eval_runtime": 16.7326, + "eval_samples_per_second": 18.168, + "eval_steps_per_second": 0.598, + "eval_wer": 0.40080583269378356, + "step": 14000 + }, + { + "epoch": 11.02441906262308, + "grad_norm": 0.16542096436023712, + "learning_rate": 2.8668333333333335e-05, + "loss": 0.0094, + "step": 14001 + }, + { + "epoch": 11.0252067743206, + "grad_norm": 0.32223886251449585, + "learning_rate": 2.8668e-05, + "loss": 0.012, + "step": 14002 + }, + { + "epoch": 11.025994486018117, + "grad_norm": 0.5840124487876892, + "learning_rate": 2.866766666666667e-05, + "loss": 0.0202, + "step": 14003 + }, + { + "epoch": 11.026782197715637, + "grad_norm": 0.24592037498950958, + "learning_rate": 2.8667333333333333e-05, + "loss": 0.0108, + "step": 14004 + }, + { + "epoch": 11.027569909413154, + "grad_norm": 0.11278470605611801, + "learning_rate": 2.8667000000000002e-05, + "loss": 0.0051, + "step": 14005 + }, + { + "epoch": 11.028357621110674, + "grad_norm": 0.1496819257736206, + "learning_rate": 2.8666666666666668e-05, + "loss": 0.0072, + "step": 14006 + }, + { + "epoch": 11.029145332808191, + "grad_norm": 0.16698817908763885, + "learning_rate": 2.8666333333333334e-05, + "loss": 0.0074, + "step": 14007 + }, + { + "epoch": 11.02993304450571, + "grad_norm": 0.7166847586631775, + "learning_rate": 2.8666e-05, + "loss": 0.0082, + "step": 14008 + }, + { + "epoch": 11.03072075620323, + "grad_norm": 0.21946103870868683, + "learning_rate": 2.866566666666667e-05, + "loss": 0.0103, + "step": 14009 + }, + { + "epoch": 11.031508467900748, + "grad_norm": 0.14431887865066528, + "learning_rate": 2.8665333333333335e-05, + "loss": 0.0076, + "step": 14010 + }, + { + "epoch": 11.032296179598267, + "grad_norm": 0.2842077910900116, + "learning_rate": 2.8665e-05, + "loss": 0.0211, + "step": 14011 + }, + { + "epoch": 11.033083891295785, + "grad_norm": 0.667517364025116, + "learning_rate": 2.866466666666667e-05, + "loss": 0.0187, + "step": 14012 + }, + { + "epoch": 11.033871602993305, + "grad_norm": 0.5938502550125122, + "learning_rate": 2.8664333333333333e-05, + "loss": 0.0125, + "step": 14013 + }, + { + "epoch": 11.034659314690822, + "grad_norm": 0.12816724181175232, + "learning_rate": 2.8664000000000002e-05, + "loss": 0.0083, + "step": 14014 + }, + { + "epoch": 11.035447026388342, + "grad_norm": 0.16119664907455444, + "learning_rate": 2.8663666666666665e-05, + "loss": 0.0073, + "step": 14015 + }, + { + "epoch": 11.036234738085861, + "grad_norm": 0.44569912552833557, + "learning_rate": 2.8663333333333334e-05, + "loss": 0.0139, + "step": 14016 + }, + { + "epoch": 11.037022449783379, + "grad_norm": 0.29063406586647034, + "learning_rate": 2.8663e-05, + "loss": 0.0136, + "step": 14017 + }, + { + "epoch": 11.037810161480898, + "grad_norm": 0.44827231764793396, + "learning_rate": 2.8662666666666666e-05, + "loss": 0.0176, + "step": 14018 + }, + { + "epoch": 11.038597873178416, + "grad_norm": 0.19542920589447021, + "learning_rate": 2.8662333333333335e-05, + "loss": 0.0086, + "step": 14019 + }, + { + "epoch": 11.039385584875935, + "grad_norm": 0.25085437297821045, + "learning_rate": 2.8662e-05, + "loss": 0.0123, + "step": 14020 + }, + { + "epoch": 11.040173296573455, + "grad_norm": 1.321478009223938, + "learning_rate": 2.8661666666666667e-05, + "loss": 0.2392, + "step": 14021 + }, + { + "epoch": 11.040961008270973, + "grad_norm": 0.5305292010307312, + "learning_rate": 2.8661333333333333e-05, + "loss": 0.1245, + "step": 14022 + }, + { + "epoch": 11.041748719968492, + "grad_norm": 0.4940291941165924, + "learning_rate": 2.8661000000000002e-05, + "loss": 0.1133, + "step": 14023 + }, + { + "epoch": 11.04253643166601, + "grad_norm": 0.47257688641548157, + "learning_rate": 2.8660666666666665e-05, + "loss": 0.067, + "step": 14024 + }, + { + "epoch": 11.04332414336353, + "grad_norm": 0.6416634321212769, + "learning_rate": 2.8660333333333334e-05, + "loss": 0.0499, + "step": 14025 + }, + { + "epoch": 11.044111855061047, + "grad_norm": 0.7140263319015503, + "learning_rate": 2.8660000000000003e-05, + "loss": 0.0293, + "step": 14026 + }, + { + "epoch": 11.044899566758566, + "grad_norm": 0.1796698421239853, + "learning_rate": 2.8659666666666666e-05, + "loss": 0.0262, + "step": 14027 + }, + { + "epoch": 11.045687278456086, + "grad_norm": 0.42299678921699524, + "learning_rate": 2.8659333333333335e-05, + "loss": 0.0259, + "step": 14028 + }, + { + "epoch": 11.046474990153603, + "grad_norm": 0.2086622714996338, + "learning_rate": 2.8659e-05, + "loss": 0.0136, + "step": 14029 + }, + { + "epoch": 11.047262701851123, + "grad_norm": 0.2671079635620117, + "learning_rate": 2.8658666666666667e-05, + "loss": 0.0161, + "step": 14030 + }, + { + "epoch": 11.04805041354864, + "grad_norm": 0.3714587390422821, + "learning_rate": 2.8658333333333333e-05, + "loss": 0.0513, + "step": 14031 + }, + { + "epoch": 11.04883812524616, + "grad_norm": 0.15329863131046295, + "learning_rate": 2.8658000000000002e-05, + "loss": 0.0101, + "step": 14032 + }, + { + "epoch": 11.04962583694368, + "grad_norm": 0.2452220916748047, + "learning_rate": 2.8657666666666665e-05, + "loss": 0.011, + "step": 14033 + }, + { + "epoch": 11.050413548641197, + "grad_norm": 0.3542845845222473, + "learning_rate": 2.8657333333333334e-05, + "loss": 0.0221, + "step": 14034 + }, + { + "epoch": 11.051201260338717, + "grad_norm": 0.19017502665519714, + "learning_rate": 2.8657000000000004e-05, + "loss": 0.0114, + "step": 14035 + }, + { + "epoch": 11.051988972036234, + "grad_norm": 0.3573198914527893, + "learning_rate": 2.8656666666666666e-05, + "loss": 0.0129, + "step": 14036 + }, + { + "epoch": 11.052776683733754, + "grad_norm": 0.16851703822612762, + "learning_rate": 2.8656333333333335e-05, + "loss": 0.0129, + "step": 14037 + }, + { + "epoch": 11.053564395431271, + "grad_norm": 0.2325402796268463, + "learning_rate": 2.8656e-05, + "loss": 0.0097, + "step": 14038 + }, + { + "epoch": 11.054352107128791, + "grad_norm": 0.15303608775138855, + "learning_rate": 2.8655666666666667e-05, + "loss": 0.008, + "step": 14039 + }, + { + "epoch": 11.05513981882631, + "grad_norm": 0.3444187343120575, + "learning_rate": 2.8655333333333333e-05, + "loss": 0.0098, + "step": 14040 + }, + { + "epoch": 11.055927530523828, + "grad_norm": 0.6073065996170044, + "learning_rate": 2.8655000000000003e-05, + "loss": 0.0106, + "step": 14041 + }, + { + "epoch": 11.056715242221347, + "grad_norm": 0.8934282660484314, + "learning_rate": 2.8654666666666665e-05, + "loss": 0.0132, + "step": 14042 + }, + { + "epoch": 11.057502953918865, + "grad_norm": 0.1805800348520279, + "learning_rate": 2.8654333333333334e-05, + "loss": 0.01, + "step": 14043 + }, + { + "epoch": 11.058290665616385, + "grad_norm": 0.10261703282594681, + "learning_rate": 2.8654e-05, + "loss": 0.0057, + "step": 14044 + }, + { + "epoch": 11.059078377313902, + "grad_norm": 0.347678542137146, + "learning_rate": 2.8653666666666666e-05, + "loss": 0.0131, + "step": 14045 + }, + { + "epoch": 11.059866089011422, + "grad_norm": 0.24151991307735443, + "learning_rate": 2.8653333333333336e-05, + "loss": 0.0172, + "step": 14046 + }, + { + "epoch": 11.060653800708941, + "grad_norm": 0.2637197971343994, + "learning_rate": 2.8652999999999998e-05, + "loss": 0.0149, + "step": 14047 + }, + { + "epoch": 11.061441512406459, + "grad_norm": 0.23408663272857666, + "learning_rate": 2.8652666666666668e-05, + "loss": 0.0138, + "step": 14048 + }, + { + "epoch": 11.062229224103978, + "grad_norm": 0.3758713901042938, + "learning_rate": 2.8652333333333334e-05, + "loss": 0.0074, + "step": 14049 + }, + { + "epoch": 11.063016935801496, + "grad_norm": 0.13137589395046234, + "learning_rate": 2.8652e-05, + "loss": 0.0063, + "step": 14050 + }, + { + "epoch": 11.063804647499015, + "grad_norm": 0.16038250923156738, + "learning_rate": 2.865166666666667e-05, + "loss": 0.0111, + "step": 14051 + }, + { + "epoch": 11.064592359196535, + "grad_norm": 0.143564835190773, + "learning_rate": 2.8651333333333335e-05, + "loss": 0.0045, + "step": 14052 + }, + { + "epoch": 11.065380070894053, + "grad_norm": 0.39375197887420654, + "learning_rate": 2.8651e-05, + "loss": 0.0184, + "step": 14053 + }, + { + "epoch": 11.066167782591572, + "grad_norm": 0.5383285880088806, + "learning_rate": 2.8650666666666667e-05, + "loss": 0.0116, + "step": 14054 + }, + { + "epoch": 11.06695549428909, + "grad_norm": 0.16058038175106049, + "learning_rate": 2.8650333333333336e-05, + "loss": 0.0076, + "step": 14055 + }, + { + "epoch": 11.06774320598661, + "grad_norm": 0.40006494522094727, + "learning_rate": 2.865e-05, + "loss": 0.0095, + "step": 14056 + }, + { + "epoch": 11.068530917684127, + "grad_norm": 0.22995947301387787, + "learning_rate": 2.8649666666666668e-05, + "loss": 0.0088, + "step": 14057 + }, + { + "epoch": 11.069318629381646, + "grad_norm": 0.19590117037296295, + "learning_rate": 2.8649333333333334e-05, + "loss": 0.0122, + "step": 14058 + }, + { + "epoch": 11.070106341079166, + "grad_norm": 0.5024939775466919, + "learning_rate": 2.8649e-05, + "loss": 0.0211, + "step": 14059 + }, + { + "epoch": 11.070894052776683, + "grad_norm": 0.19615840911865234, + "learning_rate": 2.864866666666667e-05, + "loss": 0.0166, + "step": 14060 + }, + { + "epoch": 11.071681764474203, + "grad_norm": 0.21999028325080872, + "learning_rate": 2.8648333333333335e-05, + "loss": 0.0109, + "step": 14061 + }, + { + "epoch": 11.07246947617172, + "grad_norm": 0.2890532314777374, + "learning_rate": 2.8648e-05, + "loss": 0.0137, + "step": 14062 + }, + { + "epoch": 11.07325718786924, + "grad_norm": 0.13524067401885986, + "learning_rate": 2.8647666666666667e-05, + "loss": 0.0042, + "step": 14063 + }, + { + "epoch": 11.074044899566758, + "grad_norm": 0.3144354522228241, + "learning_rate": 2.8647333333333336e-05, + "loss": 0.0128, + "step": 14064 + }, + { + "epoch": 11.074832611264277, + "grad_norm": 0.42904284596443176, + "learning_rate": 2.8647e-05, + "loss": 0.0247, + "step": 14065 + }, + { + "epoch": 11.075620322961797, + "grad_norm": 0.22910241782665253, + "learning_rate": 2.8646666666666668e-05, + "loss": 0.0085, + "step": 14066 + }, + { + "epoch": 11.076408034659314, + "grad_norm": 0.39422935247421265, + "learning_rate": 2.8646333333333334e-05, + "loss": 0.0108, + "step": 14067 + }, + { + "epoch": 11.077195746356834, + "grad_norm": 0.18152369558811188, + "learning_rate": 2.8646e-05, + "loss": 0.0065, + "step": 14068 + }, + { + "epoch": 11.077983458054351, + "grad_norm": 0.3256361186504364, + "learning_rate": 2.864566666666667e-05, + "loss": 0.0105, + "step": 14069 + }, + { + "epoch": 11.078771169751871, + "grad_norm": 0.2548161447048187, + "learning_rate": 2.8645333333333335e-05, + "loss": 0.0083, + "step": 14070 + }, + { + "epoch": 11.07955888144939, + "grad_norm": 1.4075360298156738, + "learning_rate": 2.8645e-05, + "loss": 0.2943, + "step": 14071 + }, + { + "epoch": 11.080346593146908, + "grad_norm": 0.6945061683654785, + "learning_rate": 2.8644666666666667e-05, + "loss": 0.1672, + "step": 14072 + }, + { + "epoch": 11.081134304844428, + "grad_norm": 0.5783379077911377, + "learning_rate": 2.8644333333333336e-05, + "loss": 0.1072, + "step": 14073 + }, + { + "epoch": 11.081922016541945, + "grad_norm": 0.42255350947380066, + "learning_rate": 2.8644e-05, + "loss": 0.0609, + "step": 14074 + }, + { + "epoch": 11.082709728239465, + "grad_norm": 0.6031323671340942, + "learning_rate": 2.8643666666666668e-05, + "loss": 0.0474, + "step": 14075 + }, + { + "epoch": 11.083497439936982, + "grad_norm": 0.2934197783470154, + "learning_rate": 2.8643333333333334e-05, + "loss": 0.0656, + "step": 14076 + }, + { + "epoch": 11.084285151634502, + "grad_norm": 0.2980518043041229, + "learning_rate": 2.8643e-05, + "loss": 0.0466, + "step": 14077 + }, + { + "epoch": 11.085072863332021, + "grad_norm": 0.17279799282550812, + "learning_rate": 2.864266666666667e-05, + "loss": 0.0137, + "step": 14078 + }, + { + "epoch": 11.085860575029539, + "grad_norm": 0.25575724244117737, + "learning_rate": 2.8642333333333332e-05, + "loss": 0.0197, + "step": 14079 + }, + { + "epoch": 11.086648286727058, + "grad_norm": 0.33772221207618713, + "learning_rate": 2.8642e-05, + "loss": 0.0166, + "step": 14080 + }, + { + "epoch": 11.087435998424576, + "grad_norm": 0.19929420948028564, + "learning_rate": 2.8641666666666667e-05, + "loss": 0.0159, + "step": 14081 + }, + { + "epoch": 11.088223710122096, + "grad_norm": 0.20278893411159515, + "learning_rate": 2.8641333333333333e-05, + "loss": 0.0137, + "step": 14082 + }, + { + "epoch": 11.089011421819613, + "grad_norm": 0.1272744983434677, + "learning_rate": 2.8641e-05, + "loss": 0.0071, + "step": 14083 + }, + { + "epoch": 11.089799133517133, + "grad_norm": 1.7799413204193115, + "learning_rate": 2.864066666666667e-05, + "loss": 0.0168, + "step": 14084 + }, + { + "epoch": 11.090586845214652, + "grad_norm": 0.18367192149162292, + "learning_rate": 2.8640333333333334e-05, + "loss": 0.0117, + "step": 14085 + }, + { + "epoch": 11.09137455691217, + "grad_norm": 0.15510542690753937, + "learning_rate": 2.864e-05, + "loss": 0.0212, + "step": 14086 + }, + { + "epoch": 11.09216226860969, + "grad_norm": 0.17706415057182312, + "learning_rate": 2.863966666666667e-05, + "loss": 0.0098, + "step": 14087 + }, + { + "epoch": 11.092949980307207, + "grad_norm": 0.23779058456420898, + "learning_rate": 2.8639333333333332e-05, + "loss": 0.0111, + "step": 14088 + }, + { + "epoch": 11.093737692004726, + "grad_norm": 0.2956327795982361, + "learning_rate": 2.8639e-05, + "loss": 0.0128, + "step": 14089 + }, + { + "epoch": 11.094525403702246, + "grad_norm": 0.3005991280078888, + "learning_rate": 2.8638666666666667e-05, + "loss": 0.0151, + "step": 14090 + }, + { + "epoch": 11.095313115399764, + "grad_norm": 0.16317453980445862, + "learning_rate": 2.8638333333333333e-05, + "loss": 0.0119, + "step": 14091 + }, + { + "epoch": 11.096100827097283, + "grad_norm": 0.1256873458623886, + "learning_rate": 2.8638e-05, + "loss": 0.0081, + "step": 14092 + }, + { + "epoch": 11.0968885387948, + "grad_norm": 0.16533790528774261, + "learning_rate": 2.863766666666667e-05, + "loss": 0.0107, + "step": 14093 + }, + { + "epoch": 11.09767625049232, + "grad_norm": 0.28221797943115234, + "learning_rate": 2.8637333333333334e-05, + "loss": 0.0102, + "step": 14094 + }, + { + "epoch": 11.098463962189838, + "grad_norm": 0.1871919333934784, + "learning_rate": 2.8637e-05, + "loss": 0.0104, + "step": 14095 + }, + { + "epoch": 11.099251673887357, + "grad_norm": 0.20913170278072357, + "learning_rate": 2.863666666666667e-05, + "loss": 0.0053, + "step": 14096 + }, + { + "epoch": 11.100039385584877, + "grad_norm": 0.3854799270629883, + "learning_rate": 2.8636333333333332e-05, + "loss": 0.0458, + "step": 14097 + }, + { + "epoch": 11.100827097282394, + "grad_norm": 0.5836619138717651, + "learning_rate": 2.8636e-05, + "loss": 0.0163, + "step": 14098 + }, + { + "epoch": 11.101614808979914, + "grad_norm": 0.12173948436975479, + "learning_rate": 2.8635666666666668e-05, + "loss": 0.0048, + "step": 14099 + }, + { + "epoch": 11.102402520677431, + "grad_norm": 0.12218933552503586, + "learning_rate": 2.8635333333333333e-05, + "loss": 0.0076, + "step": 14100 + }, + { + "epoch": 11.103190232374951, + "grad_norm": 0.22124874591827393, + "learning_rate": 2.8635000000000003e-05, + "loss": 0.006, + "step": 14101 + }, + { + "epoch": 11.103977944072469, + "grad_norm": 0.22507306933403015, + "learning_rate": 2.863466666666667e-05, + "loss": 0.0062, + "step": 14102 + }, + { + "epoch": 11.104765655769988, + "grad_norm": 0.2736895680427551, + "learning_rate": 2.8634333333333335e-05, + "loss": 0.0086, + "step": 14103 + }, + { + "epoch": 11.105553367467508, + "grad_norm": 0.23095473647117615, + "learning_rate": 2.8634e-05, + "loss": 0.0135, + "step": 14104 + }, + { + "epoch": 11.106341079165025, + "grad_norm": 0.4660671353340149, + "learning_rate": 2.8633666666666667e-05, + "loss": 0.0083, + "step": 14105 + }, + { + "epoch": 11.107128790862545, + "grad_norm": 0.4896681010723114, + "learning_rate": 2.8633333333333332e-05, + "loss": 0.0145, + "step": 14106 + }, + { + "epoch": 11.107916502560062, + "grad_norm": 0.08904425799846649, + "learning_rate": 2.8633000000000002e-05, + "loss": 0.0046, + "step": 14107 + }, + { + "epoch": 11.108704214257582, + "grad_norm": 0.320347398519516, + "learning_rate": 2.8632666666666664e-05, + "loss": 0.0206, + "step": 14108 + }, + { + "epoch": 11.109491925955101, + "grad_norm": 0.7917614579200745, + "learning_rate": 2.8632333333333334e-05, + "loss": 0.0173, + "step": 14109 + }, + { + "epoch": 11.110279637652619, + "grad_norm": 0.20313677191734314, + "learning_rate": 2.8632000000000003e-05, + "loss": 0.0081, + "step": 14110 + }, + { + "epoch": 11.111067349350138, + "grad_norm": 0.08063093572854996, + "learning_rate": 2.8631666666666666e-05, + "loss": 0.0027, + "step": 14111 + }, + { + "epoch": 11.111855061047656, + "grad_norm": 0.5306657552719116, + "learning_rate": 2.8631333333333335e-05, + "loss": 0.011, + "step": 14112 + }, + { + "epoch": 11.112642772745176, + "grad_norm": 0.2257840484380722, + "learning_rate": 2.8631e-05, + "loss": 0.012, + "step": 14113 + }, + { + "epoch": 11.113430484442693, + "grad_norm": 0.5220324993133545, + "learning_rate": 2.8630666666666667e-05, + "loss": 0.0187, + "step": 14114 + }, + { + "epoch": 11.114218196140213, + "grad_norm": 1.230670690536499, + "learning_rate": 2.8630333333333333e-05, + "loss": 0.0217, + "step": 14115 + }, + { + "epoch": 11.115005907837732, + "grad_norm": 0.40555793046951294, + "learning_rate": 2.8630000000000002e-05, + "loss": 0.0172, + "step": 14116 + }, + { + "epoch": 11.11579361953525, + "grad_norm": 0.2148723006248474, + "learning_rate": 2.8629666666666665e-05, + "loss": 0.0066, + "step": 14117 + }, + { + "epoch": 11.11658133123277, + "grad_norm": 0.3201863169670105, + "learning_rate": 2.8629333333333334e-05, + "loss": 0.0068, + "step": 14118 + }, + { + "epoch": 11.117369042930287, + "grad_norm": 0.3371534049510956, + "learning_rate": 2.8629000000000003e-05, + "loss": 0.0113, + "step": 14119 + }, + { + "epoch": 11.118156754627806, + "grad_norm": 0.2708929777145386, + "learning_rate": 2.8628666666666666e-05, + "loss": 0.0092, + "step": 14120 + }, + { + "epoch": 11.118944466325324, + "grad_norm": 1.1354563236236572, + "learning_rate": 2.8628333333333335e-05, + "loss": 0.281, + "step": 14121 + }, + { + "epoch": 11.119732178022844, + "grad_norm": 0.5822620987892151, + "learning_rate": 2.8628e-05, + "loss": 0.1719, + "step": 14122 + }, + { + "epoch": 11.120519889720363, + "grad_norm": 0.7961868047714233, + "learning_rate": 2.8627666666666667e-05, + "loss": 0.181, + "step": 14123 + }, + { + "epoch": 11.12130760141788, + "grad_norm": 0.6224073767662048, + "learning_rate": 2.8627333333333333e-05, + "loss": 0.1107, + "step": 14124 + }, + { + "epoch": 11.1220953131154, + "grad_norm": 0.49160850048065186, + "learning_rate": 2.8627000000000002e-05, + "loss": 0.066, + "step": 14125 + }, + { + "epoch": 11.122883024812918, + "grad_norm": 0.22342675924301147, + "learning_rate": 2.8626666666666668e-05, + "loss": 0.0531, + "step": 14126 + }, + { + "epoch": 11.123670736510437, + "grad_norm": 0.2626640796661377, + "learning_rate": 2.8626333333333334e-05, + "loss": 0.0318, + "step": 14127 + }, + { + "epoch": 11.124458448207957, + "grad_norm": 0.25718772411346436, + "learning_rate": 2.8626000000000003e-05, + "loss": 0.0335, + "step": 14128 + }, + { + "epoch": 11.125246159905474, + "grad_norm": 0.26527655124664307, + "learning_rate": 2.8625666666666666e-05, + "loss": 0.0171, + "step": 14129 + }, + { + "epoch": 11.126033871602994, + "grad_norm": 0.32777512073516846, + "learning_rate": 2.8625333333333335e-05, + "loss": 0.0173, + "step": 14130 + }, + { + "epoch": 11.126821583300512, + "grad_norm": 0.38514500856399536, + "learning_rate": 2.8625e-05, + "loss": 0.0238, + "step": 14131 + }, + { + "epoch": 11.127609294998031, + "grad_norm": 0.31848230957984924, + "learning_rate": 2.8624666666666667e-05, + "loss": 0.0309, + "step": 14132 + }, + { + "epoch": 11.128397006695549, + "grad_norm": 0.4326275885105133, + "learning_rate": 2.8624333333333333e-05, + "loss": 0.0142, + "step": 14133 + }, + { + "epoch": 11.129184718393068, + "grad_norm": 0.22574284672737122, + "learning_rate": 2.8624000000000002e-05, + "loss": 0.0116, + "step": 14134 + }, + { + "epoch": 11.129972430090588, + "grad_norm": 0.49219560623168945, + "learning_rate": 2.862366666666667e-05, + "loss": 0.0307, + "step": 14135 + }, + { + "epoch": 11.130760141788105, + "grad_norm": 0.40254127979278564, + "learning_rate": 2.8623333333333334e-05, + "loss": 0.017, + "step": 14136 + }, + { + "epoch": 11.131547853485625, + "grad_norm": 0.15722256898880005, + "learning_rate": 2.8623e-05, + "loss": 0.01, + "step": 14137 + }, + { + "epoch": 11.132335565183142, + "grad_norm": 0.2940341532230377, + "learning_rate": 2.8622666666666666e-05, + "loss": 0.0149, + "step": 14138 + }, + { + "epoch": 11.133123276880662, + "grad_norm": 0.16997729241847992, + "learning_rate": 2.8622333333333335e-05, + "loss": 0.0099, + "step": 14139 + }, + { + "epoch": 11.13391098857818, + "grad_norm": 0.14358526468276978, + "learning_rate": 2.8621999999999998e-05, + "loss": 0.0113, + "step": 14140 + }, + { + "epoch": 11.134698700275699, + "grad_norm": 0.25951772928237915, + "learning_rate": 2.8621666666666667e-05, + "loss": 0.0129, + "step": 14141 + }, + { + "epoch": 11.135486411973218, + "grad_norm": 0.10541409254074097, + "learning_rate": 2.8621333333333333e-05, + "loss": 0.0111, + "step": 14142 + }, + { + "epoch": 11.136274123670736, + "grad_norm": 0.1531326025724411, + "learning_rate": 2.8621e-05, + "loss": 0.0079, + "step": 14143 + }, + { + "epoch": 11.137061835368256, + "grad_norm": 0.1549311727285385, + "learning_rate": 2.862066666666667e-05, + "loss": 0.0054, + "step": 14144 + }, + { + "epoch": 11.137849547065773, + "grad_norm": 0.27021196484565735, + "learning_rate": 2.8620333333333334e-05, + "loss": 0.0139, + "step": 14145 + }, + { + "epoch": 11.138637258763293, + "grad_norm": 0.2298886924982071, + "learning_rate": 2.862e-05, + "loss": 0.0591, + "step": 14146 + }, + { + "epoch": 11.139424970460812, + "grad_norm": 0.38603290915489197, + "learning_rate": 2.8619666666666666e-05, + "loss": 0.0132, + "step": 14147 + }, + { + "epoch": 11.14021268215833, + "grad_norm": 0.4146521985530853, + "learning_rate": 2.8619333333333336e-05, + "loss": 0.0094, + "step": 14148 + }, + { + "epoch": 11.14100039385585, + "grad_norm": 0.2674506604671478, + "learning_rate": 2.8618999999999998e-05, + "loss": 0.0123, + "step": 14149 + }, + { + "epoch": 11.141788105553367, + "grad_norm": 0.23972737789154053, + "learning_rate": 2.8618666666666668e-05, + "loss": 0.0097, + "step": 14150 + }, + { + "epoch": 11.142575817250886, + "grad_norm": 0.18727312982082367, + "learning_rate": 2.8618333333333337e-05, + "loss": 0.0103, + "step": 14151 + }, + { + "epoch": 11.143363528948404, + "grad_norm": 0.2449464648962021, + "learning_rate": 2.8618e-05, + "loss": 0.016, + "step": 14152 + }, + { + "epoch": 11.144151240645924, + "grad_norm": 0.5493233799934387, + "learning_rate": 2.861766666666667e-05, + "loss": 0.009, + "step": 14153 + }, + { + "epoch": 11.144938952343443, + "grad_norm": 0.1988309770822525, + "learning_rate": 2.8617333333333335e-05, + "loss": 0.0067, + "step": 14154 + }, + { + "epoch": 11.14572666404096, + "grad_norm": 0.47690877318382263, + "learning_rate": 2.8617e-05, + "loss": 0.0264, + "step": 14155 + }, + { + "epoch": 11.14651437573848, + "grad_norm": 0.4455273151397705, + "learning_rate": 2.8616666666666667e-05, + "loss": 0.0154, + "step": 14156 + }, + { + "epoch": 11.147302087435998, + "grad_norm": 0.2969372570514679, + "learning_rate": 2.8616333333333336e-05, + "loss": 0.0124, + "step": 14157 + }, + { + "epoch": 11.148089799133517, + "grad_norm": 0.12604625523090363, + "learning_rate": 2.8616e-05, + "loss": 0.0117, + "step": 14158 + }, + { + "epoch": 11.148877510831035, + "grad_norm": 0.29277530312538147, + "learning_rate": 2.8615666666666668e-05, + "loss": 0.0109, + "step": 14159 + }, + { + "epoch": 11.149665222528554, + "grad_norm": 0.29394203424453735, + "learning_rate": 2.8615333333333337e-05, + "loss": 0.0076, + "step": 14160 + }, + { + "epoch": 11.150452934226074, + "grad_norm": 0.41290727257728577, + "learning_rate": 2.8615e-05, + "loss": 0.017, + "step": 14161 + }, + { + "epoch": 11.151240645923592, + "grad_norm": 0.2760210931301117, + "learning_rate": 2.861466666666667e-05, + "loss": 0.0107, + "step": 14162 + }, + { + "epoch": 11.152028357621111, + "grad_norm": 0.2654697597026825, + "learning_rate": 2.8614333333333335e-05, + "loss": 0.0065, + "step": 14163 + }, + { + "epoch": 11.152816069318629, + "grad_norm": 0.2505892217159271, + "learning_rate": 2.8614e-05, + "loss": 0.0105, + "step": 14164 + }, + { + "epoch": 11.153603781016148, + "grad_norm": 0.29686251282691956, + "learning_rate": 2.8613666666666667e-05, + "loss": 0.0176, + "step": 14165 + }, + { + "epoch": 11.154391492713668, + "grad_norm": 0.23030349612236023, + "learning_rate": 2.8613333333333333e-05, + "loss": 0.0067, + "step": 14166 + }, + { + "epoch": 11.155179204411185, + "grad_norm": 0.31357553601264954, + "learning_rate": 2.8613e-05, + "loss": 0.0122, + "step": 14167 + }, + { + "epoch": 11.155966916108705, + "grad_norm": 0.322862833738327, + "learning_rate": 2.8612666666666668e-05, + "loss": 0.0115, + "step": 14168 + }, + { + "epoch": 11.156754627806222, + "grad_norm": 0.23047703504562378, + "learning_rate": 2.8612333333333334e-05, + "loss": 0.0074, + "step": 14169 + }, + { + "epoch": 11.157542339503742, + "grad_norm": 0.6706954836845398, + "learning_rate": 2.8612e-05, + "loss": 0.0249, + "step": 14170 + }, + { + "epoch": 11.15833005120126, + "grad_norm": 0.9295711517333984, + "learning_rate": 2.861166666666667e-05, + "loss": 0.1712, + "step": 14171 + }, + { + "epoch": 11.159117762898779, + "grad_norm": 0.5931227803230286, + "learning_rate": 2.861133333333333e-05, + "loss": 0.1526, + "step": 14172 + }, + { + "epoch": 11.159905474596298, + "grad_norm": 1.2272827625274658, + "learning_rate": 2.8611e-05, + "loss": 0.1379, + "step": 14173 + }, + { + "epoch": 11.160693186293816, + "grad_norm": 0.393498957157135, + "learning_rate": 2.8610666666666667e-05, + "loss": 0.0957, + "step": 14174 + }, + { + "epoch": 11.161480897991336, + "grad_norm": 0.3179488480091095, + "learning_rate": 2.8610333333333333e-05, + "loss": 0.0442, + "step": 14175 + }, + { + "epoch": 11.162268609688853, + "grad_norm": 0.2508774995803833, + "learning_rate": 2.8610000000000002e-05, + "loss": 0.0312, + "step": 14176 + }, + { + "epoch": 11.163056321386373, + "grad_norm": 0.2506919801235199, + "learning_rate": 2.8609666666666668e-05, + "loss": 0.0259, + "step": 14177 + }, + { + "epoch": 11.16384403308389, + "grad_norm": 0.214347705245018, + "learning_rate": 2.8609333333333334e-05, + "loss": 0.0118, + "step": 14178 + }, + { + "epoch": 11.16463174478141, + "grad_norm": 0.17522312700748444, + "learning_rate": 2.8609e-05, + "loss": 0.0096, + "step": 14179 + }, + { + "epoch": 11.16541945647893, + "grad_norm": 0.2705625891685486, + "learning_rate": 2.860866666666667e-05, + "loss": 0.0147, + "step": 14180 + }, + { + "epoch": 11.166207168176447, + "grad_norm": 0.22757644951343536, + "learning_rate": 2.8608333333333332e-05, + "loss": 0.0141, + "step": 14181 + }, + { + "epoch": 11.166994879873966, + "grad_norm": 0.17312957346439362, + "learning_rate": 2.8608e-05, + "loss": 0.0108, + "step": 14182 + }, + { + "epoch": 11.167782591571484, + "grad_norm": 0.11921897530555725, + "learning_rate": 2.8607666666666667e-05, + "loss": 0.0059, + "step": 14183 + }, + { + "epoch": 11.168570303269004, + "grad_norm": 0.149479940533638, + "learning_rate": 2.8607333333333333e-05, + "loss": 0.0099, + "step": 14184 + }, + { + "epoch": 11.169358014966523, + "grad_norm": 0.16795717179775238, + "learning_rate": 2.8607000000000002e-05, + "loss": 0.0156, + "step": 14185 + }, + { + "epoch": 11.17014572666404, + "grad_norm": 0.1502688080072403, + "learning_rate": 2.860666666666667e-05, + "loss": 0.0072, + "step": 14186 + }, + { + "epoch": 11.17093343836156, + "grad_norm": 0.16639454662799835, + "learning_rate": 2.8606333333333334e-05, + "loss": 0.0091, + "step": 14187 + }, + { + "epoch": 11.171721150059078, + "grad_norm": 0.12870186567306519, + "learning_rate": 2.8606e-05, + "loss": 0.0074, + "step": 14188 + }, + { + "epoch": 11.172508861756597, + "grad_norm": 0.13736365735530853, + "learning_rate": 2.860566666666667e-05, + "loss": 0.0086, + "step": 14189 + }, + { + "epoch": 11.173296573454115, + "grad_norm": 0.22357790172100067, + "learning_rate": 2.8605333333333332e-05, + "loss": 0.0113, + "step": 14190 + }, + { + "epoch": 11.174084285151634, + "grad_norm": 0.1644410789012909, + "learning_rate": 2.8605e-05, + "loss": 0.0096, + "step": 14191 + }, + { + "epoch": 11.174871996849154, + "grad_norm": 0.10031171888113022, + "learning_rate": 2.8604666666666667e-05, + "loss": 0.0083, + "step": 14192 + }, + { + "epoch": 11.175659708546672, + "grad_norm": 0.20401358604431152, + "learning_rate": 2.8604333333333333e-05, + "loss": 0.009, + "step": 14193 + }, + { + "epoch": 11.176447420244191, + "grad_norm": 0.15358580648899078, + "learning_rate": 2.8604000000000003e-05, + "loss": 0.0066, + "step": 14194 + }, + { + "epoch": 11.177235131941709, + "grad_norm": 0.11031727492809296, + "learning_rate": 2.8603666666666665e-05, + "loss": 0.0069, + "step": 14195 + }, + { + "epoch": 11.178022843639228, + "grad_norm": 0.22111958265304565, + "learning_rate": 2.8603333333333334e-05, + "loss": 0.0145, + "step": 14196 + }, + { + "epoch": 11.178810555336748, + "grad_norm": 0.47246086597442627, + "learning_rate": 2.8603e-05, + "loss": 0.0125, + "step": 14197 + }, + { + "epoch": 11.179598267034265, + "grad_norm": 0.20357614755630493, + "learning_rate": 2.8602666666666666e-05, + "loss": 0.0116, + "step": 14198 + }, + { + "epoch": 11.180385978731785, + "grad_norm": 0.20143860578536987, + "learning_rate": 2.8602333333333332e-05, + "loss": 0.0125, + "step": 14199 + }, + { + "epoch": 11.181173690429302, + "grad_norm": 0.31985458731651306, + "learning_rate": 2.8602e-05, + "loss": 0.0108, + "step": 14200 + }, + { + "epoch": 11.181961402126822, + "grad_norm": 0.3492068946361542, + "learning_rate": 2.8601666666666668e-05, + "loss": 0.0095, + "step": 14201 + }, + { + "epoch": 11.18274911382434, + "grad_norm": 0.21218909323215485, + "learning_rate": 2.8601333333333333e-05, + "loss": 0.0077, + "step": 14202 + }, + { + "epoch": 11.183536825521859, + "grad_norm": 0.18386530876159668, + "learning_rate": 2.8601000000000003e-05, + "loss": 0.0103, + "step": 14203 + }, + { + "epoch": 11.184324537219378, + "grad_norm": 0.16799862682819366, + "learning_rate": 2.8600666666666665e-05, + "loss": 0.02, + "step": 14204 + }, + { + "epoch": 11.185112248916896, + "grad_norm": 0.17614276707172394, + "learning_rate": 2.8600333333333335e-05, + "loss": 0.007, + "step": 14205 + }, + { + "epoch": 11.185899960614416, + "grad_norm": 0.18029999732971191, + "learning_rate": 2.86e-05, + "loss": 0.0068, + "step": 14206 + }, + { + "epoch": 11.186687672311933, + "grad_norm": 0.13322605192661285, + "learning_rate": 2.8599666666666667e-05, + "loss": 0.0063, + "step": 14207 + }, + { + "epoch": 11.187475384009453, + "grad_norm": 0.2234618216753006, + "learning_rate": 2.8599333333333332e-05, + "loss": 0.0101, + "step": 14208 + }, + { + "epoch": 11.18826309570697, + "grad_norm": 0.4032365083694458, + "learning_rate": 2.8599000000000002e-05, + "loss": 0.0146, + "step": 14209 + }, + { + "epoch": 11.18905080740449, + "grad_norm": 0.18788523972034454, + "learning_rate": 2.8598666666666668e-05, + "loss": 0.0104, + "step": 14210 + }, + { + "epoch": 11.18983851910201, + "grad_norm": 0.1163051426410675, + "learning_rate": 2.8598333333333334e-05, + "loss": 0.0053, + "step": 14211 + }, + { + "epoch": 11.190626230799527, + "grad_norm": 0.6125841736793518, + "learning_rate": 2.8598000000000003e-05, + "loss": 0.0128, + "step": 14212 + }, + { + "epoch": 11.191413942497046, + "grad_norm": 0.3198662996292114, + "learning_rate": 2.8597666666666666e-05, + "loss": 0.0066, + "step": 14213 + }, + { + "epoch": 11.192201654194564, + "grad_norm": 0.49338358640670776, + "learning_rate": 2.8597333333333335e-05, + "loss": 0.0178, + "step": 14214 + }, + { + "epoch": 11.192989365892084, + "grad_norm": 0.25439685583114624, + "learning_rate": 2.8597e-05, + "loss": 0.0086, + "step": 14215 + }, + { + "epoch": 11.193777077589603, + "grad_norm": 0.45466986298561096, + "learning_rate": 2.8596666666666667e-05, + "loss": 0.0133, + "step": 14216 + }, + { + "epoch": 11.19456478928712, + "grad_norm": 0.3318633735179901, + "learning_rate": 2.8596333333333333e-05, + "loss": 0.0106, + "step": 14217 + }, + { + "epoch": 11.19535250098464, + "grad_norm": 0.37736037373542786, + "learning_rate": 2.8596000000000002e-05, + "loss": 0.0076, + "step": 14218 + }, + { + "epoch": 11.196140212682158, + "grad_norm": 0.23223355412483215, + "learning_rate": 2.8595666666666668e-05, + "loss": 0.0079, + "step": 14219 + }, + { + "epoch": 11.196927924379677, + "grad_norm": 0.28500068187713623, + "learning_rate": 2.8595333333333334e-05, + "loss": 0.0092, + "step": 14220 + }, + { + "epoch": 11.197715636077195, + "grad_norm": 0.6408125162124634, + "learning_rate": 2.8595000000000003e-05, + "loss": 0.1584, + "step": 14221 + }, + { + "epoch": 11.198503347774714, + "grad_norm": 0.5394743084907532, + "learning_rate": 2.8594666666666666e-05, + "loss": 0.1779, + "step": 14222 + }, + { + "epoch": 11.199291059472234, + "grad_norm": 0.5571290850639343, + "learning_rate": 2.8594333333333335e-05, + "loss": 0.091, + "step": 14223 + }, + { + "epoch": 11.200078771169752, + "grad_norm": 0.6334397792816162, + "learning_rate": 2.8594e-05, + "loss": 0.0973, + "step": 14224 + }, + { + "epoch": 11.200866482867271, + "grad_norm": 0.5073045492172241, + "learning_rate": 2.8593666666666667e-05, + "loss": 0.0664, + "step": 14225 + }, + { + "epoch": 11.201654194564789, + "grad_norm": 0.22302843630313873, + "learning_rate": 2.8593333333333336e-05, + "loss": 0.0251, + "step": 14226 + }, + { + "epoch": 11.202441906262308, + "grad_norm": 0.20548129081726074, + "learning_rate": 2.8593e-05, + "loss": 0.0169, + "step": 14227 + }, + { + "epoch": 11.203229617959826, + "grad_norm": 0.3388785719871521, + "learning_rate": 2.8592666666666668e-05, + "loss": 0.0262, + "step": 14228 + }, + { + "epoch": 11.204017329657345, + "grad_norm": 0.2773512303829193, + "learning_rate": 2.8592333333333334e-05, + "loss": 0.0138, + "step": 14229 + }, + { + "epoch": 11.204805041354865, + "grad_norm": 0.18687596917152405, + "learning_rate": 2.8592e-05, + "loss": 0.0136, + "step": 14230 + }, + { + "epoch": 11.205592753052382, + "grad_norm": 0.2364366352558136, + "learning_rate": 2.8591666666666666e-05, + "loss": 0.0179, + "step": 14231 + }, + { + "epoch": 11.206380464749902, + "grad_norm": 0.4771938920021057, + "learning_rate": 2.8591333333333335e-05, + "loss": 0.0588, + "step": 14232 + }, + { + "epoch": 11.20716817644742, + "grad_norm": 0.637908399105072, + "learning_rate": 2.8590999999999998e-05, + "loss": 0.0087, + "step": 14233 + }, + { + "epoch": 11.207955888144939, + "grad_norm": 0.21944989264011383, + "learning_rate": 2.8590666666666667e-05, + "loss": 0.0264, + "step": 14234 + }, + { + "epoch": 11.208743599842458, + "grad_norm": 0.40832313895225525, + "learning_rate": 2.8590333333333336e-05, + "loss": 0.0107, + "step": 14235 + }, + { + "epoch": 11.209531311539976, + "grad_norm": 0.35114067792892456, + "learning_rate": 2.859e-05, + "loss": 0.018, + "step": 14236 + }, + { + "epoch": 11.210319023237496, + "grad_norm": 0.3305507302284241, + "learning_rate": 2.858966666666667e-05, + "loss": 0.0104, + "step": 14237 + }, + { + "epoch": 11.211106734935013, + "grad_norm": 0.19578471779823303, + "learning_rate": 2.8589333333333334e-05, + "loss": 0.0117, + "step": 14238 + }, + { + "epoch": 11.211894446632533, + "grad_norm": 0.3596404194831848, + "learning_rate": 2.8589e-05, + "loss": 0.0115, + "step": 14239 + }, + { + "epoch": 11.21268215833005, + "grad_norm": 0.1787419468164444, + "learning_rate": 2.8588666666666666e-05, + "loss": 0.0079, + "step": 14240 + }, + { + "epoch": 11.21346987002757, + "grad_norm": 0.09285455197095871, + "learning_rate": 2.8588333333333335e-05, + "loss": 0.0086, + "step": 14241 + }, + { + "epoch": 11.21425758172509, + "grad_norm": 0.17252416908740997, + "learning_rate": 2.8587999999999998e-05, + "loss": 0.0095, + "step": 14242 + }, + { + "epoch": 11.215045293422607, + "grad_norm": 0.11557433009147644, + "learning_rate": 2.8587666666666667e-05, + "loss": 0.0049, + "step": 14243 + }, + { + "epoch": 11.215833005120126, + "grad_norm": 0.16428741812705994, + "learning_rate": 2.8587333333333337e-05, + "loss": 0.0097, + "step": 14244 + }, + { + "epoch": 11.216620716817644, + "grad_norm": 0.20927459001541138, + "learning_rate": 2.8587e-05, + "loss": 0.0111, + "step": 14245 + }, + { + "epoch": 11.217408428515164, + "grad_norm": 0.2469504177570343, + "learning_rate": 2.858666666666667e-05, + "loss": 0.0118, + "step": 14246 + }, + { + "epoch": 11.218196140212681, + "grad_norm": 0.08996894955635071, + "learning_rate": 2.8586333333333334e-05, + "loss": 0.0047, + "step": 14247 + }, + { + "epoch": 11.2189838519102, + "grad_norm": 0.25391724705696106, + "learning_rate": 2.8586e-05, + "loss": 0.014, + "step": 14248 + }, + { + "epoch": 11.21977156360772, + "grad_norm": 0.17735594511032104, + "learning_rate": 2.8585666666666666e-05, + "loss": 0.0087, + "step": 14249 + }, + { + "epoch": 11.220559275305238, + "grad_norm": 0.49652212858200073, + "learning_rate": 2.8585333333333336e-05, + "loss": 0.0235, + "step": 14250 + }, + { + "epoch": 11.221346987002757, + "grad_norm": 0.3088524639606476, + "learning_rate": 2.8585e-05, + "loss": 0.0106, + "step": 14251 + }, + { + "epoch": 11.222134698700275, + "grad_norm": 0.19831517338752747, + "learning_rate": 2.8584666666666668e-05, + "loss": 0.0108, + "step": 14252 + }, + { + "epoch": 11.222922410397794, + "grad_norm": 0.31733423471450806, + "learning_rate": 2.8584333333333337e-05, + "loss": 0.0099, + "step": 14253 + }, + { + "epoch": 11.223710122095314, + "grad_norm": 0.4123774468898773, + "learning_rate": 2.8584e-05, + "loss": 0.0107, + "step": 14254 + }, + { + "epoch": 11.224497833792832, + "grad_norm": 0.21112437546253204, + "learning_rate": 2.858366666666667e-05, + "loss": 0.0418, + "step": 14255 + }, + { + "epoch": 11.225285545490351, + "grad_norm": 0.13228756189346313, + "learning_rate": 2.858333333333333e-05, + "loss": 0.0079, + "step": 14256 + }, + { + "epoch": 11.226073257187869, + "grad_norm": 0.1387420892715454, + "learning_rate": 2.8583e-05, + "loss": 0.0093, + "step": 14257 + }, + { + "epoch": 11.226860968885388, + "grad_norm": 0.6092212200164795, + "learning_rate": 2.8582666666666667e-05, + "loss": 0.0106, + "step": 14258 + }, + { + "epoch": 11.227648680582906, + "grad_norm": 0.2520490884780884, + "learning_rate": 2.8582333333333332e-05, + "loss": 0.007, + "step": 14259 + }, + { + "epoch": 11.228436392280425, + "grad_norm": 0.2501658499240875, + "learning_rate": 2.8582000000000002e-05, + "loss": 0.0087, + "step": 14260 + }, + { + "epoch": 11.229224103977945, + "grad_norm": 0.23180356621742249, + "learning_rate": 2.8581666666666668e-05, + "loss": 0.0081, + "step": 14261 + }, + { + "epoch": 11.230011815675462, + "grad_norm": 0.18270716071128845, + "learning_rate": 2.8581333333333334e-05, + "loss": 0.007, + "step": 14262 + }, + { + "epoch": 11.230799527372982, + "grad_norm": 0.22941266000270844, + "learning_rate": 2.8581e-05, + "loss": 0.0096, + "step": 14263 + }, + { + "epoch": 11.2315872390705, + "grad_norm": 0.3926289975643158, + "learning_rate": 2.858066666666667e-05, + "loss": 0.0147, + "step": 14264 + }, + { + "epoch": 11.232374950768019, + "grad_norm": 0.18297156691551208, + "learning_rate": 2.858033333333333e-05, + "loss": 0.0113, + "step": 14265 + }, + { + "epoch": 11.233162662465537, + "grad_norm": 0.46629369258880615, + "learning_rate": 2.858e-05, + "loss": 0.0276, + "step": 14266 + }, + { + "epoch": 11.233950374163056, + "grad_norm": 0.35125860571861267, + "learning_rate": 2.8579666666666667e-05, + "loss": 0.0144, + "step": 14267 + }, + { + "epoch": 11.234738085860576, + "grad_norm": 0.17451344430446625, + "learning_rate": 2.8579333333333333e-05, + "loss": 0.0053, + "step": 14268 + }, + { + "epoch": 11.235525797558093, + "grad_norm": 0.7165666222572327, + "learning_rate": 2.8579000000000002e-05, + "loss": 0.019, + "step": 14269 + }, + { + "epoch": 11.236313509255613, + "grad_norm": 0.31201082468032837, + "learning_rate": 2.8578666666666668e-05, + "loss": 0.0091, + "step": 14270 + }, + { + "epoch": 11.23710122095313, + "grad_norm": 0.7914271354675293, + "learning_rate": 2.8578333333333334e-05, + "loss": 0.1887, + "step": 14271 + }, + { + "epoch": 11.23788893265065, + "grad_norm": 0.6183032989501953, + "learning_rate": 2.8578e-05, + "loss": 0.2026, + "step": 14272 + }, + { + "epoch": 11.23867664434817, + "grad_norm": 0.5454406142234802, + "learning_rate": 2.857766666666667e-05, + "loss": 0.1122, + "step": 14273 + }, + { + "epoch": 11.239464356045687, + "grad_norm": 0.47323572635650635, + "learning_rate": 2.857733333333333e-05, + "loss": 0.1017, + "step": 14274 + }, + { + "epoch": 11.240252067743207, + "grad_norm": 0.9226886630058289, + "learning_rate": 2.8577e-05, + "loss": 0.0607, + "step": 14275 + }, + { + "epoch": 11.241039779440724, + "grad_norm": 0.21605129539966583, + "learning_rate": 2.857666666666667e-05, + "loss": 0.0301, + "step": 14276 + }, + { + "epoch": 11.241827491138244, + "grad_norm": 0.2824801802635193, + "learning_rate": 2.8576333333333333e-05, + "loss": 0.0158, + "step": 14277 + }, + { + "epoch": 11.242615202835761, + "grad_norm": 0.308024525642395, + "learning_rate": 2.8576000000000002e-05, + "loss": 0.021, + "step": 14278 + }, + { + "epoch": 11.24340291453328, + "grad_norm": 0.47753578424453735, + "learning_rate": 2.8575666666666668e-05, + "loss": 0.0164, + "step": 14279 + }, + { + "epoch": 11.2441906262308, + "grad_norm": 0.17495666444301605, + "learning_rate": 2.8575333333333334e-05, + "loss": 0.0137, + "step": 14280 + }, + { + "epoch": 11.244978337928318, + "grad_norm": 0.3870325982570648, + "learning_rate": 2.8575e-05, + "loss": 0.0192, + "step": 14281 + }, + { + "epoch": 11.245766049625837, + "grad_norm": 0.21071627736091614, + "learning_rate": 2.857466666666667e-05, + "loss": 0.0092, + "step": 14282 + }, + { + "epoch": 11.246553761323355, + "grad_norm": 0.24555929005146027, + "learning_rate": 2.8574333333333332e-05, + "loss": 0.0123, + "step": 14283 + }, + { + "epoch": 11.247341473020875, + "grad_norm": 0.12566202878952026, + "learning_rate": 2.8574e-05, + "loss": 0.0055, + "step": 14284 + }, + { + "epoch": 11.248129184718394, + "grad_norm": 0.2594166398048401, + "learning_rate": 2.857366666666667e-05, + "loss": 0.0082, + "step": 14285 + }, + { + "epoch": 11.248916896415912, + "grad_norm": 0.23233649134635925, + "learning_rate": 2.8573333333333333e-05, + "loss": 0.0085, + "step": 14286 + }, + { + "epoch": 11.249704608113431, + "grad_norm": 0.1913710981607437, + "learning_rate": 2.8573000000000002e-05, + "loss": 0.0083, + "step": 14287 + }, + { + "epoch": 11.250492319810949, + "grad_norm": 0.2534051239490509, + "learning_rate": 2.8572666666666665e-05, + "loss": 0.0147, + "step": 14288 + }, + { + "epoch": 11.251280031508468, + "grad_norm": 0.361331582069397, + "learning_rate": 2.8572333333333334e-05, + "loss": 0.0153, + "step": 14289 + }, + { + "epoch": 11.252067743205986, + "grad_norm": 0.20709875226020813, + "learning_rate": 2.8572e-05, + "loss": 0.0094, + "step": 14290 + }, + { + "epoch": 11.252855454903505, + "grad_norm": 0.4266195297241211, + "learning_rate": 2.8571666666666666e-05, + "loss": 0.0097, + "step": 14291 + }, + { + "epoch": 11.253643166601025, + "grad_norm": 0.4036201536655426, + "learning_rate": 2.8571333333333332e-05, + "loss": 0.0095, + "step": 14292 + }, + { + "epoch": 11.254430878298542, + "grad_norm": 0.20770257711410522, + "learning_rate": 2.8571e-05, + "loss": 0.0094, + "step": 14293 + }, + { + "epoch": 11.255218589996062, + "grad_norm": 0.8257591724395752, + "learning_rate": 2.8570666666666667e-05, + "loss": 0.0129, + "step": 14294 + }, + { + "epoch": 11.25600630169358, + "grad_norm": 0.3800719380378723, + "learning_rate": 2.8570333333333333e-05, + "loss": 0.0131, + "step": 14295 + }, + { + "epoch": 11.256794013391099, + "grad_norm": 0.09436656534671783, + "learning_rate": 2.8570000000000003e-05, + "loss": 0.0048, + "step": 14296 + }, + { + "epoch": 11.257581725088617, + "grad_norm": 0.23005716502666473, + "learning_rate": 2.8569666666666665e-05, + "loss": 0.0113, + "step": 14297 + }, + { + "epoch": 11.258369436786136, + "grad_norm": 0.22776281833648682, + "learning_rate": 2.8569333333333334e-05, + "loss": 0.0099, + "step": 14298 + }, + { + "epoch": 11.259157148483656, + "grad_norm": 0.157121941447258, + "learning_rate": 2.8569e-05, + "loss": 0.0077, + "step": 14299 + }, + { + "epoch": 11.259944860181173, + "grad_norm": 0.5935895442962646, + "learning_rate": 2.8568666666666666e-05, + "loss": 0.0117, + "step": 14300 + }, + { + "epoch": 11.260732571878693, + "grad_norm": 0.2886374890804291, + "learning_rate": 2.8568333333333336e-05, + "loss": 0.0113, + "step": 14301 + }, + { + "epoch": 11.26152028357621, + "grad_norm": 0.2615852355957031, + "learning_rate": 2.8568e-05, + "loss": 0.0089, + "step": 14302 + }, + { + "epoch": 11.26230799527373, + "grad_norm": 0.15452642738819122, + "learning_rate": 2.8567666666666668e-05, + "loss": 0.0092, + "step": 14303 + }, + { + "epoch": 11.26309570697125, + "grad_norm": 0.289958655834198, + "learning_rate": 2.8567333333333333e-05, + "loss": 0.0105, + "step": 14304 + }, + { + "epoch": 11.263883418668767, + "grad_norm": 0.1876179277896881, + "learning_rate": 2.8567000000000003e-05, + "loss": 0.0068, + "step": 14305 + }, + { + "epoch": 11.264671130366287, + "grad_norm": 0.4528250992298126, + "learning_rate": 2.8566666666666665e-05, + "loss": 0.0155, + "step": 14306 + }, + { + "epoch": 11.265458842063804, + "grad_norm": 0.1718541383743286, + "learning_rate": 2.8566333333333335e-05, + "loss": 0.0064, + "step": 14307 + }, + { + "epoch": 11.266246553761324, + "grad_norm": 0.3211843967437744, + "learning_rate": 2.8566e-05, + "loss": 0.0136, + "step": 14308 + }, + { + "epoch": 11.267034265458841, + "grad_norm": 0.4528243839740753, + "learning_rate": 2.8565666666666667e-05, + "loss": 0.0129, + "step": 14309 + }, + { + "epoch": 11.26782197715636, + "grad_norm": 0.2186477929353714, + "learning_rate": 2.8565333333333336e-05, + "loss": 0.0141, + "step": 14310 + }, + { + "epoch": 11.26860968885388, + "grad_norm": 0.18949539959430695, + "learning_rate": 2.8565000000000002e-05, + "loss": 0.0057, + "step": 14311 + }, + { + "epoch": 11.269397400551398, + "grad_norm": 0.21370451152324677, + "learning_rate": 2.8564666666666668e-05, + "loss": 0.0073, + "step": 14312 + }, + { + "epoch": 11.270185112248917, + "grad_norm": 0.4576645791530609, + "learning_rate": 2.8564333333333334e-05, + "loss": 0.0205, + "step": 14313 + }, + { + "epoch": 11.270972823946435, + "grad_norm": 0.2338627427816391, + "learning_rate": 2.8564000000000003e-05, + "loss": 0.0147, + "step": 14314 + }, + { + "epoch": 11.271760535643955, + "grad_norm": 0.31769827008247375, + "learning_rate": 2.8563666666666666e-05, + "loss": 0.0148, + "step": 14315 + }, + { + "epoch": 11.272548247341472, + "grad_norm": 0.5896501541137695, + "learning_rate": 2.8563333333333335e-05, + "loss": 0.0241, + "step": 14316 + }, + { + "epoch": 11.273335959038992, + "grad_norm": 0.2778405547142029, + "learning_rate": 2.8563e-05, + "loss": 0.0399, + "step": 14317 + }, + { + "epoch": 11.274123670736511, + "grad_norm": 0.22213001549243927, + "learning_rate": 2.8562666666666667e-05, + "loss": 0.0065, + "step": 14318 + }, + { + "epoch": 11.274911382434029, + "grad_norm": 0.2263304442167282, + "learning_rate": 2.8562333333333336e-05, + "loss": 0.0148, + "step": 14319 + }, + { + "epoch": 11.275699094131548, + "grad_norm": 0.3689291179180145, + "learning_rate": 2.8562e-05, + "loss": 0.0176, + "step": 14320 + }, + { + "epoch": 11.276486805829066, + "grad_norm": 0.4953918159008026, + "learning_rate": 2.8561666666666668e-05, + "loss": 0.1652, + "step": 14321 + }, + { + "epoch": 11.277274517526585, + "grad_norm": 0.41121435165405273, + "learning_rate": 2.8561333333333334e-05, + "loss": 0.1294, + "step": 14322 + }, + { + "epoch": 11.278062229224105, + "grad_norm": 0.5396425724029541, + "learning_rate": 2.8561e-05, + "loss": 0.1333, + "step": 14323 + }, + { + "epoch": 11.278849940921623, + "grad_norm": 0.5572988390922546, + "learning_rate": 2.8560666666666666e-05, + "loss": 0.0904, + "step": 14324 + }, + { + "epoch": 11.279637652619142, + "grad_norm": 0.3580038547515869, + "learning_rate": 2.8560333333333335e-05, + "loss": 0.0781, + "step": 14325 + }, + { + "epoch": 11.28042536431666, + "grad_norm": 0.6555833220481873, + "learning_rate": 2.856e-05, + "loss": 0.0683, + "step": 14326 + }, + { + "epoch": 11.281213076014179, + "grad_norm": 0.1513603925704956, + "learning_rate": 2.8559666666666667e-05, + "loss": 0.0319, + "step": 14327 + }, + { + "epoch": 11.282000787711697, + "grad_norm": 0.2840062379837036, + "learning_rate": 2.8559333333333336e-05, + "loss": 0.0148, + "step": 14328 + }, + { + "epoch": 11.282788499409216, + "grad_norm": 0.3320543169975281, + "learning_rate": 2.8559e-05, + "loss": 0.0472, + "step": 14329 + }, + { + "epoch": 11.283576211106736, + "grad_norm": 0.37251320481300354, + "learning_rate": 2.8558666666666668e-05, + "loss": 0.01, + "step": 14330 + }, + { + "epoch": 11.284363922804253, + "grad_norm": 0.23920516669750214, + "learning_rate": 2.8558333333333334e-05, + "loss": 0.016, + "step": 14331 + }, + { + "epoch": 11.285151634501773, + "grad_norm": 0.18945060670375824, + "learning_rate": 2.8558e-05, + "loss": 0.0139, + "step": 14332 + }, + { + "epoch": 11.28593934619929, + "grad_norm": 0.19617067277431488, + "learning_rate": 2.8557666666666666e-05, + "loss": 0.0086, + "step": 14333 + }, + { + "epoch": 11.28672705789681, + "grad_norm": 0.1552172303199768, + "learning_rate": 2.8557333333333335e-05, + "loss": 0.0086, + "step": 14334 + }, + { + "epoch": 11.287514769594328, + "grad_norm": 0.2480096071958542, + "learning_rate": 2.8557e-05, + "loss": 0.0131, + "step": 14335 + }, + { + "epoch": 11.288302481291847, + "grad_norm": 0.3989331126213074, + "learning_rate": 2.8556666666666667e-05, + "loss": 0.0111, + "step": 14336 + }, + { + "epoch": 11.289090192989367, + "grad_norm": 0.18955671787261963, + "learning_rate": 2.8556333333333336e-05, + "loss": 0.0103, + "step": 14337 + }, + { + "epoch": 11.289877904686884, + "grad_norm": 0.1815975457429886, + "learning_rate": 2.8556e-05, + "loss": 0.0072, + "step": 14338 + }, + { + "epoch": 11.290665616384404, + "grad_norm": 0.2823576033115387, + "learning_rate": 2.855566666666667e-05, + "loss": 0.0154, + "step": 14339 + }, + { + "epoch": 11.291453328081921, + "grad_norm": 0.17547695338726044, + "learning_rate": 2.8555333333333334e-05, + "loss": 0.0227, + "step": 14340 + }, + { + "epoch": 11.29224103977944, + "grad_norm": 0.12731246650218964, + "learning_rate": 2.8555e-05, + "loss": 0.0089, + "step": 14341 + }, + { + "epoch": 11.29302875147696, + "grad_norm": 0.12145666778087616, + "learning_rate": 2.8554666666666666e-05, + "loss": 0.0111, + "step": 14342 + }, + { + "epoch": 11.293816463174478, + "grad_norm": 0.22178997099399567, + "learning_rate": 2.8554333333333335e-05, + "loss": 0.011, + "step": 14343 + }, + { + "epoch": 11.294604174871997, + "grad_norm": 0.11539990454912186, + "learning_rate": 2.8554e-05, + "loss": 0.008, + "step": 14344 + }, + { + "epoch": 11.295391886569515, + "grad_norm": 0.3376290798187256, + "learning_rate": 2.8553666666666667e-05, + "loss": 0.0079, + "step": 14345 + }, + { + "epoch": 11.296179598267035, + "grad_norm": 0.25130006670951843, + "learning_rate": 2.8553333333333333e-05, + "loss": 0.0169, + "step": 14346 + }, + { + "epoch": 11.296967309964552, + "grad_norm": 0.24104218184947968, + "learning_rate": 2.8553e-05, + "loss": 0.0094, + "step": 14347 + }, + { + "epoch": 11.297755021662072, + "grad_norm": 0.2076735943555832, + "learning_rate": 2.855266666666667e-05, + "loss": 0.0127, + "step": 14348 + }, + { + "epoch": 11.298542733359591, + "grad_norm": 0.23176950216293335, + "learning_rate": 2.855233333333333e-05, + "loss": 0.0086, + "step": 14349 + }, + { + "epoch": 11.299330445057109, + "grad_norm": 0.37988075613975525, + "learning_rate": 2.8552e-05, + "loss": 0.014, + "step": 14350 + }, + { + "epoch": 11.300118156754628, + "grad_norm": 0.2330717295408249, + "learning_rate": 2.855166666666667e-05, + "loss": 0.0177, + "step": 14351 + }, + { + "epoch": 11.300905868452146, + "grad_norm": 0.3082941472530365, + "learning_rate": 2.8551333333333332e-05, + "loss": 0.0123, + "step": 14352 + }, + { + "epoch": 11.301693580149665, + "grad_norm": 0.16510875523090363, + "learning_rate": 2.8551e-05, + "loss": 0.0096, + "step": 14353 + }, + { + "epoch": 11.302481291847183, + "grad_norm": 0.23772896826267242, + "learning_rate": 2.8550666666666668e-05, + "loss": 0.0092, + "step": 14354 + }, + { + "epoch": 11.303269003544703, + "grad_norm": 0.1857222318649292, + "learning_rate": 2.8550333333333333e-05, + "loss": 0.0112, + "step": 14355 + }, + { + "epoch": 11.304056715242222, + "grad_norm": 0.13417500257492065, + "learning_rate": 2.855e-05, + "loss": 0.0086, + "step": 14356 + }, + { + "epoch": 11.30484442693974, + "grad_norm": 0.5063596367835999, + "learning_rate": 2.854966666666667e-05, + "loss": 0.0113, + "step": 14357 + }, + { + "epoch": 11.30563213863726, + "grad_norm": 0.25165677070617676, + "learning_rate": 2.854933333333333e-05, + "loss": 0.0138, + "step": 14358 + }, + { + "epoch": 11.306419850334777, + "grad_norm": 0.1397862732410431, + "learning_rate": 2.8549e-05, + "loss": 0.0068, + "step": 14359 + }, + { + "epoch": 11.307207562032296, + "grad_norm": 0.20455963909626007, + "learning_rate": 2.854866666666667e-05, + "loss": 0.0133, + "step": 14360 + }, + { + "epoch": 11.307995273729816, + "grad_norm": 0.21856334805488586, + "learning_rate": 2.8548333333333332e-05, + "loss": 0.0101, + "step": 14361 + }, + { + "epoch": 11.308782985427333, + "grad_norm": 0.2504900097846985, + "learning_rate": 2.8548000000000002e-05, + "loss": 0.0167, + "step": 14362 + }, + { + "epoch": 11.309570697124853, + "grad_norm": 0.1882075071334839, + "learning_rate": 2.8547666666666668e-05, + "loss": 0.0078, + "step": 14363 + }, + { + "epoch": 11.31035840882237, + "grad_norm": 0.23857997357845306, + "learning_rate": 2.8547333333333334e-05, + "loss": 0.0126, + "step": 14364 + }, + { + "epoch": 11.31114612051989, + "grad_norm": 0.28369930386543274, + "learning_rate": 2.8547e-05, + "loss": 0.0085, + "step": 14365 + }, + { + "epoch": 11.311933832217408, + "grad_norm": 0.2880667448043823, + "learning_rate": 2.854666666666667e-05, + "loss": 0.0104, + "step": 14366 + }, + { + "epoch": 11.312721543914927, + "grad_norm": 0.29123666882514954, + "learning_rate": 2.8546333333333335e-05, + "loss": 0.0071, + "step": 14367 + }, + { + "epoch": 11.313509255612447, + "grad_norm": 0.7189651727676392, + "learning_rate": 2.8546e-05, + "loss": 0.0092, + "step": 14368 + }, + { + "epoch": 11.314296967309964, + "grad_norm": 0.20296360552310944, + "learning_rate": 2.854566666666667e-05, + "loss": 0.0102, + "step": 14369 + }, + { + "epoch": 11.315084679007484, + "grad_norm": 0.3671424984931946, + "learning_rate": 2.8545333333333333e-05, + "loss": 0.0059, + "step": 14370 + }, + { + "epoch": 11.315872390705001, + "grad_norm": 0.4439687132835388, + "learning_rate": 2.8545000000000002e-05, + "loss": 0.2114, + "step": 14371 + }, + { + "epoch": 11.31666010240252, + "grad_norm": 0.8109253644943237, + "learning_rate": 2.8544666666666668e-05, + "loss": 0.1763, + "step": 14372 + }, + { + "epoch": 11.317447814100039, + "grad_norm": 0.48964953422546387, + "learning_rate": 2.8544333333333334e-05, + "loss": 0.1237, + "step": 14373 + }, + { + "epoch": 11.318235525797558, + "grad_norm": 0.49554771184921265, + "learning_rate": 2.8544e-05, + "loss": 0.0735, + "step": 14374 + }, + { + "epoch": 11.319023237495077, + "grad_norm": 0.5457239151000977, + "learning_rate": 2.854366666666667e-05, + "loss": 0.0442, + "step": 14375 + }, + { + "epoch": 11.319810949192595, + "grad_norm": 0.38402578234672546, + "learning_rate": 2.8543333333333335e-05, + "loss": 0.0325, + "step": 14376 + }, + { + "epoch": 11.320598660890115, + "grad_norm": 0.24754764139652252, + "learning_rate": 2.8543e-05, + "loss": 0.0151, + "step": 14377 + }, + { + "epoch": 11.321386372587632, + "grad_norm": 0.32870057225227356, + "learning_rate": 2.8542666666666667e-05, + "loss": 0.015, + "step": 14378 + }, + { + "epoch": 11.322174084285152, + "grad_norm": 0.26632577180862427, + "learning_rate": 2.8542333333333333e-05, + "loss": 0.0157, + "step": 14379 + }, + { + "epoch": 11.322961795982671, + "grad_norm": 0.21226349472999573, + "learning_rate": 2.8542000000000002e-05, + "loss": 0.0139, + "step": 14380 + }, + { + "epoch": 11.323749507680189, + "grad_norm": 0.29886114597320557, + "learning_rate": 2.8541666666666665e-05, + "loss": 0.0189, + "step": 14381 + }, + { + "epoch": 11.324537219377708, + "grad_norm": 0.5202065706253052, + "learning_rate": 2.8541333333333334e-05, + "loss": 0.0416, + "step": 14382 + }, + { + "epoch": 11.325324931075226, + "grad_norm": 0.0870431438088417, + "learning_rate": 2.8541e-05, + "loss": 0.0039, + "step": 14383 + }, + { + "epoch": 11.326112642772745, + "grad_norm": 0.19283384084701538, + "learning_rate": 2.8540666666666666e-05, + "loss": 0.0099, + "step": 14384 + }, + { + "epoch": 11.326900354470263, + "grad_norm": 0.2855110466480255, + "learning_rate": 2.8540333333333335e-05, + "loss": 0.023, + "step": 14385 + }, + { + "epoch": 11.327688066167783, + "grad_norm": 0.5439264178276062, + "learning_rate": 2.854e-05, + "loss": 0.0083, + "step": 14386 + }, + { + "epoch": 11.328475777865302, + "grad_norm": 0.1954534500837326, + "learning_rate": 2.8539666666666667e-05, + "loss": 0.0094, + "step": 14387 + }, + { + "epoch": 11.32926348956282, + "grad_norm": 0.20363637804985046, + "learning_rate": 2.8539333333333333e-05, + "loss": 0.0118, + "step": 14388 + }, + { + "epoch": 11.33005120126034, + "grad_norm": 0.12316751480102539, + "learning_rate": 2.8539000000000002e-05, + "loss": 0.0081, + "step": 14389 + }, + { + "epoch": 11.330838912957857, + "grad_norm": 0.1674344688653946, + "learning_rate": 2.8538666666666665e-05, + "loss": 0.0095, + "step": 14390 + }, + { + "epoch": 11.331626624655376, + "grad_norm": 0.14339371025562286, + "learning_rate": 2.8538333333333334e-05, + "loss": 0.0069, + "step": 14391 + }, + { + "epoch": 11.332414336352894, + "grad_norm": 0.1824079304933548, + "learning_rate": 2.8538e-05, + "loss": 0.0119, + "step": 14392 + }, + { + "epoch": 11.333202048050413, + "grad_norm": 0.3437281847000122, + "learning_rate": 2.8537666666666666e-05, + "loss": 0.0095, + "step": 14393 + }, + { + "epoch": 11.333989759747933, + "grad_norm": 0.23008638620376587, + "learning_rate": 2.8537333333333335e-05, + "loss": 0.0101, + "step": 14394 + }, + { + "epoch": 11.33477747144545, + "grad_norm": 0.34480562806129456, + "learning_rate": 2.8537e-05, + "loss": 0.013, + "step": 14395 + }, + { + "epoch": 11.33556518314297, + "grad_norm": 0.5802191495895386, + "learning_rate": 2.8536666666666667e-05, + "loss": 0.0215, + "step": 14396 + }, + { + "epoch": 11.336352894840488, + "grad_norm": 0.070172980427742, + "learning_rate": 2.8536333333333333e-05, + "loss": 0.0037, + "step": 14397 + }, + { + "epoch": 11.337140606538007, + "grad_norm": 0.4844681918621063, + "learning_rate": 2.8536000000000003e-05, + "loss": 0.0091, + "step": 14398 + }, + { + "epoch": 11.337928318235527, + "grad_norm": 0.21146683394908905, + "learning_rate": 2.8535666666666665e-05, + "loss": 0.007, + "step": 14399 + }, + { + "epoch": 11.338716029933044, + "grad_norm": 0.19194281101226807, + "learning_rate": 2.8535333333333334e-05, + "loss": 0.0124, + "step": 14400 + }, + { + "epoch": 11.339503741630564, + "grad_norm": 0.2747430205345154, + "learning_rate": 2.8535000000000004e-05, + "loss": 0.0101, + "step": 14401 + }, + { + "epoch": 11.340291453328081, + "grad_norm": 0.16126500070095062, + "learning_rate": 2.8534666666666666e-05, + "loss": 0.0069, + "step": 14402 + }, + { + "epoch": 11.3410791650256, + "grad_norm": 0.17727793753147125, + "learning_rate": 2.8534333333333336e-05, + "loss": 0.0091, + "step": 14403 + }, + { + "epoch": 11.341866876723119, + "grad_norm": 0.32063978910446167, + "learning_rate": 2.8534e-05, + "loss": 0.0085, + "step": 14404 + }, + { + "epoch": 11.342654588420638, + "grad_norm": 0.3837624192237854, + "learning_rate": 2.8533666666666668e-05, + "loss": 0.01, + "step": 14405 + }, + { + "epoch": 11.343442300118157, + "grad_norm": 0.23862576484680176, + "learning_rate": 2.8533333333333333e-05, + "loss": 0.007, + "step": 14406 + }, + { + "epoch": 11.344230011815675, + "grad_norm": 0.31017884612083435, + "learning_rate": 2.8533e-05, + "loss": 0.0114, + "step": 14407 + }, + { + "epoch": 11.345017723513195, + "grad_norm": 0.15332022309303284, + "learning_rate": 2.8532666666666665e-05, + "loss": 0.0082, + "step": 14408 + }, + { + "epoch": 11.345805435210712, + "grad_norm": 0.12716473639011383, + "learning_rate": 2.8532333333333335e-05, + "loss": 0.0045, + "step": 14409 + }, + { + "epoch": 11.346593146908232, + "grad_norm": 0.3135543465614319, + "learning_rate": 2.8532e-05, + "loss": 0.0167, + "step": 14410 + }, + { + "epoch": 11.34738085860575, + "grad_norm": 0.3156236410140991, + "learning_rate": 2.8531666666666667e-05, + "loss": 0.0087, + "step": 14411 + }, + { + "epoch": 11.348168570303269, + "grad_norm": 0.9133021235466003, + "learning_rate": 2.8531333333333336e-05, + "loss": 0.0243, + "step": 14412 + }, + { + "epoch": 11.348956282000788, + "grad_norm": 0.25575143098831177, + "learning_rate": 2.8531e-05, + "loss": 0.0081, + "step": 14413 + }, + { + "epoch": 11.349743993698306, + "grad_norm": 0.41143399477005005, + "learning_rate": 2.8530666666666668e-05, + "loss": 0.0122, + "step": 14414 + }, + { + "epoch": 11.350531705395825, + "grad_norm": 0.3240196704864502, + "learning_rate": 2.8530333333333334e-05, + "loss": 0.0156, + "step": 14415 + }, + { + "epoch": 11.351319417093343, + "grad_norm": 0.20920409262180328, + "learning_rate": 2.853e-05, + "loss": 0.0098, + "step": 14416 + }, + { + "epoch": 11.352107128790863, + "grad_norm": 0.5341922640800476, + "learning_rate": 2.852966666666667e-05, + "loss": 0.0117, + "step": 14417 + }, + { + "epoch": 11.352894840488382, + "grad_norm": 0.25566405057907104, + "learning_rate": 2.8529333333333335e-05, + "loss": 0.011, + "step": 14418 + }, + { + "epoch": 11.3536825521859, + "grad_norm": 0.39027634263038635, + "learning_rate": 2.8529e-05, + "loss": 0.0103, + "step": 14419 + }, + { + "epoch": 11.35447026388342, + "grad_norm": 0.4786667227745056, + "learning_rate": 2.8528666666666667e-05, + "loss": 0.0098, + "step": 14420 + }, + { + "epoch": 11.355257975580937, + "grad_norm": 0.5227695107460022, + "learning_rate": 2.8528333333333336e-05, + "loss": 0.197, + "step": 14421 + }, + { + "epoch": 11.356045687278456, + "grad_norm": 0.4230790436267853, + "learning_rate": 2.8528e-05, + "loss": 0.1092, + "step": 14422 + }, + { + "epoch": 11.356833398975974, + "grad_norm": 0.40283751487731934, + "learning_rate": 2.8527666666666668e-05, + "loss": 0.0641, + "step": 14423 + }, + { + "epoch": 11.357621110673493, + "grad_norm": 0.9546533823013306, + "learning_rate": 2.8527333333333334e-05, + "loss": 0.0811, + "step": 14424 + }, + { + "epoch": 11.358408822371013, + "grad_norm": 0.4092230498790741, + "learning_rate": 2.8527e-05, + "loss": 0.0508, + "step": 14425 + }, + { + "epoch": 11.35919653406853, + "grad_norm": 0.4446810781955719, + "learning_rate": 2.852666666666667e-05, + "loss": 0.123, + "step": 14426 + }, + { + "epoch": 11.35998424576605, + "grad_norm": 0.3574763238430023, + "learning_rate": 2.8526333333333335e-05, + "loss": 0.0292, + "step": 14427 + }, + { + "epoch": 11.360771957463568, + "grad_norm": 0.284525066614151, + "learning_rate": 2.8526e-05, + "loss": 0.0238, + "step": 14428 + }, + { + "epoch": 11.361559669161087, + "grad_norm": 0.1960337907075882, + "learning_rate": 2.8525666666666667e-05, + "loss": 0.0098, + "step": 14429 + }, + { + "epoch": 11.362347380858605, + "grad_norm": 0.2505640387535095, + "learning_rate": 2.8525333333333336e-05, + "loss": 0.0075, + "step": 14430 + }, + { + "epoch": 11.363135092556124, + "grad_norm": 0.16918610036373138, + "learning_rate": 2.8525e-05, + "loss": 0.0166, + "step": 14431 + }, + { + "epoch": 11.363922804253644, + "grad_norm": 0.34241798520088196, + "learning_rate": 2.8524666666666668e-05, + "loss": 0.0456, + "step": 14432 + }, + { + "epoch": 11.364710515951161, + "grad_norm": 0.154208242893219, + "learning_rate": 2.8524333333333334e-05, + "loss": 0.0118, + "step": 14433 + }, + { + "epoch": 11.365498227648681, + "grad_norm": 0.18980512022972107, + "learning_rate": 2.8524e-05, + "loss": 0.0079, + "step": 14434 + }, + { + "epoch": 11.366285939346199, + "grad_norm": 0.16709518432617188, + "learning_rate": 2.852366666666667e-05, + "loss": 0.0105, + "step": 14435 + }, + { + "epoch": 11.367073651043718, + "grad_norm": 0.1491747796535492, + "learning_rate": 2.8523333333333335e-05, + "loss": 0.0244, + "step": 14436 + }, + { + "epoch": 11.367861362741237, + "grad_norm": 0.26943835616111755, + "learning_rate": 2.8523e-05, + "loss": 0.0128, + "step": 14437 + }, + { + "epoch": 11.368649074438755, + "grad_norm": 0.4567219018936157, + "learning_rate": 2.8522666666666667e-05, + "loss": 0.0153, + "step": 14438 + }, + { + "epoch": 11.369436786136275, + "grad_norm": 0.39484721422195435, + "learning_rate": 2.8522333333333333e-05, + "loss": 0.0164, + "step": 14439 + }, + { + "epoch": 11.370224497833792, + "grad_norm": 0.19912782311439514, + "learning_rate": 2.8522e-05, + "loss": 0.0159, + "step": 14440 + }, + { + "epoch": 11.371012209531312, + "grad_norm": 0.21003322303295135, + "learning_rate": 2.852166666666667e-05, + "loss": 0.0072, + "step": 14441 + }, + { + "epoch": 11.37179992122883, + "grad_norm": 0.11208685487508774, + "learning_rate": 2.8521333333333334e-05, + "loss": 0.009, + "step": 14442 + }, + { + "epoch": 11.372587632926349, + "grad_norm": 0.16050998866558075, + "learning_rate": 2.8521e-05, + "loss": 0.0086, + "step": 14443 + }, + { + "epoch": 11.373375344623868, + "grad_norm": 0.45974037051200867, + "learning_rate": 2.852066666666667e-05, + "loss": 0.0114, + "step": 14444 + }, + { + "epoch": 11.374163056321386, + "grad_norm": 0.13543836772441864, + "learning_rate": 2.8520333333333332e-05, + "loss": 0.0091, + "step": 14445 + }, + { + "epoch": 11.374950768018905, + "grad_norm": 0.419052392244339, + "learning_rate": 2.852e-05, + "loss": 0.0131, + "step": 14446 + }, + { + "epoch": 11.375738479716423, + "grad_norm": 0.3435143530368805, + "learning_rate": 2.8519666666666667e-05, + "loss": 0.0156, + "step": 14447 + }, + { + "epoch": 11.376526191413943, + "grad_norm": 0.15969575941562653, + "learning_rate": 2.8519333333333333e-05, + "loss": 0.0075, + "step": 14448 + }, + { + "epoch": 11.37731390311146, + "grad_norm": 0.3012976348400116, + "learning_rate": 2.8519e-05, + "loss": 0.0134, + "step": 14449 + }, + { + "epoch": 11.37810161480898, + "grad_norm": 0.4780780076980591, + "learning_rate": 2.851866666666667e-05, + "loss": 0.0108, + "step": 14450 + }, + { + "epoch": 11.3788893265065, + "grad_norm": 0.352085679769516, + "learning_rate": 2.8518333333333334e-05, + "loss": 0.0138, + "step": 14451 + }, + { + "epoch": 11.379677038204017, + "grad_norm": 0.8505268692970276, + "learning_rate": 2.8518e-05, + "loss": 0.0184, + "step": 14452 + }, + { + "epoch": 11.380464749901536, + "grad_norm": 0.4674205780029297, + "learning_rate": 2.851766666666667e-05, + "loss": 0.0165, + "step": 14453 + }, + { + "epoch": 11.381252461599054, + "grad_norm": 0.47546133399009705, + "learning_rate": 2.8517333333333332e-05, + "loss": 0.0177, + "step": 14454 + }, + { + "epoch": 11.382040173296573, + "grad_norm": 0.1435307413339615, + "learning_rate": 2.8517e-05, + "loss": 0.0099, + "step": 14455 + }, + { + "epoch": 11.382827884994093, + "grad_norm": 0.1620595008134842, + "learning_rate": 2.8516666666666668e-05, + "loss": 0.0085, + "step": 14456 + }, + { + "epoch": 11.38361559669161, + "grad_norm": 0.21687908470630646, + "learning_rate": 2.8516333333333333e-05, + "loss": 0.0126, + "step": 14457 + }, + { + "epoch": 11.38440330838913, + "grad_norm": 0.12545090913772583, + "learning_rate": 2.8516e-05, + "loss": 0.006, + "step": 14458 + }, + { + "epoch": 11.385191020086648, + "grad_norm": 0.10706383734941483, + "learning_rate": 2.851566666666667e-05, + "loss": 0.0061, + "step": 14459 + }, + { + "epoch": 11.385978731784167, + "grad_norm": 0.20320209860801697, + "learning_rate": 2.8515333333333335e-05, + "loss": 0.0073, + "step": 14460 + }, + { + "epoch": 11.386766443481685, + "grad_norm": 0.096477210521698, + "learning_rate": 2.8515e-05, + "loss": 0.0037, + "step": 14461 + }, + { + "epoch": 11.387554155179204, + "grad_norm": 0.2727310359477997, + "learning_rate": 2.851466666666667e-05, + "loss": 0.0194, + "step": 14462 + }, + { + "epoch": 11.388341866876724, + "grad_norm": 0.18970240652561188, + "learning_rate": 2.8514333333333332e-05, + "loss": 0.0062, + "step": 14463 + }, + { + "epoch": 11.389129578574241, + "grad_norm": 0.5901272892951965, + "learning_rate": 2.8514000000000002e-05, + "loss": 0.0223, + "step": 14464 + }, + { + "epoch": 11.389917290271761, + "grad_norm": 0.2773374915122986, + "learning_rate": 2.8513666666666668e-05, + "loss": 0.0122, + "step": 14465 + }, + { + "epoch": 11.390705001969279, + "grad_norm": 0.25706198811531067, + "learning_rate": 2.8513333333333334e-05, + "loss": 0.0106, + "step": 14466 + }, + { + "epoch": 11.391492713666798, + "grad_norm": 0.2959279417991638, + "learning_rate": 2.8513000000000003e-05, + "loss": 0.008, + "step": 14467 + }, + { + "epoch": 11.392280425364318, + "grad_norm": 0.6635178923606873, + "learning_rate": 2.8512666666666666e-05, + "loss": 0.0099, + "step": 14468 + }, + { + "epoch": 11.393068137061835, + "grad_norm": 0.24437092244625092, + "learning_rate": 2.8512333333333335e-05, + "loss": 0.0103, + "step": 14469 + }, + { + "epoch": 11.393855848759355, + "grad_norm": 0.2550608813762665, + "learning_rate": 2.8512e-05, + "loss": 0.0276, + "step": 14470 + }, + { + "epoch": 11.394643560456872, + "grad_norm": 0.5663403272628784, + "learning_rate": 2.8511666666666667e-05, + "loss": 0.1496, + "step": 14471 + }, + { + "epoch": 11.395431272154392, + "grad_norm": 0.4576718211174011, + "learning_rate": 2.8511333333333333e-05, + "loss": 0.1471, + "step": 14472 + }, + { + "epoch": 11.39621898385191, + "grad_norm": 0.4081750810146332, + "learning_rate": 2.8511000000000002e-05, + "loss": 0.0948, + "step": 14473 + }, + { + "epoch": 11.397006695549429, + "grad_norm": 0.5188582539558411, + "learning_rate": 2.8510666666666665e-05, + "loss": 0.0943, + "step": 14474 + }, + { + "epoch": 11.397794407246948, + "grad_norm": 0.3345058560371399, + "learning_rate": 2.8510333333333334e-05, + "loss": 0.0552, + "step": 14475 + }, + { + "epoch": 11.398582118944466, + "grad_norm": 0.49240198731422424, + "learning_rate": 2.8510000000000003e-05, + "loss": 0.0967, + "step": 14476 + }, + { + "epoch": 11.399369830641986, + "grad_norm": 0.18022246658802032, + "learning_rate": 2.8509666666666666e-05, + "loss": 0.0356, + "step": 14477 + }, + { + "epoch": 11.400157542339503, + "grad_norm": 0.49403294920921326, + "learning_rate": 2.8509333333333335e-05, + "loss": 0.0227, + "step": 14478 + }, + { + "epoch": 11.400945254037023, + "grad_norm": 0.1950070559978485, + "learning_rate": 2.8509e-05, + "loss": 0.0145, + "step": 14479 + }, + { + "epoch": 11.40173296573454, + "grad_norm": 0.19889649748802185, + "learning_rate": 2.8508666666666667e-05, + "loss": 0.0155, + "step": 14480 + }, + { + "epoch": 11.40252067743206, + "grad_norm": 0.15735021233558655, + "learning_rate": 2.8508333333333333e-05, + "loss": 0.0117, + "step": 14481 + }, + { + "epoch": 11.40330838912958, + "grad_norm": 0.2075282335281372, + "learning_rate": 2.8508000000000002e-05, + "loss": 0.0137, + "step": 14482 + }, + { + "epoch": 11.404096100827097, + "grad_norm": 0.27483829855918884, + "learning_rate": 2.8507666666666665e-05, + "loss": 0.0143, + "step": 14483 + }, + { + "epoch": 11.404883812524616, + "grad_norm": 0.12621429562568665, + "learning_rate": 2.8507333333333334e-05, + "loss": 0.0093, + "step": 14484 + }, + { + "epoch": 11.405671524222134, + "grad_norm": 0.2819642126560211, + "learning_rate": 2.8507000000000003e-05, + "loss": 0.0123, + "step": 14485 + }, + { + "epoch": 11.406459235919653, + "grad_norm": 0.13990670442581177, + "learning_rate": 2.8506666666666666e-05, + "loss": 0.0048, + "step": 14486 + }, + { + "epoch": 11.407246947617173, + "grad_norm": 0.14328061044216156, + "learning_rate": 2.8506333333333335e-05, + "loss": 0.0123, + "step": 14487 + }, + { + "epoch": 11.40803465931469, + "grad_norm": 0.16078469157218933, + "learning_rate": 2.8506e-05, + "loss": 0.0062, + "step": 14488 + }, + { + "epoch": 11.40882237101221, + "grad_norm": 0.2666710615158081, + "learning_rate": 2.8505666666666667e-05, + "loss": 0.0148, + "step": 14489 + }, + { + "epoch": 11.409610082709728, + "grad_norm": 0.11644233018159866, + "learning_rate": 2.8505333333333333e-05, + "loss": 0.0058, + "step": 14490 + }, + { + "epoch": 11.410397794407247, + "grad_norm": 0.20084866881370544, + "learning_rate": 2.8505000000000002e-05, + "loss": 0.0132, + "step": 14491 + }, + { + "epoch": 11.411185506104765, + "grad_norm": 0.1466815024614334, + "learning_rate": 2.8504666666666668e-05, + "loss": 0.0121, + "step": 14492 + }, + { + "epoch": 11.411973217802284, + "grad_norm": 0.26495036482810974, + "learning_rate": 2.8504333333333334e-05, + "loss": 0.0118, + "step": 14493 + }, + { + "epoch": 11.412760929499804, + "grad_norm": 0.3313753008842468, + "learning_rate": 2.8504000000000004e-05, + "loss": 0.0182, + "step": 14494 + }, + { + "epoch": 11.413548641197321, + "grad_norm": 0.2836146950721741, + "learning_rate": 2.8503666666666666e-05, + "loss": 0.0109, + "step": 14495 + }, + { + "epoch": 11.414336352894841, + "grad_norm": 0.16796204447746277, + "learning_rate": 2.8503333333333335e-05, + "loss": 0.008, + "step": 14496 + }, + { + "epoch": 11.415124064592359, + "grad_norm": 0.09367893636226654, + "learning_rate": 2.8502999999999998e-05, + "loss": 0.0073, + "step": 14497 + }, + { + "epoch": 11.415911776289878, + "grad_norm": 0.1140766590833664, + "learning_rate": 2.8502666666666667e-05, + "loss": 0.0103, + "step": 14498 + }, + { + "epoch": 11.416699487987396, + "grad_norm": 0.12559084594249725, + "learning_rate": 2.8502333333333333e-05, + "loss": 0.0126, + "step": 14499 + }, + { + "epoch": 11.417487199684915, + "grad_norm": 0.0739603340625763, + "learning_rate": 2.8502e-05, + "loss": 0.0045, + "step": 14500 + }, + { + "epoch": 11.418274911382435, + "grad_norm": 0.2649839520454407, + "learning_rate": 2.850166666666667e-05, + "loss": 0.0158, + "step": 14501 + }, + { + "epoch": 11.419062623079952, + "grad_norm": 0.1284419149160385, + "learning_rate": 2.8501333333333334e-05, + "loss": 0.0063, + "step": 14502 + }, + { + "epoch": 11.419850334777472, + "grad_norm": 0.662781298160553, + "learning_rate": 2.8501e-05, + "loss": 0.0138, + "step": 14503 + }, + { + "epoch": 11.42063804647499, + "grad_norm": 0.18382896482944489, + "learning_rate": 2.8500666666666666e-05, + "loss": 0.0052, + "step": 14504 + }, + { + "epoch": 11.421425758172509, + "grad_norm": 0.36642986536026, + "learning_rate": 2.8500333333333336e-05, + "loss": 0.0086, + "step": 14505 + }, + { + "epoch": 11.422213469870028, + "grad_norm": 0.18107420206069946, + "learning_rate": 2.8499999999999998e-05, + "loss": 0.0097, + "step": 14506 + }, + { + "epoch": 11.423001181567546, + "grad_norm": 0.38207852840423584, + "learning_rate": 2.8499666666666668e-05, + "loss": 0.0163, + "step": 14507 + }, + { + "epoch": 11.423788893265066, + "grad_norm": 0.30881625413894653, + "learning_rate": 2.8499333333333333e-05, + "loss": 0.0142, + "step": 14508 + }, + { + "epoch": 11.424576604962583, + "grad_norm": 0.2122786045074463, + "learning_rate": 2.8499e-05, + "loss": 0.011, + "step": 14509 + }, + { + "epoch": 11.425364316660103, + "grad_norm": 0.09898892045021057, + "learning_rate": 2.849866666666667e-05, + "loss": 0.0076, + "step": 14510 + }, + { + "epoch": 11.42615202835762, + "grad_norm": 0.13193081319332123, + "learning_rate": 2.8498333333333335e-05, + "loss": 0.0058, + "step": 14511 + }, + { + "epoch": 11.42693974005514, + "grad_norm": 0.2943066358566284, + "learning_rate": 2.8498e-05, + "loss": 0.0228, + "step": 14512 + }, + { + "epoch": 11.42772745175266, + "grad_norm": 0.13103686273097992, + "learning_rate": 2.8497666666666667e-05, + "loss": 0.0087, + "step": 14513 + }, + { + "epoch": 11.428515163450177, + "grad_norm": 0.1805799901485443, + "learning_rate": 2.8497333333333336e-05, + "loss": 0.0102, + "step": 14514 + }, + { + "epoch": 11.429302875147696, + "grad_norm": 0.4852811396121979, + "learning_rate": 2.8497e-05, + "loss": 0.0176, + "step": 14515 + }, + { + "epoch": 11.430090586845214, + "grad_norm": 2.022942543029785, + "learning_rate": 2.8496666666666668e-05, + "loss": 0.0188, + "step": 14516 + }, + { + "epoch": 11.430878298542734, + "grad_norm": 0.1907595545053482, + "learning_rate": 2.8496333333333337e-05, + "loss": 0.011, + "step": 14517 + }, + { + "epoch": 11.431666010240253, + "grad_norm": 0.22833722829818726, + "learning_rate": 2.8496e-05, + "loss": 0.0082, + "step": 14518 + }, + { + "epoch": 11.43245372193777, + "grad_norm": 0.24607200920581818, + "learning_rate": 2.849566666666667e-05, + "loss": 0.0129, + "step": 14519 + }, + { + "epoch": 11.43324143363529, + "grad_norm": 0.5282124280929565, + "learning_rate": 2.8495333333333335e-05, + "loss": 0.0333, + "step": 14520 + }, + { + "epoch": 11.434029145332808, + "grad_norm": 0.5524836778640747, + "learning_rate": 2.8495e-05, + "loss": 0.1674, + "step": 14521 + }, + { + "epoch": 11.434816857030327, + "grad_norm": 0.4717161953449249, + "learning_rate": 2.8494666666666667e-05, + "loss": 0.1163, + "step": 14522 + }, + { + "epoch": 11.435604568727845, + "grad_norm": 0.34784024953842163, + "learning_rate": 2.8494333333333336e-05, + "loss": 0.0926, + "step": 14523 + }, + { + "epoch": 11.436392280425364, + "grad_norm": 0.6313905119895935, + "learning_rate": 2.8494e-05, + "loss": 0.0993, + "step": 14524 + }, + { + "epoch": 11.437179992122884, + "grad_norm": 0.3736996650695801, + "learning_rate": 2.8493666666666668e-05, + "loss": 0.0693, + "step": 14525 + }, + { + "epoch": 11.437967703820402, + "grad_norm": 0.31601667404174805, + "learning_rate": 2.8493333333333337e-05, + "loss": 0.0293, + "step": 14526 + }, + { + "epoch": 11.438755415517921, + "grad_norm": 0.3440849184989929, + "learning_rate": 2.8493e-05, + "loss": 0.0274, + "step": 14527 + }, + { + "epoch": 11.439543127215439, + "grad_norm": 0.12657815217971802, + "learning_rate": 2.849266666666667e-05, + "loss": 0.0111, + "step": 14528 + }, + { + "epoch": 11.440330838912958, + "grad_norm": 0.25914427638053894, + "learning_rate": 2.849233333333333e-05, + "loss": 0.0118, + "step": 14529 + }, + { + "epoch": 11.441118550610476, + "grad_norm": 0.22817716002464294, + "learning_rate": 2.8492e-05, + "loss": 0.0183, + "step": 14530 + }, + { + "epoch": 11.441906262307995, + "grad_norm": 0.16389939188957214, + "learning_rate": 2.8491666666666667e-05, + "loss": 0.0125, + "step": 14531 + }, + { + "epoch": 11.442693974005515, + "grad_norm": 0.24731631577014923, + "learning_rate": 2.8491333333333333e-05, + "loss": 0.0131, + "step": 14532 + }, + { + "epoch": 11.443481685703032, + "grad_norm": 0.4274970293045044, + "learning_rate": 2.8491e-05, + "loss": 0.0129, + "step": 14533 + }, + { + "epoch": 11.444269397400552, + "grad_norm": 0.7646434307098389, + "learning_rate": 2.8490666666666668e-05, + "loss": 0.0089, + "step": 14534 + }, + { + "epoch": 11.44505710909807, + "grad_norm": 0.15133486688137054, + "learning_rate": 2.8490333333333334e-05, + "loss": 0.0077, + "step": 14535 + }, + { + "epoch": 11.445844820795589, + "grad_norm": 0.6509023308753967, + "learning_rate": 2.849e-05, + "loss": 0.0095, + "step": 14536 + }, + { + "epoch": 11.446632532493108, + "grad_norm": 0.2602762281894684, + "learning_rate": 2.848966666666667e-05, + "loss": 0.0142, + "step": 14537 + }, + { + "epoch": 11.447420244190626, + "grad_norm": 0.33743008971214294, + "learning_rate": 2.8489333333333332e-05, + "loss": 0.0156, + "step": 14538 + }, + { + "epoch": 11.448207955888146, + "grad_norm": 0.39832308888435364, + "learning_rate": 2.8489e-05, + "loss": 0.0174, + "step": 14539 + }, + { + "epoch": 11.448995667585663, + "grad_norm": 0.19266866147518158, + "learning_rate": 2.8488666666666667e-05, + "loss": 0.0066, + "step": 14540 + }, + { + "epoch": 11.449783379283183, + "grad_norm": 0.3413390815258026, + "learning_rate": 2.8488333333333333e-05, + "loss": 0.0303, + "step": 14541 + }, + { + "epoch": 11.4505710909807, + "grad_norm": 0.5920853018760681, + "learning_rate": 2.8488000000000002e-05, + "loss": 0.0167, + "step": 14542 + }, + { + "epoch": 11.45135880267822, + "grad_norm": 0.2464379370212555, + "learning_rate": 2.8487666666666668e-05, + "loss": 0.0102, + "step": 14543 + }, + { + "epoch": 11.45214651437574, + "grad_norm": 0.19856442511081696, + "learning_rate": 2.8487333333333334e-05, + "loss": 0.0118, + "step": 14544 + }, + { + "epoch": 11.452934226073257, + "grad_norm": 0.3516031503677368, + "learning_rate": 2.8487e-05, + "loss": 0.0127, + "step": 14545 + }, + { + "epoch": 11.453721937770776, + "grad_norm": 0.18692968785762787, + "learning_rate": 2.848666666666667e-05, + "loss": 0.013, + "step": 14546 + }, + { + "epoch": 11.454509649468294, + "grad_norm": 0.2885766625404358, + "learning_rate": 2.8486333333333332e-05, + "loss": 0.014, + "step": 14547 + }, + { + "epoch": 11.455297361165814, + "grad_norm": 0.2272956669330597, + "learning_rate": 2.8486e-05, + "loss": 0.0073, + "step": 14548 + }, + { + "epoch": 11.456085072863331, + "grad_norm": 0.2179996818304062, + "learning_rate": 2.8485666666666667e-05, + "loss": 0.012, + "step": 14549 + }, + { + "epoch": 11.45687278456085, + "grad_norm": 0.24280007183551788, + "learning_rate": 2.8485333333333333e-05, + "loss": 0.0126, + "step": 14550 + }, + { + "epoch": 11.45766049625837, + "grad_norm": 0.2259848266839981, + "learning_rate": 2.8485000000000003e-05, + "loss": 0.0079, + "step": 14551 + }, + { + "epoch": 11.458448207955888, + "grad_norm": 0.21402963995933533, + "learning_rate": 2.848466666666667e-05, + "loss": 0.009, + "step": 14552 + }, + { + "epoch": 11.459235919653407, + "grad_norm": 0.19460982084274292, + "learning_rate": 2.8484333333333334e-05, + "loss": 0.0082, + "step": 14553 + }, + { + "epoch": 11.460023631350925, + "grad_norm": 0.17245076596736908, + "learning_rate": 2.8484e-05, + "loss": 0.0091, + "step": 14554 + }, + { + "epoch": 11.460811343048444, + "grad_norm": 0.1633191555738449, + "learning_rate": 2.848366666666667e-05, + "loss": 0.006, + "step": 14555 + }, + { + "epoch": 11.461599054745964, + "grad_norm": 0.4049217104911804, + "learning_rate": 2.8483333333333332e-05, + "loss": 0.0131, + "step": 14556 + }, + { + "epoch": 11.462386766443482, + "grad_norm": 0.5329495668411255, + "learning_rate": 2.8483e-05, + "loss": 0.0154, + "step": 14557 + }, + { + "epoch": 11.463174478141001, + "grad_norm": 0.23661315441131592, + "learning_rate": 2.8482666666666664e-05, + "loss": 0.0141, + "step": 14558 + }, + { + "epoch": 11.463962189838519, + "grad_norm": 0.39937955141067505, + "learning_rate": 2.8482333333333333e-05, + "loss": 0.0069, + "step": 14559 + }, + { + "epoch": 11.464749901536038, + "grad_norm": 0.20103497803211212, + "learning_rate": 2.8482000000000003e-05, + "loss": 0.0106, + "step": 14560 + }, + { + "epoch": 11.465537613233556, + "grad_norm": 0.48808789253234863, + "learning_rate": 2.8481666666666665e-05, + "loss": 0.0153, + "step": 14561 + }, + { + "epoch": 11.466325324931075, + "grad_norm": 0.5255494713783264, + "learning_rate": 2.8481333333333335e-05, + "loss": 0.0189, + "step": 14562 + }, + { + "epoch": 11.467113036628595, + "grad_norm": 0.3202952742576599, + "learning_rate": 2.8481e-05, + "loss": 0.0124, + "step": 14563 + }, + { + "epoch": 11.467900748326112, + "grad_norm": 0.21891170740127563, + "learning_rate": 2.8480666666666667e-05, + "loss": 0.0084, + "step": 14564 + }, + { + "epoch": 11.468688460023632, + "grad_norm": 0.2592688202857971, + "learning_rate": 2.8480333333333332e-05, + "loss": 0.0176, + "step": 14565 + }, + { + "epoch": 11.46947617172115, + "grad_norm": 0.15304209291934967, + "learning_rate": 2.8480000000000002e-05, + "loss": 0.0083, + "step": 14566 + }, + { + "epoch": 11.470263883418669, + "grad_norm": 0.3827771842479706, + "learning_rate": 2.8479666666666668e-05, + "loss": 0.0252, + "step": 14567 + }, + { + "epoch": 11.471051595116187, + "grad_norm": 0.33233657479286194, + "learning_rate": 2.8479333333333334e-05, + "loss": 0.02, + "step": 14568 + }, + { + "epoch": 11.471839306813706, + "grad_norm": 0.5487658977508545, + "learning_rate": 2.8479000000000003e-05, + "loss": 0.02, + "step": 14569 + }, + { + "epoch": 11.472627018511226, + "grad_norm": 0.21220558881759644, + "learning_rate": 2.8478666666666666e-05, + "loss": 0.0069, + "step": 14570 + }, + { + "epoch": 11.473414730208743, + "grad_norm": 0.6399697065353394, + "learning_rate": 2.8478333333333335e-05, + "loss": 0.2082, + "step": 14571 + }, + { + "epoch": 11.474202441906263, + "grad_norm": 0.5071942806243896, + "learning_rate": 2.8478e-05, + "loss": 0.1172, + "step": 14572 + }, + { + "epoch": 11.47499015360378, + "grad_norm": 0.5746302604675293, + "learning_rate": 2.8477666666666667e-05, + "loss": 0.1095, + "step": 14573 + }, + { + "epoch": 11.4757778653013, + "grad_norm": 0.31797516345977783, + "learning_rate": 2.8477333333333333e-05, + "loss": 0.0616, + "step": 14574 + }, + { + "epoch": 11.47656557699882, + "grad_norm": 0.4295033812522888, + "learning_rate": 2.8477000000000002e-05, + "loss": 0.0956, + "step": 14575 + }, + { + "epoch": 11.477353288696337, + "grad_norm": 0.29775819182395935, + "learning_rate": 2.8476666666666668e-05, + "loss": 0.0457, + "step": 14576 + }, + { + "epoch": 11.478141000393856, + "grad_norm": 0.22148820757865906, + "learning_rate": 2.8476333333333334e-05, + "loss": 0.0302, + "step": 14577 + }, + { + "epoch": 11.478928712091374, + "grad_norm": 0.3656988739967346, + "learning_rate": 2.8476000000000003e-05, + "loss": 0.0343, + "step": 14578 + }, + { + "epoch": 11.479716423788894, + "grad_norm": 0.4027988016605377, + "learning_rate": 2.8475666666666666e-05, + "loss": 0.0172, + "step": 14579 + }, + { + "epoch": 11.480504135486411, + "grad_norm": 0.23635825514793396, + "learning_rate": 2.8475333333333335e-05, + "loss": 0.037, + "step": 14580 + }, + { + "epoch": 11.48129184718393, + "grad_norm": 0.19018562138080597, + "learning_rate": 2.8475e-05, + "loss": 0.0081, + "step": 14581 + }, + { + "epoch": 11.48207955888145, + "grad_norm": 0.1160113662481308, + "learning_rate": 2.8474666666666667e-05, + "loss": 0.0071, + "step": 14582 + }, + { + "epoch": 11.482867270578968, + "grad_norm": 1.0700931549072266, + "learning_rate": 2.8474333333333333e-05, + "loss": 0.0128, + "step": 14583 + }, + { + "epoch": 11.483654982276487, + "grad_norm": 0.20216509699821472, + "learning_rate": 2.8474000000000002e-05, + "loss": 0.0337, + "step": 14584 + }, + { + "epoch": 11.484442693974005, + "grad_norm": 0.39879071712493896, + "learning_rate": 2.8473666666666668e-05, + "loss": 0.0081, + "step": 14585 + }, + { + "epoch": 11.485230405671524, + "grad_norm": 0.17121604084968567, + "learning_rate": 2.8473333333333334e-05, + "loss": 0.0043, + "step": 14586 + }, + { + "epoch": 11.486018117369042, + "grad_norm": 0.19077469408512115, + "learning_rate": 2.8473000000000003e-05, + "loss": 0.016, + "step": 14587 + }, + { + "epoch": 11.486805829066562, + "grad_norm": 0.10525874048471451, + "learning_rate": 2.8472666666666666e-05, + "loss": 0.0068, + "step": 14588 + }, + { + "epoch": 11.487593540764081, + "grad_norm": 0.5520076155662537, + "learning_rate": 2.8472333333333335e-05, + "loss": 0.0121, + "step": 14589 + }, + { + "epoch": 11.488381252461599, + "grad_norm": 0.31841129064559937, + "learning_rate": 2.8471999999999998e-05, + "loss": 0.0116, + "step": 14590 + }, + { + "epoch": 11.489168964159118, + "grad_norm": 0.509495735168457, + "learning_rate": 2.8471666666666667e-05, + "loss": 0.0249, + "step": 14591 + }, + { + "epoch": 11.489956675856636, + "grad_norm": 0.10248325020074844, + "learning_rate": 2.8471333333333336e-05, + "loss": 0.0042, + "step": 14592 + }, + { + "epoch": 11.490744387554155, + "grad_norm": 0.2988523840904236, + "learning_rate": 2.8471e-05, + "loss": 0.019, + "step": 14593 + }, + { + "epoch": 11.491532099251675, + "grad_norm": 0.44561946392059326, + "learning_rate": 2.8470666666666668e-05, + "loss": 0.0143, + "step": 14594 + }, + { + "epoch": 11.492319810949192, + "grad_norm": 0.45134446024894714, + "learning_rate": 2.8470333333333334e-05, + "loss": 0.0151, + "step": 14595 + }, + { + "epoch": 11.493107522646712, + "grad_norm": 0.6303297281265259, + "learning_rate": 2.847e-05, + "loss": 0.0149, + "step": 14596 + }, + { + "epoch": 11.49389523434423, + "grad_norm": 0.13686122000217438, + "learning_rate": 2.8469666666666666e-05, + "loss": 0.0073, + "step": 14597 + }, + { + "epoch": 11.494682946041749, + "grad_norm": 0.24317459762096405, + "learning_rate": 2.8469333333333335e-05, + "loss": 0.01, + "step": 14598 + }, + { + "epoch": 11.495470657739267, + "grad_norm": 0.20808067917823792, + "learning_rate": 2.8468999999999998e-05, + "loss": 0.0083, + "step": 14599 + }, + { + "epoch": 11.496258369436786, + "grad_norm": 0.30247506499290466, + "learning_rate": 2.8468666666666667e-05, + "loss": 0.0089, + "step": 14600 + }, + { + "epoch": 11.497046081134306, + "grad_norm": 0.15883095562458038, + "learning_rate": 2.8468333333333337e-05, + "loss": 0.0075, + "step": 14601 + }, + { + "epoch": 11.497833792831823, + "grad_norm": 0.2823314666748047, + "learning_rate": 2.8468e-05, + "loss": 0.0075, + "step": 14602 + }, + { + "epoch": 11.498621504529343, + "grad_norm": 0.2575574517250061, + "learning_rate": 2.846766666666667e-05, + "loss": 0.0117, + "step": 14603 + }, + { + "epoch": 11.49940921622686, + "grad_norm": 0.1880328208208084, + "learning_rate": 2.8467333333333334e-05, + "loss": 0.0051, + "step": 14604 + }, + { + "epoch": 11.50019692792438, + "grad_norm": 0.39847952127456665, + "learning_rate": 2.8467e-05, + "loss": 0.0221, + "step": 14605 + }, + { + "epoch": 11.500984639621898, + "grad_norm": 0.31626227498054504, + "learning_rate": 2.8466666666666666e-05, + "loss": 0.0083, + "step": 14606 + }, + { + "epoch": 11.501772351319417, + "grad_norm": 0.22845645248889923, + "learning_rate": 2.8466333333333336e-05, + "loss": 0.0123, + "step": 14607 + }, + { + "epoch": 11.502560063016936, + "grad_norm": 0.2487887740135193, + "learning_rate": 2.8465999999999998e-05, + "loss": 0.0077, + "step": 14608 + }, + { + "epoch": 11.503347774714454, + "grad_norm": 0.18647891283035278, + "learning_rate": 2.8465666666666667e-05, + "loss": 0.0067, + "step": 14609 + }, + { + "epoch": 11.504135486411974, + "grad_norm": 0.25310277938842773, + "learning_rate": 2.8465333333333337e-05, + "loss": 0.009, + "step": 14610 + }, + { + "epoch": 11.504923198109491, + "grad_norm": 0.20339232683181763, + "learning_rate": 2.8465e-05, + "loss": 0.0077, + "step": 14611 + }, + { + "epoch": 11.50571090980701, + "grad_norm": 0.14589901268482208, + "learning_rate": 2.846466666666667e-05, + "loss": 0.0097, + "step": 14612 + }, + { + "epoch": 11.50649862150453, + "grad_norm": 0.31400421261787415, + "learning_rate": 2.8464333333333335e-05, + "loss": 0.0122, + "step": 14613 + }, + { + "epoch": 11.507286333202048, + "grad_norm": 0.33367347717285156, + "learning_rate": 2.8464e-05, + "loss": 0.013, + "step": 14614 + }, + { + "epoch": 11.508074044899567, + "grad_norm": 0.26545384526252747, + "learning_rate": 2.8463666666666666e-05, + "loss": 0.0165, + "step": 14615 + }, + { + "epoch": 11.508861756597085, + "grad_norm": 0.19239018857479095, + "learning_rate": 2.8463333333333336e-05, + "loss": 0.01, + "step": 14616 + }, + { + "epoch": 11.509649468294604, + "grad_norm": 0.4351326525211334, + "learning_rate": 2.8463000000000002e-05, + "loss": 0.011, + "step": 14617 + }, + { + "epoch": 11.510437179992122, + "grad_norm": 0.5063092708587646, + "learning_rate": 2.8462666666666668e-05, + "loss": 0.0209, + "step": 14618 + }, + { + "epoch": 11.511224891689642, + "grad_norm": 1.635683298110962, + "learning_rate": 2.8462333333333334e-05, + "loss": 0.0194, + "step": 14619 + }, + { + "epoch": 11.512012603387161, + "grad_norm": 0.31312960386276245, + "learning_rate": 2.8462e-05, + "loss": 0.0104, + "step": 14620 + }, + { + "epoch": 11.512800315084679, + "grad_norm": 0.5751544833183289, + "learning_rate": 2.846166666666667e-05, + "loss": 0.1753, + "step": 14621 + }, + { + "epoch": 11.513588026782198, + "grad_norm": 0.5512446165084839, + "learning_rate": 2.846133333333333e-05, + "loss": 0.1723, + "step": 14622 + }, + { + "epoch": 11.514375738479716, + "grad_norm": 0.41427814960479736, + "learning_rate": 2.8461e-05, + "loss": 0.1023, + "step": 14623 + }, + { + "epoch": 11.515163450177235, + "grad_norm": 0.3098989725112915, + "learning_rate": 2.8460666666666667e-05, + "loss": 0.0866, + "step": 14624 + }, + { + "epoch": 11.515951161874753, + "grad_norm": 0.2146068960428238, + "learning_rate": 2.8460333333333333e-05, + "loss": 0.0443, + "step": 14625 + }, + { + "epoch": 11.516738873572272, + "grad_norm": 0.15065912902355194, + "learning_rate": 2.8460000000000002e-05, + "loss": 0.0185, + "step": 14626 + }, + { + "epoch": 11.517526585269792, + "grad_norm": 0.42987167835235596, + "learning_rate": 2.8459666666666668e-05, + "loss": 0.023, + "step": 14627 + }, + { + "epoch": 11.51831429696731, + "grad_norm": 0.8375104069709778, + "learning_rate": 2.8459333333333334e-05, + "loss": 0.0147, + "step": 14628 + }, + { + "epoch": 11.519102008664829, + "grad_norm": 0.1255207657814026, + "learning_rate": 2.8459e-05, + "loss": 0.0068, + "step": 14629 + }, + { + "epoch": 11.519889720362347, + "grad_norm": 0.23998139798641205, + "learning_rate": 2.845866666666667e-05, + "loss": 0.0166, + "step": 14630 + }, + { + "epoch": 11.520677432059866, + "grad_norm": 0.20899997651576996, + "learning_rate": 2.845833333333333e-05, + "loss": 0.0124, + "step": 14631 + }, + { + "epoch": 11.521465143757386, + "grad_norm": 0.1830083131790161, + "learning_rate": 2.8458e-05, + "loss": 0.0083, + "step": 14632 + }, + { + "epoch": 11.522252855454903, + "grad_norm": 0.1419101506471634, + "learning_rate": 2.8457666666666667e-05, + "loss": 0.0142, + "step": 14633 + }, + { + "epoch": 11.523040567152423, + "grad_norm": 0.1429111361503601, + "learning_rate": 2.8457333333333333e-05, + "loss": 0.009, + "step": 14634 + }, + { + "epoch": 11.52382827884994, + "grad_norm": 0.23649901151657104, + "learning_rate": 2.8457000000000002e-05, + "loss": 0.0159, + "step": 14635 + }, + { + "epoch": 11.52461599054746, + "grad_norm": 0.8263759016990662, + "learning_rate": 2.8456666666666668e-05, + "loss": 0.0121, + "step": 14636 + }, + { + "epoch": 11.525403702244978, + "grad_norm": 0.18504300713539124, + "learning_rate": 2.8456333333333334e-05, + "loss": 0.0189, + "step": 14637 + }, + { + "epoch": 11.526191413942497, + "grad_norm": 0.3553701341152191, + "learning_rate": 2.8456e-05, + "loss": 0.023, + "step": 14638 + }, + { + "epoch": 11.526979125640016, + "grad_norm": 0.20449696481227875, + "learning_rate": 2.845566666666667e-05, + "loss": 0.0115, + "step": 14639 + }, + { + "epoch": 11.527766837337534, + "grad_norm": 0.2435765564441681, + "learning_rate": 2.8455333333333332e-05, + "loss": 0.0083, + "step": 14640 + }, + { + "epoch": 11.528554549035054, + "grad_norm": 0.13080038130283356, + "learning_rate": 2.8455e-05, + "loss": 0.0078, + "step": 14641 + }, + { + "epoch": 11.529342260732571, + "grad_norm": 0.3518836498260498, + "learning_rate": 2.845466666666667e-05, + "loss": 0.0071, + "step": 14642 + }, + { + "epoch": 11.53012997243009, + "grad_norm": 0.22404374182224274, + "learning_rate": 2.8454333333333333e-05, + "loss": 0.0129, + "step": 14643 + }, + { + "epoch": 11.530917684127608, + "grad_norm": 0.21068306267261505, + "learning_rate": 2.8454000000000002e-05, + "loss": 0.0106, + "step": 14644 + }, + { + "epoch": 11.531705395825128, + "grad_norm": 0.3884616196155548, + "learning_rate": 2.8453666666666668e-05, + "loss": 0.0135, + "step": 14645 + }, + { + "epoch": 11.532493107522647, + "grad_norm": 0.1896296739578247, + "learning_rate": 2.8453333333333334e-05, + "loss": 0.0132, + "step": 14646 + }, + { + "epoch": 11.533280819220165, + "grad_norm": 0.24139851331710815, + "learning_rate": 2.8453e-05, + "loss": 0.0071, + "step": 14647 + }, + { + "epoch": 11.534068530917684, + "grad_norm": 0.1821463257074356, + "learning_rate": 2.845266666666667e-05, + "loss": 0.0068, + "step": 14648 + }, + { + "epoch": 11.534856242615202, + "grad_norm": 0.2244957834482193, + "learning_rate": 2.8452333333333332e-05, + "loss": 0.01, + "step": 14649 + }, + { + "epoch": 11.535643954312722, + "grad_norm": 0.1741654872894287, + "learning_rate": 2.8452e-05, + "loss": 0.0075, + "step": 14650 + }, + { + "epoch": 11.536431666010241, + "grad_norm": 0.12390413880348206, + "learning_rate": 2.8451666666666667e-05, + "loss": 0.0043, + "step": 14651 + }, + { + "epoch": 11.537219377707759, + "grad_norm": 0.5219795107841492, + "learning_rate": 2.8451333333333333e-05, + "loss": 0.0148, + "step": 14652 + }, + { + "epoch": 11.538007089405278, + "grad_norm": 0.5757539868354797, + "learning_rate": 2.8451000000000003e-05, + "loss": 0.0165, + "step": 14653 + }, + { + "epoch": 11.538794801102796, + "grad_norm": 0.43066874146461487, + "learning_rate": 2.8450666666666665e-05, + "loss": 0.0136, + "step": 14654 + }, + { + "epoch": 11.539582512800315, + "grad_norm": 0.2152806669473648, + "learning_rate": 2.8450333333333334e-05, + "loss": 0.0152, + "step": 14655 + }, + { + "epoch": 11.540370224497833, + "grad_norm": 0.13609595596790314, + "learning_rate": 2.845e-05, + "loss": 0.0095, + "step": 14656 + }, + { + "epoch": 11.541157936195352, + "grad_norm": 0.22392204403877258, + "learning_rate": 2.8449666666666666e-05, + "loss": 0.0083, + "step": 14657 + }, + { + "epoch": 11.541945647892872, + "grad_norm": 0.258293092250824, + "learning_rate": 2.8449333333333332e-05, + "loss": 0.0209, + "step": 14658 + }, + { + "epoch": 11.54273335959039, + "grad_norm": 0.36200281977653503, + "learning_rate": 2.8449e-05, + "loss": 0.013, + "step": 14659 + }, + { + "epoch": 11.543521071287909, + "grad_norm": 0.16377533972263336, + "learning_rate": 2.8448666666666667e-05, + "loss": 0.0077, + "step": 14660 + }, + { + "epoch": 11.544308782985427, + "grad_norm": 0.43804794549942017, + "learning_rate": 2.8448333333333333e-05, + "loss": 0.0205, + "step": 14661 + }, + { + "epoch": 11.545096494682946, + "grad_norm": 0.6863658428192139, + "learning_rate": 2.8448000000000003e-05, + "loss": 0.0238, + "step": 14662 + }, + { + "epoch": 11.545884206380464, + "grad_norm": 0.3864179253578186, + "learning_rate": 2.8447666666666665e-05, + "loss": 0.0153, + "step": 14663 + }, + { + "epoch": 11.546671918077983, + "grad_norm": 0.10135605931282043, + "learning_rate": 2.8447333333333335e-05, + "loss": 0.0094, + "step": 14664 + }, + { + "epoch": 11.547459629775503, + "grad_norm": 0.49865883588790894, + "learning_rate": 2.8447e-05, + "loss": 0.0139, + "step": 14665 + }, + { + "epoch": 11.54824734147302, + "grad_norm": 0.19522622227668762, + "learning_rate": 2.8446666666666666e-05, + "loss": 0.0104, + "step": 14666 + }, + { + "epoch": 11.54903505317054, + "grad_norm": 0.24149015545845032, + "learning_rate": 2.8446333333333336e-05, + "loss": 0.0129, + "step": 14667 + }, + { + "epoch": 11.549822764868058, + "grad_norm": 0.11091230064630508, + "learning_rate": 2.8446000000000002e-05, + "loss": 0.0064, + "step": 14668 + }, + { + "epoch": 11.550610476565577, + "grad_norm": 0.22517220675945282, + "learning_rate": 2.8445666666666668e-05, + "loss": 0.011, + "step": 14669 + }, + { + "epoch": 11.551398188263097, + "grad_norm": 0.40025678277015686, + "learning_rate": 2.8445333333333334e-05, + "loss": 0.0184, + "step": 14670 + }, + { + "epoch": 11.552185899960614, + "grad_norm": 0.6807453632354736, + "learning_rate": 2.8445000000000003e-05, + "loss": 0.1623, + "step": 14671 + }, + { + "epoch": 11.552973611658134, + "grad_norm": 0.5162743926048279, + "learning_rate": 2.8444666666666665e-05, + "loss": 0.1249, + "step": 14672 + }, + { + "epoch": 11.553761323355651, + "grad_norm": 0.4928436279296875, + "learning_rate": 2.8444333333333335e-05, + "loss": 0.1119, + "step": 14673 + }, + { + "epoch": 11.55454903505317, + "grad_norm": 0.39562350511550903, + "learning_rate": 2.8444e-05, + "loss": 0.0745, + "step": 14674 + }, + { + "epoch": 11.555336746750688, + "grad_norm": 0.4296559989452362, + "learning_rate": 2.8443666666666667e-05, + "loss": 0.0305, + "step": 14675 + }, + { + "epoch": 11.556124458448208, + "grad_norm": 0.24138468503952026, + "learning_rate": 2.8443333333333336e-05, + "loss": 0.0278, + "step": 14676 + }, + { + "epoch": 11.556912170145727, + "grad_norm": 0.16327445209026337, + "learning_rate": 2.8443000000000002e-05, + "loss": 0.0379, + "step": 14677 + }, + { + "epoch": 11.557699881843245, + "grad_norm": 0.15728110074996948, + "learning_rate": 2.8442666666666668e-05, + "loss": 0.0101, + "step": 14678 + }, + { + "epoch": 11.558487593540764, + "grad_norm": 0.1335471272468567, + "learning_rate": 2.8442333333333334e-05, + "loss": 0.0112, + "step": 14679 + }, + { + "epoch": 11.559275305238282, + "grad_norm": 0.14769697189331055, + "learning_rate": 2.8442e-05, + "loss": 0.012, + "step": 14680 + }, + { + "epoch": 11.560063016935802, + "grad_norm": 0.1951015144586563, + "learning_rate": 2.8441666666666666e-05, + "loss": 0.0068, + "step": 14681 + }, + { + "epoch": 11.56085072863332, + "grad_norm": 0.25362613797187805, + "learning_rate": 2.8441333333333335e-05, + "loss": 0.0093, + "step": 14682 + }, + { + "epoch": 11.561638440330839, + "grad_norm": 0.35010138154029846, + "learning_rate": 2.8440999999999998e-05, + "loss": 0.011, + "step": 14683 + }, + { + "epoch": 11.562426152028358, + "grad_norm": 0.17773181200027466, + "learning_rate": 2.8440666666666667e-05, + "loss": 0.008, + "step": 14684 + }, + { + "epoch": 11.563213863725876, + "grad_norm": 0.1586485058069229, + "learning_rate": 2.8440333333333336e-05, + "loss": 0.0098, + "step": 14685 + }, + { + "epoch": 11.564001575423395, + "grad_norm": 0.2568548917770386, + "learning_rate": 2.844e-05, + "loss": 0.0126, + "step": 14686 + }, + { + "epoch": 11.564789287120913, + "grad_norm": 0.4478967785835266, + "learning_rate": 2.8439666666666668e-05, + "loss": 0.0194, + "step": 14687 + }, + { + "epoch": 11.565576998818432, + "grad_norm": 0.14799700677394867, + "learning_rate": 2.8439333333333334e-05, + "loss": 0.014, + "step": 14688 + }, + { + "epoch": 11.566364710515952, + "grad_norm": 0.1728009432554245, + "learning_rate": 2.8439e-05, + "loss": 0.0114, + "step": 14689 + }, + { + "epoch": 11.56715242221347, + "grad_norm": 0.35979875922203064, + "learning_rate": 2.8438666666666666e-05, + "loss": 0.0142, + "step": 14690 + }, + { + "epoch": 11.567940133910989, + "grad_norm": 0.23522712290287018, + "learning_rate": 2.8438333333333335e-05, + "loss": 0.0146, + "step": 14691 + }, + { + "epoch": 11.568727845608507, + "grad_norm": 0.30441048741340637, + "learning_rate": 2.8438e-05, + "loss": 0.0403, + "step": 14692 + }, + { + "epoch": 11.569515557306026, + "grad_norm": 0.15834994614124298, + "learning_rate": 2.8437666666666667e-05, + "loss": 0.0078, + "step": 14693 + }, + { + "epoch": 11.570303269003544, + "grad_norm": 0.19478082656860352, + "learning_rate": 2.8437333333333336e-05, + "loss": 0.01, + "step": 14694 + }, + { + "epoch": 11.571090980701063, + "grad_norm": 0.3859444558620453, + "learning_rate": 2.8437e-05, + "loss": 0.0158, + "step": 14695 + }, + { + "epoch": 11.571878692398583, + "grad_norm": 0.29722076654434204, + "learning_rate": 2.8436666666666668e-05, + "loss": 0.0167, + "step": 14696 + }, + { + "epoch": 11.5726664040961, + "grad_norm": 0.154929518699646, + "learning_rate": 2.8436333333333334e-05, + "loss": 0.0086, + "step": 14697 + }, + { + "epoch": 11.57345411579362, + "grad_norm": 0.21173511445522308, + "learning_rate": 2.8436e-05, + "loss": 0.0078, + "step": 14698 + }, + { + "epoch": 11.574241827491138, + "grad_norm": 0.165974423289299, + "learning_rate": 2.8435666666666666e-05, + "loss": 0.0066, + "step": 14699 + }, + { + "epoch": 11.575029539188657, + "grad_norm": 0.36999180912971497, + "learning_rate": 2.8435333333333335e-05, + "loss": 0.0139, + "step": 14700 + }, + { + "epoch": 11.575817250886175, + "grad_norm": 0.45306530594825745, + "learning_rate": 2.8435e-05, + "loss": 0.0089, + "step": 14701 + }, + { + "epoch": 11.576604962583694, + "grad_norm": 0.3445071876049042, + "learning_rate": 2.8434666666666667e-05, + "loss": 0.0106, + "step": 14702 + }, + { + "epoch": 11.577392674281214, + "grad_norm": 1.2034192085266113, + "learning_rate": 2.8434333333333337e-05, + "loss": 0.0086, + "step": 14703 + }, + { + "epoch": 11.578180385978731, + "grad_norm": 0.23263013362884521, + "learning_rate": 2.8434e-05, + "loss": 0.0058, + "step": 14704 + }, + { + "epoch": 11.57896809767625, + "grad_norm": 0.24753494560718536, + "learning_rate": 2.843366666666667e-05, + "loss": 0.007, + "step": 14705 + }, + { + "epoch": 11.579755809373768, + "grad_norm": 0.255156546831131, + "learning_rate": 2.8433333333333334e-05, + "loss": 0.0093, + "step": 14706 + }, + { + "epoch": 11.580543521071288, + "grad_norm": 0.28917396068573, + "learning_rate": 2.8433e-05, + "loss": 0.0077, + "step": 14707 + }, + { + "epoch": 11.581331232768807, + "grad_norm": 0.3754309415817261, + "learning_rate": 2.8432666666666666e-05, + "loss": 0.0149, + "step": 14708 + }, + { + "epoch": 11.582118944466325, + "grad_norm": 0.12014846503734589, + "learning_rate": 2.8432333333333332e-05, + "loss": 0.0066, + "step": 14709 + }, + { + "epoch": 11.582906656163845, + "grad_norm": 0.61924147605896, + "learning_rate": 2.8432e-05, + "loss": 0.0238, + "step": 14710 + }, + { + "epoch": 11.583694367861362, + "grad_norm": 0.32516855001449585, + "learning_rate": 2.8431666666666667e-05, + "loss": 0.0151, + "step": 14711 + }, + { + "epoch": 11.584482079558882, + "grad_norm": 0.25100114941596985, + "learning_rate": 2.8431333333333333e-05, + "loss": 0.0114, + "step": 14712 + }, + { + "epoch": 11.5852697912564, + "grad_norm": 0.26175934076309204, + "learning_rate": 2.8431e-05, + "loss": 0.0113, + "step": 14713 + }, + { + "epoch": 11.586057502953919, + "grad_norm": 0.6523993015289307, + "learning_rate": 2.843066666666667e-05, + "loss": 0.0169, + "step": 14714 + }, + { + "epoch": 11.586845214651438, + "grad_norm": 0.29713428020477295, + "learning_rate": 2.843033333333333e-05, + "loss": 0.0155, + "step": 14715 + }, + { + "epoch": 11.587632926348956, + "grad_norm": 0.5046488046646118, + "learning_rate": 2.843e-05, + "loss": 0.0185, + "step": 14716 + }, + { + "epoch": 11.588420638046475, + "grad_norm": 0.5543912649154663, + "learning_rate": 2.842966666666667e-05, + "loss": 0.0112, + "step": 14717 + }, + { + "epoch": 11.589208349743993, + "grad_norm": 0.45889896154403687, + "learning_rate": 2.8429333333333332e-05, + "loss": 0.0174, + "step": 14718 + }, + { + "epoch": 11.589996061441513, + "grad_norm": 0.5194530487060547, + "learning_rate": 2.8429000000000002e-05, + "loss": 0.0214, + "step": 14719 + }, + { + "epoch": 11.59078377313903, + "grad_norm": 0.31433528661727905, + "learning_rate": 2.8428666666666668e-05, + "loss": 0.019, + "step": 14720 + }, + { + "epoch": 11.59157148483655, + "grad_norm": 0.5631967186927795, + "learning_rate": 2.8428333333333334e-05, + "loss": 0.1926, + "step": 14721 + }, + { + "epoch": 11.592359196534069, + "grad_norm": 0.5323708653450012, + "learning_rate": 2.8428e-05, + "loss": 0.1308, + "step": 14722 + }, + { + "epoch": 11.593146908231587, + "grad_norm": 0.6477340459823608, + "learning_rate": 2.842766666666667e-05, + "loss": 0.2149, + "step": 14723 + }, + { + "epoch": 11.593934619929106, + "grad_norm": 0.3724861443042755, + "learning_rate": 2.842733333333333e-05, + "loss": 0.0779, + "step": 14724 + }, + { + "epoch": 11.594722331626624, + "grad_norm": 0.35304197669029236, + "learning_rate": 2.8427e-05, + "loss": 0.047, + "step": 14725 + }, + { + "epoch": 11.595510043324143, + "grad_norm": 0.4696406126022339, + "learning_rate": 2.842666666666667e-05, + "loss": 0.0379, + "step": 14726 + }, + { + "epoch": 11.596297755021663, + "grad_norm": 0.291839599609375, + "learning_rate": 2.8426333333333333e-05, + "loss": 0.022, + "step": 14727 + }, + { + "epoch": 11.59708546671918, + "grad_norm": 0.26849642395973206, + "learning_rate": 2.8426000000000002e-05, + "loss": 0.0282, + "step": 14728 + }, + { + "epoch": 11.5978731784167, + "grad_norm": 0.7746376395225525, + "learning_rate": 2.8425666666666668e-05, + "loss": 0.0643, + "step": 14729 + }, + { + "epoch": 11.598660890114218, + "grad_norm": 1.390557050704956, + "learning_rate": 2.8425333333333334e-05, + "loss": 0.0186, + "step": 14730 + }, + { + "epoch": 11.599448601811737, + "grad_norm": 0.28635409474372864, + "learning_rate": 2.8425e-05, + "loss": 0.0164, + "step": 14731 + }, + { + "epoch": 11.600236313509257, + "grad_norm": 0.31242814660072327, + "learning_rate": 2.842466666666667e-05, + "loss": 0.016, + "step": 14732 + }, + { + "epoch": 11.601024025206774, + "grad_norm": 0.4667927026748657, + "learning_rate": 2.842433333333333e-05, + "loss": 0.0148, + "step": 14733 + }, + { + "epoch": 11.601811736904294, + "grad_norm": 0.27434664964675903, + "learning_rate": 2.8424e-05, + "loss": 0.0096, + "step": 14734 + }, + { + "epoch": 11.602599448601811, + "grad_norm": 0.16004355251789093, + "learning_rate": 2.842366666666667e-05, + "loss": 0.0073, + "step": 14735 + }, + { + "epoch": 11.60338716029933, + "grad_norm": 0.2768729627132416, + "learning_rate": 2.8423333333333333e-05, + "loss": 0.0107, + "step": 14736 + }, + { + "epoch": 11.604174871996848, + "grad_norm": 0.1698644459247589, + "learning_rate": 2.8423000000000002e-05, + "loss": 0.0054, + "step": 14737 + }, + { + "epoch": 11.604962583694368, + "grad_norm": 0.22955860197544098, + "learning_rate": 2.8422666666666668e-05, + "loss": 0.0085, + "step": 14738 + }, + { + "epoch": 11.605750295391886, + "grad_norm": 0.07766865193843842, + "learning_rate": 2.8422333333333334e-05, + "loss": 0.003, + "step": 14739 + }, + { + "epoch": 11.606538007089405, + "grad_norm": 0.40409889817237854, + "learning_rate": 2.8422e-05, + "loss": 0.0105, + "step": 14740 + }, + { + "epoch": 11.607325718786925, + "grad_norm": 0.07951804250478745, + "learning_rate": 2.8421666666666666e-05, + "loss": 0.0044, + "step": 14741 + }, + { + "epoch": 11.608113430484442, + "grad_norm": 0.40366458892822266, + "learning_rate": 2.8421333333333335e-05, + "loss": 0.0184, + "step": 14742 + }, + { + "epoch": 11.608901142181962, + "grad_norm": 0.22408300638198853, + "learning_rate": 2.8421e-05, + "loss": 0.0108, + "step": 14743 + }, + { + "epoch": 11.60968885387948, + "grad_norm": 0.27248138189315796, + "learning_rate": 2.8420666666666667e-05, + "loss": 0.0117, + "step": 14744 + }, + { + "epoch": 11.610476565576999, + "grad_norm": 0.13521555066108704, + "learning_rate": 2.8420333333333333e-05, + "loss": 0.0056, + "step": 14745 + }, + { + "epoch": 11.611264277274518, + "grad_norm": 0.9453673958778381, + "learning_rate": 2.8420000000000002e-05, + "loss": 0.0117, + "step": 14746 + }, + { + "epoch": 11.612051988972036, + "grad_norm": 0.11943837255239487, + "learning_rate": 2.8419666666666665e-05, + "loss": 0.0064, + "step": 14747 + }, + { + "epoch": 11.612839700669555, + "grad_norm": 0.10406183451414108, + "learning_rate": 2.8419333333333334e-05, + "loss": 0.0067, + "step": 14748 + }, + { + "epoch": 11.613627412367073, + "grad_norm": 0.20210261642932892, + "learning_rate": 2.8419e-05, + "loss": 0.0121, + "step": 14749 + }, + { + "epoch": 11.614415124064593, + "grad_norm": 0.37973976135253906, + "learning_rate": 2.8418666666666666e-05, + "loss": 0.0105, + "step": 14750 + }, + { + "epoch": 11.615202835762112, + "grad_norm": 0.1818361133337021, + "learning_rate": 2.8418333333333335e-05, + "loss": 0.0071, + "step": 14751 + }, + { + "epoch": 11.61599054745963, + "grad_norm": 0.433755099773407, + "learning_rate": 2.8418e-05, + "loss": 0.0223, + "step": 14752 + }, + { + "epoch": 11.61677825915715, + "grad_norm": 0.1685495674610138, + "learning_rate": 2.8417666666666667e-05, + "loss": 0.0095, + "step": 14753 + }, + { + "epoch": 11.617565970854667, + "grad_norm": 0.14453370869159698, + "learning_rate": 2.8417333333333333e-05, + "loss": 0.0083, + "step": 14754 + }, + { + "epoch": 11.618353682552186, + "grad_norm": 0.3053989112377167, + "learning_rate": 2.8417000000000003e-05, + "loss": 0.0128, + "step": 14755 + }, + { + "epoch": 11.619141394249704, + "grad_norm": 0.41286125779151917, + "learning_rate": 2.8416666666666665e-05, + "loss": 0.013, + "step": 14756 + }, + { + "epoch": 11.619929105947223, + "grad_norm": 0.34063422679901123, + "learning_rate": 2.8416333333333334e-05, + "loss": 0.0119, + "step": 14757 + }, + { + "epoch": 11.620716817644743, + "grad_norm": 0.16861552000045776, + "learning_rate": 2.8416e-05, + "loss": 0.0068, + "step": 14758 + }, + { + "epoch": 11.62150452934226, + "grad_norm": 0.159835547208786, + "learning_rate": 2.8415666666666666e-05, + "loss": 0.011, + "step": 14759 + }, + { + "epoch": 11.62229224103978, + "grad_norm": 0.1785086691379547, + "learning_rate": 2.8415333333333336e-05, + "loss": 0.0103, + "step": 14760 + }, + { + "epoch": 11.623079952737298, + "grad_norm": 0.15312442183494568, + "learning_rate": 2.8415e-05, + "loss": 0.0068, + "step": 14761 + }, + { + "epoch": 11.623867664434817, + "grad_norm": 0.32683753967285156, + "learning_rate": 2.8414666666666667e-05, + "loss": 0.0146, + "step": 14762 + }, + { + "epoch": 11.624655376132335, + "grad_norm": 0.17557455599308014, + "learning_rate": 2.8414333333333333e-05, + "loss": 0.0064, + "step": 14763 + }, + { + "epoch": 11.625443087829854, + "grad_norm": 0.2572583556175232, + "learning_rate": 2.8414000000000003e-05, + "loss": 0.0107, + "step": 14764 + }, + { + "epoch": 11.626230799527374, + "grad_norm": 0.12515656650066376, + "learning_rate": 2.8413666666666665e-05, + "loss": 0.0067, + "step": 14765 + }, + { + "epoch": 11.627018511224891, + "grad_norm": 0.3425070345401764, + "learning_rate": 2.8413333333333335e-05, + "loss": 0.0155, + "step": 14766 + }, + { + "epoch": 11.62780622292241, + "grad_norm": 0.20189732313156128, + "learning_rate": 2.8413000000000004e-05, + "loss": 0.0087, + "step": 14767 + }, + { + "epoch": 11.628593934619929, + "grad_norm": 0.22497831284999847, + "learning_rate": 2.8412666666666666e-05, + "loss": 0.0086, + "step": 14768 + }, + { + "epoch": 11.629381646317448, + "grad_norm": 0.6447279453277588, + "learning_rate": 2.8412333333333336e-05, + "loss": 0.0244, + "step": 14769 + }, + { + "epoch": 11.630169358014967, + "grad_norm": 0.3663930892944336, + "learning_rate": 2.8412e-05, + "loss": 0.0151, + "step": 14770 + }, + { + "epoch": 11.630957069712485, + "grad_norm": 1.0154589414596558, + "learning_rate": 2.8411666666666668e-05, + "loss": 0.2183, + "step": 14771 + }, + { + "epoch": 11.631744781410005, + "grad_norm": 0.5717185139656067, + "learning_rate": 2.8411333333333334e-05, + "loss": 0.1353, + "step": 14772 + }, + { + "epoch": 11.632532493107522, + "grad_norm": 0.38722777366638184, + "learning_rate": 2.8411e-05, + "loss": 0.1018, + "step": 14773 + }, + { + "epoch": 11.633320204805042, + "grad_norm": 0.42810919880867004, + "learning_rate": 2.8410666666666665e-05, + "loss": 0.0935, + "step": 14774 + }, + { + "epoch": 11.63410791650256, + "grad_norm": 0.5138535499572754, + "learning_rate": 2.8410333333333335e-05, + "loss": 0.0731, + "step": 14775 + }, + { + "epoch": 11.634895628200079, + "grad_norm": 0.4193761348724365, + "learning_rate": 2.841e-05, + "loss": 0.0438, + "step": 14776 + }, + { + "epoch": 11.635683339897598, + "grad_norm": 0.17179511487483978, + "learning_rate": 2.8409666666666667e-05, + "loss": 0.012, + "step": 14777 + }, + { + "epoch": 11.636471051595116, + "grad_norm": 0.22213931381702423, + "learning_rate": 2.8409333333333336e-05, + "loss": 0.0244, + "step": 14778 + }, + { + "epoch": 11.637258763292635, + "grad_norm": 0.6385711431503296, + "learning_rate": 2.8409e-05, + "loss": 0.0127, + "step": 14779 + }, + { + "epoch": 11.638046474990153, + "grad_norm": 0.22362925112247467, + "learning_rate": 2.8408666666666668e-05, + "loss": 0.0097, + "step": 14780 + }, + { + "epoch": 11.638834186687673, + "grad_norm": 0.1479131132364273, + "learning_rate": 2.8408333333333334e-05, + "loss": 0.0073, + "step": 14781 + }, + { + "epoch": 11.63962189838519, + "grad_norm": 0.3891717493534088, + "learning_rate": 2.8408e-05, + "loss": 0.0252, + "step": 14782 + }, + { + "epoch": 11.64040961008271, + "grad_norm": 0.1422877162694931, + "learning_rate": 2.8407666666666666e-05, + "loss": 0.0056, + "step": 14783 + }, + { + "epoch": 11.64119732178023, + "grad_norm": 0.13706979155540466, + "learning_rate": 2.8407333333333335e-05, + "loss": 0.0087, + "step": 14784 + }, + { + "epoch": 11.641985033477747, + "grad_norm": 0.2669496238231659, + "learning_rate": 2.8407e-05, + "loss": 0.0146, + "step": 14785 + }, + { + "epoch": 11.642772745175266, + "grad_norm": 0.11840427666902542, + "learning_rate": 2.8406666666666667e-05, + "loss": 0.0084, + "step": 14786 + }, + { + "epoch": 11.643560456872784, + "grad_norm": 0.1768328845500946, + "learning_rate": 2.8406333333333336e-05, + "loss": 0.015, + "step": 14787 + }, + { + "epoch": 11.644348168570303, + "grad_norm": 0.5370997786521912, + "learning_rate": 2.8406e-05, + "loss": 0.0145, + "step": 14788 + }, + { + "epoch": 11.645135880267823, + "grad_norm": 0.18027757108211517, + "learning_rate": 2.8405666666666668e-05, + "loss": 0.008, + "step": 14789 + }, + { + "epoch": 11.64592359196534, + "grad_norm": 0.15227407217025757, + "learning_rate": 2.8405333333333334e-05, + "loss": 0.0049, + "step": 14790 + }, + { + "epoch": 11.64671130366286, + "grad_norm": 0.12621913850307465, + "learning_rate": 2.8405e-05, + "loss": 0.0071, + "step": 14791 + }, + { + "epoch": 11.647499015360378, + "grad_norm": 0.9170981049537659, + "learning_rate": 2.840466666666667e-05, + "loss": 0.0225, + "step": 14792 + }, + { + "epoch": 11.648286727057897, + "grad_norm": 0.20452648401260376, + "learning_rate": 2.8404333333333335e-05, + "loss": 0.0127, + "step": 14793 + }, + { + "epoch": 11.649074438755415, + "grad_norm": 0.18465760350227356, + "learning_rate": 2.8404e-05, + "loss": 0.0116, + "step": 14794 + }, + { + "epoch": 11.649862150452934, + "grad_norm": 0.2869703471660614, + "learning_rate": 2.8403666666666667e-05, + "loss": 0.0121, + "step": 14795 + }, + { + "epoch": 11.650649862150454, + "grad_norm": 0.16436754167079926, + "learning_rate": 2.8403333333333336e-05, + "loss": 0.0088, + "step": 14796 + }, + { + "epoch": 11.651437573847971, + "grad_norm": 0.25435128808021545, + "learning_rate": 2.8403e-05, + "loss": 0.0119, + "step": 14797 + }, + { + "epoch": 11.65222528554549, + "grad_norm": 0.15861207246780396, + "learning_rate": 2.8402666666666668e-05, + "loss": 0.0094, + "step": 14798 + }, + { + "epoch": 11.653012997243009, + "grad_norm": 0.3161252439022064, + "learning_rate": 2.8402333333333334e-05, + "loss": 0.0141, + "step": 14799 + }, + { + "epoch": 11.653800708940528, + "grad_norm": 0.14437377452850342, + "learning_rate": 2.8402e-05, + "loss": 0.0088, + "step": 14800 + }, + { + "epoch": 11.654588420638046, + "grad_norm": 0.16646677255630493, + "learning_rate": 2.840166666666667e-05, + "loss": 0.0065, + "step": 14801 + }, + { + "epoch": 11.655376132335565, + "grad_norm": 0.18652905523777008, + "learning_rate": 2.8401333333333332e-05, + "loss": 0.0146, + "step": 14802 + }, + { + "epoch": 11.656163844033085, + "grad_norm": 0.26223307847976685, + "learning_rate": 2.8401e-05, + "loss": 0.0141, + "step": 14803 + }, + { + "epoch": 11.656951555730602, + "grad_norm": 0.15996672213077545, + "learning_rate": 2.8400666666666667e-05, + "loss": 0.0068, + "step": 14804 + }, + { + "epoch": 11.657739267428122, + "grad_norm": 0.423633873462677, + "learning_rate": 2.8400333333333333e-05, + "loss": 0.0175, + "step": 14805 + }, + { + "epoch": 11.65852697912564, + "grad_norm": 0.17484937608242035, + "learning_rate": 2.84e-05, + "loss": 0.0109, + "step": 14806 + }, + { + "epoch": 11.659314690823159, + "grad_norm": 0.2965453565120697, + "learning_rate": 2.839966666666667e-05, + "loss": 0.0085, + "step": 14807 + }, + { + "epoch": 11.660102402520678, + "grad_norm": 0.42960038781166077, + "learning_rate": 2.8399333333333334e-05, + "loss": 0.0075, + "step": 14808 + }, + { + "epoch": 11.660890114218196, + "grad_norm": 0.25100067257881165, + "learning_rate": 2.8399e-05, + "loss": 0.0145, + "step": 14809 + }, + { + "epoch": 11.661677825915715, + "grad_norm": 0.22752757370471954, + "learning_rate": 2.839866666666667e-05, + "loss": 0.0121, + "step": 14810 + }, + { + "epoch": 11.662465537613233, + "grad_norm": 0.204904243350029, + "learning_rate": 2.8398333333333332e-05, + "loss": 0.0099, + "step": 14811 + }, + { + "epoch": 11.663253249310753, + "grad_norm": 0.4685540199279785, + "learning_rate": 2.8398e-05, + "loss": 0.0126, + "step": 14812 + }, + { + "epoch": 11.66404096100827, + "grad_norm": 0.3176911771297455, + "learning_rate": 2.8397666666666667e-05, + "loss": 0.0104, + "step": 14813 + }, + { + "epoch": 11.66482867270579, + "grad_norm": 0.15944023430347443, + "learning_rate": 2.8397333333333333e-05, + "loss": 0.008, + "step": 14814 + }, + { + "epoch": 11.66561638440331, + "grad_norm": 0.22070232033729553, + "learning_rate": 2.8397e-05, + "loss": 0.0118, + "step": 14815 + }, + { + "epoch": 11.666404096100827, + "grad_norm": 0.17278586328029633, + "learning_rate": 2.839666666666667e-05, + "loss": 0.0155, + "step": 14816 + }, + { + "epoch": 11.667191807798346, + "grad_norm": 0.30373314023017883, + "learning_rate": 2.8396333333333335e-05, + "loss": 0.0138, + "step": 14817 + }, + { + "epoch": 11.667979519495864, + "grad_norm": 0.2112508863210678, + "learning_rate": 2.8396e-05, + "loss": 0.0102, + "step": 14818 + }, + { + "epoch": 11.668767231193383, + "grad_norm": 0.45598307251930237, + "learning_rate": 2.839566666666667e-05, + "loss": 0.0108, + "step": 14819 + }, + { + "epoch": 11.669554942890901, + "grad_norm": 0.35760459303855896, + "learning_rate": 2.8395333333333332e-05, + "loss": 0.0132, + "step": 14820 + }, + { + "epoch": 11.67034265458842, + "grad_norm": 0.5214633941650391, + "learning_rate": 2.8395000000000002e-05, + "loss": 0.151, + "step": 14821 + }, + { + "epoch": 11.67113036628594, + "grad_norm": 0.7773906588554382, + "learning_rate": 2.8394666666666668e-05, + "loss": 0.1873, + "step": 14822 + }, + { + "epoch": 11.671918077983458, + "grad_norm": 0.4748157262802124, + "learning_rate": 2.8394333333333334e-05, + "loss": 0.1155, + "step": 14823 + }, + { + "epoch": 11.672705789680977, + "grad_norm": 0.5347843170166016, + "learning_rate": 2.8394e-05, + "loss": 0.113, + "step": 14824 + }, + { + "epoch": 11.673493501378495, + "grad_norm": 0.6265652179718018, + "learning_rate": 2.839366666666667e-05, + "loss": 0.0426, + "step": 14825 + }, + { + "epoch": 11.674281213076014, + "grad_norm": 0.8025290966033936, + "learning_rate": 2.8393333333333335e-05, + "loss": 0.0921, + "step": 14826 + }, + { + "epoch": 11.675068924773534, + "grad_norm": 0.18662065267562866, + "learning_rate": 2.8393e-05, + "loss": 0.0175, + "step": 14827 + }, + { + "epoch": 11.675856636471051, + "grad_norm": 0.3133064806461334, + "learning_rate": 2.839266666666667e-05, + "loss": 0.0201, + "step": 14828 + }, + { + "epoch": 11.67664434816857, + "grad_norm": 0.32104358077049255, + "learning_rate": 2.8392333333333333e-05, + "loss": 0.0354, + "step": 14829 + }, + { + "epoch": 11.677432059866089, + "grad_norm": 0.3093222975730896, + "learning_rate": 2.8392000000000002e-05, + "loss": 0.0166, + "step": 14830 + }, + { + "epoch": 11.678219771563608, + "grad_norm": 0.38352343440055847, + "learning_rate": 2.8391666666666664e-05, + "loss": 0.0163, + "step": 14831 + }, + { + "epoch": 11.679007483261126, + "grad_norm": 0.15871012210845947, + "learning_rate": 2.8391333333333334e-05, + "loss": 0.0109, + "step": 14832 + }, + { + "epoch": 11.679795194958645, + "grad_norm": 0.18234236538410187, + "learning_rate": 2.8391e-05, + "loss": 0.0107, + "step": 14833 + }, + { + "epoch": 11.680582906656165, + "grad_norm": 0.13126900792121887, + "learning_rate": 2.8390666666666666e-05, + "loss": 0.0117, + "step": 14834 + }, + { + "epoch": 11.681370618353682, + "grad_norm": 0.2209440916776657, + "learning_rate": 2.8390333333333335e-05, + "loss": 0.0127, + "step": 14835 + }, + { + "epoch": 11.682158330051202, + "grad_norm": 0.795861005783081, + "learning_rate": 2.839e-05, + "loss": 0.013, + "step": 14836 + }, + { + "epoch": 11.68294604174872, + "grad_norm": 0.2511570155620575, + "learning_rate": 2.8389666666666667e-05, + "loss": 0.0087, + "step": 14837 + }, + { + "epoch": 11.683733753446239, + "grad_norm": 0.149372398853302, + "learning_rate": 2.8389333333333333e-05, + "loss": 0.0106, + "step": 14838 + }, + { + "epoch": 11.684521465143757, + "grad_norm": 0.4507848620414734, + "learning_rate": 2.8389000000000002e-05, + "loss": 0.0195, + "step": 14839 + }, + { + "epoch": 11.685309176841276, + "grad_norm": 0.1854102909564972, + "learning_rate": 2.8388666666666665e-05, + "loss": 0.0157, + "step": 14840 + }, + { + "epoch": 11.686096888538795, + "grad_norm": 0.22978508472442627, + "learning_rate": 2.8388333333333334e-05, + "loss": 0.0066, + "step": 14841 + }, + { + "epoch": 11.686884600236313, + "grad_norm": 0.33790692687034607, + "learning_rate": 2.8388000000000003e-05, + "loss": 0.0182, + "step": 14842 + }, + { + "epoch": 11.687672311933833, + "grad_norm": 0.7237936854362488, + "learning_rate": 2.8387666666666666e-05, + "loss": 0.0319, + "step": 14843 + }, + { + "epoch": 11.68846002363135, + "grad_norm": 0.24794188141822815, + "learning_rate": 2.8387333333333335e-05, + "loss": 0.009, + "step": 14844 + }, + { + "epoch": 11.68924773532887, + "grad_norm": 0.532772421836853, + "learning_rate": 2.8387e-05, + "loss": 0.0455, + "step": 14845 + }, + { + "epoch": 11.69003544702639, + "grad_norm": 0.4961252808570862, + "learning_rate": 2.8386666666666667e-05, + "loss": 0.0188, + "step": 14846 + }, + { + "epoch": 11.690823158723907, + "grad_norm": 0.30629080533981323, + "learning_rate": 2.8386333333333333e-05, + "loss": 0.0116, + "step": 14847 + }, + { + "epoch": 11.691610870421426, + "grad_norm": 0.3169872760772705, + "learning_rate": 2.8386000000000002e-05, + "loss": 0.0139, + "step": 14848 + }, + { + "epoch": 11.692398582118944, + "grad_norm": 0.2227480560541153, + "learning_rate": 2.8385666666666665e-05, + "loss": 0.0148, + "step": 14849 + }, + { + "epoch": 11.693186293816463, + "grad_norm": 0.37487831711769104, + "learning_rate": 2.8385333333333334e-05, + "loss": 0.0145, + "step": 14850 + }, + { + "epoch": 11.693974005513981, + "grad_norm": 0.44719091057777405, + "learning_rate": 2.8385000000000004e-05, + "loss": 0.0105, + "step": 14851 + }, + { + "epoch": 11.6947617172115, + "grad_norm": 0.3043826222419739, + "learning_rate": 2.8384666666666666e-05, + "loss": 0.0155, + "step": 14852 + }, + { + "epoch": 11.69554942890902, + "grad_norm": 0.16376157104969025, + "learning_rate": 2.8384333333333335e-05, + "loss": 0.0071, + "step": 14853 + }, + { + "epoch": 11.696337140606538, + "grad_norm": 0.3755529522895813, + "learning_rate": 2.8384e-05, + "loss": 0.0131, + "step": 14854 + }, + { + "epoch": 11.697124852304057, + "grad_norm": 0.17684495449066162, + "learning_rate": 2.8383666666666667e-05, + "loss": 0.0123, + "step": 14855 + }, + { + "epoch": 11.697912564001575, + "grad_norm": 0.36894312500953674, + "learning_rate": 2.8383333333333333e-05, + "loss": 0.0128, + "step": 14856 + }, + { + "epoch": 11.698700275699094, + "grad_norm": 0.4575793147087097, + "learning_rate": 2.8383000000000003e-05, + "loss": 0.017, + "step": 14857 + }, + { + "epoch": 11.699487987396612, + "grad_norm": 0.21060553193092346, + "learning_rate": 2.838266666666667e-05, + "loss": 0.0087, + "step": 14858 + }, + { + "epoch": 11.700275699094131, + "grad_norm": 0.14270740747451782, + "learning_rate": 2.8382333333333334e-05, + "loss": 0.0092, + "step": 14859 + }, + { + "epoch": 11.701063410791651, + "grad_norm": 0.24188026785850525, + "learning_rate": 2.8382e-05, + "loss": 0.0104, + "step": 14860 + }, + { + "epoch": 11.701851122489169, + "grad_norm": 0.2517240345478058, + "learning_rate": 2.8381666666666666e-05, + "loss": 0.0108, + "step": 14861 + }, + { + "epoch": 11.702638834186688, + "grad_norm": 0.25475019216537476, + "learning_rate": 2.8381333333333336e-05, + "loss": 0.0082, + "step": 14862 + }, + { + "epoch": 11.703426545884206, + "grad_norm": 0.2363438457250595, + "learning_rate": 2.8380999999999998e-05, + "loss": 0.0148, + "step": 14863 + }, + { + "epoch": 11.704214257581725, + "grad_norm": 0.3572145402431488, + "learning_rate": 2.8380666666666667e-05, + "loss": 0.0142, + "step": 14864 + }, + { + "epoch": 11.705001969279245, + "grad_norm": 0.3003295660018921, + "learning_rate": 2.8380333333333333e-05, + "loss": 0.021, + "step": 14865 + }, + { + "epoch": 11.705789680976762, + "grad_norm": 0.1827978938817978, + "learning_rate": 2.838e-05, + "loss": 0.0083, + "step": 14866 + }, + { + "epoch": 11.706577392674282, + "grad_norm": 0.20093099772930145, + "learning_rate": 2.837966666666667e-05, + "loss": 0.0115, + "step": 14867 + }, + { + "epoch": 11.7073651043718, + "grad_norm": 0.22010184824466705, + "learning_rate": 2.8379333333333335e-05, + "loss": 0.0056, + "step": 14868 + }, + { + "epoch": 11.708152816069319, + "grad_norm": 0.23065291345119476, + "learning_rate": 2.8379e-05, + "loss": 0.015, + "step": 14869 + }, + { + "epoch": 11.708940527766837, + "grad_norm": 0.31223711371421814, + "learning_rate": 2.8378666666666666e-05, + "loss": 0.0075, + "step": 14870 + }, + { + "epoch": 11.709728239464356, + "grad_norm": 0.610490083694458, + "learning_rate": 2.8378333333333336e-05, + "loss": 0.201, + "step": 14871 + }, + { + "epoch": 11.710515951161875, + "grad_norm": 0.6957754492759705, + "learning_rate": 2.8378e-05, + "loss": 0.0929, + "step": 14872 + }, + { + "epoch": 11.711303662859393, + "grad_norm": 0.5005167722702026, + "learning_rate": 2.8377666666666668e-05, + "loss": 0.1559, + "step": 14873 + }, + { + "epoch": 11.712091374556913, + "grad_norm": 0.3341619670391083, + "learning_rate": 2.8377333333333334e-05, + "loss": 0.0856, + "step": 14874 + }, + { + "epoch": 11.71287908625443, + "grad_norm": 0.32394134998321533, + "learning_rate": 2.8377e-05, + "loss": 0.0663, + "step": 14875 + }, + { + "epoch": 11.71366679795195, + "grad_norm": 0.19036811590194702, + "learning_rate": 2.837666666666667e-05, + "loss": 0.0169, + "step": 14876 + }, + { + "epoch": 11.714454509649467, + "grad_norm": 0.2526274025440216, + "learning_rate": 2.8376333333333335e-05, + "loss": 0.0193, + "step": 14877 + }, + { + "epoch": 11.715242221346987, + "grad_norm": 0.7114115357398987, + "learning_rate": 2.8376e-05, + "loss": 0.0892, + "step": 14878 + }, + { + "epoch": 11.716029933044506, + "grad_norm": 0.18526123464107513, + "learning_rate": 2.8375666666666667e-05, + "loss": 0.0149, + "step": 14879 + }, + { + "epoch": 11.716817644742024, + "grad_norm": 0.17177022993564606, + "learning_rate": 2.8375333333333336e-05, + "loss": 0.0142, + "step": 14880 + }, + { + "epoch": 11.717605356439543, + "grad_norm": 0.26503506302833557, + "learning_rate": 2.8375e-05, + "loss": 0.0119, + "step": 14881 + }, + { + "epoch": 11.718393068137061, + "grad_norm": 0.22399750351905823, + "learning_rate": 2.8374666666666668e-05, + "loss": 0.0148, + "step": 14882 + }, + { + "epoch": 11.71918077983458, + "grad_norm": 0.07851666957139969, + "learning_rate": 2.8374333333333334e-05, + "loss": 0.0061, + "step": 14883 + }, + { + "epoch": 11.7199684915321, + "grad_norm": 0.14692825078964233, + "learning_rate": 2.8374e-05, + "loss": 0.0097, + "step": 14884 + }, + { + "epoch": 11.720756203229618, + "grad_norm": 0.1370217651128769, + "learning_rate": 2.837366666666667e-05, + "loss": 0.0095, + "step": 14885 + }, + { + "epoch": 11.721543914927137, + "grad_norm": 0.3303563892841339, + "learning_rate": 2.8373333333333335e-05, + "loss": 0.0208, + "step": 14886 + }, + { + "epoch": 11.722331626624655, + "grad_norm": 0.11647241562604904, + "learning_rate": 2.8373e-05, + "loss": 0.0084, + "step": 14887 + }, + { + "epoch": 11.723119338322174, + "grad_norm": 0.30715999007225037, + "learning_rate": 2.8372666666666667e-05, + "loss": 0.0085, + "step": 14888 + }, + { + "epoch": 11.723907050019692, + "grad_norm": 0.08548833429813385, + "learning_rate": 2.8372333333333336e-05, + "loss": 0.0074, + "step": 14889 + }, + { + "epoch": 11.724694761717211, + "grad_norm": 0.5386260747909546, + "learning_rate": 2.8372e-05, + "loss": 0.0147, + "step": 14890 + }, + { + "epoch": 11.725482473414731, + "grad_norm": 0.48880231380462646, + "learning_rate": 2.8371666666666668e-05, + "loss": 0.0184, + "step": 14891 + }, + { + "epoch": 11.726270185112249, + "grad_norm": 0.1278677135705948, + "learning_rate": 2.8371333333333334e-05, + "loss": 0.0066, + "step": 14892 + }, + { + "epoch": 11.727057896809768, + "grad_norm": 0.15814588963985443, + "learning_rate": 2.8371e-05, + "loss": 0.0096, + "step": 14893 + }, + { + "epoch": 11.727845608507286, + "grad_norm": 0.2985146641731262, + "learning_rate": 2.837066666666667e-05, + "loss": 0.0101, + "step": 14894 + }, + { + "epoch": 11.728633320204805, + "grad_norm": 0.1277742236852646, + "learning_rate": 2.8370333333333332e-05, + "loss": 0.0071, + "step": 14895 + }, + { + "epoch": 11.729421031902323, + "grad_norm": 0.25597497820854187, + "learning_rate": 2.837e-05, + "loss": 0.0104, + "step": 14896 + }, + { + "epoch": 11.730208743599842, + "grad_norm": 0.17592093348503113, + "learning_rate": 2.8369666666666667e-05, + "loss": 0.0158, + "step": 14897 + }, + { + "epoch": 11.730996455297362, + "grad_norm": 0.2049092948436737, + "learning_rate": 2.8369333333333333e-05, + "loss": 0.0099, + "step": 14898 + }, + { + "epoch": 11.73178416699488, + "grad_norm": 0.12916508316993713, + "learning_rate": 2.8369e-05, + "loss": 0.0055, + "step": 14899 + }, + { + "epoch": 11.732571878692399, + "grad_norm": 0.24389873445034027, + "learning_rate": 2.8368666666666668e-05, + "loss": 0.0234, + "step": 14900 + }, + { + "epoch": 11.733359590389917, + "grad_norm": 0.15334805846214294, + "learning_rate": 2.8368333333333334e-05, + "loss": 0.0071, + "step": 14901 + }, + { + "epoch": 11.734147302087436, + "grad_norm": 0.1302160620689392, + "learning_rate": 2.8368e-05, + "loss": 0.0057, + "step": 14902 + }, + { + "epoch": 11.734935013784956, + "grad_norm": 0.23397253453731537, + "learning_rate": 2.836766666666667e-05, + "loss": 0.0153, + "step": 14903 + }, + { + "epoch": 11.735722725482473, + "grad_norm": 0.5598808526992798, + "learning_rate": 2.8367333333333332e-05, + "loss": 0.0129, + "step": 14904 + }, + { + "epoch": 11.736510437179993, + "grad_norm": 0.15074731409549713, + "learning_rate": 2.8367e-05, + "loss": 0.0071, + "step": 14905 + }, + { + "epoch": 11.73729814887751, + "grad_norm": 0.23930087685585022, + "learning_rate": 2.8366666666666667e-05, + "loss": 0.0104, + "step": 14906 + }, + { + "epoch": 11.73808586057503, + "grad_norm": 0.2314247190952301, + "learning_rate": 2.8366333333333333e-05, + "loss": 0.0121, + "step": 14907 + }, + { + "epoch": 11.738873572272547, + "grad_norm": 0.6834248304367065, + "learning_rate": 2.8366000000000003e-05, + "loss": 0.0116, + "step": 14908 + }, + { + "epoch": 11.739661283970067, + "grad_norm": 0.36302563548088074, + "learning_rate": 2.836566666666667e-05, + "loss": 0.0196, + "step": 14909 + }, + { + "epoch": 11.740448995667586, + "grad_norm": 0.3000728487968445, + "learning_rate": 2.8365333333333334e-05, + "loss": 0.0091, + "step": 14910 + }, + { + "epoch": 11.741236707365104, + "grad_norm": 0.3581975996494293, + "learning_rate": 2.8365e-05, + "loss": 0.0102, + "step": 14911 + }, + { + "epoch": 11.742024419062624, + "grad_norm": 0.24587823450565338, + "learning_rate": 2.836466666666667e-05, + "loss": 0.0086, + "step": 14912 + }, + { + "epoch": 11.742812130760141, + "grad_norm": 0.2823675274848938, + "learning_rate": 2.8364333333333332e-05, + "loss": 0.0121, + "step": 14913 + }, + { + "epoch": 11.74359984245766, + "grad_norm": 0.323066383600235, + "learning_rate": 2.8364e-05, + "loss": 0.0097, + "step": 14914 + }, + { + "epoch": 11.744387554155178, + "grad_norm": 0.27275350689888, + "learning_rate": 2.8363666666666667e-05, + "loss": 0.0234, + "step": 14915 + }, + { + "epoch": 11.745175265852698, + "grad_norm": 0.4123269319534302, + "learning_rate": 2.8363333333333333e-05, + "loss": 0.0113, + "step": 14916 + }, + { + "epoch": 11.745962977550217, + "grad_norm": 0.11871758848428726, + "learning_rate": 2.8363000000000003e-05, + "loss": 0.0055, + "step": 14917 + }, + { + "epoch": 11.746750689247735, + "grad_norm": 0.4670640230178833, + "learning_rate": 2.836266666666667e-05, + "loss": 0.0177, + "step": 14918 + }, + { + "epoch": 11.747538400945254, + "grad_norm": 0.319537490606308, + "learning_rate": 2.8362333333333335e-05, + "loss": 0.0115, + "step": 14919 + }, + { + "epoch": 11.748326112642772, + "grad_norm": 0.2192443460226059, + "learning_rate": 2.8362e-05, + "loss": 0.0133, + "step": 14920 + }, + { + "epoch": 11.749113824340292, + "grad_norm": 0.6209616661071777, + "learning_rate": 2.8361666666666666e-05, + "loss": 0.1556, + "step": 14921 + }, + { + "epoch": 11.749901536037811, + "grad_norm": 0.7477407455444336, + "learning_rate": 2.8361333333333332e-05, + "loss": 0.1651, + "step": 14922 + }, + { + "epoch": 11.750689247735329, + "grad_norm": 0.5263681411743164, + "learning_rate": 2.8361000000000002e-05, + "loss": 0.1179, + "step": 14923 + }, + { + "epoch": 11.751476959432848, + "grad_norm": 0.554900586605072, + "learning_rate": 2.8360666666666664e-05, + "loss": 0.1006, + "step": 14924 + }, + { + "epoch": 11.752264671130366, + "grad_norm": 0.41716787219047546, + "learning_rate": 2.8360333333333334e-05, + "loss": 0.075, + "step": 14925 + }, + { + "epoch": 11.753052382827885, + "grad_norm": 0.4102745056152344, + "learning_rate": 2.8360000000000003e-05, + "loss": 0.0347, + "step": 14926 + }, + { + "epoch": 11.753840094525403, + "grad_norm": 0.2926805317401886, + "learning_rate": 2.8359666666666665e-05, + "loss": 0.0168, + "step": 14927 + }, + { + "epoch": 11.754627806222922, + "grad_norm": 0.3400616943836212, + "learning_rate": 2.8359333333333335e-05, + "loss": 0.0575, + "step": 14928 + }, + { + "epoch": 11.755415517920442, + "grad_norm": 0.14193889498710632, + "learning_rate": 2.8359e-05, + "loss": 0.0088, + "step": 14929 + }, + { + "epoch": 11.75620322961796, + "grad_norm": 0.1955934762954712, + "learning_rate": 2.8358666666666667e-05, + "loss": 0.0114, + "step": 14930 + }, + { + "epoch": 11.756990941315479, + "grad_norm": 0.17945434153079987, + "learning_rate": 2.8358333333333333e-05, + "loss": 0.008, + "step": 14931 + }, + { + "epoch": 11.757778653012997, + "grad_norm": 0.2550886571407318, + "learning_rate": 2.8358000000000002e-05, + "loss": 0.0489, + "step": 14932 + }, + { + "epoch": 11.758566364710516, + "grad_norm": 0.22569264471530914, + "learning_rate": 2.8357666666666668e-05, + "loss": 0.0091, + "step": 14933 + }, + { + "epoch": 11.759354076408034, + "grad_norm": 0.18901440501213074, + "learning_rate": 2.8357333333333334e-05, + "loss": 0.009, + "step": 14934 + }, + { + "epoch": 11.760141788105553, + "grad_norm": 0.4241901636123657, + "learning_rate": 2.8357000000000003e-05, + "loss": 0.0172, + "step": 14935 + }, + { + "epoch": 11.760929499803073, + "grad_norm": 0.4451787769794464, + "learning_rate": 2.8356666666666666e-05, + "loss": 0.0142, + "step": 14936 + }, + { + "epoch": 11.76171721150059, + "grad_norm": 0.1003662496805191, + "learning_rate": 2.8356333333333335e-05, + "loss": 0.0055, + "step": 14937 + }, + { + "epoch": 11.76250492319811, + "grad_norm": 0.2683355212211609, + "learning_rate": 2.8356e-05, + "loss": 0.0161, + "step": 14938 + }, + { + "epoch": 11.763292634895627, + "grad_norm": 0.19654664397239685, + "learning_rate": 2.8355666666666667e-05, + "loss": 0.0112, + "step": 14939 + }, + { + "epoch": 11.764080346593147, + "grad_norm": 0.19045008718967438, + "learning_rate": 2.8355333333333333e-05, + "loss": 0.0051, + "step": 14940 + }, + { + "epoch": 11.764868058290666, + "grad_norm": 0.21782852709293365, + "learning_rate": 2.8355000000000002e-05, + "loss": 0.0085, + "step": 14941 + }, + { + "epoch": 11.765655769988184, + "grad_norm": 0.18528366088867188, + "learning_rate": 2.8354666666666668e-05, + "loss": 0.0086, + "step": 14942 + }, + { + "epoch": 11.766443481685704, + "grad_norm": 1.2154748439788818, + "learning_rate": 2.8354333333333334e-05, + "loss": 0.016, + "step": 14943 + }, + { + "epoch": 11.767231193383221, + "grad_norm": 0.13959985971450806, + "learning_rate": 2.8354000000000003e-05, + "loss": 0.008, + "step": 14944 + }, + { + "epoch": 11.76801890508074, + "grad_norm": 0.19111984968185425, + "learning_rate": 2.8353666666666666e-05, + "loss": 0.0063, + "step": 14945 + }, + { + "epoch": 11.768806616778258, + "grad_norm": 0.39312463998794556, + "learning_rate": 2.8353333333333335e-05, + "loss": 0.0102, + "step": 14946 + }, + { + "epoch": 11.769594328475778, + "grad_norm": 0.2428353726863861, + "learning_rate": 2.8353e-05, + "loss": 0.013, + "step": 14947 + }, + { + "epoch": 11.770382040173297, + "grad_norm": 0.17355695366859436, + "learning_rate": 2.8352666666666667e-05, + "loss": 0.0125, + "step": 14948 + }, + { + "epoch": 11.771169751870815, + "grad_norm": 0.11843522638082504, + "learning_rate": 2.8352333333333333e-05, + "loss": 0.0057, + "step": 14949 + }, + { + "epoch": 11.771957463568334, + "grad_norm": 0.1846674382686615, + "learning_rate": 2.8352000000000002e-05, + "loss": 0.0091, + "step": 14950 + }, + { + "epoch": 11.772745175265852, + "grad_norm": 0.34963664412498474, + "learning_rate": 2.8351666666666668e-05, + "loss": 0.0202, + "step": 14951 + }, + { + "epoch": 11.773532886963372, + "grad_norm": 0.14678704738616943, + "learning_rate": 2.8351333333333334e-05, + "loss": 0.0088, + "step": 14952 + }, + { + "epoch": 11.77432059866089, + "grad_norm": 0.37417158484458923, + "learning_rate": 2.8351e-05, + "loss": 0.0201, + "step": 14953 + }, + { + "epoch": 11.775108310358409, + "grad_norm": 0.33353275060653687, + "learning_rate": 2.8350666666666666e-05, + "loss": 0.0103, + "step": 14954 + }, + { + "epoch": 11.775896022055928, + "grad_norm": 0.46380260586738586, + "learning_rate": 2.8350333333333335e-05, + "loss": 0.0163, + "step": 14955 + }, + { + "epoch": 11.776683733753446, + "grad_norm": 0.49262094497680664, + "learning_rate": 2.8349999999999998e-05, + "loss": 0.0118, + "step": 14956 + }, + { + "epoch": 11.777471445450965, + "grad_norm": 0.18968887627124786, + "learning_rate": 2.8349666666666667e-05, + "loss": 0.0082, + "step": 14957 + }, + { + "epoch": 11.778259157148483, + "grad_norm": 0.5322263836860657, + "learning_rate": 2.8349333333333337e-05, + "loss": 0.006, + "step": 14958 + }, + { + "epoch": 11.779046868846002, + "grad_norm": 0.9953143000602722, + "learning_rate": 2.8349e-05, + "loss": 0.0119, + "step": 14959 + }, + { + "epoch": 11.779834580543522, + "grad_norm": 0.2818474769592285, + "learning_rate": 2.834866666666667e-05, + "loss": 0.0138, + "step": 14960 + }, + { + "epoch": 11.78062229224104, + "grad_norm": 0.3586304783821106, + "learning_rate": 2.8348333333333334e-05, + "loss": 0.0153, + "step": 14961 + }, + { + "epoch": 11.781410003938559, + "grad_norm": 0.18448574841022491, + "learning_rate": 2.8348e-05, + "loss": 0.0095, + "step": 14962 + }, + { + "epoch": 11.782197715636077, + "grad_norm": 0.6712980270385742, + "learning_rate": 2.8347666666666666e-05, + "loss": 0.0196, + "step": 14963 + }, + { + "epoch": 11.782985427333596, + "grad_norm": 0.272866427898407, + "learning_rate": 2.8347333333333336e-05, + "loss": 0.0134, + "step": 14964 + }, + { + "epoch": 11.783773139031114, + "grad_norm": 0.2928125262260437, + "learning_rate": 2.8346999999999998e-05, + "loss": 0.0106, + "step": 14965 + }, + { + "epoch": 11.784560850728633, + "grad_norm": 0.14945156872272491, + "learning_rate": 2.8346666666666667e-05, + "loss": 0.01, + "step": 14966 + }, + { + "epoch": 11.785348562426153, + "grad_norm": 0.43444162607192993, + "learning_rate": 2.8346333333333337e-05, + "loss": 0.0197, + "step": 14967 + }, + { + "epoch": 11.78613627412367, + "grad_norm": 0.26722630858421326, + "learning_rate": 2.8346e-05, + "loss": 0.0131, + "step": 14968 + }, + { + "epoch": 11.78692398582119, + "grad_norm": 0.6482908725738525, + "learning_rate": 2.834566666666667e-05, + "loss": 0.0256, + "step": 14969 + }, + { + "epoch": 11.787711697518708, + "grad_norm": 0.21543654799461365, + "learning_rate": 2.8345333333333335e-05, + "loss": 0.0115, + "step": 14970 + }, + { + "epoch": 11.788499409216227, + "grad_norm": 0.8821447491645813, + "learning_rate": 2.8345e-05, + "loss": 0.2911, + "step": 14971 + }, + { + "epoch": 11.789287120913745, + "grad_norm": 0.43935948610305786, + "learning_rate": 2.8344666666666666e-05, + "loss": 0.1178, + "step": 14972 + }, + { + "epoch": 11.790074832611264, + "grad_norm": 0.6550071835517883, + "learning_rate": 2.8344333333333336e-05, + "loss": 0.0955, + "step": 14973 + }, + { + "epoch": 11.790862544308784, + "grad_norm": 0.4643568694591522, + "learning_rate": 2.8344e-05, + "loss": 0.0718, + "step": 14974 + }, + { + "epoch": 11.791650256006301, + "grad_norm": 0.5925131440162659, + "learning_rate": 2.8343666666666668e-05, + "loss": 0.0706, + "step": 14975 + }, + { + "epoch": 11.79243796770382, + "grad_norm": 0.5575248003005981, + "learning_rate": 2.8343333333333337e-05, + "loss": 0.0562, + "step": 14976 + }, + { + "epoch": 11.793225679401338, + "grad_norm": 0.32537978887557983, + "learning_rate": 2.8343e-05, + "loss": 0.0272, + "step": 14977 + }, + { + "epoch": 11.794013391098858, + "grad_norm": 0.280186265707016, + "learning_rate": 2.834266666666667e-05, + "loss": 0.0234, + "step": 14978 + }, + { + "epoch": 11.794801102796377, + "grad_norm": 0.23488067090511322, + "learning_rate": 2.8342333333333335e-05, + "loss": 0.0152, + "step": 14979 + }, + { + "epoch": 11.795588814493895, + "grad_norm": 0.11845192313194275, + "learning_rate": 2.8342e-05, + "loss": 0.012, + "step": 14980 + }, + { + "epoch": 11.796376526191414, + "grad_norm": 0.17261484265327454, + "learning_rate": 2.8341666666666667e-05, + "loss": 0.0168, + "step": 14981 + }, + { + "epoch": 11.797164237888932, + "grad_norm": 0.15226691961288452, + "learning_rate": 2.8341333333333333e-05, + "loss": 0.0091, + "step": 14982 + }, + { + "epoch": 11.797951949586452, + "grad_norm": 0.1353096067905426, + "learning_rate": 2.8341000000000002e-05, + "loss": 0.0132, + "step": 14983 + }, + { + "epoch": 11.798739661283971, + "grad_norm": 0.20001569390296936, + "learning_rate": 2.8340666666666668e-05, + "loss": 0.0366, + "step": 14984 + }, + { + "epoch": 11.799527372981489, + "grad_norm": 0.1444527506828308, + "learning_rate": 2.8340333333333334e-05, + "loss": 0.009, + "step": 14985 + }, + { + "epoch": 11.800315084679008, + "grad_norm": 0.21726171672344208, + "learning_rate": 2.834e-05, + "loss": 0.0127, + "step": 14986 + }, + { + "epoch": 11.801102796376526, + "grad_norm": 0.2557425796985626, + "learning_rate": 2.833966666666667e-05, + "loss": 0.0148, + "step": 14987 + }, + { + "epoch": 11.801890508074045, + "grad_norm": 0.14617253839969635, + "learning_rate": 2.833933333333333e-05, + "loss": 0.0091, + "step": 14988 + }, + { + "epoch": 11.802678219771563, + "grad_norm": 0.2207745760679245, + "learning_rate": 2.8339e-05, + "loss": 0.0113, + "step": 14989 + }, + { + "epoch": 11.803465931469082, + "grad_norm": 0.14747655391693115, + "learning_rate": 2.8338666666666667e-05, + "loss": 0.0113, + "step": 14990 + }, + { + "epoch": 11.8042536431666, + "grad_norm": 0.2300046980381012, + "learning_rate": 2.8338333333333333e-05, + "loss": 0.0126, + "step": 14991 + }, + { + "epoch": 11.80504135486412, + "grad_norm": 0.19407032430171967, + "learning_rate": 2.8338000000000002e-05, + "loss": 0.0066, + "step": 14992 + }, + { + "epoch": 11.805829066561639, + "grad_norm": 0.09053876996040344, + "learning_rate": 2.8337666666666668e-05, + "loss": 0.0068, + "step": 14993 + }, + { + "epoch": 11.806616778259157, + "grad_norm": 0.27250561118125916, + "learning_rate": 2.8337333333333334e-05, + "loss": 0.0101, + "step": 14994 + }, + { + "epoch": 11.807404489956676, + "grad_norm": 0.35306239128112793, + "learning_rate": 2.8337e-05, + "loss": 0.0104, + "step": 14995 + }, + { + "epoch": 11.808192201654194, + "grad_norm": 0.13687658309936523, + "learning_rate": 2.833666666666667e-05, + "loss": 0.006, + "step": 14996 + }, + { + "epoch": 11.808979913351713, + "grad_norm": 0.321855366230011, + "learning_rate": 2.8336333333333332e-05, + "loss": 0.0099, + "step": 14997 + }, + { + "epoch": 11.809767625049233, + "grad_norm": 0.4072970747947693, + "learning_rate": 2.8336e-05, + "loss": 0.016, + "step": 14998 + }, + { + "epoch": 11.81055533674675, + "grad_norm": 0.37942081689834595, + "learning_rate": 2.8335666666666667e-05, + "loss": 0.0141, + "step": 14999 + }, + { + "epoch": 11.81134304844427, + "grad_norm": 0.17283999919891357, + "learning_rate": 2.8335333333333333e-05, + "loss": 0.0089, + "step": 15000 + }, + { + "epoch": 11.81134304844427, + "eval_cer": 0.12491899318246624, + "eval_loss": 0.3657892644405365, + "eval_runtime": 16.9313, + "eval_samples_per_second": 17.955, + "eval_steps_per_second": 0.591, + "eval_wer": 0.40905602455871065, + "step": 15000 + }, + { + "epoch": 11.812130760141788, + "grad_norm": 0.0873795598745346, + "learning_rate": 2.8335000000000002e-05, + "loss": 0.0033, + "step": 15001 + }, + { + "epoch": 11.812918471839307, + "grad_norm": 0.13985180854797363, + "learning_rate": 2.8334666666666668e-05, + "loss": 0.0085, + "step": 15002 + }, + { + "epoch": 11.813706183536826, + "grad_norm": 0.15965285897254944, + "learning_rate": 2.8334333333333334e-05, + "loss": 0.006, + "step": 15003 + }, + { + "epoch": 11.814493895234344, + "grad_norm": 0.3812499940395355, + "learning_rate": 2.8334e-05, + "loss": 0.0119, + "step": 15004 + }, + { + "epoch": 11.815281606931864, + "grad_norm": 0.29981112480163574, + "learning_rate": 2.833366666666667e-05, + "loss": 0.0096, + "step": 15005 + }, + { + "epoch": 11.816069318629381, + "grad_norm": 0.16554270684719086, + "learning_rate": 2.8333333333333332e-05, + "loss": 0.0094, + "step": 15006 + }, + { + "epoch": 11.8168570303269, + "grad_norm": 0.20255140960216522, + "learning_rate": 2.8333e-05, + "loss": 0.0119, + "step": 15007 + }, + { + "epoch": 11.817644742024418, + "grad_norm": 0.3072189390659332, + "learning_rate": 2.833266666666667e-05, + "loss": 0.0068, + "step": 15008 + }, + { + "epoch": 11.818432453721938, + "grad_norm": 0.3715985119342804, + "learning_rate": 2.8332333333333333e-05, + "loss": 0.0159, + "step": 15009 + }, + { + "epoch": 11.819220165419457, + "grad_norm": 0.25852081179618835, + "learning_rate": 2.8332000000000002e-05, + "loss": 0.0091, + "step": 15010 + }, + { + "epoch": 11.820007877116975, + "grad_norm": 0.43322163820266724, + "learning_rate": 2.833166666666667e-05, + "loss": 0.0114, + "step": 15011 + }, + { + "epoch": 11.820795588814494, + "grad_norm": 0.19051729142665863, + "learning_rate": 2.8331333333333334e-05, + "loss": 0.0084, + "step": 15012 + }, + { + "epoch": 11.821583300512012, + "grad_norm": 0.18871353566646576, + "learning_rate": 2.8331e-05, + "loss": 0.0105, + "step": 15013 + }, + { + "epoch": 11.822371012209532, + "grad_norm": 0.43013501167297363, + "learning_rate": 2.8330666666666666e-05, + "loss": 0.0131, + "step": 15014 + }, + { + "epoch": 11.82315872390705, + "grad_norm": 0.14463914930820465, + "learning_rate": 2.8330333333333332e-05, + "loss": 0.0044, + "step": 15015 + }, + { + "epoch": 11.823946435604569, + "grad_norm": 0.16388662159442902, + "learning_rate": 2.833e-05, + "loss": 0.0096, + "step": 15016 + }, + { + "epoch": 11.824734147302088, + "grad_norm": 0.6365162134170532, + "learning_rate": 2.8329666666666667e-05, + "loss": 0.0145, + "step": 15017 + }, + { + "epoch": 11.825521858999606, + "grad_norm": 0.2126867175102234, + "learning_rate": 2.8329333333333333e-05, + "loss": 0.0136, + "step": 15018 + }, + { + "epoch": 11.826309570697125, + "grad_norm": 0.47737544775009155, + "learning_rate": 2.8329000000000003e-05, + "loss": 0.0137, + "step": 15019 + }, + { + "epoch": 11.827097282394643, + "grad_norm": 0.5113485455513, + "learning_rate": 2.8328666666666665e-05, + "loss": 0.0117, + "step": 15020 + }, + { + "epoch": 11.827884994092162, + "grad_norm": 1.9253331422805786, + "learning_rate": 2.8328333333333335e-05, + "loss": 0.308, + "step": 15021 + }, + { + "epoch": 11.828672705789682, + "grad_norm": 0.47757822275161743, + "learning_rate": 2.8328e-05, + "loss": 0.1534, + "step": 15022 + }, + { + "epoch": 11.8294604174872, + "grad_norm": 0.42031583189964294, + "learning_rate": 2.8327666666666666e-05, + "loss": 0.1083, + "step": 15023 + }, + { + "epoch": 11.830248129184719, + "grad_norm": 0.4602862000465393, + "learning_rate": 2.8327333333333332e-05, + "loss": 0.1071, + "step": 15024 + }, + { + "epoch": 11.831035840882237, + "grad_norm": 0.6132071614265442, + "learning_rate": 2.8327000000000002e-05, + "loss": 0.0702, + "step": 15025 + }, + { + "epoch": 11.831823552579756, + "grad_norm": 0.4128910005092621, + "learning_rate": 2.8326666666666668e-05, + "loss": 0.0938, + "step": 15026 + }, + { + "epoch": 11.832611264277274, + "grad_norm": 0.7303581833839417, + "learning_rate": 2.8326333333333334e-05, + "loss": 0.0386, + "step": 15027 + }, + { + "epoch": 11.833398975974793, + "grad_norm": 0.36114203929901123, + "learning_rate": 2.8326000000000003e-05, + "loss": 0.0212, + "step": 15028 + }, + { + "epoch": 11.834186687672313, + "grad_norm": 0.15174603462219238, + "learning_rate": 2.8325666666666665e-05, + "loss": 0.011, + "step": 15029 + }, + { + "epoch": 11.83497439936983, + "grad_norm": 0.20409011840820312, + "learning_rate": 2.8325333333333335e-05, + "loss": 0.0061, + "step": 15030 + }, + { + "epoch": 11.83576211106735, + "grad_norm": 0.1362273395061493, + "learning_rate": 2.8325e-05, + "loss": 0.0075, + "step": 15031 + }, + { + "epoch": 11.836549822764868, + "grad_norm": 0.43665263056755066, + "learning_rate": 2.8324666666666667e-05, + "loss": 0.0224, + "step": 15032 + }, + { + "epoch": 11.837337534462387, + "grad_norm": 0.11503834277391434, + "learning_rate": 2.8324333333333336e-05, + "loss": 0.0083, + "step": 15033 + }, + { + "epoch": 11.838125246159905, + "grad_norm": 0.195225328207016, + "learning_rate": 2.8324000000000002e-05, + "loss": 0.008, + "step": 15034 + }, + { + "epoch": 11.838912957857424, + "grad_norm": 0.22950881719589233, + "learning_rate": 2.8323666666666668e-05, + "loss": 0.0185, + "step": 15035 + }, + { + "epoch": 11.839700669554944, + "grad_norm": 0.14181889593601227, + "learning_rate": 2.8323333333333334e-05, + "loss": 0.0102, + "step": 15036 + }, + { + "epoch": 11.840488381252461, + "grad_norm": 0.07980356365442276, + "learning_rate": 2.8323000000000003e-05, + "loss": 0.005, + "step": 15037 + }, + { + "epoch": 11.84127609294998, + "grad_norm": 0.16250941157341003, + "learning_rate": 2.8322666666666666e-05, + "loss": 0.008, + "step": 15038 + }, + { + "epoch": 11.842063804647498, + "grad_norm": 0.3419044613838196, + "learning_rate": 2.8322333333333335e-05, + "loss": 0.0106, + "step": 15039 + }, + { + "epoch": 11.842851516345018, + "grad_norm": 0.17724163830280304, + "learning_rate": 2.8322e-05, + "loss": 0.0116, + "step": 15040 + }, + { + "epoch": 11.843639228042537, + "grad_norm": 0.1164722591638565, + "learning_rate": 2.8321666666666667e-05, + "loss": 0.0063, + "step": 15041 + }, + { + "epoch": 11.844426939740055, + "grad_norm": 0.3261156678199768, + "learning_rate": 2.8321333333333336e-05, + "loss": 0.0139, + "step": 15042 + }, + { + "epoch": 11.845214651437574, + "grad_norm": 0.19463227689266205, + "learning_rate": 2.8321e-05, + "loss": 0.0107, + "step": 15043 + }, + { + "epoch": 11.846002363135092, + "grad_norm": 0.1755082607269287, + "learning_rate": 2.8320666666666668e-05, + "loss": 0.0078, + "step": 15044 + }, + { + "epoch": 11.846790074832612, + "grad_norm": 0.3560106158256531, + "learning_rate": 2.8320333333333334e-05, + "loss": 0.0159, + "step": 15045 + }, + { + "epoch": 11.84757778653013, + "grad_norm": 0.10646764934062958, + "learning_rate": 2.832e-05, + "loss": 0.0052, + "step": 15046 + }, + { + "epoch": 11.848365498227649, + "grad_norm": 0.14410732686519623, + "learning_rate": 2.8319666666666666e-05, + "loss": 0.0091, + "step": 15047 + }, + { + "epoch": 11.849153209925168, + "grad_norm": 0.2746017575263977, + "learning_rate": 2.8319333333333335e-05, + "loss": 0.0129, + "step": 15048 + }, + { + "epoch": 11.849940921622686, + "grad_norm": 0.46216970682144165, + "learning_rate": 2.8318999999999998e-05, + "loss": 0.0264, + "step": 15049 + }, + { + "epoch": 11.850728633320205, + "grad_norm": 0.21692758798599243, + "learning_rate": 2.8318666666666667e-05, + "loss": 0.0086, + "step": 15050 + }, + { + "epoch": 11.851516345017723, + "grad_norm": 0.16410347819328308, + "learning_rate": 2.8318333333333336e-05, + "loss": 0.0079, + "step": 15051 + }, + { + "epoch": 11.852304056715242, + "grad_norm": 0.10432807356119156, + "learning_rate": 2.8318e-05, + "loss": 0.0055, + "step": 15052 + }, + { + "epoch": 11.85309176841276, + "grad_norm": 0.2925571799278259, + "learning_rate": 2.8317666666666668e-05, + "loss": 0.011, + "step": 15053 + }, + { + "epoch": 11.85387948011028, + "grad_norm": 0.6009413003921509, + "learning_rate": 2.8317333333333334e-05, + "loss": 0.0144, + "step": 15054 + }, + { + "epoch": 11.854667191807799, + "grad_norm": 0.44209587574005127, + "learning_rate": 2.8317e-05, + "loss": 0.0162, + "step": 15055 + }, + { + "epoch": 11.855454903505317, + "grad_norm": 0.296209454536438, + "learning_rate": 2.8316666666666666e-05, + "loss": 0.0219, + "step": 15056 + }, + { + "epoch": 11.856242615202836, + "grad_norm": 0.43253856897354126, + "learning_rate": 2.8316333333333335e-05, + "loss": 0.0144, + "step": 15057 + }, + { + "epoch": 11.857030326900354, + "grad_norm": 0.703800618648529, + "learning_rate": 2.8316e-05, + "loss": 0.0129, + "step": 15058 + }, + { + "epoch": 11.857818038597873, + "grad_norm": 0.2079361081123352, + "learning_rate": 2.8315666666666667e-05, + "loss": 0.0147, + "step": 15059 + }, + { + "epoch": 11.858605750295393, + "grad_norm": 0.2205347865819931, + "learning_rate": 2.8315333333333337e-05, + "loss": 0.0145, + "step": 15060 + }, + { + "epoch": 11.85939346199291, + "grad_norm": 0.11508960276842117, + "learning_rate": 2.8315e-05, + "loss": 0.009, + "step": 15061 + }, + { + "epoch": 11.86018117369043, + "grad_norm": 0.1405237466096878, + "learning_rate": 2.831466666666667e-05, + "loss": 0.0104, + "step": 15062 + }, + { + "epoch": 11.860968885387948, + "grad_norm": 0.18730862438678741, + "learning_rate": 2.8314333333333334e-05, + "loss": 0.0099, + "step": 15063 + }, + { + "epoch": 11.861756597085467, + "grad_norm": 0.3674246668815613, + "learning_rate": 2.8314e-05, + "loss": 0.0145, + "step": 15064 + }, + { + "epoch": 11.862544308782985, + "grad_norm": 0.2845221161842346, + "learning_rate": 2.8313666666666666e-05, + "loss": 0.0192, + "step": 15065 + }, + { + "epoch": 11.863332020480504, + "grad_norm": 0.29164302349090576, + "learning_rate": 2.8313333333333336e-05, + "loss": 0.0139, + "step": 15066 + }, + { + "epoch": 11.864119732178024, + "grad_norm": 0.23155173659324646, + "learning_rate": 2.8313e-05, + "loss": 0.007, + "step": 15067 + }, + { + "epoch": 11.864907443875541, + "grad_norm": 0.23699624836444855, + "learning_rate": 2.8312666666666667e-05, + "loss": 0.0125, + "step": 15068 + }, + { + "epoch": 11.86569515557306, + "grad_norm": 0.37555408477783203, + "learning_rate": 2.8312333333333337e-05, + "loss": 0.0123, + "step": 15069 + }, + { + "epoch": 11.866482867270578, + "grad_norm": 0.19098910689353943, + "learning_rate": 2.8312e-05, + "loss": 0.0075, + "step": 15070 + }, + { + "epoch": 11.867270578968098, + "grad_norm": 0.8113088607788086, + "learning_rate": 2.831166666666667e-05, + "loss": 0.2131, + "step": 15071 + }, + { + "epoch": 11.868058290665616, + "grad_norm": 0.5962550640106201, + "learning_rate": 2.831133333333333e-05, + "loss": 0.1218, + "step": 15072 + }, + { + "epoch": 11.868846002363135, + "grad_norm": 1.4803922176361084, + "learning_rate": 2.8311e-05, + "loss": 0.1235, + "step": 15073 + }, + { + "epoch": 11.869633714060654, + "grad_norm": 0.2776375114917755, + "learning_rate": 2.8310666666666666e-05, + "loss": 0.0633, + "step": 15074 + }, + { + "epoch": 11.870421425758172, + "grad_norm": 0.3453705906867981, + "learning_rate": 2.8310333333333332e-05, + "loss": 0.0565, + "step": 15075 + }, + { + "epoch": 11.871209137455692, + "grad_norm": 0.29901447892189026, + "learning_rate": 2.8310000000000002e-05, + "loss": 0.072, + "step": 15076 + }, + { + "epoch": 11.87199684915321, + "grad_norm": 0.4092116951942444, + "learning_rate": 2.8309666666666668e-05, + "loss": 0.0288, + "step": 15077 + }, + { + "epoch": 11.872784560850729, + "grad_norm": 0.7314311265945435, + "learning_rate": 2.8309333333333334e-05, + "loss": 0.0195, + "step": 15078 + }, + { + "epoch": 11.873572272548248, + "grad_norm": 0.44410279393196106, + "learning_rate": 2.8309e-05, + "loss": 0.0195, + "step": 15079 + }, + { + "epoch": 11.874359984245766, + "grad_norm": 0.2037498503923416, + "learning_rate": 2.830866666666667e-05, + "loss": 0.0082, + "step": 15080 + }, + { + "epoch": 11.875147695943285, + "grad_norm": 0.3980848491191864, + "learning_rate": 2.830833333333333e-05, + "loss": 0.0186, + "step": 15081 + }, + { + "epoch": 11.875935407640803, + "grad_norm": 0.11481282114982605, + "learning_rate": 2.8308e-05, + "loss": 0.0098, + "step": 15082 + }, + { + "epoch": 11.876723119338322, + "grad_norm": 0.15182039141654968, + "learning_rate": 2.830766666666667e-05, + "loss": 0.0063, + "step": 15083 + }, + { + "epoch": 11.87751083103584, + "grad_norm": 0.214328333735466, + "learning_rate": 2.8307333333333333e-05, + "loss": 0.0088, + "step": 15084 + }, + { + "epoch": 11.87829854273336, + "grad_norm": 0.22545796632766724, + "learning_rate": 2.8307000000000002e-05, + "loss": 0.0104, + "step": 15085 + }, + { + "epoch": 11.879086254430879, + "grad_norm": 0.1888103187084198, + "learning_rate": 2.8306666666666668e-05, + "loss": 0.0152, + "step": 15086 + }, + { + "epoch": 11.879873966128397, + "grad_norm": 0.32749268412590027, + "learning_rate": 2.8306333333333334e-05, + "loss": 0.0175, + "step": 15087 + }, + { + "epoch": 11.880661677825916, + "grad_norm": 0.32535696029663086, + "learning_rate": 2.8306e-05, + "loss": 0.0231, + "step": 15088 + }, + { + "epoch": 11.881449389523434, + "grad_norm": 0.16245998442173004, + "learning_rate": 2.830566666666667e-05, + "loss": 0.0062, + "step": 15089 + }, + { + "epoch": 11.882237101220953, + "grad_norm": 0.2870718240737915, + "learning_rate": 2.830533333333333e-05, + "loss": 0.0147, + "step": 15090 + }, + { + "epoch": 11.883024812918471, + "grad_norm": 0.12078194320201874, + "learning_rate": 2.8305e-05, + "loss": 0.0093, + "step": 15091 + }, + { + "epoch": 11.88381252461599, + "grad_norm": 0.23722708225250244, + "learning_rate": 2.830466666666667e-05, + "loss": 0.0149, + "step": 15092 + }, + { + "epoch": 11.88460023631351, + "grad_norm": 0.24029017984867096, + "learning_rate": 2.8304333333333333e-05, + "loss": 0.0169, + "step": 15093 + }, + { + "epoch": 11.885387948011028, + "grad_norm": 0.25003620982170105, + "learning_rate": 2.8304000000000002e-05, + "loss": 0.0127, + "step": 15094 + }, + { + "epoch": 11.886175659708547, + "grad_norm": 0.3617786467075348, + "learning_rate": 2.8303666666666668e-05, + "loss": 0.007, + "step": 15095 + }, + { + "epoch": 11.886963371406065, + "grad_norm": 0.13228215277194977, + "learning_rate": 2.8303333333333334e-05, + "loss": 0.0065, + "step": 15096 + }, + { + "epoch": 11.887751083103584, + "grad_norm": 0.15088793635368347, + "learning_rate": 2.8303e-05, + "loss": 0.0117, + "step": 15097 + }, + { + "epoch": 11.888538794801104, + "grad_norm": 0.18299362063407898, + "learning_rate": 2.830266666666667e-05, + "loss": 0.0081, + "step": 15098 + }, + { + "epoch": 11.889326506498621, + "grad_norm": 0.1577618420124054, + "learning_rate": 2.8302333333333332e-05, + "loss": 0.0123, + "step": 15099 + }, + { + "epoch": 11.89011421819614, + "grad_norm": 0.11311835795640945, + "learning_rate": 2.8302e-05, + "loss": 0.0088, + "step": 15100 + }, + { + "epoch": 11.890901929893658, + "grad_norm": 0.264013409614563, + "learning_rate": 2.830166666666667e-05, + "loss": 0.0096, + "step": 15101 + }, + { + "epoch": 11.891689641591178, + "grad_norm": 0.15250830352306366, + "learning_rate": 2.8301333333333333e-05, + "loss": 0.0056, + "step": 15102 + }, + { + "epoch": 11.892477353288696, + "grad_norm": 0.24088996648788452, + "learning_rate": 2.8301000000000002e-05, + "loss": 0.0171, + "step": 15103 + }, + { + "epoch": 11.893265064986215, + "grad_norm": 0.16552996635437012, + "learning_rate": 2.8300666666666665e-05, + "loss": 0.0061, + "step": 15104 + }, + { + "epoch": 11.894052776683735, + "grad_norm": 0.13164564967155457, + "learning_rate": 2.8300333333333334e-05, + "loss": 0.0068, + "step": 15105 + }, + { + "epoch": 11.894840488381252, + "grad_norm": 0.1928861141204834, + "learning_rate": 2.83e-05, + "loss": 0.0159, + "step": 15106 + }, + { + "epoch": 11.895628200078772, + "grad_norm": 0.4571320712566376, + "learning_rate": 2.8299666666666666e-05, + "loss": 0.0119, + "step": 15107 + }, + { + "epoch": 11.89641591177629, + "grad_norm": 0.09982146322727203, + "learning_rate": 2.8299333333333335e-05, + "loss": 0.0075, + "step": 15108 + }, + { + "epoch": 11.897203623473809, + "grad_norm": 0.5860010385513306, + "learning_rate": 2.8299e-05, + "loss": 0.009, + "step": 15109 + }, + { + "epoch": 11.897991335171326, + "grad_norm": 0.24035859107971191, + "learning_rate": 2.8298666666666667e-05, + "loss": 0.0134, + "step": 15110 + }, + { + "epoch": 11.898779046868846, + "grad_norm": 0.3024533689022064, + "learning_rate": 2.8298333333333333e-05, + "loss": 0.0093, + "step": 15111 + }, + { + "epoch": 11.899566758566365, + "grad_norm": 0.17590734362602234, + "learning_rate": 2.8298000000000002e-05, + "loss": 0.0072, + "step": 15112 + }, + { + "epoch": 11.900354470263883, + "grad_norm": 0.18549717962741852, + "learning_rate": 2.8297666666666665e-05, + "loss": 0.0123, + "step": 15113 + }, + { + "epoch": 11.901142181961402, + "grad_norm": 0.21999281644821167, + "learning_rate": 2.8297333333333334e-05, + "loss": 0.0105, + "step": 15114 + }, + { + "epoch": 11.90192989365892, + "grad_norm": 0.12922552227973938, + "learning_rate": 2.8297e-05, + "loss": 0.0046, + "step": 15115 + }, + { + "epoch": 11.90271760535644, + "grad_norm": 0.23652254045009613, + "learning_rate": 2.8296666666666666e-05, + "loss": 0.0118, + "step": 15116 + }, + { + "epoch": 11.903505317053959, + "grad_norm": 0.12327323853969574, + "learning_rate": 2.8296333333333336e-05, + "loss": 0.0038, + "step": 15117 + }, + { + "epoch": 11.904293028751477, + "grad_norm": 0.07739321142435074, + "learning_rate": 2.8296e-05, + "loss": 0.0036, + "step": 15118 + }, + { + "epoch": 11.905080740448996, + "grad_norm": 0.44151514768600464, + "learning_rate": 2.8295666666666667e-05, + "loss": 0.0128, + "step": 15119 + }, + { + "epoch": 11.905868452146514, + "grad_norm": 0.23133675754070282, + "learning_rate": 2.8295333333333333e-05, + "loss": 0.007, + "step": 15120 + }, + { + "epoch": 11.906656163844033, + "grad_norm": 0.8190683722496033, + "learning_rate": 2.8295000000000003e-05, + "loss": 0.2155, + "step": 15121 + }, + { + "epoch": 11.907443875541551, + "grad_norm": 0.5617139339447021, + "learning_rate": 2.8294666666666665e-05, + "loss": 0.1452, + "step": 15122 + }, + { + "epoch": 11.90823158723907, + "grad_norm": 0.47455358505249023, + "learning_rate": 2.8294333333333335e-05, + "loss": 0.097, + "step": 15123 + }, + { + "epoch": 11.90901929893659, + "grad_norm": 0.466471791267395, + "learning_rate": 2.8294e-05, + "loss": 0.0689, + "step": 15124 + }, + { + "epoch": 11.909807010634108, + "grad_norm": 0.44122442603111267, + "learning_rate": 2.8293666666666666e-05, + "loss": 0.0787, + "step": 15125 + }, + { + "epoch": 11.910594722331627, + "grad_norm": 0.4063183665275574, + "learning_rate": 2.8293333333333336e-05, + "loss": 0.0522, + "step": 15126 + }, + { + "epoch": 11.911382434029145, + "grad_norm": 0.45614010095596313, + "learning_rate": 2.8293e-05, + "loss": 0.0196, + "step": 15127 + }, + { + "epoch": 11.912170145726664, + "grad_norm": 0.27270835638046265, + "learning_rate": 2.8292666666666668e-05, + "loss": 0.0263, + "step": 15128 + }, + { + "epoch": 11.912957857424182, + "grad_norm": 0.1264498382806778, + "learning_rate": 2.8292333333333334e-05, + "loss": 0.0072, + "step": 15129 + }, + { + "epoch": 11.913745569121701, + "grad_norm": 0.18779940903186798, + "learning_rate": 2.8292000000000003e-05, + "loss": 0.0151, + "step": 15130 + }, + { + "epoch": 11.91453328081922, + "grad_norm": 0.4667900800704956, + "learning_rate": 2.8291666666666665e-05, + "loss": 0.0122, + "step": 15131 + }, + { + "epoch": 11.915320992516738, + "grad_norm": 0.28822770714759827, + "learning_rate": 2.8291333333333335e-05, + "loss": 0.0127, + "step": 15132 + }, + { + "epoch": 11.916108704214258, + "grad_norm": 0.21935516595840454, + "learning_rate": 2.8291e-05, + "loss": 0.0135, + "step": 15133 + }, + { + "epoch": 11.916896415911776, + "grad_norm": 0.12350715696811676, + "learning_rate": 2.8290666666666667e-05, + "loss": 0.0057, + "step": 15134 + }, + { + "epoch": 11.917684127609295, + "grad_norm": 0.16788150370121002, + "learning_rate": 2.8290333333333336e-05, + "loss": 0.0079, + "step": 15135 + }, + { + "epoch": 11.918471839306815, + "grad_norm": 0.39020049571990967, + "learning_rate": 2.829e-05, + "loss": 0.0102, + "step": 15136 + }, + { + "epoch": 11.919259551004332, + "grad_norm": 0.1584109514951706, + "learning_rate": 2.8289666666666668e-05, + "loss": 0.0061, + "step": 15137 + }, + { + "epoch": 11.920047262701852, + "grad_norm": 0.2562220096588135, + "learning_rate": 2.8289333333333334e-05, + "loss": 0.0122, + "step": 15138 + }, + { + "epoch": 11.92083497439937, + "grad_norm": 0.236334890127182, + "learning_rate": 2.8289e-05, + "loss": 0.0084, + "step": 15139 + }, + { + "epoch": 11.921622686096889, + "grad_norm": 0.21742984652519226, + "learning_rate": 2.8288666666666666e-05, + "loss": 0.0059, + "step": 15140 + }, + { + "epoch": 11.922410397794406, + "grad_norm": 0.15540765225887299, + "learning_rate": 2.8288333333333335e-05, + "loss": 0.0166, + "step": 15141 + }, + { + "epoch": 11.923198109491926, + "grad_norm": 0.1582559049129486, + "learning_rate": 2.8288e-05, + "loss": 0.0086, + "step": 15142 + }, + { + "epoch": 11.923985821189445, + "grad_norm": 1.2978307008743286, + "learning_rate": 2.8287666666666667e-05, + "loss": 0.0084, + "step": 15143 + }, + { + "epoch": 11.924773532886963, + "grad_norm": 0.30108314752578735, + "learning_rate": 2.8287333333333336e-05, + "loss": 0.0071, + "step": 15144 + }, + { + "epoch": 11.925561244584483, + "grad_norm": 0.23499326407909393, + "learning_rate": 2.8287e-05, + "loss": 0.0118, + "step": 15145 + }, + { + "epoch": 11.926348956282, + "grad_norm": 0.16657806932926178, + "learning_rate": 2.8286666666666668e-05, + "loss": 0.0107, + "step": 15146 + }, + { + "epoch": 11.92713666797952, + "grad_norm": 0.13587532937526703, + "learning_rate": 2.8286333333333334e-05, + "loss": 0.0048, + "step": 15147 + }, + { + "epoch": 11.927924379677037, + "grad_norm": 0.20549234747886658, + "learning_rate": 2.8286e-05, + "loss": 0.0144, + "step": 15148 + }, + { + "epoch": 11.928712091374557, + "grad_norm": 0.23784981667995453, + "learning_rate": 2.8285666666666666e-05, + "loss": 0.0157, + "step": 15149 + }, + { + "epoch": 11.929499803072076, + "grad_norm": 0.2697475850582123, + "learning_rate": 2.8285333333333335e-05, + "loss": 0.0092, + "step": 15150 + }, + { + "epoch": 11.930287514769594, + "grad_norm": 0.21764186024665833, + "learning_rate": 2.8285e-05, + "loss": 0.0114, + "step": 15151 + }, + { + "epoch": 11.931075226467113, + "grad_norm": 0.2329782098531723, + "learning_rate": 2.8284666666666667e-05, + "loss": 0.0078, + "step": 15152 + }, + { + "epoch": 11.931862938164631, + "grad_norm": 0.28862518072128296, + "learning_rate": 2.8284333333333336e-05, + "loss": 0.0146, + "step": 15153 + }, + { + "epoch": 11.93265064986215, + "grad_norm": 0.2566812038421631, + "learning_rate": 2.8284e-05, + "loss": 0.0138, + "step": 15154 + }, + { + "epoch": 11.93343836155967, + "grad_norm": 0.16919508576393127, + "learning_rate": 2.8283666666666668e-05, + "loss": 0.0093, + "step": 15155 + }, + { + "epoch": 11.934226073257188, + "grad_norm": 0.3303368091583252, + "learning_rate": 2.8283333333333334e-05, + "loss": 0.0159, + "step": 15156 + }, + { + "epoch": 11.935013784954707, + "grad_norm": 0.271275132894516, + "learning_rate": 2.8283e-05, + "loss": 0.0074, + "step": 15157 + }, + { + "epoch": 11.935801496652225, + "grad_norm": 0.2067061811685562, + "learning_rate": 2.828266666666667e-05, + "loss": 0.0118, + "step": 15158 + }, + { + "epoch": 11.936589208349744, + "grad_norm": 0.10393655300140381, + "learning_rate": 2.8282333333333335e-05, + "loss": 0.0056, + "step": 15159 + }, + { + "epoch": 11.937376920047262, + "grad_norm": 0.2076403945684433, + "learning_rate": 2.8282e-05, + "loss": 0.0036, + "step": 15160 + }, + { + "epoch": 11.938164631744781, + "grad_norm": 0.6669636964797974, + "learning_rate": 2.8281666666666667e-05, + "loss": 0.0169, + "step": 15161 + }, + { + "epoch": 11.9389523434423, + "grad_norm": 0.1821296364068985, + "learning_rate": 2.8281333333333337e-05, + "loss": 0.01, + "step": 15162 + }, + { + "epoch": 11.939740055139819, + "grad_norm": 0.28541651368141174, + "learning_rate": 2.8281e-05, + "loss": 0.0112, + "step": 15163 + }, + { + "epoch": 11.940527766837338, + "grad_norm": 0.6966181397438049, + "learning_rate": 2.828066666666667e-05, + "loss": 0.0187, + "step": 15164 + }, + { + "epoch": 11.941315478534856, + "grad_norm": 0.22333551943302155, + "learning_rate": 2.828033333333333e-05, + "loss": 0.0072, + "step": 15165 + }, + { + "epoch": 11.942103190232375, + "grad_norm": 0.20538972318172455, + "learning_rate": 2.828e-05, + "loss": 0.0091, + "step": 15166 + }, + { + "epoch": 11.942890901929893, + "grad_norm": 0.46478933095932007, + "learning_rate": 2.827966666666667e-05, + "loss": 0.0084, + "step": 15167 + }, + { + "epoch": 11.943678613627412, + "grad_norm": 0.27885863184928894, + "learning_rate": 2.8279333333333332e-05, + "loss": 0.0118, + "step": 15168 + }, + { + "epoch": 11.944466325324932, + "grad_norm": 0.3456260859966278, + "learning_rate": 2.8279e-05, + "loss": 0.0241, + "step": 15169 + }, + { + "epoch": 11.94525403702245, + "grad_norm": 0.38009965419769287, + "learning_rate": 2.8278666666666667e-05, + "loss": 0.0173, + "step": 15170 + }, + { + "epoch": 11.946041748719969, + "grad_norm": 0.7063853740692139, + "learning_rate": 2.8278333333333333e-05, + "loss": 0.2689, + "step": 15171 + }, + { + "epoch": 11.946829460417487, + "grad_norm": 0.4197266101837158, + "learning_rate": 2.8278e-05, + "loss": 0.1229, + "step": 15172 + }, + { + "epoch": 11.947617172115006, + "grad_norm": 0.595113217830658, + "learning_rate": 2.827766666666667e-05, + "loss": 0.1154, + "step": 15173 + }, + { + "epoch": 11.948404883812525, + "grad_norm": 0.32489559054374695, + "learning_rate": 2.827733333333333e-05, + "loss": 0.0697, + "step": 15174 + }, + { + "epoch": 11.949192595510043, + "grad_norm": 0.5196365118026733, + "learning_rate": 2.8277e-05, + "loss": 0.0817, + "step": 15175 + }, + { + "epoch": 11.949980307207563, + "grad_norm": 0.20239044725894928, + "learning_rate": 2.827666666666667e-05, + "loss": 0.0166, + "step": 15176 + }, + { + "epoch": 11.95076801890508, + "grad_norm": 0.4760875105857849, + "learning_rate": 2.8276333333333332e-05, + "loss": 0.0127, + "step": 15177 + }, + { + "epoch": 11.9515557306026, + "grad_norm": 0.25203362107276917, + "learning_rate": 2.8276e-05, + "loss": 0.0148, + "step": 15178 + }, + { + "epoch": 11.952343442300117, + "grad_norm": 0.13607369363307953, + "learning_rate": 2.8275666666666668e-05, + "loss": 0.0074, + "step": 15179 + }, + { + "epoch": 11.953131153997637, + "grad_norm": 0.190272718667984, + "learning_rate": 2.8275333333333334e-05, + "loss": 0.0131, + "step": 15180 + }, + { + "epoch": 11.953918865695156, + "grad_norm": 0.24774034321308136, + "learning_rate": 2.8275e-05, + "loss": 0.0263, + "step": 15181 + }, + { + "epoch": 11.954706577392674, + "grad_norm": 0.26344287395477295, + "learning_rate": 2.827466666666667e-05, + "loss": 0.0237, + "step": 15182 + }, + { + "epoch": 11.955494289090193, + "grad_norm": 0.12548749148845673, + "learning_rate": 2.8274333333333335e-05, + "loss": 0.0098, + "step": 15183 + }, + { + "epoch": 11.956282000787711, + "grad_norm": 0.16511361300945282, + "learning_rate": 2.8274e-05, + "loss": 0.0069, + "step": 15184 + }, + { + "epoch": 11.95706971248523, + "grad_norm": 0.16827288269996643, + "learning_rate": 2.827366666666667e-05, + "loss": 0.0072, + "step": 15185 + }, + { + "epoch": 11.957857424182748, + "grad_norm": 0.16487127542495728, + "learning_rate": 2.8273333333333333e-05, + "loss": 0.0106, + "step": 15186 + }, + { + "epoch": 11.958645135880268, + "grad_norm": 0.34458109736442566, + "learning_rate": 2.8273000000000002e-05, + "loss": 0.0202, + "step": 15187 + }, + { + "epoch": 11.959432847577787, + "grad_norm": 0.09711945056915283, + "learning_rate": 2.8272666666666668e-05, + "loss": 0.0052, + "step": 15188 + }, + { + "epoch": 11.960220559275305, + "grad_norm": 0.20907743275165558, + "learning_rate": 2.8272333333333334e-05, + "loss": 0.0168, + "step": 15189 + }, + { + "epoch": 11.961008270972824, + "grad_norm": 0.1831585317850113, + "learning_rate": 2.8272e-05, + "loss": 0.0071, + "step": 15190 + }, + { + "epoch": 11.961795982670342, + "grad_norm": 0.16109995543956757, + "learning_rate": 2.827166666666667e-05, + "loss": 0.0054, + "step": 15191 + }, + { + "epoch": 11.962583694367861, + "grad_norm": 0.14529117941856384, + "learning_rate": 2.8271333333333335e-05, + "loss": 0.0092, + "step": 15192 + }, + { + "epoch": 11.96337140606538, + "grad_norm": 0.2521763741970062, + "learning_rate": 2.8271e-05, + "loss": 0.0162, + "step": 15193 + }, + { + "epoch": 11.964159117762899, + "grad_norm": 0.12934696674346924, + "learning_rate": 2.8270666666666667e-05, + "loss": 0.0063, + "step": 15194 + }, + { + "epoch": 11.964946829460418, + "grad_norm": 0.13866551220417023, + "learning_rate": 2.8270333333333333e-05, + "loss": 0.0054, + "step": 15195 + }, + { + "epoch": 11.965734541157936, + "grad_norm": 0.21335706114768982, + "learning_rate": 2.8270000000000002e-05, + "loss": 0.0156, + "step": 15196 + }, + { + "epoch": 11.966522252855455, + "grad_norm": 0.19618424773216248, + "learning_rate": 2.8269666666666665e-05, + "loss": 0.007, + "step": 15197 + }, + { + "epoch": 11.967309964552973, + "grad_norm": 0.2049410194158554, + "learning_rate": 2.8269333333333334e-05, + "loss": 0.0073, + "step": 15198 + }, + { + "epoch": 11.968097676250492, + "grad_norm": 0.2528725564479828, + "learning_rate": 2.8269e-05, + "loss": 0.0099, + "step": 15199 + }, + { + "epoch": 11.968885387948012, + "grad_norm": 0.21829365193843842, + "learning_rate": 2.8268666666666666e-05, + "loss": 0.0084, + "step": 15200 + }, + { + "epoch": 11.96967309964553, + "grad_norm": 0.39190223813056946, + "learning_rate": 2.8268333333333335e-05, + "loss": 0.0061, + "step": 15201 + }, + { + "epoch": 11.970460811343049, + "grad_norm": 0.20511354506015778, + "learning_rate": 2.8268e-05, + "loss": 0.0122, + "step": 15202 + }, + { + "epoch": 11.971248523040567, + "grad_norm": 0.08446861803531647, + "learning_rate": 2.8267666666666667e-05, + "loss": 0.0073, + "step": 15203 + }, + { + "epoch": 11.972036234738086, + "grad_norm": 0.22261083126068115, + "learning_rate": 2.8267333333333333e-05, + "loss": 0.0107, + "step": 15204 + }, + { + "epoch": 11.972823946435604, + "grad_norm": 0.048237163573503494, + "learning_rate": 2.8267000000000002e-05, + "loss": 0.0022, + "step": 15205 + }, + { + "epoch": 11.973611658133123, + "grad_norm": 0.3581071197986603, + "learning_rate": 2.8266666666666665e-05, + "loss": 0.0047, + "step": 15206 + }, + { + "epoch": 11.974399369830643, + "grad_norm": 0.15429820120334625, + "learning_rate": 2.8266333333333334e-05, + "loss": 0.007, + "step": 15207 + }, + { + "epoch": 11.97518708152816, + "grad_norm": 0.28068387508392334, + "learning_rate": 2.8266000000000003e-05, + "loss": 0.0131, + "step": 15208 + }, + { + "epoch": 11.97597479322568, + "grad_norm": 0.23049737513065338, + "learning_rate": 2.8265666666666666e-05, + "loss": 0.0122, + "step": 15209 + }, + { + "epoch": 11.976762504923197, + "grad_norm": 0.45736226439476013, + "learning_rate": 2.8265333333333335e-05, + "loss": 0.0176, + "step": 15210 + }, + { + "epoch": 11.977550216620717, + "grad_norm": 0.16873648762702942, + "learning_rate": 2.8265e-05, + "loss": 0.0129, + "step": 15211 + }, + { + "epoch": 11.978337928318236, + "grad_norm": 0.34864485263824463, + "learning_rate": 2.8264666666666667e-05, + "loss": 0.0165, + "step": 15212 + }, + { + "epoch": 11.979125640015754, + "grad_norm": 0.30226585268974304, + "learning_rate": 2.8264333333333333e-05, + "loss": 0.009, + "step": 15213 + }, + { + "epoch": 11.979913351713273, + "grad_norm": 0.41459277272224426, + "learning_rate": 2.8264000000000002e-05, + "loss": 0.0086, + "step": 15214 + }, + { + "epoch": 11.980701063410791, + "grad_norm": 0.22350670397281647, + "learning_rate": 2.8263666666666665e-05, + "loss": 0.0095, + "step": 15215 + }, + { + "epoch": 11.98148877510831, + "grad_norm": 0.1290869116783142, + "learning_rate": 2.8263333333333334e-05, + "loss": 0.0064, + "step": 15216 + }, + { + "epoch": 11.982276486805828, + "grad_norm": 0.19818015396595, + "learning_rate": 2.8263000000000004e-05, + "loss": 0.0091, + "step": 15217 + }, + { + "epoch": 11.983064198503348, + "grad_norm": 0.2551735043525696, + "learning_rate": 2.8262666666666666e-05, + "loss": 0.0088, + "step": 15218 + }, + { + "epoch": 11.983851910200867, + "grad_norm": 0.24366246163845062, + "learning_rate": 2.8262333333333336e-05, + "loss": 0.0114, + "step": 15219 + }, + { + "epoch": 11.984639621898385, + "grad_norm": 0.21296317875385284, + "learning_rate": 2.8262e-05, + "loss": 0.0088, + "step": 15220 + }, + { + "epoch": 11.985427333595904, + "grad_norm": 0.4202103912830353, + "learning_rate": 2.8261666666666667e-05, + "loss": 0.1046, + "step": 15221 + }, + { + "epoch": 11.986215045293422, + "grad_norm": 0.3235439360141754, + "learning_rate": 2.8261333333333333e-05, + "loss": 0.0858, + "step": 15222 + }, + { + "epoch": 11.987002756990941, + "grad_norm": 0.32001832127571106, + "learning_rate": 2.8261e-05, + "loss": 0.0558, + "step": 15223 + }, + { + "epoch": 11.987790468688459, + "grad_norm": 0.11601747572422028, + "learning_rate": 2.8260666666666665e-05, + "loss": 0.0077, + "step": 15224 + }, + { + "epoch": 11.988578180385979, + "grad_norm": 0.6412959694862366, + "learning_rate": 2.8260333333333335e-05, + "loss": 0.0323, + "step": 15225 + }, + { + "epoch": 11.989365892083498, + "grad_norm": 0.12369298934936523, + "learning_rate": 2.826e-05, + "loss": 0.0087, + "step": 15226 + }, + { + "epoch": 11.990153603781016, + "grad_norm": 0.15577730536460876, + "learning_rate": 2.8259666666666666e-05, + "loss": 0.0093, + "step": 15227 + }, + { + "epoch": 11.990941315478535, + "grad_norm": 0.13406647741794586, + "learning_rate": 2.8259333333333336e-05, + "loss": 0.0074, + "step": 15228 + }, + { + "epoch": 11.991729027176053, + "grad_norm": 0.14995455741882324, + "learning_rate": 2.8258999999999998e-05, + "loss": 0.0107, + "step": 15229 + }, + { + "epoch": 11.992516738873572, + "grad_norm": 0.26391008496284485, + "learning_rate": 2.8258666666666668e-05, + "loss": 0.0091, + "step": 15230 + }, + { + "epoch": 11.993304450571092, + "grad_norm": 0.2585811913013458, + "learning_rate": 2.8258333333333334e-05, + "loss": 0.0118, + "step": 15231 + }, + { + "epoch": 11.99409216226861, + "grad_norm": 0.11068207770586014, + "learning_rate": 2.8258e-05, + "loss": 0.0071, + "step": 15232 + }, + { + "epoch": 11.994879873966129, + "grad_norm": 0.11860866099596024, + "learning_rate": 2.825766666666667e-05, + "loss": 0.005, + "step": 15233 + }, + { + "epoch": 11.995667585663647, + "grad_norm": 0.3265817165374756, + "learning_rate": 2.8257333333333335e-05, + "loss": 0.0152, + "step": 15234 + }, + { + "epoch": 11.996455297361166, + "grad_norm": 0.21795238554477692, + "learning_rate": 2.8257e-05, + "loss": 0.0169, + "step": 15235 + }, + { + "epoch": 11.997243009058685, + "grad_norm": 0.4492070972919464, + "learning_rate": 2.8256666666666667e-05, + "loss": 0.013, + "step": 15236 + }, + { + "epoch": 11.998030720756203, + "grad_norm": 0.4425395131111145, + "learning_rate": 2.8256333333333336e-05, + "loss": 0.0088, + "step": 15237 + }, + { + "epoch": 11.998818432453723, + "grad_norm": 0.15771299600601196, + "learning_rate": 2.8256e-05, + "loss": 0.0054, + "step": 15238 + }, + { + "epoch": 11.99960614415124, + "grad_norm": 0.5986523032188416, + "learning_rate": 2.8255666666666668e-05, + "loss": 0.0315, + "step": 15239 + }, + { + "epoch": 12.0, + "grad_norm": 0.17083315551280975, + "learning_rate": 2.8255333333333334e-05, + "loss": 0.0057, + "step": 15240 + }, + { + "epoch": 12.00078771169752, + "grad_norm": 0.48588287830352783, + "learning_rate": 2.8255e-05, + "loss": 0.1307, + "step": 15241 + }, + { + "epoch": 12.001575423395037, + "grad_norm": 0.5397747755050659, + "learning_rate": 2.825466666666667e-05, + "loss": 0.1152, + "step": 15242 + }, + { + "epoch": 12.002363135092557, + "grad_norm": 0.45495739579200745, + "learning_rate": 2.8254333333333335e-05, + "loss": 0.1275, + "step": 15243 + }, + { + "epoch": 12.003150846790074, + "grad_norm": 0.3905108571052551, + "learning_rate": 2.8254e-05, + "loss": 0.0798, + "step": 15244 + }, + { + "epoch": 12.003938558487594, + "grad_norm": 0.40524858236312866, + "learning_rate": 2.8253666666666667e-05, + "loss": 0.0535, + "step": 15245 + }, + { + "epoch": 12.004726270185111, + "grad_norm": 0.42708852887153625, + "learning_rate": 2.8253333333333336e-05, + "loss": 0.0655, + "step": 15246 + }, + { + "epoch": 12.00551398188263, + "grad_norm": 0.1409161388874054, + "learning_rate": 2.8253e-05, + "loss": 0.0141, + "step": 15247 + }, + { + "epoch": 12.00630169358015, + "grad_norm": 0.16124966740608215, + "learning_rate": 2.8252666666666668e-05, + "loss": 0.0122, + "step": 15248 + }, + { + "epoch": 12.007089405277668, + "grad_norm": 0.14795617759227753, + "learning_rate": 2.8252333333333334e-05, + "loss": 0.0124, + "step": 15249 + }, + { + "epoch": 12.007877116975187, + "grad_norm": 0.08246877789497375, + "learning_rate": 2.8252e-05, + "loss": 0.0054, + "step": 15250 + }, + { + "epoch": 12.008664828672705, + "grad_norm": 0.15794096887111664, + "learning_rate": 2.825166666666667e-05, + "loss": 0.0072, + "step": 15251 + }, + { + "epoch": 12.009452540370225, + "grad_norm": 0.1517462581396103, + "learning_rate": 2.8251333333333335e-05, + "loss": 0.0075, + "step": 15252 + }, + { + "epoch": 12.010240252067744, + "grad_norm": 0.29872339963912964, + "learning_rate": 2.8251e-05, + "loss": 0.0081, + "step": 15253 + }, + { + "epoch": 12.011027963765262, + "grad_norm": 0.6365204453468323, + "learning_rate": 2.8250666666666667e-05, + "loss": 0.0125, + "step": 15254 + }, + { + "epoch": 12.011815675462781, + "grad_norm": 0.07853809744119644, + "learning_rate": 2.8250333333333333e-05, + "loss": 0.0027, + "step": 15255 + }, + { + "epoch": 12.012603387160299, + "grad_norm": 0.22332389652729034, + "learning_rate": 2.825e-05, + "loss": 0.0045, + "step": 15256 + }, + { + "epoch": 12.013391098857818, + "grad_norm": 0.25098589062690735, + "learning_rate": 2.8249666666666668e-05, + "loss": 0.0045, + "step": 15257 + }, + { + "epoch": 12.014178810555336, + "grad_norm": 0.10327248275279999, + "learning_rate": 2.8249333333333334e-05, + "loss": 0.0065, + "step": 15258 + }, + { + "epoch": 12.014966522252855, + "grad_norm": 0.1849689781665802, + "learning_rate": 2.8249e-05, + "loss": 0.007, + "step": 15259 + }, + { + "epoch": 12.015754233950375, + "grad_norm": 0.12288615852594376, + "learning_rate": 2.824866666666667e-05, + "loss": 0.0046, + "step": 15260 + }, + { + "epoch": 12.016541945647893, + "grad_norm": 0.2096136212348938, + "learning_rate": 2.8248333333333332e-05, + "loss": 0.0116, + "step": 15261 + }, + { + "epoch": 12.017329657345412, + "grad_norm": 0.19105647504329681, + "learning_rate": 2.8248e-05, + "loss": 0.0156, + "step": 15262 + }, + { + "epoch": 12.01811736904293, + "grad_norm": 0.401719868183136, + "learning_rate": 2.8247666666666667e-05, + "loss": 0.0104, + "step": 15263 + }, + { + "epoch": 12.01890508074045, + "grad_norm": 0.1322997659444809, + "learning_rate": 2.8247333333333333e-05, + "loss": 0.0075, + "step": 15264 + }, + { + "epoch": 12.019692792437967, + "grad_norm": 0.2320784479379654, + "learning_rate": 2.8247e-05, + "loss": 0.0141, + "step": 15265 + }, + { + "epoch": 12.020480504135486, + "grad_norm": 0.455590158700943, + "learning_rate": 2.824666666666667e-05, + "loss": 0.0153, + "step": 15266 + }, + { + "epoch": 12.021268215833006, + "grad_norm": 0.1992596834897995, + "learning_rate": 2.8246333333333334e-05, + "loss": 0.0069, + "step": 15267 + }, + { + "epoch": 12.022055927530523, + "grad_norm": 0.11615303158760071, + "learning_rate": 2.8246e-05, + "loss": 0.0039, + "step": 15268 + }, + { + "epoch": 12.022843639228043, + "grad_norm": 0.19403447210788727, + "learning_rate": 2.824566666666667e-05, + "loss": 0.0085, + "step": 15269 + }, + { + "epoch": 12.02363135092556, + "grad_norm": 0.2449449896812439, + "learning_rate": 2.8245333333333332e-05, + "loss": 0.0081, + "step": 15270 + }, + { + "epoch": 12.02441906262308, + "grad_norm": 0.22204530239105225, + "learning_rate": 2.8245e-05, + "loss": 0.0065, + "step": 15271 + }, + { + "epoch": 12.0252067743206, + "grad_norm": 0.2337706834077835, + "learning_rate": 2.8244666666666667e-05, + "loss": 0.0131, + "step": 15272 + }, + { + "epoch": 12.025994486018117, + "grad_norm": 0.2523253262042999, + "learning_rate": 2.8244333333333333e-05, + "loss": 0.0123, + "step": 15273 + }, + { + "epoch": 12.026782197715637, + "grad_norm": 0.12794198095798492, + "learning_rate": 2.8244e-05, + "loss": 0.0084, + "step": 15274 + }, + { + "epoch": 12.027569909413154, + "grad_norm": 0.1193154826760292, + "learning_rate": 2.824366666666667e-05, + "loss": 0.0036, + "step": 15275 + }, + { + "epoch": 12.028357621110674, + "grad_norm": 0.2420918494462967, + "learning_rate": 2.8243333333333335e-05, + "loss": 0.0114, + "step": 15276 + }, + { + "epoch": 12.029145332808191, + "grad_norm": 0.2883686125278473, + "learning_rate": 2.8243e-05, + "loss": 0.0117, + "step": 15277 + }, + { + "epoch": 12.02993304450571, + "grad_norm": 0.3236706554889679, + "learning_rate": 2.824266666666667e-05, + "loss": 0.0069, + "step": 15278 + }, + { + "epoch": 12.03072075620323, + "grad_norm": 0.08659595251083374, + "learning_rate": 2.8242333333333332e-05, + "loss": 0.0047, + "step": 15279 + }, + { + "epoch": 12.031508467900748, + "grad_norm": 0.23876971006393433, + "learning_rate": 2.8242e-05, + "loss": 0.0077, + "step": 15280 + }, + { + "epoch": 12.032296179598267, + "grad_norm": 0.1929863542318344, + "learning_rate": 2.8241666666666668e-05, + "loss": 0.0082, + "step": 15281 + }, + { + "epoch": 12.033083891295785, + "grad_norm": 0.19082558155059814, + "learning_rate": 2.8241333333333334e-05, + "loss": 0.0078, + "step": 15282 + }, + { + "epoch": 12.033871602993305, + "grad_norm": 0.18728086352348328, + "learning_rate": 2.8241000000000003e-05, + "loss": 0.0049, + "step": 15283 + }, + { + "epoch": 12.034659314690822, + "grad_norm": 0.3623099625110626, + "learning_rate": 2.8240666666666665e-05, + "loss": 0.0049, + "step": 15284 + }, + { + "epoch": 12.035447026388342, + "grad_norm": 0.37702447175979614, + "learning_rate": 2.8240333333333335e-05, + "loss": 0.0077, + "step": 15285 + }, + { + "epoch": 12.036234738085861, + "grad_norm": 0.35655903816223145, + "learning_rate": 2.824e-05, + "loss": 0.0081, + "step": 15286 + }, + { + "epoch": 12.037022449783379, + "grad_norm": 0.37391600012779236, + "learning_rate": 2.8239666666666667e-05, + "loss": 0.0108, + "step": 15287 + }, + { + "epoch": 12.037810161480898, + "grad_norm": 0.13988415896892548, + "learning_rate": 2.8239333333333333e-05, + "loss": 0.0044, + "step": 15288 + }, + { + "epoch": 12.038597873178416, + "grad_norm": 0.3514108955860138, + "learning_rate": 2.8239000000000002e-05, + "loss": 0.0047, + "step": 15289 + }, + { + "epoch": 12.039385584875935, + "grad_norm": 0.18642885982990265, + "learning_rate": 2.8238666666666664e-05, + "loss": 0.0048, + "step": 15290 + }, + { + "epoch": 12.040173296573455, + "grad_norm": 0.5139287114143372, + "learning_rate": 2.8238333333333334e-05, + "loss": 0.1709, + "step": 15291 + }, + { + "epoch": 12.040961008270973, + "grad_norm": 0.4055432379245758, + "learning_rate": 2.8238000000000003e-05, + "loss": 0.1476, + "step": 15292 + }, + { + "epoch": 12.041748719968492, + "grad_norm": 0.6580935120582581, + "learning_rate": 2.8237666666666666e-05, + "loss": 0.069, + "step": 15293 + }, + { + "epoch": 12.04253643166601, + "grad_norm": 0.7612277865409851, + "learning_rate": 2.8237333333333335e-05, + "loss": 0.1057, + "step": 15294 + }, + { + "epoch": 12.04332414336353, + "grad_norm": 0.31673651933670044, + "learning_rate": 2.8237e-05, + "loss": 0.0382, + "step": 15295 + }, + { + "epoch": 12.044111855061047, + "grad_norm": 0.28075242042541504, + "learning_rate": 2.8236666666666667e-05, + "loss": 0.0666, + "step": 15296 + }, + { + "epoch": 12.044899566758566, + "grad_norm": 0.1892121285200119, + "learning_rate": 2.8236333333333333e-05, + "loss": 0.0219, + "step": 15297 + }, + { + "epoch": 12.045687278456086, + "grad_norm": 0.2061404287815094, + "learning_rate": 2.8236000000000002e-05, + "loss": 0.0096, + "step": 15298 + }, + { + "epoch": 12.046474990153603, + "grad_norm": 0.15467466413974762, + "learning_rate": 2.8235666666666665e-05, + "loss": 0.0162, + "step": 15299 + }, + { + "epoch": 12.047262701851123, + "grad_norm": 0.5732933282852173, + "learning_rate": 2.8235333333333334e-05, + "loss": 0.0168, + "step": 15300 + }, + { + "epoch": 12.04805041354864, + "grad_norm": 0.24213071167469025, + "learning_rate": 2.8235000000000003e-05, + "loss": 0.0123, + "step": 15301 + }, + { + "epoch": 12.04883812524616, + "grad_norm": 0.6003707647323608, + "learning_rate": 2.8234666666666666e-05, + "loss": 0.0176, + "step": 15302 + }, + { + "epoch": 12.04962583694368, + "grad_norm": 0.12470722943544388, + "learning_rate": 2.8234333333333335e-05, + "loss": 0.0087, + "step": 15303 + }, + { + "epoch": 12.050413548641197, + "grad_norm": 0.17301906645298004, + "learning_rate": 2.8234e-05, + "loss": 0.0128, + "step": 15304 + }, + { + "epoch": 12.051201260338717, + "grad_norm": 0.16793301701545715, + "learning_rate": 2.8233666666666667e-05, + "loss": 0.0088, + "step": 15305 + }, + { + "epoch": 12.051988972036234, + "grad_norm": 0.0966280996799469, + "learning_rate": 2.8233333333333333e-05, + "loss": 0.0039, + "step": 15306 + }, + { + "epoch": 12.052776683733754, + "grad_norm": 0.24520105123519897, + "learning_rate": 2.8233000000000002e-05, + "loss": 0.0093, + "step": 15307 + }, + { + "epoch": 12.053564395431271, + "grad_norm": 0.0866558626294136, + "learning_rate": 2.8232666666666668e-05, + "loss": 0.0034, + "step": 15308 + }, + { + "epoch": 12.054352107128791, + "grad_norm": 0.08687364310026169, + "learning_rate": 2.8232333333333334e-05, + "loss": 0.0059, + "step": 15309 + }, + { + "epoch": 12.05513981882631, + "grad_norm": 0.2974657714366913, + "learning_rate": 2.8232000000000003e-05, + "loss": 0.0117, + "step": 15310 + }, + { + "epoch": 12.055927530523828, + "grad_norm": 0.32152727246284485, + "learning_rate": 2.8231666666666666e-05, + "loss": 0.0187, + "step": 15311 + }, + { + "epoch": 12.056715242221347, + "grad_norm": 0.21036557853221893, + "learning_rate": 2.8231333333333335e-05, + "loss": 0.006, + "step": 15312 + }, + { + "epoch": 12.057502953918865, + "grad_norm": 0.29861176013946533, + "learning_rate": 2.8231e-05, + "loss": 0.0152, + "step": 15313 + }, + { + "epoch": 12.058290665616385, + "grad_norm": 0.31831127405166626, + "learning_rate": 2.8230666666666667e-05, + "loss": 0.0114, + "step": 15314 + }, + { + "epoch": 12.059078377313902, + "grad_norm": 1.1348227262496948, + "learning_rate": 2.8230333333333333e-05, + "loss": 0.0092, + "step": 15315 + }, + { + "epoch": 12.059866089011422, + "grad_norm": 0.40260013937950134, + "learning_rate": 2.823e-05, + "loss": 0.0069, + "step": 15316 + }, + { + "epoch": 12.060653800708941, + "grad_norm": 0.14394640922546387, + "learning_rate": 2.822966666666667e-05, + "loss": 0.009, + "step": 15317 + }, + { + "epoch": 12.061441512406459, + "grad_norm": 0.14640961587429047, + "learning_rate": 2.8229333333333334e-05, + "loss": 0.0098, + "step": 15318 + }, + { + "epoch": 12.062229224103978, + "grad_norm": 0.18230946362018585, + "learning_rate": 2.8229e-05, + "loss": 0.0083, + "step": 15319 + }, + { + "epoch": 12.063016935801496, + "grad_norm": 0.2821504771709442, + "learning_rate": 2.8228666666666666e-05, + "loss": 0.0076, + "step": 15320 + }, + { + "epoch": 12.063804647499015, + "grad_norm": 0.12443806231021881, + "learning_rate": 2.8228333333333336e-05, + "loss": 0.0056, + "step": 15321 + }, + { + "epoch": 12.064592359196535, + "grad_norm": 0.3762710690498352, + "learning_rate": 2.8227999999999998e-05, + "loss": 0.012, + "step": 15322 + }, + { + "epoch": 12.065380070894053, + "grad_norm": 0.10943640023469925, + "learning_rate": 2.8227666666666667e-05, + "loss": 0.0061, + "step": 15323 + }, + { + "epoch": 12.066167782591572, + "grad_norm": 0.2703808546066284, + "learning_rate": 2.8227333333333333e-05, + "loss": 0.0099, + "step": 15324 + }, + { + "epoch": 12.06695549428909, + "grad_norm": 0.20772318542003632, + "learning_rate": 2.8227e-05, + "loss": 0.0104, + "step": 15325 + }, + { + "epoch": 12.06774320598661, + "grad_norm": 0.3104206621646881, + "learning_rate": 2.822666666666667e-05, + "loss": 0.0165, + "step": 15326 + }, + { + "epoch": 12.068530917684127, + "grad_norm": 0.14729765057563782, + "learning_rate": 2.8226333333333335e-05, + "loss": 0.01, + "step": 15327 + }, + { + "epoch": 12.069318629381646, + "grad_norm": 0.20131614804267883, + "learning_rate": 2.8226e-05, + "loss": 0.0066, + "step": 15328 + }, + { + "epoch": 12.070106341079166, + "grad_norm": 0.28167423605918884, + "learning_rate": 2.8225666666666666e-05, + "loss": 0.0127, + "step": 15329 + }, + { + "epoch": 12.070894052776683, + "grad_norm": 0.30348125100135803, + "learning_rate": 2.8225333333333336e-05, + "loss": 0.0206, + "step": 15330 + }, + { + "epoch": 12.071681764474203, + "grad_norm": 0.7702656984329224, + "learning_rate": 2.8224999999999998e-05, + "loss": 0.0179, + "step": 15331 + }, + { + "epoch": 12.07246947617172, + "grad_norm": 0.24427753686904907, + "learning_rate": 2.8224666666666668e-05, + "loss": 0.0112, + "step": 15332 + }, + { + "epoch": 12.07325718786924, + "grad_norm": 0.25835004448890686, + "learning_rate": 2.8224333333333337e-05, + "loss": 0.0062, + "step": 15333 + }, + { + "epoch": 12.074044899566758, + "grad_norm": 0.23075422644615173, + "learning_rate": 2.8224e-05, + "loss": 0.0108, + "step": 15334 + }, + { + "epoch": 12.074832611264277, + "grad_norm": 0.2561299800872803, + "learning_rate": 2.822366666666667e-05, + "loss": 0.0097, + "step": 15335 + }, + { + "epoch": 12.075620322961797, + "grad_norm": 0.18460102379322052, + "learning_rate": 2.8223333333333335e-05, + "loss": 0.0058, + "step": 15336 + }, + { + "epoch": 12.076408034659314, + "grad_norm": 0.22077374160289764, + "learning_rate": 2.8223e-05, + "loss": 0.0044, + "step": 15337 + }, + { + "epoch": 12.077195746356834, + "grad_norm": 0.5551306009292603, + "learning_rate": 2.8222666666666667e-05, + "loss": 0.0084, + "step": 15338 + }, + { + "epoch": 12.077983458054351, + "grad_norm": 0.20008517801761627, + "learning_rate": 2.8222333333333336e-05, + "loss": 0.01, + "step": 15339 + }, + { + "epoch": 12.078771169751871, + "grad_norm": 0.48269832134246826, + "learning_rate": 2.8222e-05, + "loss": 0.0091, + "step": 15340 + }, + { + "epoch": 12.07955888144939, + "grad_norm": 0.748683512210846, + "learning_rate": 2.8221666666666668e-05, + "loss": 0.1951, + "step": 15341 + }, + { + "epoch": 12.080346593146908, + "grad_norm": 0.6244509816169739, + "learning_rate": 2.8221333333333337e-05, + "loss": 0.1851, + "step": 15342 + }, + { + "epoch": 12.081134304844428, + "grad_norm": 0.5179840922355652, + "learning_rate": 2.8221e-05, + "loss": 0.0859, + "step": 15343 + }, + { + "epoch": 12.081922016541945, + "grad_norm": 0.3985542356967926, + "learning_rate": 2.822066666666667e-05, + "loss": 0.0959, + "step": 15344 + }, + { + "epoch": 12.082709728239465, + "grad_norm": 0.3385522663593292, + "learning_rate": 2.822033333333333e-05, + "loss": 0.0432, + "step": 15345 + }, + { + "epoch": 12.083497439936982, + "grad_norm": 0.1923120617866516, + "learning_rate": 2.822e-05, + "loss": 0.0214, + "step": 15346 + }, + { + "epoch": 12.084285151634502, + "grad_norm": 0.4165063500404358, + "learning_rate": 2.8219666666666667e-05, + "loss": 0.0128, + "step": 15347 + }, + { + "epoch": 12.085072863332021, + "grad_norm": 0.19207225739955902, + "learning_rate": 2.8219333333333333e-05, + "loss": 0.0114, + "step": 15348 + }, + { + "epoch": 12.085860575029539, + "grad_norm": 0.204949751496315, + "learning_rate": 2.8219e-05, + "loss": 0.0095, + "step": 15349 + }, + { + "epoch": 12.086648286727058, + "grad_norm": 0.1389198750257492, + "learning_rate": 2.8218666666666668e-05, + "loss": 0.0061, + "step": 15350 + }, + { + "epoch": 12.087435998424576, + "grad_norm": 0.36668920516967773, + "learning_rate": 2.8218333333333334e-05, + "loss": 0.0135, + "step": 15351 + }, + { + "epoch": 12.088223710122096, + "grad_norm": 0.103516586124897, + "learning_rate": 2.8218e-05, + "loss": 0.0055, + "step": 15352 + }, + { + "epoch": 12.089011421819613, + "grad_norm": 0.18943601846694946, + "learning_rate": 2.821766666666667e-05, + "loss": 0.0157, + "step": 15353 + }, + { + "epoch": 12.089799133517133, + "grad_norm": 0.14549635350704193, + "learning_rate": 2.8217333333333332e-05, + "loss": 0.0062, + "step": 15354 + }, + { + "epoch": 12.090586845214652, + "grad_norm": 0.8421486616134644, + "learning_rate": 2.8217e-05, + "loss": 0.01, + "step": 15355 + }, + { + "epoch": 12.09137455691217, + "grad_norm": 0.14569300413131714, + "learning_rate": 2.8216666666666667e-05, + "loss": 0.0052, + "step": 15356 + }, + { + "epoch": 12.09216226860969, + "grad_norm": 0.20882311463356018, + "learning_rate": 2.8216333333333333e-05, + "loss": 0.0087, + "step": 15357 + }, + { + "epoch": 12.092949980307207, + "grad_norm": 0.26659446954727173, + "learning_rate": 2.8216000000000002e-05, + "loss": 0.013, + "step": 15358 + }, + { + "epoch": 12.093737692004726, + "grad_norm": 0.20161721110343933, + "learning_rate": 2.8215666666666668e-05, + "loss": 0.0142, + "step": 15359 + }, + { + "epoch": 12.094525403702246, + "grad_norm": 0.2444819211959839, + "learning_rate": 2.8215333333333334e-05, + "loss": 0.0123, + "step": 15360 + }, + { + "epoch": 12.095313115399764, + "grad_norm": 0.1908285766839981, + "learning_rate": 2.8215e-05, + "loss": 0.0108, + "step": 15361 + }, + { + "epoch": 12.096100827097283, + "grad_norm": 0.33403995633125305, + "learning_rate": 2.821466666666667e-05, + "loss": 0.016, + "step": 15362 + }, + { + "epoch": 12.0968885387948, + "grad_norm": 0.16317513585090637, + "learning_rate": 2.8214333333333332e-05, + "loss": 0.0064, + "step": 15363 + }, + { + "epoch": 12.09767625049232, + "grad_norm": 0.1473882645368576, + "learning_rate": 2.8214e-05, + "loss": 0.0077, + "step": 15364 + }, + { + "epoch": 12.098463962189838, + "grad_norm": 0.10429388284683228, + "learning_rate": 2.8213666666666667e-05, + "loss": 0.0056, + "step": 15365 + }, + { + "epoch": 12.099251673887357, + "grad_norm": 0.14236775040626526, + "learning_rate": 2.8213333333333333e-05, + "loss": 0.0049, + "step": 15366 + }, + { + "epoch": 12.100039385584877, + "grad_norm": 0.2061794549226761, + "learning_rate": 2.8213000000000002e-05, + "loss": 0.0086, + "step": 15367 + }, + { + "epoch": 12.100827097282394, + "grad_norm": 0.09422646462917328, + "learning_rate": 2.821266666666667e-05, + "loss": 0.0049, + "step": 15368 + }, + { + "epoch": 12.101614808979914, + "grad_norm": 0.4360932409763336, + "learning_rate": 2.8212333333333334e-05, + "loss": 0.0092, + "step": 15369 + }, + { + "epoch": 12.102402520677431, + "grad_norm": 0.18715964257717133, + "learning_rate": 2.8212e-05, + "loss": 0.0096, + "step": 15370 + }, + { + "epoch": 12.103190232374951, + "grad_norm": 0.17351940274238586, + "learning_rate": 2.821166666666667e-05, + "loss": 0.008, + "step": 15371 + }, + { + "epoch": 12.103977944072469, + "grad_norm": 0.2968997657299042, + "learning_rate": 2.8211333333333332e-05, + "loss": 0.0127, + "step": 15372 + }, + { + "epoch": 12.104765655769988, + "grad_norm": 0.700716495513916, + "learning_rate": 2.8211e-05, + "loss": 0.0082, + "step": 15373 + }, + { + "epoch": 12.105553367467508, + "grad_norm": 0.3693387806415558, + "learning_rate": 2.8210666666666667e-05, + "loss": 0.0131, + "step": 15374 + }, + { + "epoch": 12.106341079165025, + "grad_norm": 0.16389328241348267, + "learning_rate": 2.8210333333333333e-05, + "loss": 0.0105, + "step": 15375 + }, + { + "epoch": 12.107128790862545, + "grad_norm": 0.15340334177017212, + "learning_rate": 2.8210000000000003e-05, + "loss": 0.0074, + "step": 15376 + }, + { + "epoch": 12.107916502560062, + "grad_norm": 0.08240417391061783, + "learning_rate": 2.8209666666666665e-05, + "loss": 0.0057, + "step": 15377 + }, + { + "epoch": 12.108704214257582, + "grad_norm": 0.09408498555421829, + "learning_rate": 2.8209333333333335e-05, + "loss": 0.0061, + "step": 15378 + }, + { + "epoch": 12.109491925955101, + "grad_norm": 0.6152880191802979, + "learning_rate": 2.8209e-05, + "loss": 0.0084, + "step": 15379 + }, + { + "epoch": 12.110279637652619, + "grad_norm": 0.214158833026886, + "learning_rate": 2.8208666666666666e-05, + "loss": 0.0102, + "step": 15380 + }, + { + "epoch": 12.111067349350138, + "grad_norm": 0.3473976254463196, + "learning_rate": 2.8208333333333332e-05, + "loss": 0.0083, + "step": 15381 + }, + { + "epoch": 12.111855061047656, + "grad_norm": 0.5653436183929443, + "learning_rate": 2.8208e-05, + "loss": 0.0065, + "step": 15382 + }, + { + "epoch": 12.112642772745176, + "grad_norm": 0.44644224643707275, + "learning_rate": 2.8207666666666668e-05, + "loss": 0.0179, + "step": 15383 + }, + { + "epoch": 12.113430484442693, + "grad_norm": 0.17275646328926086, + "learning_rate": 2.8207333333333334e-05, + "loss": 0.0101, + "step": 15384 + }, + { + "epoch": 12.114218196140213, + "grad_norm": 0.2096022218465805, + "learning_rate": 2.8207000000000003e-05, + "loss": 0.0082, + "step": 15385 + }, + { + "epoch": 12.115005907837732, + "grad_norm": 0.2178313434123993, + "learning_rate": 2.8206666666666665e-05, + "loss": 0.0127, + "step": 15386 + }, + { + "epoch": 12.11579361953525, + "grad_norm": 1.113511085510254, + "learning_rate": 2.8206333333333335e-05, + "loss": 0.0107, + "step": 15387 + }, + { + "epoch": 12.11658133123277, + "grad_norm": 0.7918652892112732, + "learning_rate": 2.8206e-05, + "loss": 0.0139, + "step": 15388 + }, + { + "epoch": 12.117369042930287, + "grad_norm": 0.305467814207077, + "learning_rate": 2.8205666666666667e-05, + "loss": 0.0181, + "step": 15389 + }, + { + "epoch": 12.118156754627806, + "grad_norm": 0.29075518250465393, + "learning_rate": 2.8205333333333333e-05, + "loss": 0.0095, + "step": 15390 + }, + { + "epoch": 12.118944466325324, + "grad_norm": 0.4627517759799957, + "learning_rate": 2.8205000000000002e-05, + "loss": 0.1401, + "step": 15391 + }, + { + "epoch": 12.119732178022844, + "grad_norm": 0.5360866189002991, + "learning_rate": 2.8204666666666668e-05, + "loss": 0.1076, + "step": 15392 + }, + { + "epoch": 12.120519889720363, + "grad_norm": 0.49143633246421814, + "learning_rate": 2.8204333333333334e-05, + "loss": 0.0935, + "step": 15393 + }, + { + "epoch": 12.12130760141788, + "grad_norm": 0.3355049788951874, + "learning_rate": 2.8204000000000003e-05, + "loss": 0.0757, + "step": 15394 + }, + { + "epoch": 12.1220953131154, + "grad_norm": 0.32389360666275024, + "learning_rate": 2.8203666666666666e-05, + "loss": 0.07, + "step": 15395 + }, + { + "epoch": 12.122883024812918, + "grad_norm": 0.5551542639732361, + "learning_rate": 2.8203333333333335e-05, + "loss": 0.034, + "step": 15396 + }, + { + "epoch": 12.123670736510437, + "grad_norm": 0.22655314207077026, + "learning_rate": 2.8203e-05, + "loss": 0.0636, + "step": 15397 + }, + { + "epoch": 12.124458448207957, + "grad_norm": 0.2002800852060318, + "learning_rate": 2.8202666666666667e-05, + "loss": 0.0153, + "step": 15398 + }, + { + "epoch": 12.125246159905474, + "grad_norm": 0.149496391415596, + "learning_rate": 2.8202333333333333e-05, + "loss": 0.0102, + "step": 15399 + }, + { + "epoch": 12.126033871602994, + "grad_norm": 0.12399471551179886, + "learning_rate": 2.8202000000000002e-05, + "loss": 0.0076, + "step": 15400 + }, + { + "epoch": 12.126821583300512, + "grad_norm": 0.1373102366924286, + "learning_rate": 2.8201666666666668e-05, + "loss": 0.0114, + "step": 15401 + }, + { + "epoch": 12.127609294998031, + "grad_norm": 0.35464152693748474, + "learning_rate": 2.8201333333333334e-05, + "loss": 0.012, + "step": 15402 + }, + { + "epoch": 12.128397006695549, + "grad_norm": 0.08529004454612732, + "learning_rate": 2.8201000000000003e-05, + "loss": 0.0054, + "step": 15403 + }, + { + "epoch": 12.129184718393068, + "grad_norm": 0.3071659207344055, + "learning_rate": 2.8200666666666666e-05, + "loss": 0.0078, + "step": 15404 + }, + { + "epoch": 12.129972430090588, + "grad_norm": 0.1966327726840973, + "learning_rate": 2.8200333333333335e-05, + "loss": 0.0064, + "step": 15405 + }, + { + "epoch": 12.130760141788105, + "grad_norm": 0.20174022018909454, + "learning_rate": 2.8199999999999998e-05, + "loss": 0.025, + "step": 15406 + }, + { + "epoch": 12.131547853485625, + "grad_norm": 0.21998153626918793, + "learning_rate": 2.8199666666666667e-05, + "loss": 0.0135, + "step": 15407 + }, + { + "epoch": 12.132335565183142, + "grad_norm": 0.13236218690872192, + "learning_rate": 2.8199333333333336e-05, + "loss": 0.0084, + "step": 15408 + }, + { + "epoch": 12.133123276880662, + "grad_norm": 0.44848573207855225, + "learning_rate": 2.8199e-05, + "loss": 0.0135, + "step": 15409 + }, + { + "epoch": 12.13391098857818, + "grad_norm": 0.28997090458869934, + "learning_rate": 2.8198666666666668e-05, + "loss": 0.0116, + "step": 15410 + }, + { + "epoch": 12.134698700275699, + "grad_norm": 0.20791666209697723, + "learning_rate": 2.8198333333333334e-05, + "loss": 0.0115, + "step": 15411 + }, + { + "epoch": 12.135486411973218, + "grad_norm": 0.12756237387657166, + "learning_rate": 2.8198e-05, + "loss": 0.0064, + "step": 15412 + }, + { + "epoch": 12.136274123670736, + "grad_norm": 0.2795947194099426, + "learning_rate": 2.8197666666666666e-05, + "loss": 0.0297, + "step": 15413 + }, + { + "epoch": 12.137061835368256, + "grad_norm": 0.09132719784975052, + "learning_rate": 2.8197333333333335e-05, + "loss": 0.0054, + "step": 15414 + }, + { + "epoch": 12.137849547065773, + "grad_norm": 0.24425235390663147, + "learning_rate": 2.8196999999999998e-05, + "loss": 0.011, + "step": 15415 + }, + { + "epoch": 12.138637258763293, + "grad_norm": 0.1346789449453354, + "learning_rate": 2.8196666666666667e-05, + "loss": 0.004, + "step": 15416 + }, + { + "epoch": 12.139424970460812, + "grad_norm": 0.13242757320404053, + "learning_rate": 2.8196333333333337e-05, + "loss": 0.007, + "step": 15417 + }, + { + "epoch": 12.14021268215833, + "grad_norm": 0.14283441007137299, + "learning_rate": 2.8196e-05, + "loss": 0.005, + "step": 15418 + }, + { + "epoch": 12.14100039385585, + "grad_norm": 0.10133419930934906, + "learning_rate": 2.819566666666667e-05, + "loss": 0.0033, + "step": 15419 + }, + { + "epoch": 12.141788105553367, + "grad_norm": 0.14719459414482117, + "learning_rate": 2.8195333333333334e-05, + "loss": 0.0063, + "step": 15420 + }, + { + "epoch": 12.142575817250886, + "grad_norm": 0.1466098576784134, + "learning_rate": 2.8195e-05, + "loss": 0.0077, + "step": 15421 + }, + { + "epoch": 12.143363528948404, + "grad_norm": 0.17686539888381958, + "learning_rate": 2.8194666666666666e-05, + "loss": 0.0109, + "step": 15422 + }, + { + "epoch": 12.144151240645924, + "grad_norm": 0.3092848062515259, + "learning_rate": 2.8194333333333336e-05, + "loss": 0.0083, + "step": 15423 + }, + { + "epoch": 12.144938952343443, + "grad_norm": 0.17168354988098145, + "learning_rate": 2.8194e-05, + "loss": 0.0055, + "step": 15424 + }, + { + "epoch": 12.14572666404096, + "grad_norm": 0.45778805017471313, + "learning_rate": 2.8193666666666667e-05, + "loss": 0.0101, + "step": 15425 + }, + { + "epoch": 12.14651437573848, + "grad_norm": 0.5473909378051758, + "learning_rate": 2.8193333333333337e-05, + "loss": 0.0124, + "step": 15426 + }, + { + "epoch": 12.147302087435998, + "grad_norm": 0.19072121381759644, + "learning_rate": 2.8193e-05, + "loss": 0.0102, + "step": 15427 + }, + { + "epoch": 12.148089799133517, + "grad_norm": 0.23649385571479797, + "learning_rate": 2.819266666666667e-05, + "loss": 0.0082, + "step": 15428 + }, + { + "epoch": 12.148877510831035, + "grad_norm": 0.25746724009513855, + "learning_rate": 2.8192333333333335e-05, + "loss": 0.0116, + "step": 15429 + }, + { + "epoch": 12.149665222528554, + "grad_norm": 0.5391885638237, + "learning_rate": 2.8192e-05, + "loss": 0.0144, + "step": 15430 + }, + { + "epoch": 12.150452934226074, + "grad_norm": 0.14692895114421844, + "learning_rate": 2.8191666666666666e-05, + "loss": 0.0068, + "step": 15431 + }, + { + "epoch": 12.151240645923592, + "grad_norm": 0.21068060398101807, + "learning_rate": 2.8191333333333336e-05, + "loss": 0.0095, + "step": 15432 + }, + { + "epoch": 12.152028357621111, + "grad_norm": 0.3478395640850067, + "learning_rate": 2.8191e-05, + "loss": 0.0154, + "step": 15433 + }, + { + "epoch": 12.152816069318629, + "grad_norm": 0.3546059727668762, + "learning_rate": 2.8190666666666668e-05, + "loss": 0.0058, + "step": 15434 + }, + { + "epoch": 12.153603781016148, + "grad_norm": 0.3351501524448395, + "learning_rate": 2.8190333333333334e-05, + "loss": 0.0165, + "step": 15435 + }, + { + "epoch": 12.154391492713668, + "grad_norm": 0.16014733910560608, + "learning_rate": 2.819e-05, + "loss": 0.0063, + "step": 15436 + }, + { + "epoch": 12.155179204411185, + "grad_norm": 0.3086172044277191, + "learning_rate": 2.818966666666667e-05, + "loss": 0.0153, + "step": 15437 + }, + { + "epoch": 12.155966916108705, + "grad_norm": 0.3078186810016632, + "learning_rate": 2.818933333333333e-05, + "loss": 0.0134, + "step": 15438 + }, + { + "epoch": 12.156754627806222, + "grad_norm": 0.3270341455936432, + "learning_rate": 2.8189e-05, + "loss": 0.0225, + "step": 15439 + }, + { + "epoch": 12.157542339503742, + "grad_norm": 0.2789638042449951, + "learning_rate": 2.8188666666666667e-05, + "loss": 0.0125, + "step": 15440 + }, + { + "epoch": 12.15833005120126, + "grad_norm": 0.7785143256187439, + "learning_rate": 2.8188333333333333e-05, + "loss": 0.1578, + "step": 15441 + }, + { + "epoch": 12.159117762898779, + "grad_norm": 0.640640377998352, + "learning_rate": 2.8188000000000002e-05, + "loss": 0.176, + "step": 15442 + }, + { + "epoch": 12.159905474596298, + "grad_norm": 0.4528503119945526, + "learning_rate": 2.8187666666666668e-05, + "loss": 0.0961, + "step": 15443 + }, + { + "epoch": 12.160693186293816, + "grad_norm": 0.6064000129699707, + "learning_rate": 2.8187333333333334e-05, + "loss": 0.1204, + "step": 15444 + }, + { + "epoch": 12.161480897991336, + "grad_norm": 0.49949464201927185, + "learning_rate": 2.8187e-05, + "loss": 0.0428, + "step": 15445 + }, + { + "epoch": 12.162268609688853, + "grad_norm": 0.4573555886745453, + "learning_rate": 2.818666666666667e-05, + "loss": 0.0747, + "step": 15446 + }, + { + "epoch": 12.163056321386373, + "grad_norm": 0.25161656737327576, + "learning_rate": 2.818633333333333e-05, + "loss": 0.024, + "step": 15447 + }, + { + "epoch": 12.16384403308389, + "grad_norm": 0.39855724573135376, + "learning_rate": 2.8186e-05, + "loss": 0.0194, + "step": 15448 + }, + { + "epoch": 12.16463174478141, + "grad_norm": 0.1944975107908249, + "learning_rate": 2.8185666666666667e-05, + "loss": 0.0383, + "step": 15449 + }, + { + "epoch": 12.16541945647893, + "grad_norm": 0.6134234070777893, + "learning_rate": 2.8185333333333333e-05, + "loss": 0.0179, + "step": 15450 + }, + { + "epoch": 12.166207168176447, + "grad_norm": 0.368299275636673, + "learning_rate": 2.8185000000000002e-05, + "loss": 0.0138, + "step": 15451 + }, + { + "epoch": 12.166994879873966, + "grad_norm": 0.14851848781108856, + "learning_rate": 2.8184666666666668e-05, + "loss": 0.0128, + "step": 15452 + }, + { + "epoch": 12.167782591571484, + "grad_norm": 0.137277290225029, + "learning_rate": 2.8184333333333334e-05, + "loss": 0.0393, + "step": 15453 + }, + { + "epoch": 12.168570303269004, + "grad_norm": 0.14038805663585663, + "learning_rate": 2.8184e-05, + "loss": 0.0065, + "step": 15454 + }, + { + "epoch": 12.169358014966523, + "grad_norm": 0.2844415307044983, + "learning_rate": 2.818366666666667e-05, + "loss": 0.0158, + "step": 15455 + }, + { + "epoch": 12.17014572666404, + "grad_norm": 0.24186596274375916, + "learning_rate": 2.8183333333333332e-05, + "loss": 0.0107, + "step": 15456 + }, + { + "epoch": 12.17093343836156, + "grad_norm": 0.146803617477417, + "learning_rate": 2.8183e-05, + "loss": 0.0044, + "step": 15457 + }, + { + "epoch": 12.171721150059078, + "grad_norm": 0.13018237054347992, + "learning_rate": 2.818266666666667e-05, + "loss": 0.0054, + "step": 15458 + }, + { + "epoch": 12.172508861756597, + "grad_norm": 0.3166652321815491, + "learning_rate": 2.8182333333333333e-05, + "loss": 0.0092, + "step": 15459 + }, + { + "epoch": 12.173296573454115, + "grad_norm": 0.20625744760036469, + "learning_rate": 2.8182000000000002e-05, + "loss": 0.0108, + "step": 15460 + }, + { + "epoch": 12.174084285151634, + "grad_norm": 0.1552630066871643, + "learning_rate": 2.8181666666666668e-05, + "loss": 0.0047, + "step": 15461 + }, + { + "epoch": 12.174871996849154, + "grad_norm": 0.09929351508617401, + "learning_rate": 2.8181333333333334e-05, + "loss": 0.0061, + "step": 15462 + }, + { + "epoch": 12.175659708546672, + "grad_norm": 0.15155461430549622, + "learning_rate": 2.8181e-05, + "loss": 0.0075, + "step": 15463 + }, + { + "epoch": 12.176447420244191, + "grad_norm": 0.28883522748947144, + "learning_rate": 2.818066666666667e-05, + "loss": 0.012, + "step": 15464 + }, + { + "epoch": 12.177235131941709, + "grad_norm": 0.204809308052063, + "learning_rate": 2.8180333333333332e-05, + "loss": 0.0076, + "step": 15465 + }, + { + "epoch": 12.178022843639228, + "grad_norm": 0.15170113742351532, + "learning_rate": 2.818e-05, + "loss": 0.0061, + "step": 15466 + }, + { + "epoch": 12.178810555336748, + "grad_norm": 0.41547247767448425, + "learning_rate": 2.8179666666666667e-05, + "loss": 0.0156, + "step": 15467 + }, + { + "epoch": 12.179598267034265, + "grad_norm": 0.14774270355701447, + "learning_rate": 2.8179333333333333e-05, + "loss": 0.006, + "step": 15468 + }, + { + "epoch": 12.180385978731785, + "grad_norm": 0.14074793457984924, + "learning_rate": 2.8179000000000002e-05, + "loss": 0.0093, + "step": 15469 + }, + { + "epoch": 12.181173690429302, + "grad_norm": 0.2491932362318039, + "learning_rate": 2.8178666666666665e-05, + "loss": 0.0075, + "step": 15470 + }, + { + "epoch": 12.181961402126822, + "grad_norm": 0.28135257959365845, + "learning_rate": 2.8178333333333334e-05, + "loss": 0.0117, + "step": 15471 + }, + { + "epoch": 12.18274911382434, + "grad_norm": 0.23907899856567383, + "learning_rate": 2.8178e-05, + "loss": 0.0074, + "step": 15472 + }, + { + "epoch": 12.183536825521859, + "grad_norm": 0.17094431817531586, + "learning_rate": 2.8177666666666666e-05, + "loss": 0.0057, + "step": 15473 + }, + { + "epoch": 12.184324537219378, + "grad_norm": 0.16276688873767853, + "learning_rate": 2.8177333333333336e-05, + "loss": 0.0058, + "step": 15474 + }, + { + "epoch": 12.185112248916896, + "grad_norm": 0.21493001282215118, + "learning_rate": 2.8177e-05, + "loss": 0.018, + "step": 15475 + }, + { + "epoch": 12.185899960614416, + "grad_norm": 0.11825601756572723, + "learning_rate": 2.8176666666666667e-05, + "loss": 0.0058, + "step": 15476 + }, + { + "epoch": 12.186687672311933, + "grad_norm": 0.3738262951374054, + "learning_rate": 2.8176333333333333e-05, + "loss": 0.0151, + "step": 15477 + }, + { + "epoch": 12.187475384009453, + "grad_norm": 0.49015742540359497, + "learning_rate": 2.8176000000000003e-05, + "loss": 0.0168, + "step": 15478 + }, + { + "epoch": 12.18826309570697, + "grad_norm": 0.11106470227241516, + "learning_rate": 2.8175666666666665e-05, + "loss": 0.0065, + "step": 15479 + }, + { + "epoch": 12.18905080740449, + "grad_norm": 0.4207763373851776, + "learning_rate": 2.8175333333333335e-05, + "loss": 0.0173, + "step": 15480 + }, + { + "epoch": 12.18983851910201, + "grad_norm": 0.17737430334091187, + "learning_rate": 2.8175e-05, + "loss": 0.0057, + "step": 15481 + }, + { + "epoch": 12.190626230799527, + "grad_norm": 0.18477840721607208, + "learning_rate": 2.8174666666666666e-05, + "loss": 0.0086, + "step": 15482 + }, + { + "epoch": 12.191413942497046, + "grad_norm": 0.34466955065727234, + "learning_rate": 2.8174333333333336e-05, + "loss": 0.0089, + "step": 15483 + }, + { + "epoch": 12.192201654194564, + "grad_norm": 0.3108138144016266, + "learning_rate": 2.8174e-05, + "loss": 0.0098, + "step": 15484 + }, + { + "epoch": 12.192989365892084, + "grad_norm": 0.16316568851470947, + "learning_rate": 2.8173666666666668e-05, + "loss": 0.0055, + "step": 15485 + }, + { + "epoch": 12.193777077589603, + "grad_norm": 0.13035447895526886, + "learning_rate": 2.8173333333333334e-05, + "loss": 0.0068, + "step": 15486 + }, + { + "epoch": 12.19456478928712, + "grad_norm": 0.24202463030815125, + "learning_rate": 2.8173000000000003e-05, + "loss": 0.0091, + "step": 15487 + }, + { + "epoch": 12.19535250098464, + "grad_norm": 0.2026604562997818, + "learning_rate": 2.8172666666666665e-05, + "loss": 0.0074, + "step": 15488 + }, + { + "epoch": 12.196140212682158, + "grad_norm": 0.34542179107666016, + "learning_rate": 2.8172333333333335e-05, + "loss": 0.0165, + "step": 15489 + }, + { + "epoch": 12.196927924379677, + "grad_norm": 0.5487165451049805, + "learning_rate": 2.8172e-05, + "loss": 0.011, + "step": 15490 + }, + { + "epoch": 12.197715636077195, + "grad_norm": 0.5324872136116028, + "learning_rate": 2.8171666666666667e-05, + "loss": 0.1841, + "step": 15491 + }, + { + "epoch": 12.198503347774714, + "grad_norm": 0.5001672506332397, + "learning_rate": 2.8171333333333336e-05, + "loss": 0.1243, + "step": 15492 + }, + { + "epoch": 12.199291059472234, + "grad_norm": 0.6023962497711182, + "learning_rate": 2.8171000000000002e-05, + "loss": 0.1077, + "step": 15493 + }, + { + "epoch": 12.200078771169752, + "grad_norm": 0.5358093976974487, + "learning_rate": 2.8170666666666668e-05, + "loss": 0.0814, + "step": 15494 + }, + { + "epoch": 12.200866482867271, + "grad_norm": 0.39528319239616394, + "learning_rate": 2.8170333333333334e-05, + "loss": 0.0419, + "step": 15495 + }, + { + "epoch": 12.201654194564789, + "grad_norm": 0.28171849250793457, + "learning_rate": 2.817e-05, + "loss": 0.0309, + "step": 15496 + }, + { + "epoch": 12.202441906262308, + "grad_norm": 0.45562100410461426, + "learning_rate": 2.8169666666666666e-05, + "loss": 0.0469, + "step": 15497 + }, + { + "epoch": 12.203229617959826, + "grad_norm": 0.14269959926605225, + "learning_rate": 2.8169333333333335e-05, + "loss": 0.0099, + "step": 15498 + }, + { + "epoch": 12.204017329657345, + "grad_norm": 0.3220239281654358, + "learning_rate": 2.8169e-05, + "loss": 0.0154, + "step": 15499 + }, + { + "epoch": 12.204805041354865, + "grad_norm": 0.22731080651283264, + "learning_rate": 2.8168666666666667e-05, + "loss": 0.0204, + "step": 15500 + }, + { + "epoch": 12.205592753052382, + "grad_norm": 0.1579306274652481, + "learning_rate": 2.8168333333333336e-05, + "loss": 0.0096, + "step": 15501 + }, + { + "epoch": 12.206380464749902, + "grad_norm": 0.4685879945755005, + "learning_rate": 2.8168e-05, + "loss": 0.0144, + "step": 15502 + }, + { + "epoch": 12.20716817644742, + "grad_norm": 0.24778176844120026, + "learning_rate": 2.8167666666666668e-05, + "loss": 0.0114, + "step": 15503 + }, + { + "epoch": 12.207955888144939, + "grad_norm": 0.11899016797542572, + "learning_rate": 2.8167333333333334e-05, + "loss": 0.0236, + "step": 15504 + }, + { + "epoch": 12.208743599842458, + "grad_norm": 0.11570972204208374, + "learning_rate": 2.8167e-05, + "loss": 0.0042, + "step": 15505 + }, + { + "epoch": 12.209531311539976, + "grad_norm": 0.2900594472885132, + "learning_rate": 2.8166666666666666e-05, + "loss": 0.0122, + "step": 15506 + }, + { + "epoch": 12.210319023237496, + "grad_norm": 0.2032894492149353, + "learning_rate": 2.8166333333333335e-05, + "loss": 0.0154, + "step": 15507 + }, + { + "epoch": 12.211106734935013, + "grad_norm": 0.17178836464881897, + "learning_rate": 2.8166e-05, + "loss": 0.0072, + "step": 15508 + }, + { + "epoch": 12.211894446632533, + "grad_norm": 0.12499798834323883, + "learning_rate": 2.8165666666666667e-05, + "loss": 0.0074, + "step": 15509 + }, + { + "epoch": 12.21268215833005, + "grad_norm": 0.278888463973999, + "learning_rate": 2.8165333333333336e-05, + "loss": 0.0119, + "step": 15510 + }, + { + "epoch": 12.21346987002757, + "grad_norm": 0.4898206293582916, + "learning_rate": 2.8165e-05, + "loss": 0.0123, + "step": 15511 + }, + { + "epoch": 12.21425758172509, + "grad_norm": 0.2153252363204956, + "learning_rate": 2.8164666666666668e-05, + "loss": 0.0077, + "step": 15512 + }, + { + "epoch": 12.215045293422607, + "grad_norm": 0.2311144471168518, + "learning_rate": 2.8164333333333334e-05, + "loss": 0.0121, + "step": 15513 + }, + { + "epoch": 12.215833005120126, + "grad_norm": 0.11645089089870453, + "learning_rate": 2.8164e-05, + "loss": 0.005, + "step": 15514 + }, + { + "epoch": 12.216620716817644, + "grad_norm": 0.17182959616184235, + "learning_rate": 2.8163666666666666e-05, + "loss": 0.0084, + "step": 15515 + }, + { + "epoch": 12.217408428515164, + "grad_norm": 0.19103170931339264, + "learning_rate": 2.8163333333333335e-05, + "loss": 0.0157, + "step": 15516 + }, + { + "epoch": 12.218196140212681, + "grad_norm": 0.4507151246070862, + "learning_rate": 2.8163e-05, + "loss": 0.0234, + "step": 15517 + }, + { + "epoch": 12.2189838519102, + "grad_norm": 0.15098324418067932, + "learning_rate": 2.8162666666666667e-05, + "loss": 0.0057, + "step": 15518 + }, + { + "epoch": 12.21977156360772, + "grad_norm": 0.19027996063232422, + "learning_rate": 2.8162333333333336e-05, + "loss": 0.0081, + "step": 15519 + }, + { + "epoch": 12.220559275305238, + "grad_norm": 0.2975047826766968, + "learning_rate": 2.8162e-05, + "loss": 0.013, + "step": 15520 + }, + { + "epoch": 12.221346987002757, + "grad_norm": 0.46025320887565613, + "learning_rate": 2.816166666666667e-05, + "loss": 0.0159, + "step": 15521 + }, + { + "epoch": 12.222134698700275, + "grad_norm": 0.15887585282325745, + "learning_rate": 2.8161333333333334e-05, + "loss": 0.0066, + "step": 15522 + }, + { + "epoch": 12.222922410397794, + "grad_norm": 0.21662631630897522, + "learning_rate": 2.8161e-05, + "loss": 0.01, + "step": 15523 + }, + { + "epoch": 12.223710122095314, + "grad_norm": 0.2794441282749176, + "learning_rate": 2.816066666666667e-05, + "loss": 0.0142, + "step": 15524 + }, + { + "epoch": 12.224497833792832, + "grad_norm": 0.07730868458747864, + "learning_rate": 2.8160333333333336e-05, + "loss": 0.0038, + "step": 15525 + }, + { + "epoch": 12.225285545490351, + "grad_norm": 0.21185600757598877, + "learning_rate": 2.816e-05, + "loss": 0.0088, + "step": 15526 + }, + { + "epoch": 12.226073257187869, + "grad_norm": 0.28091755509376526, + "learning_rate": 2.8159666666666667e-05, + "loss": 0.0223, + "step": 15527 + }, + { + "epoch": 12.226860968885388, + "grad_norm": 0.2931147515773773, + "learning_rate": 2.8159333333333333e-05, + "loss": 0.0095, + "step": 15528 + }, + { + "epoch": 12.227648680582906, + "grad_norm": 0.22422809898853302, + "learning_rate": 2.8159e-05, + "loss": 0.0081, + "step": 15529 + }, + { + "epoch": 12.228436392280425, + "grad_norm": 0.13341191411018372, + "learning_rate": 2.815866666666667e-05, + "loss": 0.007, + "step": 15530 + }, + { + "epoch": 12.229224103977945, + "grad_norm": 0.16261975467205048, + "learning_rate": 2.815833333333333e-05, + "loss": 0.0061, + "step": 15531 + }, + { + "epoch": 12.230011815675462, + "grad_norm": 0.20184452831745148, + "learning_rate": 2.8158e-05, + "loss": 0.0089, + "step": 15532 + }, + { + "epoch": 12.230799527372982, + "grad_norm": 0.156964510679245, + "learning_rate": 2.815766666666667e-05, + "loss": 0.004, + "step": 15533 + }, + { + "epoch": 12.2315872390705, + "grad_norm": 0.21493135392665863, + "learning_rate": 2.8157333333333332e-05, + "loss": 0.009, + "step": 15534 + }, + { + "epoch": 12.232374950768019, + "grad_norm": 0.3320150375366211, + "learning_rate": 2.8157e-05, + "loss": 0.0155, + "step": 15535 + }, + { + "epoch": 12.233162662465537, + "grad_norm": 0.2103278785943985, + "learning_rate": 2.8156666666666668e-05, + "loss": 0.0125, + "step": 15536 + }, + { + "epoch": 12.233950374163056, + "grad_norm": 0.3649730086326599, + "learning_rate": 2.8156333333333334e-05, + "loss": 0.0082, + "step": 15537 + }, + { + "epoch": 12.234738085860576, + "grad_norm": 0.13295218348503113, + "learning_rate": 2.8156e-05, + "loss": 0.0082, + "step": 15538 + }, + { + "epoch": 12.235525797558093, + "grad_norm": 0.20039039850234985, + "learning_rate": 2.815566666666667e-05, + "loss": 0.0054, + "step": 15539 + }, + { + "epoch": 12.236313509255613, + "grad_norm": 0.509155809879303, + "learning_rate": 2.815533333333333e-05, + "loss": 0.0114, + "step": 15540 + }, + { + "epoch": 12.23710122095313, + "grad_norm": 0.5155431032180786, + "learning_rate": 2.8155e-05, + "loss": 0.1851, + "step": 15541 + }, + { + "epoch": 12.23788893265065, + "grad_norm": 0.6071720123291016, + "learning_rate": 2.815466666666667e-05, + "loss": 0.1333, + "step": 15542 + }, + { + "epoch": 12.23867664434817, + "grad_norm": 0.2907487154006958, + "learning_rate": 2.8154333333333333e-05, + "loss": 0.0737, + "step": 15543 + }, + { + "epoch": 12.239464356045687, + "grad_norm": 0.25891900062561035, + "learning_rate": 2.8154000000000002e-05, + "loss": 0.0572, + "step": 15544 + }, + { + "epoch": 12.240252067743207, + "grad_norm": 0.6039171814918518, + "learning_rate": 2.8153666666666668e-05, + "loss": 0.0636, + "step": 15545 + }, + { + "epoch": 12.241039779440724, + "grad_norm": 0.35199764370918274, + "learning_rate": 2.8153333333333334e-05, + "loss": 0.0399, + "step": 15546 + }, + { + "epoch": 12.241827491138244, + "grad_norm": 0.10397189110517502, + "learning_rate": 2.8153e-05, + "loss": 0.01, + "step": 15547 + }, + { + "epoch": 12.242615202835761, + "grad_norm": 0.463049054145813, + "learning_rate": 2.815266666666667e-05, + "loss": 0.0344, + "step": 15548 + }, + { + "epoch": 12.24340291453328, + "grad_norm": 0.14201360940933228, + "learning_rate": 2.8152333333333335e-05, + "loss": 0.0064, + "step": 15549 + }, + { + "epoch": 12.2441906262308, + "grad_norm": 0.19198070466518402, + "learning_rate": 2.8152e-05, + "loss": 0.0086, + "step": 15550 + }, + { + "epoch": 12.244978337928318, + "grad_norm": 0.09281625598669052, + "learning_rate": 2.815166666666667e-05, + "loss": 0.0061, + "step": 15551 + }, + { + "epoch": 12.245766049625837, + "grad_norm": 0.5837111473083496, + "learning_rate": 2.8151333333333333e-05, + "loss": 0.0144, + "step": 15552 + }, + { + "epoch": 12.246553761323355, + "grad_norm": 0.1143636554479599, + "learning_rate": 2.8151000000000002e-05, + "loss": 0.0046, + "step": 15553 + }, + { + "epoch": 12.247341473020875, + "grad_norm": 0.2015443742275238, + "learning_rate": 2.8150666666666668e-05, + "loss": 0.0054, + "step": 15554 + }, + { + "epoch": 12.248129184718394, + "grad_norm": 0.3380185067653656, + "learning_rate": 2.8150333333333334e-05, + "loss": 0.009, + "step": 15555 + }, + { + "epoch": 12.248916896415912, + "grad_norm": 0.18752051889896393, + "learning_rate": 2.815e-05, + "loss": 0.004, + "step": 15556 + }, + { + "epoch": 12.249704608113431, + "grad_norm": 0.17236541211605072, + "learning_rate": 2.8149666666666666e-05, + "loss": 0.0088, + "step": 15557 + }, + { + "epoch": 12.250492319810949, + "grad_norm": 0.19630572199821472, + "learning_rate": 2.8149333333333335e-05, + "loss": 0.0099, + "step": 15558 + }, + { + "epoch": 12.251280031508468, + "grad_norm": 0.10502517968416214, + "learning_rate": 2.8149e-05, + "loss": 0.006, + "step": 15559 + }, + { + "epoch": 12.252067743205986, + "grad_norm": 0.42908570170402527, + "learning_rate": 2.8148666666666667e-05, + "loss": 0.014, + "step": 15560 + }, + { + "epoch": 12.252855454903505, + "grad_norm": 0.15027882158756256, + "learning_rate": 2.8148333333333333e-05, + "loss": 0.0102, + "step": 15561 + }, + { + "epoch": 12.253643166601025, + "grad_norm": 0.16145102679729462, + "learning_rate": 2.8148000000000002e-05, + "loss": 0.0088, + "step": 15562 + }, + { + "epoch": 12.254430878298542, + "grad_norm": 0.18525385856628418, + "learning_rate": 2.8147666666666665e-05, + "loss": 0.0045, + "step": 15563 + }, + { + "epoch": 12.255218589996062, + "grad_norm": 0.4091652035713196, + "learning_rate": 2.8147333333333334e-05, + "loss": 0.0116, + "step": 15564 + }, + { + "epoch": 12.25600630169358, + "grad_norm": 0.14732667803764343, + "learning_rate": 2.8147e-05, + "loss": 0.0056, + "step": 15565 + }, + { + "epoch": 12.256794013391099, + "grad_norm": 0.180865079164505, + "learning_rate": 2.8146666666666666e-05, + "loss": 0.0063, + "step": 15566 + }, + { + "epoch": 12.257581725088617, + "grad_norm": 0.18898271024227142, + "learning_rate": 2.8146333333333335e-05, + "loss": 0.0053, + "step": 15567 + }, + { + "epoch": 12.258369436786136, + "grad_norm": 0.499001145362854, + "learning_rate": 2.8146e-05, + "loss": 0.0099, + "step": 15568 + }, + { + "epoch": 12.259157148483656, + "grad_norm": 0.3697686493396759, + "learning_rate": 2.8145666666666667e-05, + "loss": 0.0125, + "step": 15569 + }, + { + "epoch": 12.259944860181173, + "grad_norm": 0.10529621690511703, + "learning_rate": 2.8145333333333333e-05, + "loss": 0.003, + "step": 15570 + }, + { + "epoch": 12.260732571878693, + "grad_norm": 0.3424420654773712, + "learning_rate": 2.8145000000000002e-05, + "loss": 0.0092, + "step": 15571 + }, + { + "epoch": 12.26152028357621, + "grad_norm": 0.25079289078712463, + "learning_rate": 2.8144666666666665e-05, + "loss": 0.0067, + "step": 15572 + }, + { + "epoch": 12.26230799527373, + "grad_norm": 0.1912071257829666, + "learning_rate": 2.8144333333333334e-05, + "loss": 0.0077, + "step": 15573 + }, + { + "epoch": 12.26309570697125, + "grad_norm": 0.5779695510864258, + "learning_rate": 2.8144000000000004e-05, + "loss": 0.0161, + "step": 15574 + }, + { + "epoch": 12.263883418668767, + "grad_norm": 0.29027071595191956, + "learning_rate": 2.8143666666666666e-05, + "loss": 0.0196, + "step": 15575 + }, + { + "epoch": 12.264671130366287, + "grad_norm": 0.11341401189565659, + "learning_rate": 2.8143333333333335e-05, + "loss": 0.0053, + "step": 15576 + }, + { + "epoch": 12.265458842063804, + "grad_norm": 0.13615302741527557, + "learning_rate": 2.8143e-05, + "loss": 0.0073, + "step": 15577 + }, + { + "epoch": 12.266246553761324, + "grad_norm": 0.25682419538497925, + "learning_rate": 2.8142666666666667e-05, + "loss": 0.0126, + "step": 15578 + }, + { + "epoch": 12.267034265458841, + "grad_norm": 0.39176416397094727, + "learning_rate": 2.8142333333333333e-05, + "loss": 0.0193, + "step": 15579 + }, + { + "epoch": 12.26782197715636, + "grad_norm": 1.3378115892410278, + "learning_rate": 2.8142000000000003e-05, + "loss": 0.0067, + "step": 15580 + }, + { + "epoch": 12.26860968885388, + "grad_norm": 0.2791236340999603, + "learning_rate": 2.8141666666666665e-05, + "loss": 0.0075, + "step": 15581 + }, + { + "epoch": 12.269397400551398, + "grad_norm": 0.3107852339744568, + "learning_rate": 2.8141333333333334e-05, + "loss": 0.0098, + "step": 15582 + }, + { + "epoch": 12.270185112248917, + "grad_norm": 0.15496927499771118, + "learning_rate": 2.8141000000000004e-05, + "loss": 0.0048, + "step": 15583 + }, + { + "epoch": 12.270972823946435, + "grad_norm": 0.1673751324415207, + "learning_rate": 2.8140666666666666e-05, + "loss": 0.0046, + "step": 15584 + }, + { + "epoch": 12.271760535643955, + "grad_norm": 0.1468704789876938, + "learning_rate": 2.8140333333333336e-05, + "loss": 0.0093, + "step": 15585 + }, + { + "epoch": 12.272548247341472, + "grad_norm": 0.3731079399585724, + "learning_rate": 2.8139999999999998e-05, + "loss": 0.0127, + "step": 15586 + }, + { + "epoch": 12.273335959038992, + "grad_norm": 0.4872099459171295, + "learning_rate": 2.8139666666666668e-05, + "loss": 0.0087, + "step": 15587 + }, + { + "epoch": 12.274123670736511, + "grad_norm": 0.15712302923202515, + "learning_rate": 2.8139333333333333e-05, + "loss": 0.008, + "step": 15588 + }, + { + "epoch": 12.274911382434029, + "grad_norm": 0.7626677751541138, + "learning_rate": 2.8139e-05, + "loss": 0.0194, + "step": 15589 + }, + { + "epoch": 12.275699094131548, + "grad_norm": 0.4426371455192566, + "learning_rate": 2.8138666666666665e-05, + "loss": 0.0127, + "step": 15590 + }, + { + "epoch": 12.276486805829066, + "grad_norm": 0.5298207998275757, + "learning_rate": 2.8138333333333335e-05, + "loss": 0.1785, + "step": 15591 + }, + { + "epoch": 12.277274517526585, + "grad_norm": 0.551148533821106, + "learning_rate": 2.8138e-05, + "loss": 0.1124, + "step": 15592 + }, + { + "epoch": 12.278062229224105, + "grad_norm": 0.5794615745544434, + "learning_rate": 2.8137666666666667e-05, + "loss": 0.1226, + "step": 15593 + }, + { + "epoch": 12.278849940921623, + "grad_norm": 0.3597247004508972, + "learning_rate": 2.8137333333333336e-05, + "loss": 0.0471, + "step": 15594 + }, + { + "epoch": 12.279637652619142, + "grad_norm": 0.33010950684547424, + "learning_rate": 2.8137e-05, + "loss": 0.0397, + "step": 15595 + }, + { + "epoch": 12.28042536431666, + "grad_norm": 0.32380858063697815, + "learning_rate": 2.8136666666666668e-05, + "loss": 0.0331, + "step": 15596 + }, + { + "epoch": 12.281213076014179, + "grad_norm": 0.229989156126976, + "learning_rate": 2.8136333333333334e-05, + "loss": 0.0129, + "step": 15597 + }, + { + "epoch": 12.282000787711697, + "grad_norm": 0.35402047634124756, + "learning_rate": 2.8136e-05, + "loss": 0.0209, + "step": 15598 + }, + { + "epoch": 12.282788499409216, + "grad_norm": 0.15031535923480988, + "learning_rate": 2.813566666666667e-05, + "loss": 0.0055, + "step": 15599 + }, + { + "epoch": 12.283576211106736, + "grad_norm": 0.24482080340385437, + "learning_rate": 2.8135333333333335e-05, + "loss": 0.0128, + "step": 15600 + }, + { + "epoch": 12.284363922804253, + "grad_norm": 0.167738139629364, + "learning_rate": 2.8135e-05, + "loss": 0.0119, + "step": 15601 + }, + { + "epoch": 12.285151634501773, + "grad_norm": 0.8057288527488708, + "learning_rate": 2.8134666666666667e-05, + "loss": 0.0137, + "step": 15602 + }, + { + "epoch": 12.28593934619929, + "grad_norm": 0.17011429369449615, + "learning_rate": 2.8134333333333336e-05, + "loss": 0.0093, + "step": 15603 + }, + { + "epoch": 12.28672705789681, + "grad_norm": 0.13965147733688354, + "learning_rate": 2.8134e-05, + "loss": 0.0064, + "step": 15604 + }, + { + "epoch": 12.287514769594328, + "grad_norm": 0.3335871994495392, + "learning_rate": 2.8133666666666668e-05, + "loss": 0.0133, + "step": 15605 + }, + { + "epoch": 12.288302481291847, + "grad_norm": 0.20553992688655853, + "learning_rate": 2.8133333333333334e-05, + "loss": 0.0107, + "step": 15606 + }, + { + "epoch": 12.289090192989367, + "grad_norm": 0.3196520209312439, + "learning_rate": 2.8133e-05, + "loss": 0.014, + "step": 15607 + }, + { + "epoch": 12.289877904686884, + "grad_norm": 0.16856488585472107, + "learning_rate": 2.813266666666667e-05, + "loss": 0.0088, + "step": 15608 + }, + { + "epoch": 12.290665616384404, + "grad_norm": 0.18961890041828156, + "learning_rate": 2.8132333333333335e-05, + "loss": 0.0102, + "step": 15609 + }, + { + "epoch": 12.291453328081921, + "grad_norm": 0.12318965792655945, + "learning_rate": 2.8132e-05, + "loss": 0.0057, + "step": 15610 + }, + { + "epoch": 12.29224103977944, + "grad_norm": 0.2340862900018692, + "learning_rate": 2.8131666666666667e-05, + "loss": 0.0103, + "step": 15611 + }, + { + "epoch": 12.29302875147696, + "grad_norm": 0.22937890887260437, + "learning_rate": 2.8131333333333336e-05, + "loss": 0.009, + "step": 15612 + }, + { + "epoch": 12.293816463174478, + "grad_norm": 0.2863045334815979, + "learning_rate": 2.8131e-05, + "loss": 0.0146, + "step": 15613 + }, + { + "epoch": 12.294604174871997, + "grad_norm": 0.521881639957428, + "learning_rate": 2.8130666666666668e-05, + "loss": 0.0141, + "step": 15614 + }, + { + "epoch": 12.295391886569515, + "grad_norm": 0.13672317564487457, + "learning_rate": 2.8130333333333334e-05, + "loss": 0.0116, + "step": 15615 + }, + { + "epoch": 12.296179598267035, + "grad_norm": 0.17085552215576172, + "learning_rate": 2.813e-05, + "loss": 0.008, + "step": 15616 + }, + { + "epoch": 12.296967309964552, + "grad_norm": 0.16922645270824432, + "learning_rate": 2.812966666666667e-05, + "loss": 0.0101, + "step": 15617 + }, + { + "epoch": 12.297755021662072, + "grad_norm": 0.18927304446697235, + "learning_rate": 2.8129333333333332e-05, + "loss": 0.0073, + "step": 15618 + }, + { + "epoch": 12.298542733359591, + "grad_norm": 0.8591091632843018, + "learning_rate": 2.8129e-05, + "loss": 0.0101, + "step": 15619 + }, + { + "epoch": 12.299330445057109, + "grad_norm": 0.24441662430763245, + "learning_rate": 2.8128666666666667e-05, + "loss": 0.0102, + "step": 15620 + }, + { + "epoch": 12.300118156754628, + "grad_norm": 0.173714280128479, + "learning_rate": 2.8128333333333333e-05, + "loss": 0.0077, + "step": 15621 + }, + { + "epoch": 12.300905868452146, + "grad_norm": 0.24959853291511536, + "learning_rate": 2.8128e-05, + "loss": 0.0091, + "step": 15622 + }, + { + "epoch": 12.301693580149665, + "grad_norm": 0.38613662123680115, + "learning_rate": 2.812766666666667e-05, + "loss": 0.013, + "step": 15623 + }, + { + "epoch": 12.302481291847183, + "grad_norm": 0.34788504242897034, + "learning_rate": 2.8127333333333334e-05, + "loss": 0.0148, + "step": 15624 + }, + { + "epoch": 12.303269003544703, + "grad_norm": 0.17976981401443481, + "learning_rate": 2.8127e-05, + "loss": 0.0128, + "step": 15625 + }, + { + "epoch": 12.304056715242222, + "grad_norm": 0.17468686401844025, + "learning_rate": 2.812666666666667e-05, + "loss": 0.0142, + "step": 15626 + }, + { + "epoch": 12.30484442693974, + "grad_norm": 0.6016737222671509, + "learning_rate": 2.8126333333333332e-05, + "loss": 0.0144, + "step": 15627 + }, + { + "epoch": 12.30563213863726, + "grad_norm": 0.3326856195926666, + "learning_rate": 2.8126e-05, + "loss": 0.0086, + "step": 15628 + }, + { + "epoch": 12.306419850334777, + "grad_norm": 0.10948844254016876, + "learning_rate": 2.8125666666666667e-05, + "loss": 0.0044, + "step": 15629 + }, + { + "epoch": 12.307207562032296, + "grad_norm": 0.18966642022132874, + "learning_rate": 2.8125333333333333e-05, + "loss": 0.0081, + "step": 15630 + }, + { + "epoch": 12.307995273729816, + "grad_norm": 0.5178632140159607, + "learning_rate": 2.8125e-05, + "loss": 0.0133, + "step": 15631 + }, + { + "epoch": 12.308782985427333, + "grad_norm": 0.1616244614124298, + "learning_rate": 2.812466666666667e-05, + "loss": 0.0059, + "step": 15632 + }, + { + "epoch": 12.309570697124853, + "grad_norm": 0.40337738394737244, + "learning_rate": 2.8124333333333334e-05, + "loss": 0.021, + "step": 15633 + }, + { + "epoch": 12.31035840882237, + "grad_norm": 0.19099916517734528, + "learning_rate": 2.8124e-05, + "loss": 0.0094, + "step": 15634 + }, + { + "epoch": 12.31114612051989, + "grad_norm": 0.15612879395484924, + "learning_rate": 2.812366666666667e-05, + "loss": 0.0059, + "step": 15635 + }, + { + "epoch": 12.311933832217408, + "grad_norm": 0.09955783933401108, + "learning_rate": 2.8123333333333332e-05, + "loss": 0.0032, + "step": 15636 + }, + { + "epoch": 12.312721543914927, + "grad_norm": 0.5561476349830627, + "learning_rate": 2.8123e-05, + "loss": 0.0274, + "step": 15637 + }, + { + "epoch": 12.313509255612447, + "grad_norm": 0.2417834997177124, + "learning_rate": 2.8122666666666668e-05, + "loss": 0.0105, + "step": 15638 + }, + { + "epoch": 12.314296967309964, + "grad_norm": 0.2780158221721649, + "learning_rate": 2.8122333333333333e-05, + "loss": 0.0125, + "step": 15639 + }, + { + "epoch": 12.315084679007484, + "grad_norm": 0.8386333584785461, + "learning_rate": 2.8122e-05, + "loss": 0.0141, + "step": 15640 + }, + { + "epoch": 12.315872390705001, + "grad_norm": 0.6510702967643738, + "learning_rate": 2.812166666666667e-05, + "loss": 0.2189, + "step": 15641 + }, + { + "epoch": 12.31666010240252, + "grad_norm": 0.44224417209625244, + "learning_rate": 2.8121333333333335e-05, + "loss": 0.106, + "step": 15642 + }, + { + "epoch": 12.317447814100039, + "grad_norm": 1.073510766029358, + "learning_rate": 2.8121e-05, + "loss": 0.0869, + "step": 15643 + }, + { + "epoch": 12.318235525797558, + "grad_norm": 0.4159106910228729, + "learning_rate": 2.812066666666667e-05, + "loss": 0.0788, + "step": 15644 + }, + { + "epoch": 12.319023237495077, + "grad_norm": 0.19047227501869202, + "learning_rate": 2.8120333333333332e-05, + "loss": 0.033, + "step": 15645 + }, + { + "epoch": 12.319810949192595, + "grad_norm": 0.2797084152698517, + "learning_rate": 2.8120000000000002e-05, + "loss": 0.0236, + "step": 15646 + }, + { + "epoch": 12.320598660890115, + "grad_norm": 0.22373418509960175, + "learning_rate": 2.8119666666666664e-05, + "loss": 0.0127, + "step": 15647 + }, + { + "epoch": 12.321386372587632, + "grad_norm": 0.24607035517692566, + "learning_rate": 2.8119333333333334e-05, + "loss": 0.0206, + "step": 15648 + }, + { + "epoch": 12.322174084285152, + "grad_norm": 0.25867488980293274, + "learning_rate": 2.8119000000000003e-05, + "loss": 0.0122, + "step": 15649 + }, + { + "epoch": 12.322961795982671, + "grad_norm": 0.3315587639808655, + "learning_rate": 2.8118666666666666e-05, + "loss": 0.0269, + "step": 15650 + }, + { + "epoch": 12.323749507680189, + "grad_norm": 0.20079174637794495, + "learning_rate": 2.8118333333333335e-05, + "loss": 0.0142, + "step": 15651 + }, + { + "epoch": 12.324537219377708, + "grad_norm": 0.2654336094856262, + "learning_rate": 2.8118e-05, + "loss": 0.0139, + "step": 15652 + }, + { + "epoch": 12.325324931075226, + "grad_norm": 0.07916440069675446, + "learning_rate": 2.8117666666666667e-05, + "loss": 0.0055, + "step": 15653 + }, + { + "epoch": 12.326112642772745, + "grad_norm": 0.16192446649074554, + "learning_rate": 2.8117333333333333e-05, + "loss": 0.0102, + "step": 15654 + }, + { + "epoch": 12.326900354470263, + "grad_norm": 0.13560937345027924, + "learning_rate": 2.8117000000000002e-05, + "loss": 0.0062, + "step": 15655 + }, + { + "epoch": 12.327688066167783, + "grad_norm": 0.23207956552505493, + "learning_rate": 2.8116666666666665e-05, + "loss": 0.0168, + "step": 15656 + }, + { + "epoch": 12.328475777865302, + "grad_norm": 0.17141325771808624, + "learning_rate": 2.8116333333333334e-05, + "loss": 0.0111, + "step": 15657 + }, + { + "epoch": 12.32926348956282, + "grad_norm": 0.09420546889305115, + "learning_rate": 2.8116000000000003e-05, + "loss": 0.0072, + "step": 15658 + }, + { + "epoch": 12.33005120126034, + "grad_norm": 0.09310154616832733, + "learning_rate": 2.8115666666666666e-05, + "loss": 0.0039, + "step": 15659 + }, + { + "epoch": 12.330838912957857, + "grad_norm": 0.09281733632087708, + "learning_rate": 2.8115333333333335e-05, + "loss": 0.0052, + "step": 15660 + }, + { + "epoch": 12.331626624655376, + "grad_norm": 0.22314195334911346, + "learning_rate": 2.8115e-05, + "loss": 0.0108, + "step": 15661 + }, + { + "epoch": 12.332414336352894, + "grad_norm": 0.26906833052635193, + "learning_rate": 2.8114666666666667e-05, + "loss": 0.0135, + "step": 15662 + }, + { + "epoch": 12.333202048050413, + "grad_norm": 0.25588515400886536, + "learning_rate": 2.8114333333333333e-05, + "loss": 0.0074, + "step": 15663 + }, + { + "epoch": 12.333989759747933, + "grad_norm": 0.2090378999710083, + "learning_rate": 2.8114000000000002e-05, + "loss": 0.0135, + "step": 15664 + }, + { + "epoch": 12.33477747144545, + "grad_norm": 0.21807833015918732, + "learning_rate": 2.8113666666666665e-05, + "loss": 0.0052, + "step": 15665 + }, + { + "epoch": 12.33556518314297, + "grad_norm": 0.24751360714435577, + "learning_rate": 2.8113333333333334e-05, + "loss": 0.0082, + "step": 15666 + }, + { + "epoch": 12.336352894840488, + "grad_norm": 0.2989620864391327, + "learning_rate": 2.8113000000000003e-05, + "loss": 0.0155, + "step": 15667 + }, + { + "epoch": 12.337140606538007, + "grad_norm": 0.1818322092294693, + "learning_rate": 2.8112666666666666e-05, + "loss": 0.0116, + "step": 15668 + }, + { + "epoch": 12.337928318235527, + "grad_norm": 0.603197455406189, + "learning_rate": 2.8112333333333335e-05, + "loss": 0.0103, + "step": 15669 + }, + { + "epoch": 12.338716029933044, + "grad_norm": 0.34639623761177063, + "learning_rate": 2.8112e-05, + "loss": 0.0142, + "step": 15670 + }, + { + "epoch": 12.339503741630564, + "grad_norm": 0.07655934989452362, + "learning_rate": 2.8111666666666667e-05, + "loss": 0.0039, + "step": 15671 + }, + { + "epoch": 12.340291453328081, + "grad_norm": 0.3525053858757019, + "learning_rate": 2.8111333333333333e-05, + "loss": 0.0127, + "step": 15672 + }, + { + "epoch": 12.3410791650256, + "grad_norm": 0.211248978972435, + "learning_rate": 2.8111000000000002e-05, + "loss": 0.0359, + "step": 15673 + }, + { + "epoch": 12.341866876723119, + "grad_norm": 0.0881672129034996, + "learning_rate": 2.811066666666667e-05, + "loss": 0.0036, + "step": 15674 + }, + { + "epoch": 12.342654588420638, + "grad_norm": 0.13536514341831207, + "learning_rate": 2.8110333333333334e-05, + "loss": 0.008, + "step": 15675 + }, + { + "epoch": 12.343442300118157, + "grad_norm": 0.2595255970954895, + "learning_rate": 2.8110000000000004e-05, + "loss": 0.0087, + "step": 15676 + }, + { + "epoch": 12.344230011815675, + "grad_norm": 0.2094029188156128, + "learning_rate": 2.8109666666666666e-05, + "loss": 0.0045, + "step": 15677 + }, + { + "epoch": 12.345017723513195, + "grad_norm": 0.19161272048950195, + "learning_rate": 2.8109333333333335e-05, + "loss": 0.0078, + "step": 15678 + }, + { + "epoch": 12.345805435210712, + "grad_norm": 0.39887091517448425, + "learning_rate": 2.8108999999999998e-05, + "loss": 0.009, + "step": 15679 + }, + { + "epoch": 12.346593146908232, + "grad_norm": 0.16888557374477386, + "learning_rate": 2.8108666666666667e-05, + "loss": 0.0102, + "step": 15680 + }, + { + "epoch": 12.34738085860575, + "grad_norm": 0.22826138138771057, + "learning_rate": 2.8108333333333333e-05, + "loss": 0.0089, + "step": 15681 + }, + { + "epoch": 12.348168570303269, + "grad_norm": 0.19630447030067444, + "learning_rate": 2.8108e-05, + "loss": 0.0054, + "step": 15682 + }, + { + "epoch": 12.348956282000788, + "grad_norm": 0.6121242642402649, + "learning_rate": 2.810766666666667e-05, + "loss": 0.0238, + "step": 15683 + }, + { + "epoch": 12.349743993698306, + "grad_norm": 0.24589411914348602, + "learning_rate": 2.8107333333333334e-05, + "loss": 0.0117, + "step": 15684 + }, + { + "epoch": 12.350531705395825, + "grad_norm": 0.22013674676418304, + "learning_rate": 2.8107e-05, + "loss": 0.0097, + "step": 15685 + }, + { + "epoch": 12.351319417093343, + "grad_norm": 0.2507689595222473, + "learning_rate": 2.8106666666666666e-05, + "loss": 0.011, + "step": 15686 + }, + { + "epoch": 12.352107128790863, + "grad_norm": 0.0841793641448021, + "learning_rate": 2.8106333333333336e-05, + "loss": 0.003, + "step": 15687 + }, + { + "epoch": 12.352894840488382, + "grad_norm": 0.2499595582485199, + "learning_rate": 2.8105999999999998e-05, + "loss": 0.0101, + "step": 15688 + }, + { + "epoch": 12.3536825521859, + "grad_norm": 0.307180255651474, + "learning_rate": 2.8105666666666668e-05, + "loss": 0.0126, + "step": 15689 + }, + { + "epoch": 12.35447026388342, + "grad_norm": 2.012462854385376, + "learning_rate": 2.8105333333333333e-05, + "loss": 0.0274, + "step": 15690 + }, + { + "epoch": 12.355257975580937, + "grad_norm": 0.5091241598129272, + "learning_rate": 2.8105e-05, + "loss": 0.1957, + "step": 15691 + }, + { + "epoch": 12.356045687278456, + "grad_norm": 0.5409130454063416, + "learning_rate": 2.810466666666667e-05, + "loss": 0.111, + "step": 15692 + }, + { + "epoch": 12.356833398975974, + "grad_norm": 0.6118851900100708, + "learning_rate": 2.8104333333333335e-05, + "loss": 0.1017, + "step": 15693 + }, + { + "epoch": 12.357621110673493, + "grad_norm": 0.5765065550804138, + "learning_rate": 2.8104e-05, + "loss": 0.0801, + "step": 15694 + }, + { + "epoch": 12.358408822371013, + "grad_norm": 0.3135763108730316, + "learning_rate": 2.8103666666666667e-05, + "loss": 0.047, + "step": 15695 + }, + { + "epoch": 12.35919653406853, + "grad_norm": 0.2015930563211441, + "learning_rate": 2.8103333333333336e-05, + "loss": 0.021, + "step": 15696 + }, + { + "epoch": 12.35998424576605, + "grad_norm": 0.25659245252609253, + "learning_rate": 2.8103e-05, + "loss": 0.0226, + "step": 15697 + }, + { + "epoch": 12.360771957463568, + "grad_norm": 0.43256905674934387, + "learning_rate": 2.8102666666666668e-05, + "loss": 0.0252, + "step": 15698 + }, + { + "epoch": 12.361559669161087, + "grad_norm": 0.23712852597236633, + "learning_rate": 2.8102333333333337e-05, + "loss": 0.0175, + "step": 15699 + }, + { + "epoch": 12.362347380858605, + "grad_norm": 0.26089906692504883, + "learning_rate": 2.8102e-05, + "loss": 0.0157, + "step": 15700 + }, + { + "epoch": 12.363135092556124, + "grad_norm": 0.27705755829811096, + "learning_rate": 2.810166666666667e-05, + "loss": 0.0092, + "step": 15701 + }, + { + "epoch": 12.363922804253644, + "grad_norm": 0.2544595003128052, + "learning_rate": 2.8101333333333335e-05, + "loss": 0.0133, + "step": 15702 + }, + { + "epoch": 12.364710515951161, + "grad_norm": 0.09117716550827026, + "learning_rate": 2.8101e-05, + "loss": 0.0041, + "step": 15703 + }, + { + "epoch": 12.365498227648681, + "grad_norm": 0.12409152090549469, + "learning_rate": 2.8100666666666667e-05, + "loss": 0.0062, + "step": 15704 + }, + { + "epoch": 12.366285939346199, + "grad_norm": 0.0859769806265831, + "learning_rate": 2.8100333333333336e-05, + "loss": 0.0054, + "step": 15705 + }, + { + "epoch": 12.367073651043718, + "grad_norm": 0.33101966977119446, + "learning_rate": 2.81e-05, + "loss": 0.0107, + "step": 15706 + }, + { + "epoch": 12.367861362741237, + "grad_norm": 0.17214824259281158, + "learning_rate": 2.8099666666666668e-05, + "loss": 0.0117, + "step": 15707 + }, + { + "epoch": 12.368649074438755, + "grad_norm": 0.13771256804466248, + "learning_rate": 2.8099333333333334e-05, + "loss": 0.0068, + "step": 15708 + }, + { + "epoch": 12.369436786136275, + "grad_norm": 0.1556989997625351, + "learning_rate": 2.8099e-05, + "loss": 0.0077, + "step": 15709 + }, + { + "epoch": 12.370224497833792, + "grad_norm": 0.24938087165355682, + "learning_rate": 2.809866666666667e-05, + "loss": 0.0094, + "step": 15710 + }, + { + "epoch": 12.371012209531312, + "grad_norm": 0.3051822781562805, + "learning_rate": 2.809833333333333e-05, + "loss": 0.0108, + "step": 15711 + }, + { + "epoch": 12.37179992122883, + "grad_norm": 0.19646978378295898, + "learning_rate": 2.8098e-05, + "loss": 0.0438, + "step": 15712 + }, + { + "epoch": 12.372587632926349, + "grad_norm": 0.31694912910461426, + "learning_rate": 2.8097666666666667e-05, + "loss": 0.0105, + "step": 15713 + }, + { + "epoch": 12.373375344623868, + "grad_norm": 0.29168829321861267, + "learning_rate": 2.8097333333333333e-05, + "loss": 0.0092, + "step": 15714 + }, + { + "epoch": 12.374163056321386, + "grad_norm": 0.35397082567214966, + "learning_rate": 2.8097e-05, + "loss": 0.0118, + "step": 15715 + }, + { + "epoch": 12.374950768018905, + "grad_norm": 0.1482931673526764, + "learning_rate": 2.8096666666666668e-05, + "loss": 0.0047, + "step": 15716 + }, + { + "epoch": 12.375738479716423, + "grad_norm": 0.3624628782272339, + "learning_rate": 2.8096333333333334e-05, + "loss": 0.0163, + "step": 15717 + }, + { + "epoch": 12.376526191413943, + "grad_norm": 0.20224089920520782, + "learning_rate": 2.8096e-05, + "loss": 0.014, + "step": 15718 + }, + { + "epoch": 12.37731390311146, + "grad_norm": 0.2230791598558426, + "learning_rate": 2.809566666666667e-05, + "loss": 0.0107, + "step": 15719 + }, + { + "epoch": 12.37810161480898, + "grad_norm": 0.2322644293308258, + "learning_rate": 2.8095333333333332e-05, + "loss": 0.0122, + "step": 15720 + }, + { + "epoch": 12.3788893265065, + "grad_norm": 0.7919154167175293, + "learning_rate": 2.8095e-05, + "loss": 0.0168, + "step": 15721 + }, + { + "epoch": 12.379677038204017, + "grad_norm": 0.1255168318748474, + "learning_rate": 2.8094666666666667e-05, + "loss": 0.0069, + "step": 15722 + }, + { + "epoch": 12.380464749901536, + "grad_norm": 0.11279736459255219, + "learning_rate": 2.8094333333333333e-05, + "loss": 0.0053, + "step": 15723 + }, + { + "epoch": 12.381252461599054, + "grad_norm": 0.20724770426750183, + "learning_rate": 2.8094000000000002e-05, + "loss": 0.0134, + "step": 15724 + }, + { + "epoch": 12.382040173296573, + "grad_norm": 0.08451779186725616, + "learning_rate": 2.809366666666667e-05, + "loss": 0.0058, + "step": 15725 + }, + { + "epoch": 12.382827884994093, + "grad_norm": 0.14885148406028748, + "learning_rate": 2.8093333333333334e-05, + "loss": 0.0078, + "step": 15726 + }, + { + "epoch": 12.38361559669161, + "grad_norm": 0.2281603366136551, + "learning_rate": 2.8093e-05, + "loss": 0.0172, + "step": 15727 + }, + { + "epoch": 12.38440330838913, + "grad_norm": 0.2813457250595093, + "learning_rate": 2.809266666666667e-05, + "loss": 0.0092, + "step": 15728 + }, + { + "epoch": 12.385191020086648, + "grad_norm": 0.294548898935318, + "learning_rate": 2.8092333333333332e-05, + "loss": 0.0165, + "step": 15729 + }, + { + "epoch": 12.385978731784167, + "grad_norm": 0.2183280736207962, + "learning_rate": 2.8092e-05, + "loss": 0.0106, + "step": 15730 + }, + { + "epoch": 12.386766443481685, + "grad_norm": 0.10595737397670746, + "learning_rate": 2.8091666666666667e-05, + "loss": 0.0062, + "step": 15731 + }, + { + "epoch": 12.387554155179204, + "grad_norm": 0.1383112072944641, + "learning_rate": 2.8091333333333333e-05, + "loss": 0.0049, + "step": 15732 + }, + { + "epoch": 12.388341866876724, + "grad_norm": 0.4037777781486511, + "learning_rate": 2.8091000000000003e-05, + "loss": 0.011, + "step": 15733 + }, + { + "epoch": 12.389129578574241, + "grad_norm": 0.21017690002918243, + "learning_rate": 2.809066666666667e-05, + "loss": 0.0124, + "step": 15734 + }, + { + "epoch": 12.389917290271761, + "grad_norm": 0.1744094043970108, + "learning_rate": 2.8090333333333334e-05, + "loss": 0.0084, + "step": 15735 + }, + { + "epoch": 12.390705001969279, + "grad_norm": 0.1822592169046402, + "learning_rate": 2.809e-05, + "loss": 0.0062, + "step": 15736 + }, + { + "epoch": 12.391492713666798, + "grad_norm": 0.3126107156276703, + "learning_rate": 2.8089666666666666e-05, + "loss": 0.0094, + "step": 15737 + }, + { + "epoch": 12.392280425364318, + "grad_norm": 0.21026787161827087, + "learning_rate": 2.8089333333333332e-05, + "loss": 0.0077, + "step": 15738 + }, + { + "epoch": 12.393068137061835, + "grad_norm": 0.1841021180152893, + "learning_rate": 2.8089e-05, + "loss": 0.0064, + "step": 15739 + }, + { + "epoch": 12.393855848759355, + "grad_norm": 0.22054937481880188, + "learning_rate": 2.8088666666666664e-05, + "loss": 0.0076, + "step": 15740 + }, + { + "epoch": 12.394643560456872, + "grad_norm": 0.541093647480011, + "learning_rate": 2.8088333333333333e-05, + "loss": 0.1833, + "step": 15741 + }, + { + "epoch": 12.395431272154392, + "grad_norm": 1.5155282020568848, + "learning_rate": 2.8088000000000003e-05, + "loss": 0.0912, + "step": 15742 + }, + { + "epoch": 12.39621898385191, + "grad_norm": 0.4663562774658203, + "learning_rate": 2.8087666666666665e-05, + "loss": 0.0671, + "step": 15743 + }, + { + "epoch": 12.397006695549429, + "grad_norm": 0.6929229497909546, + "learning_rate": 2.8087333333333335e-05, + "loss": 0.0655, + "step": 15744 + }, + { + "epoch": 12.397794407246948, + "grad_norm": 0.37060481309890747, + "learning_rate": 2.8087e-05, + "loss": 0.0485, + "step": 15745 + }, + { + "epoch": 12.398582118944466, + "grad_norm": 0.3341464102268219, + "learning_rate": 2.8086666666666667e-05, + "loss": 0.0281, + "step": 15746 + }, + { + "epoch": 12.399369830641986, + "grad_norm": 0.3326646387577057, + "learning_rate": 2.8086333333333332e-05, + "loss": 0.0167, + "step": 15747 + }, + { + "epoch": 12.400157542339503, + "grad_norm": 0.13820302486419678, + "learning_rate": 2.8086000000000002e-05, + "loss": 0.0148, + "step": 15748 + }, + { + "epoch": 12.400945254037023, + "grad_norm": 0.15888327360153198, + "learning_rate": 2.8085666666666668e-05, + "loss": 0.0131, + "step": 15749 + }, + { + "epoch": 12.40173296573454, + "grad_norm": 0.1335151195526123, + "learning_rate": 2.8085333333333334e-05, + "loss": 0.0015, + "step": 15750 + }, + { + "epoch": 12.40252067743206, + "grad_norm": 0.18438251316547394, + "learning_rate": 2.8085000000000003e-05, + "loss": 0.0281, + "step": 15751 + }, + { + "epoch": 12.40330838912958, + "grad_norm": 0.13389497995376587, + "learning_rate": 2.8084666666666666e-05, + "loss": 0.0055, + "step": 15752 + }, + { + "epoch": 12.404096100827097, + "grad_norm": 0.12311471253633499, + "learning_rate": 2.8084333333333335e-05, + "loss": 0.0099, + "step": 15753 + }, + { + "epoch": 12.404883812524616, + "grad_norm": 0.9706339240074158, + "learning_rate": 2.8084e-05, + "loss": 0.0223, + "step": 15754 + }, + { + "epoch": 12.405671524222134, + "grad_norm": 0.1784125566482544, + "learning_rate": 2.8083666666666667e-05, + "loss": 0.0117, + "step": 15755 + }, + { + "epoch": 12.406459235919653, + "grad_norm": 0.23472733795642853, + "learning_rate": 2.8083333333333333e-05, + "loss": 0.0092, + "step": 15756 + }, + { + "epoch": 12.407246947617173, + "grad_norm": 0.2145538479089737, + "learning_rate": 2.8083000000000002e-05, + "loss": 0.0144, + "step": 15757 + }, + { + "epoch": 12.40803465931469, + "grad_norm": 0.1490279883146286, + "learning_rate": 2.8082666666666668e-05, + "loss": 0.0081, + "step": 15758 + }, + { + "epoch": 12.40882237101221, + "grad_norm": 1.8260372877120972, + "learning_rate": 2.8082333333333334e-05, + "loss": 0.0221, + "step": 15759 + }, + { + "epoch": 12.409610082709728, + "grad_norm": 0.15690767765045166, + "learning_rate": 2.8082000000000003e-05, + "loss": 0.0123, + "step": 15760 + }, + { + "epoch": 12.410397794407247, + "grad_norm": 0.36835116147994995, + "learning_rate": 2.8081666666666666e-05, + "loss": 0.0087, + "step": 15761 + }, + { + "epoch": 12.411185506104765, + "grad_norm": 0.6664104461669922, + "learning_rate": 2.8081333333333335e-05, + "loss": 0.0114, + "step": 15762 + }, + { + "epoch": 12.411973217802284, + "grad_norm": 0.1667444258928299, + "learning_rate": 2.8081e-05, + "loss": 0.0054, + "step": 15763 + }, + { + "epoch": 12.412760929499804, + "grad_norm": 0.5146225094795227, + "learning_rate": 2.8080666666666667e-05, + "loss": 0.0136, + "step": 15764 + }, + { + "epoch": 12.413548641197321, + "grad_norm": 0.21495738625526428, + "learning_rate": 2.8080333333333333e-05, + "loss": 0.0058, + "step": 15765 + }, + { + "epoch": 12.414336352894841, + "grad_norm": 0.621526837348938, + "learning_rate": 2.8080000000000002e-05, + "loss": 0.0092, + "step": 15766 + }, + { + "epoch": 12.415124064592359, + "grad_norm": 0.2651360034942627, + "learning_rate": 2.8079666666666668e-05, + "loss": 0.0135, + "step": 15767 + }, + { + "epoch": 12.415911776289878, + "grad_norm": 0.1258573979139328, + "learning_rate": 2.8079333333333334e-05, + "loss": 0.0064, + "step": 15768 + }, + { + "epoch": 12.416699487987396, + "grad_norm": 0.18546763062477112, + "learning_rate": 2.8079e-05, + "loss": 0.0088, + "step": 15769 + }, + { + "epoch": 12.417487199684915, + "grad_norm": 0.38231393694877625, + "learning_rate": 2.8078666666666666e-05, + "loss": 0.0136, + "step": 15770 + }, + { + "epoch": 12.418274911382435, + "grad_norm": 0.20978733897209167, + "learning_rate": 2.8078333333333335e-05, + "loss": 0.0103, + "step": 15771 + }, + { + "epoch": 12.419062623079952, + "grad_norm": 0.21728992462158203, + "learning_rate": 2.8077999999999998e-05, + "loss": 0.0083, + "step": 15772 + }, + { + "epoch": 12.419850334777472, + "grad_norm": 0.3003341257572174, + "learning_rate": 2.8077666666666667e-05, + "loss": 0.0075, + "step": 15773 + }, + { + "epoch": 12.42063804647499, + "grad_norm": 0.1492672562599182, + "learning_rate": 2.8077333333333336e-05, + "loss": 0.0071, + "step": 15774 + }, + { + "epoch": 12.421425758172509, + "grad_norm": 0.8008854985237122, + "learning_rate": 2.8077e-05, + "loss": 0.019, + "step": 15775 + }, + { + "epoch": 12.422213469870028, + "grad_norm": 0.22522398829460144, + "learning_rate": 2.807666666666667e-05, + "loss": 0.0105, + "step": 15776 + }, + { + "epoch": 12.423001181567546, + "grad_norm": 0.2025655061006546, + "learning_rate": 2.8076333333333334e-05, + "loss": 0.0087, + "step": 15777 + }, + { + "epoch": 12.423788893265066, + "grad_norm": 0.22749708592891693, + "learning_rate": 2.8076e-05, + "loss": 0.0098, + "step": 15778 + }, + { + "epoch": 12.424576604962583, + "grad_norm": 0.1725713163614273, + "learning_rate": 2.8075666666666666e-05, + "loss": 0.008, + "step": 15779 + }, + { + "epoch": 12.425364316660103, + "grad_norm": 0.22272709012031555, + "learning_rate": 2.8075333333333335e-05, + "loss": 0.0076, + "step": 15780 + }, + { + "epoch": 12.42615202835762, + "grad_norm": 0.5035314559936523, + "learning_rate": 2.8074999999999998e-05, + "loss": 0.0138, + "step": 15781 + }, + { + "epoch": 12.42693974005514, + "grad_norm": 0.24022118747234344, + "learning_rate": 2.8074666666666667e-05, + "loss": 0.0077, + "step": 15782 + }, + { + "epoch": 12.42772745175266, + "grad_norm": 0.23848868906497955, + "learning_rate": 2.8074333333333337e-05, + "loss": 0.0114, + "step": 15783 + }, + { + "epoch": 12.428515163450177, + "grad_norm": 0.2117944210767746, + "learning_rate": 2.8074e-05, + "loss": 0.0128, + "step": 15784 + }, + { + "epoch": 12.429302875147696, + "grad_norm": 0.5574262142181396, + "learning_rate": 2.807366666666667e-05, + "loss": 0.0195, + "step": 15785 + }, + { + "epoch": 12.430090586845214, + "grad_norm": 0.4086807370185852, + "learning_rate": 2.8073333333333334e-05, + "loss": 0.0284, + "step": 15786 + }, + { + "epoch": 12.430878298542734, + "grad_norm": 0.36621391773223877, + "learning_rate": 2.8073e-05, + "loss": 0.0078, + "step": 15787 + }, + { + "epoch": 12.431666010240253, + "grad_norm": 1.0321815013885498, + "learning_rate": 2.8072666666666666e-05, + "loss": 0.0202, + "step": 15788 + }, + { + "epoch": 12.43245372193777, + "grad_norm": 0.39918088912963867, + "learning_rate": 2.8072333333333336e-05, + "loss": 0.0147, + "step": 15789 + }, + { + "epoch": 12.43324143363529, + "grad_norm": 0.2532637119293213, + "learning_rate": 2.8071999999999998e-05, + "loss": 0.0069, + "step": 15790 + }, + { + "epoch": 12.434029145332808, + "grad_norm": 0.9143795967102051, + "learning_rate": 2.8071666666666668e-05, + "loss": 0.2175, + "step": 15791 + }, + { + "epoch": 12.434816857030327, + "grad_norm": 0.508628785610199, + "learning_rate": 2.8071333333333337e-05, + "loss": 0.0931, + "step": 15792 + }, + { + "epoch": 12.435604568727845, + "grad_norm": 0.3787236213684082, + "learning_rate": 2.8071e-05, + "loss": 0.1349, + "step": 15793 + }, + { + "epoch": 12.436392280425364, + "grad_norm": 0.439097136259079, + "learning_rate": 2.807066666666667e-05, + "loss": 0.0681, + "step": 15794 + }, + { + "epoch": 12.437179992122884, + "grad_norm": 0.28233447670936584, + "learning_rate": 2.8070333333333335e-05, + "loss": 0.065, + "step": 15795 + }, + { + "epoch": 12.437967703820402, + "grad_norm": 0.2787061333656311, + "learning_rate": 2.807e-05, + "loss": 0.0304, + "step": 15796 + }, + { + "epoch": 12.438755415517921, + "grad_norm": 0.2488386332988739, + "learning_rate": 2.8069666666666667e-05, + "loss": 0.0317, + "step": 15797 + }, + { + "epoch": 12.439543127215439, + "grad_norm": 0.24127043783664703, + "learning_rate": 2.8069333333333332e-05, + "loss": 0.0098, + "step": 15798 + }, + { + "epoch": 12.440330838912958, + "grad_norm": 0.24996107816696167, + "learning_rate": 2.8069000000000002e-05, + "loss": 0.0121, + "step": 15799 + }, + { + "epoch": 12.441118550610476, + "grad_norm": 0.11157383024692535, + "learning_rate": 2.8068666666666668e-05, + "loss": 0.0091, + "step": 15800 + }, + { + "epoch": 12.441906262307995, + "grad_norm": 0.0898984745144844, + "learning_rate": 2.8068333333333334e-05, + "loss": 0.006, + "step": 15801 + }, + { + "epoch": 12.442693974005515, + "grad_norm": 0.09322338551282883, + "learning_rate": 2.8068e-05, + "loss": 0.0059, + "step": 15802 + }, + { + "epoch": 12.443481685703032, + "grad_norm": 0.2407228797674179, + "learning_rate": 2.806766666666667e-05, + "loss": 0.0111, + "step": 15803 + }, + { + "epoch": 12.444269397400552, + "grad_norm": 0.23399698734283447, + "learning_rate": 2.806733333333333e-05, + "loss": 0.0095, + "step": 15804 + }, + { + "epoch": 12.44505710909807, + "grad_norm": 0.19812420010566711, + "learning_rate": 2.8067e-05, + "loss": 0.0144, + "step": 15805 + }, + { + "epoch": 12.445844820795589, + "grad_norm": 0.11242125183343887, + "learning_rate": 2.8066666666666667e-05, + "loss": 0.0034, + "step": 15806 + }, + { + "epoch": 12.446632532493108, + "grad_norm": 0.305007666349411, + "learning_rate": 2.8066333333333333e-05, + "loss": 0.0172, + "step": 15807 + }, + { + "epoch": 12.447420244190626, + "grad_norm": 0.5746996402740479, + "learning_rate": 2.8066000000000002e-05, + "loss": 0.0111, + "step": 15808 + }, + { + "epoch": 12.448207955888146, + "grad_norm": 0.1381150782108307, + "learning_rate": 2.8065666666666668e-05, + "loss": 0.0099, + "step": 15809 + }, + { + "epoch": 12.448995667585663, + "grad_norm": 0.09728100150823593, + "learning_rate": 2.8065333333333334e-05, + "loss": 0.0052, + "step": 15810 + }, + { + "epoch": 12.449783379283183, + "grad_norm": 0.3684859573841095, + "learning_rate": 2.8065e-05, + "loss": 0.0221, + "step": 15811 + }, + { + "epoch": 12.4505710909807, + "grad_norm": 0.20528976619243622, + "learning_rate": 2.806466666666667e-05, + "loss": 0.0128, + "step": 15812 + }, + { + "epoch": 12.45135880267822, + "grad_norm": 0.27941352128982544, + "learning_rate": 2.806433333333333e-05, + "loss": 0.0073, + "step": 15813 + }, + { + "epoch": 12.45214651437574, + "grad_norm": 0.12916408479213715, + "learning_rate": 2.8064e-05, + "loss": 0.0096, + "step": 15814 + }, + { + "epoch": 12.452934226073257, + "grad_norm": 0.39114910364151, + "learning_rate": 2.8063666666666667e-05, + "loss": 0.0539, + "step": 15815 + }, + { + "epoch": 12.453721937770776, + "grad_norm": 0.46694818139076233, + "learning_rate": 2.8063333333333333e-05, + "loss": 0.0091, + "step": 15816 + }, + { + "epoch": 12.454509649468294, + "grad_norm": 0.2651364803314209, + "learning_rate": 2.8063000000000002e-05, + "loss": 0.0083, + "step": 15817 + }, + { + "epoch": 12.455297361165814, + "grad_norm": 0.2418326437473297, + "learning_rate": 2.8062666666666668e-05, + "loss": 0.0124, + "step": 15818 + }, + { + "epoch": 12.456085072863331, + "grad_norm": 0.1192190870642662, + "learning_rate": 2.8062333333333334e-05, + "loss": 0.0089, + "step": 15819 + }, + { + "epoch": 12.45687278456085, + "grad_norm": 0.39346322417259216, + "learning_rate": 2.8062e-05, + "loss": 0.0123, + "step": 15820 + }, + { + "epoch": 12.45766049625837, + "grad_norm": 0.30561959743499756, + "learning_rate": 2.806166666666667e-05, + "loss": 0.0109, + "step": 15821 + }, + { + "epoch": 12.458448207955888, + "grad_norm": 0.1591537892818451, + "learning_rate": 2.8061333333333332e-05, + "loss": 0.0086, + "step": 15822 + }, + { + "epoch": 12.459235919653407, + "grad_norm": 0.1476021260023117, + "learning_rate": 2.8061e-05, + "loss": 0.0087, + "step": 15823 + }, + { + "epoch": 12.460023631350925, + "grad_norm": 0.16840019822120667, + "learning_rate": 2.806066666666667e-05, + "loss": 0.0125, + "step": 15824 + }, + { + "epoch": 12.460811343048444, + "grad_norm": 0.25780487060546875, + "learning_rate": 2.8060333333333333e-05, + "loss": 0.0107, + "step": 15825 + }, + { + "epoch": 12.461599054745964, + "grad_norm": 0.24904794991016388, + "learning_rate": 2.8060000000000002e-05, + "loss": 0.0068, + "step": 15826 + }, + { + "epoch": 12.462386766443482, + "grad_norm": 0.517077624797821, + "learning_rate": 2.805966666666667e-05, + "loss": 0.0112, + "step": 15827 + }, + { + "epoch": 12.463174478141001, + "grad_norm": 0.23498620092868805, + "learning_rate": 2.8059333333333334e-05, + "loss": 0.0095, + "step": 15828 + }, + { + "epoch": 12.463962189838519, + "grad_norm": 0.22308731079101562, + "learning_rate": 2.8059e-05, + "loss": 0.0104, + "step": 15829 + }, + { + "epoch": 12.464749901536038, + "grad_norm": 0.2520102858543396, + "learning_rate": 2.8058666666666666e-05, + "loss": 0.008, + "step": 15830 + }, + { + "epoch": 12.465537613233556, + "grad_norm": 0.16490067541599274, + "learning_rate": 2.8058333333333332e-05, + "loss": 0.0064, + "step": 15831 + }, + { + "epoch": 12.466325324931075, + "grad_norm": 0.16341017186641693, + "learning_rate": 2.8058e-05, + "loss": 0.0109, + "step": 15832 + }, + { + "epoch": 12.467113036628595, + "grad_norm": 0.529669463634491, + "learning_rate": 2.8057666666666667e-05, + "loss": 0.0119, + "step": 15833 + }, + { + "epoch": 12.467900748326112, + "grad_norm": 0.4876493513584137, + "learning_rate": 2.8057333333333333e-05, + "loss": 0.012, + "step": 15834 + }, + { + "epoch": 12.468688460023632, + "grad_norm": 0.2307247370481491, + "learning_rate": 2.8057000000000003e-05, + "loss": 0.0102, + "step": 15835 + }, + { + "epoch": 12.46947617172115, + "grad_norm": 1.1196001768112183, + "learning_rate": 2.8056666666666665e-05, + "loss": 0.0169, + "step": 15836 + }, + { + "epoch": 12.470263883418669, + "grad_norm": 0.6745665073394775, + "learning_rate": 2.8056333333333334e-05, + "loss": 0.0141, + "step": 15837 + }, + { + "epoch": 12.471051595116187, + "grad_norm": 0.27449724078178406, + "learning_rate": 2.8056e-05, + "loss": 0.0162, + "step": 15838 + }, + { + "epoch": 12.471839306813706, + "grad_norm": 0.5522574782371521, + "learning_rate": 2.8055666666666666e-05, + "loss": 0.0121, + "step": 15839 + }, + { + "epoch": 12.472627018511226, + "grad_norm": 0.11800843477249146, + "learning_rate": 2.8055333333333332e-05, + "loss": 0.0039, + "step": 15840 + }, + { + "epoch": 12.473414730208743, + "grad_norm": 0.7705029249191284, + "learning_rate": 2.8055e-05, + "loss": 0.1577, + "step": 15841 + }, + { + "epoch": 12.474202441906263, + "grad_norm": 0.6831575632095337, + "learning_rate": 2.8054666666666668e-05, + "loss": 0.1497, + "step": 15842 + }, + { + "epoch": 12.47499015360378, + "grad_norm": 0.38240906596183777, + "learning_rate": 2.8054333333333333e-05, + "loss": 0.0742, + "step": 15843 + }, + { + "epoch": 12.4757778653013, + "grad_norm": 0.35243040323257446, + "learning_rate": 2.8054000000000003e-05, + "loss": 0.0724, + "step": 15844 + }, + { + "epoch": 12.47656557699882, + "grad_norm": 0.2186931073665619, + "learning_rate": 2.8053666666666665e-05, + "loss": 0.0435, + "step": 15845 + }, + { + "epoch": 12.477353288696337, + "grad_norm": 0.423915296792984, + "learning_rate": 2.8053333333333335e-05, + "loss": 0.0564, + "step": 15846 + }, + { + "epoch": 12.478141000393856, + "grad_norm": 0.3589932322502136, + "learning_rate": 2.8053e-05, + "loss": 0.0297, + "step": 15847 + }, + { + "epoch": 12.478928712091374, + "grad_norm": 0.21999283134937286, + "learning_rate": 2.8052666666666667e-05, + "loss": 0.0436, + "step": 15848 + }, + { + "epoch": 12.479716423788894, + "grad_norm": 0.1483861654996872, + "learning_rate": 2.8052333333333336e-05, + "loss": 0.0118, + "step": 15849 + }, + { + "epoch": 12.480504135486411, + "grad_norm": 0.14085669815540314, + "learning_rate": 2.8052000000000002e-05, + "loss": 0.0132, + "step": 15850 + }, + { + "epoch": 12.48129184718393, + "grad_norm": 0.3245398998260498, + "learning_rate": 2.8051666666666668e-05, + "loss": 0.0149, + "step": 15851 + }, + { + "epoch": 12.48207955888145, + "grad_norm": 0.1132202297449112, + "learning_rate": 2.8051333333333334e-05, + "loss": 0.0071, + "step": 15852 + }, + { + "epoch": 12.482867270578968, + "grad_norm": 0.1661561131477356, + "learning_rate": 2.8051000000000003e-05, + "loss": 0.0101, + "step": 15853 + }, + { + "epoch": 12.483654982276487, + "grad_norm": 0.24611248075962067, + "learning_rate": 2.8050666666666666e-05, + "loss": 0.0109, + "step": 15854 + }, + { + "epoch": 12.484442693974005, + "grad_norm": 0.15944436192512512, + "learning_rate": 2.8050333333333335e-05, + "loss": 0.0106, + "step": 15855 + }, + { + "epoch": 12.485230405671524, + "grad_norm": 0.16165058314800262, + "learning_rate": 2.805e-05, + "loss": 0.0098, + "step": 15856 + }, + { + "epoch": 12.486018117369042, + "grad_norm": 0.540827751159668, + "learning_rate": 2.8049666666666667e-05, + "loss": 0.0117, + "step": 15857 + }, + { + "epoch": 12.486805829066562, + "grad_norm": 0.8313676118850708, + "learning_rate": 2.8049333333333336e-05, + "loss": 0.0087, + "step": 15858 + }, + { + "epoch": 12.487593540764081, + "grad_norm": 0.27530574798583984, + "learning_rate": 2.8049e-05, + "loss": 0.0127, + "step": 15859 + }, + { + "epoch": 12.488381252461599, + "grad_norm": 0.16431984305381775, + "learning_rate": 2.8048666666666668e-05, + "loss": 0.0152, + "step": 15860 + }, + { + "epoch": 12.489168964159118, + "grad_norm": 0.23146909475326538, + "learning_rate": 2.8048333333333334e-05, + "loss": 0.0104, + "step": 15861 + }, + { + "epoch": 12.489956675856636, + "grad_norm": 0.11220633238554001, + "learning_rate": 2.8048e-05, + "loss": 0.0069, + "step": 15862 + }, + { + "epoch": 12.490744387554155, + "grad_norm": 0.12827512621879578, + "learning_rate": 2.8047666666666666e-05, + "loss": 0.0086, + "step": 15863 + }, + { + "epoch": 12.491532099251675, + "grad_norm": 0.13889430463314056, + "learning_rate": 2.8047333333333335e-05, + "loss": 0.0046, + "step": 15864 + }, + { + "epoch": 12.492319810949192, + "grad_norm": 0.15425337851047516, + "learning_rate": 2.8047e-05, + "loss": 0.0104, + "step": 15865 + }, + { + "epoch": 12.493107522646712, + "grad_norm": 0.18059372901916504, + "learning_rate": 2.8046666666666667e-05, + "loss": 0.012, + "step": 15866 + }, + { + "epoch": 12.49389523434423, + "grad_norm": 0.6903148293495178, + "learning_rate": 2.8046333333333336e-05, + "loss": 0.0125, + "step": 15867 + }, + { + "epoch": 12.494682946041749, + "grad_norm": 0.18473440408706665, + "learning_rate": 2.8046e-05, + "loss": 0.0074, + "step": 15868 + }, + { + "epoch": 12.495470657739267, + "grad_norm": 0.1290004998445511, + "learning_rate": 2.8045666666666668e-05, + "loss": 0.0077, + "step": 15869 + }, + { + "epoch": 12.496258369436786, + "grad_norm": 0.4639712870121002, + "learning_rate": 2.8045333333333334e-05, + "loss": 0.0133, + "step": 15870 + }, + { + "epoch": 12.497046081134306, + "grad_norm": 0.4819900691509247, + "learning_rate": 2.8045e-05, + "loss": 0.0174, + "step": 15871 + }, + { + "epoch": 12.497833792831823, + "grad_norm": 0.2745397686958313, + "learning_rate": 2.8044666666666666e-05, + "loss": 0.0073, + "step": 15872 + }, + { + "epoch": 12.498621504529343, + "grad_norm": 0.09868840128183365, + "learning_rate": 2.8044333333333335e-05, + "loss": 0.0051, + "step": 15873 + }, + { + "epoch": 12.49940921622686, + "grad_norm": 0.3232852816581726, + "learning_rate": 2.8044e-05, + "loss": 0.009, + "step": 15874 + }, + { + "epoch": 12.50019692792438, + "grad_norm": 0.18958202004432678, + "learning_rate": 2.8043666666666667e-05, + "loss": 0.0077, + "step": 15875 + }, + { + "epoch": 12.500984639621898, + "grad_norm": 0.14392288029193878, + "learning_rate": 2.8043333333333336e-05, + "loss": 0.0081, + "step": 15876 + }, + { + "epoch": 12.501772351319417, + "grad_norm": 0.19633162021636963, + "learning_rate": 2.8043e-05, + "loss": 0.0062, + "step": 15877 + }, + { + "epoch": 12.502560063016936, + "grad_norm": 0.25441139936447144, + "learning_rate": 2.804266666666667e-05, + "loss": 0.0153, + "step": 15878 + }, + { + "epoch": 12.503347774714454, + "grad_norm": 0.5923601984977722, + "learning_rate": 2.8042333333333334e-05, + "loss": 0.0223, + "step": 15879 + }, + { + "epoch": 12.504135486411974, + "grad_norm": 0.3246802091598511, + "learning_rate": 2.8042e-05, + "loss": 0.0116, + "step": 15880 + }, + { + "epoch": 12.504923198109491, + "grad_norm": 0.3144563138484955, + "learning_rate": 2.8041666666666666e-05, + "loss": 0.0112, + "step": 15881 + }, + { + "epoch": 12.50571090980701, + "grad_norm": 0.3502810001373291, + "learning_rate": 2.8041333333333335e-05, + "loss": 0.0165, + "step": 15882 + }, + { + "epoch": 12.50649862150453, + "grad_norm": 0.24537765979766846, + "learning_rate": 2.8041e-05, + "loss": 0.0083, + "step": 15883 + }, + { + "epoch": 12.507286333202048, + "grad_norm": 0.2362603396177292, + "learning_rate": 2.8040666666666667e-05, + "loss": 0.0079, + "step": 15884 + }, + { + "epoch": 12.508074044899567, + "grad_norm": 0.27994269132614136, + "learning_rate": 2.8040333333333337e-05, + "loss": 0.0139, + "step": 15885 + }, + { + "epoch": 12.508861756597085, + "grad_norm": 0.44618475437164307, + "learning_rate": 2.804e-05, + "loss": 0.0111, + "step": 15886 + }, + { + "epoch": 12.509649468294604, + "grad_norm": 0.28496235609054565, + "learning_rate": 2.803966666666667e-05, + "loss": 0.0121, + "step": 15887 + }, + { + "epoch": 12.510437179992122, + "grad_norm": 0.15900318324565887, + "learning_rate": 2.8039333333333334e-05, + "loss": 0.0043, + "step": 15888 + }, + { + "epoch": 12.511224891689642, + "grad_norm": 0.22712378203868866, + "learning_rate": 2.8039e-05, + "loss": 0.0084, + "step": 15889 + }, + { + "epoch": 12.512012603387161, + "grad_norm": 0.29247528314590454, + "learning_rate": 2.8038666666666666e-05, + "loss": 0.0112, + "step": 15890 + }, + { + "epoch": 12.512800315084679, + "grad_norm": 0.7297523617744446, + "learning_rate": 2.8038333333333332e-05, + "loss": 0.276, + "step": 15891 + }, + { + "epoch": 12.513588026782198, + "grad_norm": 0.502360999584198, + "learning_rate": 2.8038e-05, + "loss": 0.1033, + "step": 15892 + }, + { + "epoch": 12.514375738479716, + "grad_norm": 0.33638033270835876, + "learning_rate": 2.8037666666666668e-05, + "loss": 0.0878, + "step": 15893 + }, + { + "epoch": 12.515163450177235, + "grad_norm": 0.4485207796096802, + "learning_rate": 2.8037333333333333e-05, + "loss": 0.0544, + "step": 15894 + }, + { + "epoch": 12.515951161874753, + "grad_norm": 0.38342908024787903, + "learning_rate": 2.8037e-05, + "loss": 0.052, + "step": 15895 + }, + { + "epoch": 12.516738873572272, + "grad_norm": 0.29354479908943176, + "learning_rate": 2.803666666666667e-05, + "loss": 0.0848, + "step": 15896 + }, + { + "epoch": 12.517526585269792, + "grad_norm": 0.532835841178894, + "learning_rate": 2.803633333333333e-05, + "loss": 0.059, + "step": 15897 + }, + { + "epoch": 12.51831429696731, + "grad_norm": 0.28463441133499146, + "learning_rate": 2.8036e-05, + "loss": 0.0169, + "step": 15898 + }, + { + "epoch": 12.519102008664829, + "grad_norm": 0.1399240642786026, + "learning_rate": 2.803566666666667e-05, + "loss": 0.0059, + "step": 15899 + }, + { + "epoch": 12.519889720362347, + "grad_norm": 0.25933489203453064, + "learning_rate": 2.8035333333333332e-05, + "loss": 0.0206, + "step": 15900 + }, + { + "epoch": 12.520677432059866, + "grad_norm": 0.17080342769622803, + "learning_rate": 2.8035000000000002e-05, + "loss": 0.0111, + "step": 15901 + }, + { + "epoch": 12.521465143757386, + "grad_norm": 0.21515364944934845, + "learning_rate": 2.8034666666666668e-05, + "loss": 0.0122, + "step": 15902 + }, + { + "epoch": 12.522252855454903, + "grad_norm": 0.3218812048435211, + "learning_rate": 2.8034333333333334e-05, + "loss": 0.0442, + "step": 15903 + }, + { + "epoch": 12.523040567152423, + "grad_norm": 0.32852885127067566, + "learning_rate": 2.8034e-05, + "loss": 0.009, + "step": 15904 + }, + { + "epoch": 12.52382827884994, + "grad_norm": 0.1294478178024292, + "learning_rate": 2.803366666666667e-05, + "loss": 0.0101, + "step": 15905 + }, + { + "epoch": 12.52461599054746, + "grad_norm": 0.8301399350166321, + "learning_rate": 2.803333333333333e-05, + "loss": 0.013, + "step": 15906 + }, + { + "epoch": 12.525403702244978, + "grad_norm": 0.24150852859020233, + "learning_rate": 2.8033e-05, + "loss": 0.0081, + "step": 15907 + }, + { + "epoch": 12.526191413942497, + "grad_norm": 0.18020844459533691, + "learning_rate": 2.803266666666667e-05, + "loss": 0.0064, + "step": 15908 + }, + { + "epoch": 12.526979125640016, + "grad_norm": 0.2549254596233368, + "learning_rate": 2.8032333333333333e-05, + "loss": 0.0135, + "step": 15909 + }, + { + "epoch": 12.527766837337534, + "grad_norm": 0.2080152928829193, + "learning_rate": 2.8032000000000002e-05, + "loss": 0.0076, + "step": 15910 + }, + { + "epoch": 12.528554549035054, + "grad_norm": 0.2977714538574219, + "learning_rate": 2.8031666666666668e-05, + "loss": 0.0116, + "step": 15911 + }, + { + "epoch": 12.529342260732571, + "grad_norm": 0.31229570508003235, + "learning_rate": 2.8031333333333334e-05, + "loss": 0.0157, + "step": 15912 + }, + { + "epoch": 12.53012997243009, + "grad_norm": 0.3114875257015228, + "learning_rate": 2.8031e-05, + "loss": 0.0195, + "step": 15913 + }, + { + "epoch": 12.530917684127608, + "grad_norm": 0.2395208328962326, + "learning_rate": 2.803066666666667e-05, + "loss": 0.0114, + "step": 15914 + }, + { + "epoch": 12.531705395825128, + "grad_norm": 0.12679241597652435, + "learning_rate": 2.8030333333333335e-05, + "loss": 0.0076, + "step": 15915 + }, + { + "epoch": 12.532493107522647, + "grad_norm": 0.4438953697681427, + "learning_rate": 2.803e-05, + "loss": 0.0244, + "step": 15916 + }, + { + "epoch": 12.533280819220165, + "grad_norm": 0.3683900833129883, + "learning_rate": 2.802966666666667e-05, + "loss": 0.0138, + "step": 15917 + }, + { + "epoch": 12.534068530917684, + "grad_norm": 0.13556155562400818, + "learning_rate": 2.8029333333333333e-05, + "loss": 0.0088, + "step": 15918 + }, + { + "epoch": 12.534856242615202, + "grad_norm": 0.2101430743932724, + "learning_rate": 2.8029000000000002e-05, + "loss": 0.0078, + "step": 15919 + }, + { + "epoch": 12.535643954312722, + "grad_norm": 0.17538733780384064, + "learning_rate": 2.8028666666666665e-05, + "loss": 0.0088, + "step": 15920 + }, + { + "epoch": 12.536431666010241, + "grad_norm": 0.3308374881744385, + "learning_rate": 2.8028333333333334e-05, + "loss": 0.0134, + "step": 15921 + }, + { + "epoch": 12.537219377707759, + "grad_norm": 0.18810996413230896, + "learning_rate": 2.8028e-05, + "loss": 0.012, + "step": 15922 + }, + { + "epoch": 12.538007089405278, + "grad_norm": 0.13181564211845398, + "learning_rate": 2.8027666666666666e-05, + "loss": 0.0099, + "step": 15923 + }, + { + "epoch": 12.538794801102796, + "grad_norm": 0.1792856603860855, + "learning_rate": 2.8027333333333335e-05, + "loss": 0.0094, + "step": 15924 + }, + { + "epoch": 12.539582512800315, + "grad_norm": 0.18119238317012787, + "learning_rate": 2.8027e-05, + "loss": 0.0107, + "step": 15925 + }, + { + "epoch": 12.540370224497833, + "grad_norm": 0.1205357164144516, + "learning_rate": 2.8026666666666667e-05, + "loss": 0.0055, + "step": 15926 + }, + { + "epoch": 12.541157936195352, + "grad_norm": 0.13396593928337097, + "learning_rate": 2.8026333333333333e-05, + "loss": 0.0101, + "step": 15927 + }, + { + "epoch": 12.541945647892872, + "grad_norm": 0.1916956603527069, + "learning_rate": 2.8026000000000002e-05, + "loss": 0.0073, + "step": 15928 + }, + { + "epoch": 12.54273335959039, + "grad_norm": 0.1764678806066513, + "learning_rate": 2.8025666666666665e-05, + "loss": 0.0103, + "step": 15929 + }, + { + "epoch": 12.543521071287909, + "grad_norm": 0.09399151057004929, + "learning_rate": 2.8025333333333334e-05, + "loss": 0.0058, + "step": 15930 + }, + { + "epoch": 12.544308782985427, + "grad_norm": 0.1483575701713562, + "learning_rate": 2.8025e-05, + "loss": 0.0086, + "step": 15931 + }, + { + "epoch": 12.545096494682946, + "grad_norm": 0.21123269200325012, + "learning_rate": 2.8024666666666666e-05, + "loss": 0.0079, + "step": 15932 + }, + { + "epoch": 12.545884206380464, + "grad_norm": 0.17926760017871857, + "learning_rate": 2.8024333333333335e-05, + "loss": 0.0077, + "step": 15933 + }, + { + "epoch": 12.546671918077983, + "grad_norm": 0.19840820133686066, + "learning_rate": 2.8024e-05, + "loss": 0.0119, + "step": 15934 + }, + { + "epoch": 12.547459629775503, + "grad_norm": 0.30686071515083313, + "learning_rate": 2.8023666666666667e-05, + "loss": 0.0188, + "step": 15935 + }, + { + "epoch": 12.54824734147302, + "grad_norm": 0.21594463288784027, + "learning_rate": 2.8023333333333333e-05, + "loss": 0.0087, + "step": 15936 + }, + { + "epoch": 12.54903505317054, + "grad_norm": 0.28916382789611816, + "learning_rate": 2.8023000000000003e-05, + "loss": 0.0112, + "step": 15937 + }, + { + "epoch": 12.549822764868058, + "grad_norm": 0.15704503655433655, + "learning_rate": 2.8022666666666665e-05, + "loss": 0.0062, + "step": 15938 + }, + { + "epoch": 12.550610476565577, + "grad_norm": 0.4164106249809265, + "learning_rate": 2.8022333333333334e-05, + "loss": 0.0138, + "step": 15939 + }, + { + "epoch": 12.551398188263097, + "grad_norm": 0.25687912106513977, + "learning_rate": 2.8022e-05, + "loss": 0.0097, + "step": 15940 + }, + { + "epoch": 12.552185899960614, + "grad_norm": 0.40773507952690125, + "learning_rate": 2.8021666666666666e-05, + "loss": 0.1595, + "step": 15941 + }, + { + "epoch": 12.552973611658134, + "grad_norm": 0.4337042272090912, + "learning_rate": 2.8021333333333336e-05, + "loss": 0.1171, + "step": 15942 + }, + { + "epoch": 12.553761323355651, + "grad_norm": 0.33278828859329224, + "learning_rate": 2.8021e-05, + "loss": 0.0759, + "step": 15943 + }, + { + "epoch": 12.55454903505317, + "grad_norm": 0.4621215760707855, + "learning_rate": 2.8020666666666668e-05, + "loss": 0.0671, + "step": 15944 + }, + { + "epoch": 12.555336746750688, + "grad_norm": 0.3595724403858185, + "learning_rate": 2.8020333333333333e-05, + "loss": 0.121, + "step": 15945 + }, + { + "epoch": 12.556124458448208, + "grad_norm": 0.5211712718009949, + "learning_rate": 2.8020000000000003e-05, + "loss": 0.0369, + "step": 15946 + }, + { + "epoch": 12.556912170145727, + "grad_norm": 0.3522174060344696, + "learning_rate": 2.8019666666666665e-05, + "loss": 0.0229, + "step": 15947 + }, + { + "epoch": 12.557699881843245, + "grad_norm": 0.19604571163654327, + "learning_rate": 2.8019333333333335e-05, + "loss": 0.0149, + "step": 15948 + }, + { + "epoch": 12.558487593540764, + "grad_norm": 0.27287572622299194, + "learning_rate": 2.8019e-05, + "loss": 0.0318, + "step": 15949 + }, + { + "epoch": 12.559275305238282, + "grad_norm": 0.12748166918754578, + "learning_rate": 2.8018666666666667e-05, + "loss": 0.0101, + "step": 15950 + }, + { + "epoch": 12.560063016935802, + "grad_norm": 0.4897858500480652, + "learning_rate": 2.8018333333333336e-05, + "loss": 0.0092, + "step": 15951 + }, + { + "epoch": 12.56085072863332, + "grad_norm": 0.16219542920589447, + "learning_rate": 2.8018e-05, + "loss": 0.009, + "step": 15952 + }, + { + "epoch": 12.561638440330839, + "grad_norm": 0.24424731731414795, + "learning_rate": 2.8017666666666668e-05, + "loss": 0.0153, + "step": 15953 + }, + { + "epoch": 12.562426152028358, + "grad_norm": 0.08773892372846603, + "learning_rate": 2.8017333333333334e-05, + "loss": 0.0065, + "step": 15954 + }, + { + "epoch": 12.563213863725876, + "grad_norm": 0.13432805240154266, + "learning_rate": 2.8017e-05, + "loss": 0.0095, + "step": 15955 + }, + { + "epoch": 12.564001575423395, + "grad_norm": 0.11902330815792084, + "learning_rate": 2.8016666666666666e-05, + "loss": 0.0079, + "step": 15956 + }, + { + "epoch": 12.564789287120913, + "grad_norm": 0.08957304060459137, + "learning_rate": 2.8016333333333335e-05, + "loss": 0.0077, + "step": 15957 + }, + { + "epoch": 12.565576998818432, + "grad_norm": 0.12032180279493332, + "learning_rate": 2.8016e-05, + "loss": 0.0069, + "step": 15958 + }, + { + "epoch": 12.566364710515952, + "grad_norm": 0.07813628762960434, + "learning_rate": 2.8015666666666667e-05, + "loss": 0.0034, + "step": 15959 + }, + { + "epoch": 12.56715242221347, + "grad_norm": 0.1947801560163498, + "learning_rate": 2.8015333333333336e-05, + "loss": 0.0061, + "step": 15960 + }, + { + "epoch": 12.567940133910989, + "grad_norm": 0.11603552848100662, + "learning_rate": 2.8015e-05, + "loss": 0.0046, + "step": 15961 + }, + { + "epoch": 12.568727845608507, + "grad_norm": 0.1823427379131317, + "learning_rate": 2.8014666666666668e-05, + "loss": 0.0109, + "step": 15962 + }, + { + "epoch": 12.569515557306026, + "grad_norm": 0.1914941817522049, + "learning_rate": 2.8014333333333334e-05, + "loss": 0.007, + "step": 15963 + }, + { + "epoch": 12.570303269003544, + "grad_norm": 0.14942467212677002, + "learning_rate": 2.8014e-05, + "loss": 0.0071, + "step": 15964 + }, + { + "epoch": 12.571090980701063, + "grad_norm": 0.20003101229667664, + "learning_rate": 2.801366666666667e-05, + "loss": 0.0087, + "step": 15965 + }, + { + "epoch": 12.571878692398583, + "grad_norm": 0.11730498820543289, + "learning_rate": 2.8013333333333335e-05, + "loss": 0.0072, + "step": 15966 + }, + { + "epoch": 12.5726664040961, + "grad_norm": 0.33677583932876587, + "learning_rate": 2.8013e-05, + "loss": 0.0101, + "step": 15967 + }, + { + "epoch": 12.57345411579362, + "grad_norm": 0.14642438292503357, + "learning_rate": 2.8012666666666667e-05, + "loss": 0.007, + "step": 15968 + }, + { + "epoch": 12.574241827491138, + "grad_norm": 0.276937872171402, + "learning_rate": 2.8012333333333336e-05, + "loss": 0.0113, + "step": 15969 + }, + { + "epoch": 12.575029539188657, + "grad_norm": 0.1787809580564499, + "learning_rate": 2.8012e-05, + "loss": 0.0089, + "step": 15970 + }, + { + "epoch": 12.575817250886175, + "grad_norm": 0.24747557938098907, + "learning_rate": 2.8011666666666668e-05, + "loss": 0.011, + "step": 15971 + }, + { + "epoch": 12.576604962583694, + "grad_norm": 0.2490312159061432, + "learning_rate": 2.8011333333333334e-05, + "loss": 0.0045, + "step": 15972 + }, + { + "epoch": 12.577392674281214, + "grad_norm": 0.23341712355613708, + "learning_rate": 2.8011e-05, + "loss": 0.0117, + "step": 15973 + }, + { + "epoch": 12.578180385978731, + "grad_norm": 0.1864408701658249, + "learning_rate": 2.801066666666667e-05, + "loss": 0.0114, + "step": 15974 + }, + { + "epoch": 12.57896809767625, + "grad_norm": 0.2549959421157837, + "learning_rate": 2.8010333333333335e-05, + "loss": 0.0116, + "step": 15975 + }, + { + "epoch": 12.579755809373768, + "grad_norm": 0.19879472255706787, + "learning_rate": 2.801e-05, + "loss": 0.0096, + "step": 15976 + }, + { + "epoch": 12.580543521071288, + "grad_norm": 0.1574733704328537, + "learning_rate": 2.8009666666666667e-05, + "loss": 0.0056, + "step": 15977 + }, + { + "epoch": 12.581331232768807, + "grad_norm": 0.23429466784000397, + "learning_rate": 2.8009333333333336e-05, + "loss": 0.0068, + "step": 15978 + }, + { + "epoch": 12.582118944466325, + "grad_norm": 0.5814771056175232, + "learning_rate": 2.8009e-05, + "loss": 0.0118, + "step": 15979 + }, + { + "epoch": 12.582906656163845, + "grad_norm": 0.4089342951774597, + "learning_rate": 2.800866666666667e-05, + "loss": 0.0142, + "step": 15980 + }, + { + "epoch": 12.583694367861362, + "grad_norm": 0.4255625605583191, + "learning_rate": 2.800833333333333e-05, + "loss": 0.0203, + "step": 15981 + }, + { + "epoch": 12.584482079558882, + "grad_norm": 0.6674103736877441, + "learning_rate": 2.8008e-05, + "loss": 0.0104, + "step": 15982 + }, + { + "epoch": 12.5852697912564, + "grad_norm": 0.2521721124649048, + "learning_rate": 2.800766666666667e-05, + "loss": 0.01, + "step": 15983 + }, + { + "epoch": 12.586057502953919, + "grad_norm": 0.1856241226196289, + "learning_rate": 2.8007333333333332e-05, + "loss": 0.0077, + "step": 15984 + }, + { + "epoch": 12.586845214651438, + "grad_norm": 0.13589175045490265, + "learning_rate": 2.8007e-05, + "loss": 0.0055, + "step": 15985 + }, + { + "epoch": 12.587632926348956, + "grad_norm": 0.17949406802654266, + "learning_rate": 2.8006666666666667e-05, + "loss": 0.0058, + "step": 15986 + }, + { + "epoch": 12.588420638046475, + "grad_norm": 0.15069033205509186, + "learning_rate": 2.8006333333333333e-05, + "loss": 0.0083, + "step": 15987 + }, + { + "epoch": 12.589208349743993, + "grad_norm": 0.1608428657054901, + "learning_rate": 2.8006e-05, + "loss": 0.0094, + "step": 15988 + }, + { + "epoch": 12.589996061441513, + "grad_norm": 0.3110886514186859, + "learning_rate": 2.800566666666667e-05, + "loss": 0.0134, + "step": 15989 + }, + { + "epoch": 12.59078377313903, + "grad_norm": 0.14006946980953217, + "learning_rate": 2.8005333333333334e-05, + "loss": 0.0035, + "step": 15990 + }, + { + "epoch": 12.59157148483655, + "grad_norm": 0.5204048156738281, + "learning_rate": 2.8005e-05, + "loss": 0.1661, + "step": 15991 + }, + { + "epoch": 12.592359196534069, + "grad_norm": 0.601081907749176, + "learning_rate": 2.800466666666667e-05, + "loss": 0.1785, + "step": 15992 + }, + { + "epoch": 12.593146908231587, + "grad_norm": 0.33948177099227905, + "learning_rate": 2.8004333333333332e-05, + "loss": 0.0886, + "step": 15993 + }, + { + "epoch": 12.593934619929106, + "grad_norm": 0.32274487614631653, + "learning_rate": 2.8004e-05, + "loss": 0.0793, + "step": 15994 + }, + { + "epoch": 12.594722331626624, + "grad_norm": 0.3732445240020752, + "learning_rate": 2.8003666666666668e-05, + "loss": 0.0436, + "step": 15995 + }, + { + "epoch": 12.595510043324143, + "grad_norm": 0.28343331813812256, + "learning_rate": 2.8003333333333333e-05, + "loss": 0.0404, + "step": 15996 + }, + { + "epoch": 12.596297755021663, + "grad_norm": 0.33373385667800903, + "learning_rate": 2.8003e-05, + "loss": 0.0287, + "step": 15997 + }, + { + "epoch": 12.59708546671918, + "grad_norm": 0.20478951930999756, + "learning_rate": 2.800266666666667e-05, + "loss": 0.0201, + "step": 15998 + }, + { + "epoch": 12.5978731784167, + "grad_norm": 0.22438207268714905, + "learning_rate": 2.8002333333333335e-05, + "loss": 0.0095, + "step": 15999 + }, + { + "epoch": 12.598660890114218, + "grad_norm": 0.09988577663898468, + "learning_rate": 2.8002e-05, + "loss": 0.0068, + "step": 16000 + }, + { + "epoch": 12.598660890114218, + "eval_cer": 0.11237265728283692, + "eval_loss": 0.3054291307926178, + "eval_runtime": 16.5026, + "eval_samples_per_second": 18.421, + "eval_steps_per_second": 0.606, + "eval_wer": 0.3917881811204912, + "step": 16000 + }, + { + "epoch": 12.599448601811737, + "grad_norm": 0.14574271440505981, + "learning_rate": 2.800166666666667e-05, + "loss": 0.0061, + "step": 16001 + }, + { + "epoch": 12.600236313509257, + "grad_norm": 0.16665329039096832, + "learning_rate": 2.8001333333333332e-05, + "loss": 0.0309, + "step": 16002 + }, + { + "epoch": 12.601024025206774, + "grad_norm": 0.11944793164730072, + "learning_rate": 2.8001000000000002e-05, + "loss": 0.0069, + "step": 16003 + }, + { + "epoch": 12.601811736904294, + "grad_norm": 0.11311313509941101, + "learning_rate": 2.8000666666666668e-05, + "loss": 0.0051, + "step": 16004 + }, + { + "epoch": 12.602599448601811, + "grad_norm": 0.17916497588157654, + "learning_rate": 2.8000333333333334e-05, + "loss": 0.01, + "step": 16005 + }, + { + "epoch": 12.60338716029933, + "grad_norm": 0.23751460015773773, + "learning_rate": 2.8e-05, + "loss": 0.0129, + "step": 16006 + }, + { + "epoch": 12.604174871996848, + "grad_norm": 0.09845653921365738, + "learning_rate": 2.799966666666667e-05, + "loss": 0.0092, + "step": 16007 + }, + { + "epoch": 12.604962583694368, + "grad_norm": 0.29714077711105347, + "learning_rate": 2.7999333333333335e-05, + "loss": 0.0117, + "step": 16008 + }, + { + "epoch": 12.605750295391886, + "grad_norm": 0.13163623213768005, + "learning_rate": 2.7999e-05, + "loss": 0.0063, + "step": 16009 + }, + { + "epoch": 12.606538007089405, + "grad_norm": 0.21642498672008514, + "learning_rate": 2.7998666666666667e-05, + "loss": 0.0123, + "step": 16010 + }, + { + "epoch": 12.607325718786925, + "grad_norm": 0.13917958736419678, + "learning_rate": 2.7998333333333333e-05, + "loss": 0.0079, + "step": 16011 + }, + { + "epoch": 12.608113430484442, + "grad_norm": 0.1128886267542839, + "learning_rate": 2.7998000000000002e-05, + "loss": 0.0099, + "step": 16012 + }, + { + "epoch": 12.608901142181962, + "grad_norm": 0.25542083382606506, + "learning_rate": 2.7997666666666665e-05, + "loss": 0.0132, + "step": 16013 + }, + { + "epoch": 12.60968885387948, + "grad_norm": 0.15633077919483185, + "learning_rate": 2.7997333333333334e-05, + "loss": 0.0096, + "step": 16014 + }, + { + "epoch": 12.610476565576999, + "grad_norm": 0.13385312259197235, + "learning_rate": 2.7997000000000003e-05, + "loss": 0.0093, + "step": 16015 + }, + { + "epoch": 12.611264277274518, + "grad_norm": 0.1269933432340622, + "learning_rate": 2.7996666666666666e-05, + "loss": 0.0052, + "step": 16016 + }, + { + "epoch": 12.612051988972036, + "grad_norm": 0.17959652841091156, + "learning_rate": 2.7996333333333335e-05, + "loss": 0.035, + "step": 16017 + }, + { + "epoch": 12.612839700669555, + "grad_norm": 0.15817369520664215, + "learning_rate": 2.7996e-05, + "loss": 0.0084, + "step": 16018 + }, + { + "epoch": 12.613627412367073, + "grad_norm": 0.23982036113739014, + "learning_rate": 2.7995666666666667e-05, + "loss": 0.0153, + "step": 16019 + }, + { + "epoch": 12.614415124064593, + "grad_norm": 0.12239546328783035, + "learning_rate": 2.7995333333333333e-05, + "loss": 0.0064, + "step": 16020 + }, + { + "epoch": 12.615202835762112, + "grad_norm": 0.3477782905101776, + "learning_rate": 2.7995000000000002e-05, + "loss": 0.011, + "step": 16021 + }, + { + "epoch": 12.61599054745963, + "grad_norm": 0.2834319770336151, + "learning_rate": 2.7994666666666665e-05, + "loss": 0.0126, + "step": 16022 + }, + { + "epoch": 12.61677825915715, + "grad_norm": 0.17151260375976562, + "learning_rate": 2.7994333333333334e-05, + "loss": 0.0066, + "step": 16023 + }, + { + "epoch": 12.617565970854667, + "grad_norm": 0.27091771364212036, + "learning_rate": 2.7994000000000003e-05, + "loss": 0.0059, + "step": 16024 + }, + { + "epoch": 12.618353682552186, + "grad_norm": 0.13081932067871094, + "learning_rate": 2.7993666666666666e-05, + "loss": 0.0075, + "step": 16025 + }, + { + "epoch": 12.619141394249704, + "grad_norm": 0.0761503353714943, + "learning_rate": 2.7993333333333335e-05, + "loss": 0.0042, + "step": 16026 + }, + { + "epoch": 12.619929105947223, + "grad_norm": 0.08355507999658585, + "learning_rate": 2.7993e-05, + "loss": 0.0046, + "step": 16027 + }, + { + "epoch": 12.620716817644743, + "grad_norm": 0.20982466638088226, + "learning_rate": 2.7992666666666667e-05, + "loss": 0.0078, + "step": 16028 + }, + { + "epoch": 12.62150452934226, + "grad_norm": 0.1747661530971527, + "learning_rate": 2.7992333333333333e-05, + "loss": 0.0064, + "step": 16029 + }, + { + "epoch": 12.62229224103978, + "grad_norm": 0.15770235657691956, + "learning_rate": 2.7992000000000002e-05, + "loss": 0.0088, + "step": 16030 + }, + { + "epoch": 12.623079952737298, + "grad_norm": 0.4727642834186554, + "learning_rate": 2.7991666666666665e-05, + "loss": 0.0115, + "step": 16031 + }, + { + "epoch": 12.623867664434817, + "grad_norm": 0.13234782218933105, + "learning_rate": 2.7991333333333334e-05, + "loss": 0.005, + "step": 16032 + }, + { + "epoch": 12.624655376132335, + "grad_norm": 0.4486909508705139, + "learning_rate": 2.7991000000000004e-05, + "loss": 0.0105, + "step": 16033 + }, + { + "epoch": 12.625443087829854, + "grad_norm": 0.1901310235261917, + "learning_rate": 2.7990666666666666e-05, + "loss": 0.0069, + "step": 16034 + }, + { + "epoch": 12.626230799527374, + "grad_norm": 0.06998317688703537, + "learning_rate": 2.7990333333333335e-05, + "loss": 0.0036, + "step": 16035 + }, + { + "epoch": 12.627018511224891, + "grad_norm": 0.24017830193042755, + "learning_rate": 2.799e-05, + "loss": 0.0117, + "step": 16036 + }, + { + "epoch": 12.62780622292241, + "grad_norm": 0.2754761576652527, + "learning_rate": 2.7989666666666667e-05, + "loss": 0.009, + "step": 16037 + }, + { + "epoch": 12.628593934619929, + "grad_norm": 0.22049863636493683, + "learning_rate": 2.7989333333333333e-05, + "loss": 0.0064, + "step": 16038 + }, + { + "epoch": 12.629381646317448, + "grad_norm": 0.2153993397951126, + "learning_rate": 2.7989000000000003e-05, + "loss": 0.0081, + "step": 16039 + }, + { + "epoch": 12.630169358014967, + "grad_norm": 0.3900080919265747, + "learning_rate": 2.798866666666667e-05, + "loss": 0.0112, + "step": 16040 + }, + { + "epoch": 12.630957069712485, + "grad_norm": 0.5614567995071411, + "learning_rate": 2.7988333333333334e-05, + "loss": 0.158, + "step": 16041 + }, + { + "epoch": 12.631744781410005, + "grad_norm": 0.484210342168808, + "learning_rate": 2.7988e-05, + "loss": 0.1046, + "step": 16042 + }, + { + "epoch": 12.632532493107522, + "grad_norm": 0.5816476941108704, + "learning_rate": 2.7987666666666666e-05, + "loss": 0.0688, + "step": 16043 + }, + { + "epoch": 12.633320204805042, + "grad_norm": 0.5349624752998352, + "learning_rate": 2.7987333333333336e-05, + "loss": 0.0663, + "step": 16044 + }, + { + "epoch": 12.63410791650256, + "grad_norm": 0.47901374101638794, + "learning_rate": 2.7986999999999998e-05, + "loss": 0.0766, + "step": 16045 + }, + { + "epoch": 12.634895628200079, + "grad_norm": 0.2148965448141098, + "learning_rate": 2.7986666666666668e-05, + "loss": 0.0206, + "step": 16046 + }, + { + "epoch": 12.635683339897598, + "grad_norm": 0.19055791199207306, + "learning_rate": 2.7986333333333333e-05, + "loss": 0.0335, + "step": 16047 + }, + { + "epoch": 12.636471051595116, + "grad_norm": 0.3875463902950287, + "learning_rate": 2.7986e-05, + "loss": 0.0312, + "step": 16048 + }, + { + "epoch": 12.637258763292635, + "grad_norm": 0.2821081280708313, + "learning_rate": 2.798566666666667e-05, + "loss": 0.0234, + "step": 16049 + }, + { + "epoch": 12.638046474990153, + "grad_norm": 0.2448875904083252, + "learning_rate": 2.7985333333333335e-05, + "loss": 0.0335, + "step": 16050 + }, + { + "epoch": 12.638834186687673, + "grad_norm": 0.2385857105255127, + "learning_rate": 2.7985e-05, + "loss": 0.0119, + "step": 16051 + }, + { + "epoch": 12.63962189838519, + "grad_norm": 0.4530807137489319, + "learning_rate": 2.7984666666666667e-05, + "loss": 0.032, + "step": 16052 + }, + { + "epoch": 12.64040961008271, + "grad_norm": 0.4468100965023041, + "learning_rate": 2.7984333333333336e-05, + "loss": 0.0114, + "step": 16053 + }, + { + "epoch": 12.64119732178023, + "grad_norm": 0.3414275050163269, + "learning_rate": 2.7984e-05, + "loss": 0.0104, + "step": 16054 + }, + { + "epoch": 12.641985033477747, + "grad_norm": 0.24828927218914032, + "learning_rate": 2.7983666666666668e-05, + "loss": 0.006, + "step": 16055 + }, + { + "epoch": 12.642772745175266, + "grad_norm": 0.2559967041015625, + "learning_rate": 2.7983333333333334e-05, + "loss": 0.005, + "step": 16056 + }, + { + "epoch": 12.643560456872784, + "grad_norm": 0.2887651026248932, + "learning_rate": 2.7983e-05, + "loss": 0.0139, + "step": 16057 + }, + { + "epoch": 12.644348168570303, + "grad_norm": 0.17798395454883575, + "learning_rate": 2.798266666666667e-05, + "loss": 0.0085, + "step": 16058 + }, + { + "epoch": 12.645135880267823, + "grad_norm": 0.19831353425979614, + "learning_rate": 2.7982333333333335e-05, + "loss": 0.0079, + "step": 16059 + }, + { + "epoch": 12.64592359196534, + "grad_norm": 0.2374373823404312, + "learning_rate": 2.7982e-05, + "loss": 0.0142, + "step": 16060 + }, + { + "epoch": 12.64671130366286, + "grad_norm": 0.15448321402072906, + "learning_rate": 2.7981666666666667e-05, + "loss": 0.0079, + "step": 16061 + }, + { + "epoch": 12.647499015360378, + "grad_norm": 0.2855042517185211, + "learning_rate": 2.7981333333333336e-05, + "loss": 0.01, + "step": 16062 + }, + { + "epoch": 12.648286727057897, + "grad_norm": 0.15132546424865723, + "learning_rate": 2.7981e-05, + "loss": 0.008, + "step": 16063 + }, + { + "epoch": 12.649074438755415, + "grad_norm": 0.11226892471313477, + "learning_rate": 2.7980666666666668e-05, + "loss": 0.0104, + "step": 16064 + }, + { + "epoch": 12.649862150452934, + "grad_norm": 0.21653081476688385, + "learning_rate": 2.7980333333333337e-05, + "loss": 0.0082, + "step": 16065 + }, + { + "epoch": 12.650649862150454, + "grad_norm": 0.10395772755146027, + "learning_rate": 2.798e-05, + "loss": 0.0042, + "step": 16066 + }, + { + "epoch": 12.651437573847971, + "grad_norm": 0.2552696764469147, + "learning_rate": 2.797966666666667e-05, + "loss": 0.0115, + "step": 16067 + }, + { + "epoch": 12.65222528554549, + "grad_norm": 0.16312497854232788, + "learning_rate": 2.7979333333333335e-05, + "loss": 0.0076, + "step": 16068 + }, + { + "epoch": 12.653012997243009, + "grad_norm": 0.22795426845550537, + "learning_rate": 2.7979e-05, + "loss": 0.008, + "step": 16069 + }, + { + "epoch": 12.653800708940528, + "grad_norm": 0.21964867413043976, + "learning_rate": 2.7978666666666667e-05, + "loss": 0.0103, + "step": 16070 + }, + { + "epoch": 12.654588420638046, + "grad_norm": 0.15390975773334503, + "learning_rate": 2.7978333333333333e-05, + "loss": 0.008, + "step": 16071 + }, + { + "epoch": 12.655376132335565, + "grad_norm": 0.20147143304347992, + "learning_rate": 2.7978e-05, + "loss": 0.0106, + "step": 16072 + }, + { + "epoch": 12.656163844033085, + "grad_norm": 0.18713094294071198, + "learning_rate": 2.7977666666666668e-05, + "loss": 0.0111, + "step": 16073 + }, + { + "epoch": 12.656951555730602, + "grad_norm": 0.36015719175338745, + "learning_rate": 2.7977333333333334e-05, + "loss": 0.0082, + "step": 16074 + }, + { + "epoch": 12.657739267428122, + "grad_norm": 0.16703025996685028, + "learning_rate": 2.7977e-05, + "loss": 0.0078, + "step": 16075 + }, + { + "epoch": 12.65852697912564, + "grad_norm": 0.4147128760814667, + "learning_rate": 2.797666666666667e-05, + "loss": 0.0122, + "step": 16076 + }, + { + "epoch": 12.659314690823159, + "grad_norm": 0.19040919840335846, + "learning_rate": 2.7976333333333332e-05, + "loss": 0.0067, + "step": 16077 + }, + { + "epoch": 12.660102402520678, + "grad_norm": 0.22660566866397858, + "learning_rate": 2.7976e-05, + "loss": 0.0075, + "step": 16078 + }, + { + "epoch": 12.660890114218196, + "grad_norm": 0.16505484282970428, + "learning_rate": 2.7975666666666667e-05, + "loss": 0.0076, + "step": 16079 + }, + { + "epoch": 12.661677825915715, + "grad_norm": 0.21331647038459778, + "learning_rate": 2.7975333333333333e-05, + "loss": 0.0068, + "step": 16080 + }, + { + "epoch": 12.662465537613233, + "grad_norm": 0.27821531891822815, + "learning_rate": 2.7975e-05, + "loss": 0.0137, + "step": 16081 + }, + { + "epoch": 12.663253249310753, + "grad_norm": 0.11275710165500641, + "learning_rate": 2.7974666666666668e-05, + "loss": 0.005, + "step": 16082 + }, + { + "epoch": 12.66404096100827, + "grad_norm": 0.09441031515598297, + "learning_rate": 2.7974333333333334e-05, + "loss": 0.0063, + "step": 16083 + }, + { + "epoch": 12.66482867270579, + "grad_norm": 0.16141463816165924, + "learning_rate": 2.7974e-05, + "loss": 0.0126, + "step": 16084 + }, + { + "epoch": 12.66561638440331, + "grad_norm": 0.1560875028371811, + "learning_rate": 2.797366666666667e-05, + "loss": 0.0112, + "step": 16085 + }, + { + "epoch": 12.666404096100827, + "grad_norm": 0.24516361951828003, + "learning_rate": 2.7973333333333332e-05, + "loss": 0.0162, + "step": 16086 + }, + { + "epoch": 12.667191807798346, + "grad_norm": 0.2058125138282776, + "learning_rate": 2.7973e-05, + "loss": 0.0045, + "step": 16087 + }, + { + "epoch": 12.667979519495864, + "grad_norm": 0.22062994539737701, + "learning_rate": 2.7972666666666667e-05, + "loss": 0.0079, + "step": 16088 + }, + { + "epoch": 12.668767231193383, + "grad_norm": 0.2774559259414673, + "learning_rate": 2.7972333333333333e-05, + "loss": 0.0123, + "step": 16089 + }, + { + "epoch": 12.669554942890901, + "grad_norm": 0.47358450293540955, + "learning_rate": 2.7972000000000003e-05, + "loss": 0.0275, + "step": 16090 + }, + { + "epoch": 12.67034265458842, + "grad_norm": 0.609592854976654, + "learning_rate": 2.797166666666667e-05, + "loss": 0.1576, + "step": 16091 + }, + { + "epoch": 12.67113036628594, + "grad_norm": 0.43854275345802307, + "learning_rate": 2.7971333333333334e-05, + "loss": 0.1487, + "step": 16092 + }, + { + "epoch": 12.671918077983458, + "grad_norm": 0.44181689620018005, + "learning_rate": 2.7971e-05, + "loss": 0.0767, + "step": 16093 + }, + { + "epoch": 12.672705789680977, + "grad_norm": 0.4653741419315338, + "learning_rate": 2.797066666666667e-05, + "loss": 0.0788, + "step": 16094 + }, + { + "epoch": 12.673493501378495, + "grad_norm": 0.3904432952404022, + "learning_rate": 2.7970333333333332e-05, + "loss": 0.0546, + "step": 16095 + }, + { + "epoch": 12.674281213076014, + "grad_norm": 0.5121660828590393, + "learning_rate": 2.797e-05, + "loss": 0.0344, + "step": 16096 + }, + { + "epoch": 12.675068924773534, + "grad_norm": 0.2583841383457184, + "learning_rate": 2.7969666666666667e-05, + "loss": 0.0275, + "step": 16097 + }, + { + "epoch": 12.675856636471051, + "grad_norm": 0.13905255496501923, + "learning_rate": 2.7969333333333333e-05, + "loss": 0.0185, + "step": 16098 + }, + { + "epoch": 12.67664434816857, + "grad_norm": 0.11450202763080597, + "learning_rate": 2.7969000000000003e-05, + "loss": 0.0267, + "step": 16099 + }, + { + "epoch": 12.677432059866089, + "grad_norm": 0.43917223811149597, + "learning_rate": 2.7968666666666665e-05, + "loss": 0.0099, + "step": 16100 + }, + { + "epoch": 12.678219771563608, + "grad_norm": 0.14626175165176392, + "learning_rate": 2.7968333333333335e-05, + "loss": 0.0083, + "step": 16101 + }, + { + "epoch": 12.679007483261126, + "grad_norm": 0.17641961574554443, + "learning_rate": 2.7968e-05, + "loss": 0.0108, + "step": 16102 + }, + { + "epoch": 12.679795194958645, + "grad_norm": 0.22956235706806183, + "learning_rate": 2.7967666666666666e-05, + "loss": 0.0125, + "step": 16103 + }, + { + "epoch": 12.680582906656165, + "grad_norm": 0.15748366713523865, + "learning_rate": 2.7967333333333332e-05, + "loss": 0.0049, + "step": 16104 + }, + { + "epoch": 12.681370618353682, + "grad_norm": 0.16224870085716248, + "learning_rate": 2.7967000000000002e-05, + "loss": 0.0082, + "step": 16105 + }, + { + "epoch": 12.682158330051202, + "grad_norm": 0.11596690118312836, + "learning_rate": 2.7966666666666664e-05, + "loss": 0.0065, + "step": 16106 + }, + { + "epoch": 12.68294604174872, + "grad_norm": 0.14225205779075623, + "learning_rate": 2.7966333333333334e-05, + "loss": 0.0072, + "step": 16107 + }, + { + "epoch": 12.683733753446239, + "grad_norm": 0.16857078671455383, + "learning_rate": 2.7966000000000003e-05, + "loss": 0.009, + "step": 16108 + }, + { + "epoch": 12.684521465143757, + "grad_norm": 0.2798473834991455, + "learning_rate": 2.7965666666666666e-05, + "loss": 0.0088, + "step": 16109 + }, + { + "epoch": 12.685309176841276, + "grad_norm": 0.13624702394008636, + "learning_rate": 2.7965333333333335e-05, + "loss": 0.0069, + "step": 16110 + }, + { + "epoch": 12.686096888538795, + "grad_norm": 0.0825338289141655, + "learning_rate": 2.7965e-05, + "loss": 0.0065, + "step": 16111 + }, + { + "epoch": 12.686884600236313, + "grad_norm": 0.2972523272037506, + "learning_rate": 2.7964666666666667e-05, + "loss": 0.007, + "step": 16112 + }, + { + "epoch": 12.687672311933833, + "grad_norm": 0.10725875943899155, + "learning_rate": 2.7964333333333333e-05, + "loss": 0.0063, + "step": 16113 + }, + { + "epoch": 12.68846002363135, + "grad_norm": 0.20817647874355316, + "learning_rate": 2.7964000000000002e-05, + "loss": 0.0068, + "step": 16114 + }, + { + "epoch": 12.68924773532887, + "grad_norm": 0.2202547937631607, + "learning_rate": 2.7963666666666668e-05, + "loss": 0.0089, + "step": 16115 + }, + { + "epoch": 12.69003544702639, + "grad_norm": 0.5576842427253723, + "learning_rate": 2.7963333333333334e-05, + "loss": 0.0119, + "step": 16116 + }, + { + "epoch": 12.690823158723907, + "grad_norm": 0.13015079498291016, + "learning_rate": 2.7963000000000003e-05, + "loss": 0.0071, + "step": 16117 + }, + { + "epoch": 12.691610870421426, + "grad_norm": 0.2191544473171234, + "learning_rate": 2.7962666666666666e-05, + "loss": 0.0176, + "step": 16118 + }, + { + "epoch": 12.692398582118944, + "grad_norm": 0.14804689586162567, + "learning_rate": 2.7962333333333335e-05, + "loss": 0.0072, + "step": 16119 + }, + { + "epoch": 12.693186293816463, + "grad_norm": 0.3842185437679291, + "learning_rate": 2.7962e-05, + "loss": 0.0108, + "step": 16120 + }, + { + "epoch": 12.693974005513981, + "grad_norm": 0.3179798126220703, + "learning_rate": 2.7961666666666667e-05, + "loss": 0.0097, + "step": 16121 + }, + { + "epoch": 12.6947617172115, + "grad_norm": 0.1613743156194687, + "learning_rate": 2.7961333333333333e-05, + "loss": 0.0075, + "step": 16122 + }, + { + "epoch": 12.69554942890902, + "grad_norm": 0.24825099110603333, + "learning_rate": 2.7961000000000002e-05, + "loss": 0.0088, + "step": 16123 + }, + { + "epoch": 12.696337140606538, + "grad_norm": 0.11573708802461624, + "learning_rate": 2.7960666666666668e-05, + "loss": 0.0045, + "step": 16124 + }, + { + "epoch": 12.697124852304057, + "grad_norm": 0.28502750396728516, + "learning_rate": 2.7960333333333334e-05, + "loss": 0.0076, + "step": 16125 + }, + { + "epoch": 12.697912564001575, + "grad_norm": 0.1462554931640625, + "learning_rate": 2.7960000000000003e-05, + "loss": 0.0053, + "step": 16126 + }, + { + "epoch": 12.698700275699094, + "grad_norm": 0.1676461547613144, + "learning_rate": 2.7959666666666666e-05, + "loss": 0.0075, + "step": 16127 + }, + { + "epoch": 12.699487987396612, + "grad_norm": 0.3215475082397461, + "learning_rate": 2.7959333333333335e-05, + "loss": 0.0093, + "step": 16128 + }, + { + "epoch": 12.700275699094131, + "grad_norm": 0.10357668250799179, + "learning_rate": 2.7959e-05, + "loss": 0.0049, + "step": 16129 + }, + { + "epoch": 12.701063410791651, + "grad_norm": 0.3757361173629761, + "learning_rate": 2.7958666666666667e-05, + "loss": 0.0136, + "step": 16130 + }, + { + "epoch": 12.701851122489169, + "grad_norm": 0.13927553594112396, + "learning_rate": 2.7958333333333333e-05, + "loss": 0.0066, + "step": 16131 + }, + { + "epoch": 12.702638834186688, + "grad_norm": 0.47057297825813293, + "learning_rate": 2.7958e-05, + "loss": 0.0141, + "step": 16132 + }, + { + "epoch": 12.703426545884206, + "grad_norm": 0.48437607288360596, + "learning_rate": 2.7957666666666668e-05, + "loss": 0.01, + "step": 16133 + }, + { + "epoch": 12.704214257581725, + "grad_norm": 0.2230672687292099, + "learning_rate": 2.7957333333333334e-05, + "loss": 0.0082, + "step": 16134 + }, + { + "epoch": 12.705001969279245, + "grad_norm": 0.5830339193344116, + "learning_rate": 2.7957e-05, + "loss": 0.0115, + "step": 16135 + }, + { + "epoch": 12.705789680976762, + "grad_norm": 0.3645531237125397, + "learning_rate": 2.7956666666666666e-05, + "loss": 0.0118, + "step": 16136 + }, + { + "epoch": 12.706577392674282, + "grad_norm": 0.15843479335308075, + "learning_rate": 2.7956333333333335e-05, + "loss": 0.0055, + "step": 16137 + }, + { + "epoch": 12.7073651043718, + "grad_norm": 0.20551156997680664, + "learning_rate": 2.7955999999999998e-05, + "loss": 0.009, + "step": 16138 + }, + { + "epoch": 12.708152816069319, + "grad_norm": 1.1534541845321655, + "learning_rate": 2.7955666666666667e-05, + "loss": 0.023, + "step": 16139 + }, + { + "epoch": 12.708940527766837, + "grad_norm": 0.11293768137693405, + "learning_rate": 2.7955333333333337e-05, + "loss": 0.0051, + "step": 16140 + }, + { + "epoch": 12.709728239464356, + "grad_norm": 0.56775963306427, + "learning_rate": 2.7955e-05, + "loss": 0.1899, + "step": 16141 + }, + { + "epoch": 12.710515951161875, + "grad_norm": 0.7048707008361816, + "learning_rate": 2.795466666666667e-05, + "loss": 0.122, + "step": 16142 + }, + { + "epoch": 12.711303662859393, + "grad_norm": 0.4956205487251282, + "learning_rate": 2.7954333333333334e-05, + "loss": 0.088, + "step": 16143 + }, + { + "epoch": 12.712091374556913, + "grad_norm": 0.6192354559898376, + "learning_rate": 2.7954e-05, + "loss": 0.0947, + "step": 16144 + }, + { + "epoch": 12.71287908625443, + "grad_norm": 0.2588057219982147, + "learning_rate": 2.7953666666666666e-05, + "loss": 0.0511, + "step": 16145 + }, + { + "epoch": 12.71366679795195, + "grad_norm": 0.4678553342819214, + "learning_rate": 2.7953333333333336e-05, + "loss": 0.0373, + "step": 16146 + }, + { + "epoch": 12.714454509649467, + "grad_norm": 0.22973181307315826, + "learning_rate": 2.7952999999999998e-05, + "loss": 0.0343, + "step": 16147 + }, + { + "epoch": 12.715242221346987, + "grad_norm": 0.24336938560009003, + "learning_rate": 2.7952666666666667e-05, + "loss": 0.0302, + "step": 16148 + }, + { + "epoch": 12.716029933044506, + "grad_norm": 0.10527234524488449, + "learning_rate": 2.7952333333333337e-05, + "loss": 0.0075, + "step": 16149 + }, + { + "epoch": 12.716817644742024, + "grad_norm": 0.34613072872161865, + "learning_rate": 2.7952e-05, + "loss": 0.0098, + "step": 16150 + }, + { + "epoch": 12.717605356439543, + "grad_norm": 0.1846241056919098, + "learning_rate": 2.795166666666667e-05, + "loss": 0.0125, + "step": 16151 + }, + { + "epoch": 12.718393068137061, + "grad_norm": 0.5164477825164795, + "learning_rate": 2.7951333333333335e-05, + "loss": 0.0204, + "step": 16152 + }, + { + "epoch": 12.71918077983458, + "grad_norm": 0.17042486369609833, + "learning_rate": 2.7951e-05, + "loss": 0.0066, + "step": 16153 + }, + { + "epoch": 12.7199684915321, + "grad_norm": 0.13261806964874268, + "learning_rate": 2.7950666666666666e-05, + "loss": 0.0072, + "step": 16154 + }, + { + "epoch": 12.720756203229618, + "grad_norm": 0.144711434841156, + "learning_rate": 2.7950333333333336e-05, + "loss": 0.0109, + "step": 16155 + }, + { + "epoch": 12.721543914927137, + "grad_norm": 0.12806667387485504, + "learning_rate": 2.795e-05, + "loss": 0.0111, + "step": 16156 + }, + { + "epoch": 12.722331626624655, + "grad_norm": 0.14936237037181854, + "learning_rate": 2.7949666666666668e-05, + "loss": 0.0084, + "step": 16157 + }, + { + "epoch": 12.723119338322174, + "grad_norm": 0.51587975025177, + "learning_rate": 2.7949333333333337e-05, + "loss": 0.0055, + "step": 16158 + }, + { + "epoch": 12.723907050019692, + "grad_norm": 0.12590423226356506, + "learning_rate": 2.7949e-05, + "loss": 0.0076, + "step": 16159 + }, + { + "epoch": 12.724694761717211, + "grad_norm": 0.22005894780158997, + "learning_rate": 2.794866666666667e-05, + "loss": 0.0098, + "step": 16160 + }, + { + "epoch": 12.725482473414731, + "grad_norm": 0.10775920003652573, + "learning_rate": 2.794833333333333e-05, + "loss": 0.0086, + "step": 16161 + }, + { + "epoch": 12.726270185112249, + "grad_norm": 0.19957990944385529, + "learning_rate": 2.7948e-05, + "loss": 0.0073, + "step": 16162 + }, + { + "epoch": 12.727057896809768, + "grad_norm": 0.21687453985214233, + "learning_rate": 2.7947666666666667e-05, + "loss": 0.0121, + "step": 16163 + }, + { + "epoch": 12.727845608507286, + "grad_norm": 0.18052570521831512, + "learning_rate": 2.7947333333333333e-05, + "loss": 0.0089, + "step": 16164 + }, + { + "epoch": 12.728633320204805, + "grad_norm": 0.2379801720380783, + "learning_rate": 2.7947000000000002e-05, + "loss": 0.0056, + "step": 16165 + }, + { + "epoch": 12.729421031902323, + "grad_norm": 0.22797532379627228, + "learning_rate": 2.7946666666666668e-05, + "loss": 0.0091, + "step": 16166 + }, + { + "epoch": 12.730208743599842, + "grad_norm": 0.17719526588916779, + "learning_rate": 2.7946333333333334e-05, + "loss": 0.0085, + "step": 16167 + }, + { + "epoch": 12.730996455297362, + "grad_norm": 0.3791084587574005, + "learning_rate": 2.7946e-05, + "loss": 0.0119, + "step": 16168 + }, + { + "epoch": 12.73178416699488, + "grad_norm": 0.21651969850063324, + "learning_rate": 2.794566666666667e-05, + "loss": 0.0143, + "step": 16169 + }, + { + "epoch": 12.732571878692399, + "grad_norm": 0.16245390474796295, + "learning_rate": 2.794533333333333e-05, + "loss": 0.0062, + "step": 16170 + }, + { + "epoch": 12.733359590389917, + "grad_norm": 0.22301529347896576, + "learning_rate": 2.7945e-05, + "loss": 0.0092, + "step": 16171 + }, + { + "epoch": 12.734147302087436, + "grad_norm": 0.1544768065214157, + "learning_rate": 2.7944666666666667e-05, + "loss": 0.0092, + "step": 16172 + }, + { + "epoch": 12.734935013784956, + "grad_norm": 0.1892623007297516, + "learning_rate": 2.7944333333333333e-05, + "loss": 0.0118, + "step": 16173 + }, + { + "epoch": 12.735722725482473, + "grad_norm": 0.8177206516265869, + "learning_rate": 2.7944000000000002e-05, + "loss": 0.017, + "step": 16174 + }, + { + "epoch": 12.736510437179993, + "grad_norm": 0.49196451902389526, + "learning_rate": 2.7943666666666668e-05, + "loss": 0.0087, + "step": 16175 + }, + { + "epoch": 12.73729814887751, + "grad_norm": 0.24339276552200317, + "learning_rate": 2.7943333333333334e-05, + "loss": 0.0086, + "step": 16176 + }, + { + "epoch": 12.73808586057503, + "grad_norm": 0.10401853919029236, + "learning_rate": 2.7943e-05, + "loss": 0.0071, + "step": 16177 + }, + { + "epoch": 12.738873572272547, + "grad_norm": 0.16240231692790985, + "learning_rate": 2.794266666666667e-05, + "loss": 0.0112, + "step": 16178 + }, + { + "epoch": 12.739661283970067, + "grad_norm": 0.08432085812091827, + "learning_rate": 2.7942333333333332e-05, + "loss": 0.0055, + "step": 16179 + }, + { + "epoch": 12.740448995667586, + "grad_norm": 0.33796679973602295, + "learning_rate": 2.7942e-05, + "loss": 0.018, + "step": 16180 + }, + { + "epoch": 12.741236707365104, + "grad_norm": 0.20345504581928253, + "learning_rate": 2.7941666666666667e-05, + "loss": 0.0123, + "step": 16181 + }, + { + "epoch": 12.742024419062624, + "grad_norm": 0.28554198145866394, + "learning_rate": 2.7941333333333333e-05, + "loss": 0.0172, + "step": 16182 + }, + { + "epoch": 12.742812130760141, + "grad_norm": 0.14824435114860535, + "learning_rate": 2.7941000000000002e-05, + "loss": 0.0086, + "step": 16183 + }, + { + "epoch": 12.74359984245766, + "grad_norm": 0.14225804805755615, + "learning_rate": 2.7940666666666668e-05, + "loss": 0.0082, + "step": 16184 + }, + { + "epoch": 12.744387554155178, + "grad_norm": 0.3895396590232849, + "learning_rate": 2.7940333333333334e-05, + "loss": 0.0194, + "step": 16185 + }, + { + "epoch": 12.745175265852698, + "grad_norm": 0.3455139696598053, + "learning_rate": 2.794e-05, + "loss": 0.0128, + "step": 16186 + }, + { + "epoch": 12.745962977550217, + "grad_norm": 0.4879066050052643, + "learning_rate": 2.793966666666667e-05, + "loss": 0.011, + "step": 16187 + }, + { + "epoch": 12.746750689247735, + "grad_norm": 0.4656219482421875, + "learning_rate": 2.7939333333333332e-05, + "loss": 0.0115, + "step": 16188 + }, + { + "epoch": 12.747538400945254, + "grad_norm": 0.23239655792713165, + "learning_rate": 2.7939e-05, + "loss": 0.0131, + "step": 16189 + }, + { + "epoch": 12.748326112642772, + "grad_norm": 0.44863998889923096, + "learning_rate": 2.793866666666667e-05, + "loss": 0.0306, + "step": 16190 + }, + { + "epoch": 12.749113824340292, + "grad_norm": 0.5980526208877563, + "learning_rate": 2.7938333333333333e-05, + "loss": 0.2186, + "step": 16191 + }, + { + "epoch": 12.749901536037811, + "grad_norm": 0.39564183354377747, + "learning_rate": 2.7938000000000003e-05, + "loss": 0.1091, + "step": 16192 + }, + { + "epoch": 12.750689247735329, + "grad_norm": 0.5571467280387878, + "learning_rate": 2.7937666666666665e-05, + "loss": 0.1181, + "step": 16193 + }, + { + "epoch": 12.751476959432848, + "grad_norm": 0.3819016218185425, + "learning_rate": 2.7937333333333334e-05, + "loss": 0.0482, + "step": 16194 + }, + { + "epoch": 12.752264671130366, + "grad_norm": 0.43324020504951477, + "learning_rate": 2.7937e-05, + "loss": 0.0731, + "step": 16195 + }, + { + "epoch": 12.753052382827885, + "grad_norm": 0.22188973426818848, + "learning_rate": 2.7936666666666666e-05, + "loss": 0.0283, + "step": 16196 + }, + { + "epoch": 12.753840094525403, + "grad_norm": 0.20941732823848724, + "learning_rate": 2.7936333333333332e-05, + "loss": 0.0154, + "step": 16197 + }, + { + "epoch": 12.754627806222922, + "grad_norm": 0.2798798084259033, + "learning_rate": 2.7936e-05, + "loss": 0.0439, + "step": 16198 + }, + { + "epoch": 12.755415517920442, + "grad_norm": 0.15612578392028809, + "learning_rate": 2.7935666666666667e-05, + "loss": 0.0053, + "step": 16199 + }, + { + "epoch": 12.75620322961796, + "grad_norm": 0.17523470520973206, + "learning_rate": 2.7935333333333333e-05, + "loss": 0.0087, + "step": 16200 + }, + { + "epoch": 12.756990941315479, + "grad_norm": 0.4084024131298065, + "learning_rate": 2.7935000000000003e-05, + "loss": 0.0141, + "step": 16201 + }, + { + "epoch": 12.757778653012997, + "grad_norm": 0.12591993808746338, + "learning_rate": 2.7934666666666665e-05, + "loss": 0.0033, + "step": 16202 + }, + { + "epoch": 12.758566364710516, + "grad_norm": 0.14501149952411652, + "learning_rate": 2.7934333333333335e-05, + "loss": 0.0099, + "step": 16203 + }, + { + "epoch": 12.759354076408034, + "grad_norm": 0.20071350038051605, + "learning_rate": 2.7934e-05, + "loss": 0.0069, + "step": 16204 + }, + { + "epoch": 12.760141788105553, + "grad_norm": 0.12066517770290375, + "learning_rate": 2.7933666666666666e-05, + "loss": 0.0047, + "step": 16205 + }, + { + "epoch": 12.760929499803073, + "grad_norm": 0.17569728195667267, + "learning_rate": 2.7933333333333332e-05, + "loss": 0.0126, + "step": 16206 + }, + { + "epoch": 12.76171721150059, + "grad_norm": 0.20223475992679596, + "learning_rate": 2.7933000000000002e-05, + "loss": 0.0117, + "step": 16207 + }, + { + "epoch": 12.76250492319811, + "grad_norm": 0.23777732253074646, + "learning_rate": 2.7932666666666668e-05, + "loss": 0.0095, + "step": 16208 + }, + { + "epoch": 12.763292634895627, + "grad_norm": 0.2933274507522583, + "learning_rate": 2.7932333333333334e-05, + "loss": 0.0305, + "step": 16209 + }, + { + "epoch": 12.764080346593147, + "grad_norm": 0.19458648562431335, + "learning_rate": 2.7932000000000003e-05, + "loss": 0.0087, + "step": 16210 + }, + { + "epoch": 12.764868058290666, + "grad_norm": 0.12643098831176758, + "learning_rate": 2.7931666666666665e-05, + "loss": 0.0084, + "step": 16211 + }, + { + "epoch": 12.765655769988184, + "grad_norm": 0.13524502515792847, + "learning_rate": 2.7931333333333335e-05, + "loss": 0.0086, + "step": 16212 + }, + { + "epoch": 12.766443481685704, + "grad_norm": 0.3879307806491852, + "learning_rate": 2.7931e-05, + "loss": 0.0166, + "step": 16213 + }, + { + "epoch": 12.767231193383221, + "grad_norm": 0.29014429450035095, + "learning_rate": 2.7930666666666667e-05, + "loss": 0.0139, + "step": 16214 + }, + { + "epoch": 12.76801890508074, + "grad_norm": 0.2802991569042206, + "learning_rate": 2.7930333333333336e-05, + "loss": 0.01, + "step": 16215 + }, + { + "epoch": 12.768806616778258, + "grad_norm": 0.23414190113544464, + "learning_rate": 2.7930000000000002e-05, + "loss": 0.0088, + "step": 16216 + }, + { + "epoch": 12.769594328475778, + "grad_norm": 0.1250772625207901, + "learning_rate": 2.7929666666666668e-05, + "loss": 0.0066, + "step": 16217 + }, + { + "epoch": 12.770382040173297, + "grad_norm": 0.2241474837064743, + "learning_rate": 2.7929333333333334e-05, + "loss": 0.0053, + "step": 16218 + }, + { + "epoch": 12.771169751870815, + "grad_norm": 0.09758781641721725, + "learning_rate": 2.7929000000000003e-05, + "loss": 0.0043, + "step": 16219 + }, + { + "epoch": 12.771957463568334, + "grad_norm": 0.2133363038301468, + "learning_rate": 2.7928666666666666e-05, + "loss": 0.0086, + "step": 16220 + }, + { + "epoch": 12.772745175265852, + "grad_norm": 0.31706511974334717, + "learning_rate": 2.7928333333333335e-05, + "loss": 0.0127, + "step": 16221 + }, + { + "epoch": 12.773532886963372, + "grad_norm": 0.11509855091571808, + "learning_rate": 2.7927999999999998e-05, + "loss": 0.0057, + "step": 16222 + }, + { + "epoch": 12.77432059866089, + "grad_norm": 0.15504632890224457, + "learning_rate": 2.7927666666666667e-05, + "loss": 0.0086, + "step": 16223 + }, + { + "epoch": 12.775108310358409, + "grad_norm": 0.09071952104568481, + "learning_rate": 2.7927333333333336e-05, + "loss": 0.0044, + "step": 16224 + }, + { + "epoch": 12.775896022055928, + "grad_norm": 0.23158811032772064, + "learning_rate": 2.7927e-05, + "loss": 0.0193, + "step": 16225 + }, + { + "epoch": 12.776683733753446, + "grad_norm": 0.2722005248069763, + "learning_rate": 2.7926666666666668e-05, + "loss": 0.0108, + "step": 16226 + }, + { + "epoch": 12.777471445450965, + "grad_norm": 0.19880250096321106, + "learning_rate": 2.7926333333333334e-05, + "loss": 0.0095, + "step": 16227 + }, + { + "epoch": 12.778259157148483, + "grad_norm": 0.2841813266277313, + "learning_rate": 2.7926e-05, + "loss": 0.0079, + "step": 16228 + }, + { + "epoch": 12.779046868846002, + "grad_norm": 0.22556856274604797, + "learning_rate": 2.7925666666666666e-05, + "loss": 0.0144, + "step": 16229 + }, + { + "epoch": 12.779834580543522, + "grad_norm": 0.20260971784591675, + "learning_rate": 2.7925333333333335e-05, + "loss": 0.014, + "step": 16230 + }, + { + "epoch": 12.78062229224104, + "grad_norm": 0.36930814385414124, + "learning_rate": 2.7924999999999998e-05, + "loss": 0.0113, + "step": 16231 + }, + { + "epoch": 12.781410003938559, + "grad_norm": 0.1439410299062729, + "learning_rate": 2.7924666666666667e-05, + "loss": 0.0071, + "step": 16232 + }, + { + "epoch": 12.782197715636077, + "grad_norm": 0.31437623500823975, + "learning_rate": 2.7924333333333336e-05, + "loss": 0.0132, + "step": 16233 + }, + { + "epoch": 12.782985427333596, + "grad_norm": 0.2752645015716553, + "learning_rate": 2.7924e-05, + "loss": 0.0092, + "step": 16234 + }, + { + "epoch": 12.783773139031114, + "grad_norm": 0.22796517610549927, + "learning_rate": 2.7923666666666668e-05, + "loss": 0.0084, + "step": 16235 + }, + { + "epoch": 12.784560850728633, + "grad_norm": 0.2308921217918396, + "learning_rate": 2.7923333333333334e-05, + "loss": 0.0122, + "step": 16236 + }, + { + "epoch": 12.785348562426153, + "grad_norm": 0.12168800830841064, + "learning_rate": 2.7923e-05, + "loss": 0.0056, + "step": 16237 + }, + { + "epoch": 12.78613627412367, + "grad_norm": 0.23941655457019806, + "learning_rate": 2.7922666666666666e-05, + "loss": 0.0103, + "step": 16238 + }, + { + "epoch": 12.78692398582119, + "grad_norm": 0.3654789924621582, + "learning_rate": 2.7922333333333335e-05, + "loss": 0.011, + "step": 16239 + }, + { + "epoch": 12.787711697518708, + "grad_norm": 0.4270029664039612, + "learning_rate": 2.7922e-05, + "loss": 0.0102, + "step": 16240 + }, + { + "epoch": 12.788499409216227, + "grad_norm": 0.6591128706932068, + "learning_rate": 2.7921666666666667e-05, + "loss": 0.2026, + "step": 16241 + }, + { + "epoch": 12.789287120913745, + "grad_norm": 0.4414259195327759, + "learning_rate": 2.7921333333333337e-05, + "loss": 0.1161, + "step": 16242 + }, + { + "epoch": 12.790074832611264, + "grad_norm": 0.39135950803756714, + "learning_rate": 2.7921e-05, + "loss": 0.079, + "step": 16243 + }, + { + "epoch": 12.790862544308784, + "grad_norm": 0.8086352944374084, + "learning_rate": 2.792066666666667e-05, + "loss": 0.0806, + "step": 16244 + }, + { + "epoch": 12.791650256006301, + "grad_norm": 0.5353102684020996, + "learning_rate": 2.7920333333333334e-05, + "loss": 0.0643, + "step": 16245 + }, + { + "epoch": 12.79243796770382, + "grad_norm": 0.18257135152816772, + "learning_rate": 2.792e-05, + "loss": 0.0379, + "step": 16246 + }, + { + "epoch": 12.793225679401338, + "grad_norm": 0.29072338342666626, + "learning_rate": 2.7919666666666666e-05, + "loss": 0.0142, + "step": 16247 + }, + { + "epoch": 12.794013391098858, + "grad_norm": 0.1491587609052658, + "learning_rate": 2.7919333333333336e-05, + "loss": 0.0186, + "step": 16248 + }, + { + "epoch": 12.794801102796377, + "grad_norm": 0.10999877005815506, + "learning_rate": 2.7919e-05, + "loss": 0.007, + "step": 16249 + }, + { + "epoch": 12.795588814493895, + "grad_norm": 0.1099114939570427, + "learning_rate": 2.7918666666666667e-05, + "loss": 0.0087, + "step": 16250 + }, + { + "epoch": 12.796376526191414, + "grad_norm": 0.1965056210756302, + "learning_rate": 2.7918333333333333e-05, + "loss": 0.0191, + "step": 16251 + }, + { + "epoch": 12.797164237888932, + "grad_norm": 0.14225445687770844, + "learning_rate": 2.7918e-05, + "loss": 0.0111, + "step": 16252 + }, + { + "epoch": 12.797951949586452, + "grad_norm": 0.15372507274150848, + "learning_rate": 2.791766666666667e-05, + "loss": 0.0096, + "step": 16253 + }, + { + "epoch": 12.798739661283971, + "grad_norm": 0.1817503571510315, + "learning_rate": 2.791733333333333e-05, + "loss": 0.0087, + "step": 16254 + }, + { + "epoch": 12.799527372981489, + "grad_norm": 0.08811932802200317, + "learning_rate": 2.7917e-05, + "loss": 0.0043, + "step": 16255 + }, + { + "epoch": 12.800315084679008, + "grad_norm": 0.12269105017185211, + "learning_rate": 2.7916666666666666e-05, + "loss": 0.0063, + "step": 16256 + }, + { + "epoch": 12.801102796376526, + "grad_norm": 0.19937652349472046, + "learning_rate": 2.7916333333333332e-05, + "loss": 0.0081, + "step": 16257 + }, + { + "epoch": 12.801890508074045, + "grad_norm": 0.19930344820022583, + "learning_rate": 2.7916000000000002e-05, + "loss": 0.0102, + "step": 16258 + }, + { + "epoch": 12.802678219771563, + "grad_norm": 0.15079253911972046, + "learning_rate": 2.7915666666666668e-05, + "loss": 0.0123, + "step": 16259 + }, + { + "epoch": 12.803465931469082, + "grad_norm": 0.13615180552005768, + "learning_rate": 2.7915333333333334e-05, + "loss": 0.013, + "step": 16260 + }, + { + "epoch": 12.8042536431666, + "grad_norm": 0.13682080805301666, + "learning_rate": 2.7915e-05, + "loss": 0.007, + "step": 16261 + }, + { + "epoch": 12.80504135486412, + "grad_norm": 0.3281131386756897, + "learning_rate": 2.791466666666667e-05, + "loss": 0.0179, + "step": 16262 + }, + { + "epoch": 12.805829066561639, + "grad_norm": 0.4334314167499542, + "learning_rate": 2.791433333333333e-05, + "loss": 0.0091, + "step": 16263 + }, + { + "epoch": 12.806616778259157, + "grad_norm": 0.3370717167854309, + "learning_rate": 2.7914e-05, + "loss": 0.0097, + "step": 16264 + }, + { + "epoch": 12.807404489956676, + "grad_norm": 0.22224171459674835, + "learning_rate": 2.791366666666667e-05, + "loss": 0.0107, + "step": 16265 + }, + { + "epoch": 12.808192201654194, + "grad_norm": 0.9358347654342651, + "learning_rate": 2.7913333333333333e-05, + "loss": 0.025, + "step": 16266 + }, + { + "epoch": 12.808979913351713, + "grad_norm": 0.3853949308395386, + "learning_rate": 2.7913000000000002e-05, + "loss": 0.0171, + "step": 16267 + }, + { + "epoch": 12.809767625049233, + "grad_norm": 0.17995189130306244, + "learning_rate": 2.7912666666666668e-05, + "loss": 0.0073, + "step": 16268 + }, + { + "epoch": 12.81055533674675, + "grad_norm": 0.1779133826494217, + "learning_rate": 2.7912333333333334e-05, + "loss": 0.0058, + "step": 16269 + }, + { + "epoch": 12.81134304844427, + "grad_norm": 0.1425490826368332, + "learning_rate": 2.7912e-05, + "loss": 0.0102, + "step": 16270 + }, + { + "epoch": 12.812130760141788, + "grad_norm": 0.11077791452407837, + "learning_rate": 2.791166666666667e-05, + "loss": 0.0061, + "step": 16271 + }, + { + "epoch": 12.812918471839307, + "grad_norm": 0.3171681761741638, + "learning_rate": 2.791133333333333e-05, + "loss": 0.014, + "step": 16272 + }, + { + "epoch": 12.813706183536826, + "grad_norm": 0.10120585560798645, + "learning_rate": 2.7911e-05, + "loss": 0.004, + "step": 16273 + }, + { + "epoch": 12.814493895234344, + "grad_norm": 0.2274048626422882, + "learning_rate": 2.791066666666667e-05, + "loss": 0.0124, + "step": 16274 + }, + { + "epoch": 12.815281606931864, + "grad_norm": 0.16100850701332092, + "learning_rate": 2.7910333333333333e-05, + "loss": 0.007, + "step": 16275 + }, + { + "epoch": 12.816069318629381, + "grad_norm": 0.3439982235431671, + "learning_rate": 2.7910000000000002e-05, + "loss": 0.0146, + "step": 16276 + }, + { + "epoch": 12.8168570303269, + "grad_norm": 0.24584458768367767, + "learning_rate": 2.7909666666666668e-05, + "loss": 0.0119, + "step": 16277 + }, + { + "epoch": 12.817644742024418, + "grad_norm": 0.14197713136672974, + "learning_rate": 2.7909333333333334e-05, + "loss": 0.0077, + "step": 16278 + }, + { + "epoch": 12.818432453721938, + "grad_norm": 0.18641963601112366, + "learning_rate": 2.7909e-05, + "loss": 0.0126, + "step": 16279 + }, + { + "epoch": 12.819220165419457, + "grad_norm": 0.14507657289505005, + "learning_rate": 2.790866666666667e-05, + "loss": 0.0067, + "step": 16280 + }, + { + "epoch": 12.820007877116975, + "grad_norm": 0.13205072283744812, + "learning_rate": 2.7908333333333332e-05, + "loss": 0.0051, + "step": 16281 + }, + { + "epoch": 12.820795588814494, + "grad_norm": 0.19286060333251953, + "learning_rate": 2.7908e-05, + "loss": 0.0145, + "step": 16282 + }, + { + "epoch": 12.821583300512012, + "grad_norm": 0.15269625186920166, + "learning_rate": 2.7907666666666667e-05, + "loss": 0.0045, + "step": 16283 + }, + { + "epoch": 12.822371012209532, + "grad_norm": 0.39671164751052856, + "learning_rate": 2.7907333333333333e-05, + "loss": 0.0121, + "step": 16284 + }, + { + "epoch": 12.82315872390705, + "grad_norm": 0.17004738748073578, + "learning_rate": 2.7907000000000002e-05, + "loss": 0.0082, + "step": 16285 + }, + { + "epoch": 12.823946435604569, + "grad_norm": 0.2750309705734253, + "learning_rate": 2.7906666666666665e-05, + "loss": 0.0094, + "step": 16286 + }, + { + "epoch": 12.824734147302088, + "grad_norm": 0.12773461639881134, + "learning_rate": 2.7906333333333334e-05, + "loss": 0.0058, + "step": 16287 + }, + { + "epoch": 12.825521858999606, + "grad_norm": 1.5936611890792847, + "learning_rate": 2.7906e-05, + "loss": 0.01, + "step": 16288 + }, + { + "epoch": 12.826309570697125, + "grad_norm": 0.5740278959274292, + "learning_rate": 2.7905666666666666e-05, + "loss": 0.021, + "step": 16289 + }, + { + "epoch": 12.827097282394643, + "grad_norm": 0.31807470321655273, + "learning_rate": 2.7905333333333335e-05, + "loss": 0.0207, + "step": 16290 + }, + { + "epoch": 12.827884994092162, + "grad_norm": 0.5423628091812134, + "learning_rate": 2.7905e-05, + "loss": 0.1699, + "step": 16291 + }, + { + "epoch": 12.828672705789682, + "grad_norm": 0.5456141829490662, + "learning_rate": 2.7904666666666667e-05, + "loss": 0.189, + "step": 16292 + }, + { + "epoch": 12.8294604174872, + "grad_norm": 0.6510260105133057, + "learning_rate": 2.7904333333333333e-05, + "loss": 0.2142, + "step": 16293 + }, + { + "epoch": 12.830248129184719, + "grad_norm": 0.40499091148376465, + "learning_rate": 2.7904000000000003e-05, + "loss": 0.1013, + "step": 16294 + }, + { + "epoch": 12.831035840882237, + "grad_norm": 0.41239312291145325, + "learning_rate": 2.7903666666666665e-05, + "loss": 0.046, + "step": 16295 + }, + { + "epoch": 12.831823552579756, + "grad_norm": 0.36266952753067017, + "learning_rate": 2.7903333333333334e-05, + "loss": 0.0393, + "step": 16296 + }, + { + "epoch": 12.832611264277274, + "grad_norm": 0.24149709939956665, + "learning_rate": 2.7903e-05, + "loss": 0.0145, + "step": 16297 + }, + { + "epoch": 12.833398975974793, + "grad_norm": 0.6834068894386292, + "learning_rate": 2.7902666666666666e-05, + "loss": 0.0289, + "step": 16298 + }, + { + "epoch": 12.834186687672313, + "grad_norm": 0.4533454179763794, + "learning_rate": 2.7902333333333336e-05, + "loss": 0.0177, + "step": 16299 + }, + { + "epoch": 12.83497439936983, + "grad_norm": 0.1835239678621292, + "learning_rate": 2.7902e-05, + "loss": 0.0117, + "step": 16300 + }, + { + "epoch": 12.83576211106735, + "grad_norm": 0.28847363591194153, + "learning_rate": 2.7901666666666667e-05, + "loss": 0.0154, + "step": 16301 + }, + { + "epoch": 12.836549822764868, + "grad_norm": 0.18481793999671936, + "learning_rate": 2.7901333333333333e-05, + "loss": 0.0194, + "step": 16302 + }, + { + "epoch": 12.837337534462387, + "grad_norm": 0.22818498313426971, + "learning_rate": 2.7901000000000003e-05, + "loss": 0.0187, + "step": 16303 + }, + { + "epoch": 12.838125246159905, + "grad_norm": 0.2421848028898239, + "learning_rate": 2.7900666666666665e-05, + "loss": 0.0123, + "step": 16304 + }, + { + "epoch": 12.838912957857424, + "grad_norm": 0.2993636727333069, + "learning_rate": 2.7900333333333335e-05, + "loss": 0.04, + "step": 16305 + }, + { + "epoch": 12.839700669554944, + "grad_norm": 0.7179872989654541, + "learning_rate": 2.79e-05, + "loss": 0.0139, + "step": 16306 + }, + { + "epoch": 12.840488381252461, + "grad_norm": 0.5812466144561768, + "learning_rate": 2.7899666666666666e-05, + "loss": 0.0122, + "step": 16307 + }, + { + "epoch": 12.84127609294998, + "grad_norm": 0.09676577150821686, + "learning_rate": 2.7899333333333336e-05, + "loss": 0.0037, + "step": 16308 + }, + { + "epoch": 12.842063804647498, + "grad_norm": 0.24050527811050415, + "learning_rate": 2.7899000000000002e-05, + "loss": 0.0069, + "step": 16309 + }, + { + "epoch": 12.842851516345018, + "grad_norm": 0.38300710916519165, + "learning_rate": 2.7898666666666668e-05, + "loss": 0.0207, + "step": 16310 + }, + { + "epoch": 12.843639228042537, + "grad_norm": 0.26326778531074524, + "learning_rate": 2.7898333333333334e-05, + "loss": 0.0097, + "step": 16311 + }, + { + "epoch": 12.844426939740055, + "grad_norm": 0.1825352907180786, + "learning_rate": 2.7898e-05, + "loss": 0.0145, + "step": 16312 + }, + { + "epoch": 12.845214651437574, + "grad_norm": 0.18730947375297546, + "learning_rate": 2.7897666666666665e-05, + "loss": 0.0063, + "step": 16313 + }, + { + "epoch": 12.846002363135092, + "grad_norm": 0.17093981802463531, + "learning_rate": 2.7897333333333335e-05, + "loss": 0.0067, + "step": 16314 + }, + { + "epoch": 12.846790074832612, + "grad_norm": 0.1559232473373413, + "learning_rate": 2.7897e-05, + "loss": 0.0075, + "step": 16315 + }, + { + "epoch": 12.84757778653013, + "grad_norm": 0.21982669830322266, + "learning_rate": 2.7896666666666667e-05, + "loss": 0.0128, + "step": 16316 + }, + { + "epoch": 12.848365498227649, + "grad_norm": 0.11374326795339584, + "learning_rate": 2.7896333333333336e-05, + "loss": 0.0063, + "step": 16317 + }, + { + "epoch": 12.849153209925168, + "grad_norm": 0.20308370888233185, + "learning_rate": 2.7896e-05, + "loss": 0.0056, + "step": 16318 + }, + { + "epoch": 12.849940921622686, + "grad_norm": 0.2743051052093506, + "learning_rate": 2.7895666666666668e-05, + "loss": 0.0157, + "step": 16319 + }, + { + "epoch": 12.850728633320205, + "grad_norm": 0.19774556159973145, + "learning_rate": 2.7895333333333334e-05, + "loss": 0.0134, + "step": 16320 + }, + { + "epoch": 12.851516345017723, + "grad_norm": 0.20915627479553223, + "learning_rate": 2.7895e-05, + "loss": 0.0125, + "step": 16321 + }, + { + "epoch": 12.852304056715242, + "grad_norm": 0.39681142568588257, + "learning_rate": 2.7894666666666666e-05, + "loss": 0.0135, + "step": 16322 + }, + { + "epoch": 12.85309176841276, + "grad_norm": 0.20677880942821503, + "learning_rate": 2.7894333333333335e-05, + "loss": 0.0089, + "step": 16323 + }, + { + "epoch": 12.85387948011028, + "grad_norm": 0.17394936084747314, + "learning_rate": 2.7894e-05, + "loss": 0.0104, + "step": 16324 + }, + { + "epoch": 12.854667191807799, + "grad_norm": 0.17670466005802155, + "learning_rate": 2.7893666666666667e-05, + "loss": 0.0096, + "step": 16325 + }, + { + "epoch": 12.855454903505317, + "grad_norm": 0.2465471476316452, + "learning_rate": 2.7893333333333336e-05, + "loss": 0.011, + "step": 16326 + }, + { + "epoch": 12.856242615202836, + "grad_norm": 0.24707703292369843, + "learning_rate": 2.7893e-05, + "loss": 0.0082, + "step": 16327 + }, + { + "epoch": 12.857030326900354, + "grad_norm": 0.27372094988822937, + "learning_rate": 2.7892666666666668e-05, + "loss": 0.0084, + "step": 16328 + }, + { + "epoch": 12.857818038597873, + "grad_norm": 0.19996234774589539, + "learning_rate": 2.7892333333333334e-05, + "loss": 0.0076, + "step": 16329 + }, + { + "epoch": 12.858605750295393, + "grad_norm": 0.2641517221927643, + "learning_rate": 2.7892e-05, + "loss": 0.011, + "step": 16330 + }, + { + "epoch": 12.85939346199291, + "grad_norm": 2.9126720428466797, + "learning_rate": 2.7891666666666666e-05, + "loss": 0.0095, + "step": 16331 + }, + { + "epoch": 12.86018117369043, + "grad_norm": 0.26402711868286133, + "learning_rate": 2.7891333333333335e-05, + "loss": 0.015, + "step": 16332 + }, + { + "epoch": 12.860968885387948, + "grad_norm": 0.3931419253349304, + "learning_rate": 2.7891e-05, + "loss": 0.0115, + "step": 16333 + }, + { + "epoch": 12.861756597085467, + "grad_norm": 0.22469863295555115, + "learning_rate": 2.7890666666666667e-05, + "loss": 0.0078, + "step": 16334 + }, + { + "epoch": 12.862544308782985, + "grad_norm": 0.29407212138175964, + "learning_rate": 2.7890333333333336e-05, + "loss": 0.0133, + "step": 16335 + }, + { + "epoch": 12.863332020480504, + "grad_norm": 0.23493200540542603, + "learning_rate": 2.789e-05, + "loss": 0.0046, + "step": 16336 + }, + { + "epoch": 12.864119732178024, + "grad_norm": 0.19841881096363068, + "learning_rate": 2.7889666666666668e-05, + "loss": 0.0216, + "step": 16337 + }, + { + "epoch": 12.864907443875541, + "grad_norm": 0.3753519356250763, + "learning_rate": 2.7889333333333334e-05, + "loss": 0.0135, + "step": 16338 + }, + { + "epoch": 12.86569515557306, + "grad_norm": 0.35784441232681274, + "learning_rate": 2.7889e-05, + "loss": 0.0127, + "step": 16339 + }, + { + "epoch": 12.866482867270578, + "grad_norm": 0.3045785427093506, + "learning_rate": 2.788866666666667e-05, + "loss": 0.0095, + "step": 16340 + }, + { + "epoch": 12.867270578968098, + "grad_norm": 0.6615331172943115, + "learning_rate": 2.7888333333333335e-05, + "loss": 0.2052, + "step": 16341 + }, + { + "epoch": 12.868058290665616, + "grad_norm": 0.5886141657829285, + "learning_rate": 2.7888e-05, + "loss": 0.1134, + "step": 16342 + }, + { + "epoch": 12.868846002363135, + "grad_norm": 0.45329296588897705, + "learning_rate": 2.7887666666666667e-05, + "loss": 0.1351, + "step": 16343 + }, + { + "epoch": 12.869633714060654, + "grad_norm": 0.5017944574356079, + "learning_rate": 2.7887333333333333e-05, + "loss": 0.0963, + "step": 16344 + }, + { + "epoch": 12.870421425758172, + "grad_norm": 0.22943396866321564, + "learning_rate": 2.7887e-05, + "loss": 0.0373, + "step": 16345 + }, + { + "epoch": 12.871209137455692, + "grad_norm": 0.2523016035556793, + "learning_rate": 2.788666666666667e-05, + "loss": 0.0482, + "step": 16346 + }, + { + "epoch": 12.87199684915321, + "grad_norm": 0.24105985462665558, + "learning_rate": 2.788633333333333e-05, + "loss": 0.0264, + "step": 16347 + }, + { + "epoch": 12.872784560850729, + "grad_norm": 0.17998073995113373, + "learning_rate": 2.7886e-05, + "loss": 0.0162, + "step": 16348 + }, + { + "epoch": 12.873572272548248, + "grad_norm": 0.25902578234672546, + "learning_rate": 2.788566666666667e-05, + "loss": 0.0221, + "step": 16349 + }, + { + "epoch": 12.874359984245766, + "grad_norm": 0.12900738418102264, + "learning_rate": 2.7885333333333332e-05, + "loss": 0.0089, + "step": 16350 + }, + { + "epoch": 12.875147695943285, + "grad_norm": 0.20697256922721863, + "learning_rate": 2.7885e-05, + "loss": 0.0066, + "step": 16351 + }, + { + "epoch": 12.875935407640803, + "grad_norm": 0.181672140955925, + "learning_rate": 2.7884666666666667e-05, + "loss": 0.0096, + "step": 16352 + }, + { + "epoch": 12.876723119338322, + "grad_norm": 0.12628452479839325, + "learning_rate": 2.7884333333333333e-05, + "loss": 0.0097, + "step": 16353 + }, + { + "epoch": 12.87751083103584, + "grad_norm": 0.689086377620697, + "learning_rate": 2.7884e-05, + "loss": 0.0252, + "step": 16354 + }, + { + "epoch": 12.87829854273336, + "grad_norm": 0.13316090404987335, + "learning_rate": 2.788366666666667e-05, + "loss": 0.0068, + "step": 16355 + }, + { + "epoch": 12.879086254430879, + "grad_norm": 0.3702711760997772, + "learning_rate": 2.788333333333333e-05, + "loss": 0.0121, + "step": 16356 + }, + { + "epoch": 12.879873966128397, + "grad_norm": 0.23225486278533936, + "learning_rate": 2.7883e-05, + "loss": 0.015, + "step": 16357 + }, + { + "epoch": 12.880661677825916, + "grad_norm": 0.13526037335395813, + "learning_rate": 2.788266666666667e-05, + "loss": 0.0076, + "step": 16358 + }, + { + "epoch": 12.881449389523434, + "grad_norm": 0.9913129806518555, + "learning_rate": 2.7882333333333332e-05, + "loss": 0.013, + "step": 16359 + }, + { + "epoch": 12.882237101220953, + "grad_norm": 0.34149083495140076, + "learning_rate": 2.7882000000000002e-05, + "loss": 0.0103, + "step": 16360 + }, + { + "epoch": 12.883024812918471, + "grad_norm": 0.1725146621465683, + "learning_rate": 2.7881666666666668e-05, + "loss": 0.0081, + "step": 16361 + }, + { + "epoch": 12.88381252461599, + "grad_norm": 0.09893491119146347, + "learning_rate": 2.7881333333333334e-05, + "loss": 0.0034, + "step": 16362 + }, + { + "epoch": 12.88460023631351, + "grad_norm": 0.3163292407989502, + "learning_rate": 2.7881e-05, + "loss": 0.0154, + "step": 16363 + }, + { + "epoch": 12.885387948011028, + "grad_norm": 0.5582239031791687, + "learning_rate": 2.788066666666667e-05, + "loss": 0.0095, + "step": 16364 + }, + { + "epoch": 12.886175659708547, + "grad_norm": 0.22936509549617767, + "learning_rate": 2.7880333333333335e-05, + "loss": 0.0151, + "step": 16365 + }, + { + "epoch": 12.886963371406065, + "grad_norm": 0.44471511244773865, + "learning_rate": 2.788e-05, + "loss": 0.0159, + "step": 16366 + }, + { + "epoch": 12.887751083103584, + "grad_norm": 0.3717482388019562, + "learning_rate": 2.787966666666667e-05, + "loss": 0.013, + "step": 16367 + }, + { + "epoch": 12.888538794801104, + "grad_norm": 0.49130913615226746, + "learning_rate": 2.7879333333333333e-05, + "loss": 0.0099, + "step": 16368 + }, + { + "epoch": 12.889326506498621, + "grad_norm": 0.35102906823158264, + "learning_rate": 2.7879000000000002e-05, + "loss": 0.0146, + "step": 16369 + }, + { + "epoch": 12.89011421819614, + "grad_norm": 0.19278791546821594, + "learning_rate": 2.7878666666666668e-05, + "loss": 0.0105, + "step": 16370 + }, + { + "epoch": 12.890901929893658, + "grad_norm": 0.21106302738189697, + "learning_rate": 2.7878333333333334e-05, + "loss": 0.0104, + "step": 16371 + }, + { + "epoch": 12.891689641591178, + "grad_norm": 0.3362337648868561, + "learning_rate": 2.7878e-05, + "loss": 0.0099, + "step": 16372 + }, + { + "epoch": 12.892477353288696, + "grad_norm": 0.19145046174526215, + "learning_rate": 2.7877666666666666e-05, + "loss": 0.0102, + "step": 16373 + }, + { + "epoch": 12.893265064986215, + "grad_norm": 0.1618327498435974, + "learning_rate": 2.7877333333333335e-05, + "loss": 0.0089, + "step": 16374 + }, + { + "epoch": 12.894052776683735, + "grad_norm": 0.2532574534416199, + "learning_rate": 2.7877e-05, + "loss": 0.0171, + "step": 16375 + }, + { + "epoch": 12.894840488381252, + "grad_norm": 0.2875487506389618, + "learning_rate": 2.7876666666666667e-05, + "loss": 0.0145, + "step": 16376 + }, + { + "epoch": 12.895628200078772, + "grad_norm": 0.1680598109960556, + "learning_rate": 2.7876333333333333e-05, + "loss": 0.0069, + "step": 16377 + }, + { + "epoch": 12.89641591177629, + "grad_norm": 0.16030003130435944, + "learning_rate": 2.7876000000000002e-05, + "loss": 0.0053, + "step": 16378 + }, + { + "epoch": 12.897203623473809, + "grad_norm": 0.6148320436477661, + "learning_rate": 2.7875666666666665e-05, + "loss": 0.0116, + "step": 16379 + }, + { + "epoch": 12.897991335171326, + "grad_norm": 0.33363568782806396, + "learning_rate": 2.7875333333333334e-05, + "loss": 0.0214, + "step": 16380 + }, + { + "epoch": 12.898779046868846, + "grad_norm": 0.1601312905550003, + "learning_rate": 2.7875e-05, + "loss": 0.014, + "step": 16381 + }, + { + "epoch": 12.899566758566365, + "grad_norm": 0.16803225874900818, + "learning_rate": 2.7874666666666666e-05, + "loss": 0.0082, + "step": 16382 + }, + { + "epoch": 12.900354470263883, + "grad_norm": 0.24592739343643188, + "learning_rate": 2.7874333333333335e-05, + "loss": 0.0123, + "step": 16383 + }, + { + "epoch": 12.901142181961402, + "grad_norm": 0.2734162509441376, + "learning_rate": 2.7874e-05, + "loss": 0.011, + "step": 16384 + }, + { + "epoch": 12.90192989365892, + "grad_norm": 0.6908361911773682, + "learning_rate": 2.7873666666666667e-05, + "loss": 0.0263, + "step": 16385 + }, + { + "epoch": 12.90271760535644, + "grad_norm": 0.23468631505966187, + "learning_rate": 2.7873333333333333e-05, + "loss": 0.0106, + "step": 16386 + }, + { + "epoch": 12.903505317053959, + "grad_norm": 0.2001519352197647, + "learning_rate": 2.7873000000000002e-05, + "loss": 0.0073, + "step": 16387 + }, + { + "epoch": 12.904293028751477, + "grad_norm": 0.22226421535015106, + "learning_rate": 2.7872666666666665e-05, + "loss": 0.0095, + "step": 16388 + }, + { + "epoch": 12.905080740448996, + "grad_norm": 0.23972652852535248, + "learning_rate": 2.7872333333333334e-05, + "loss": 0.0082, + "step": 16389 + }, + { + "epoch": 12.905868452146514, + "grad_norm": 0.46415939927101135, + "learning_rate": 2.7872000000000004e-05, + "loss": 0.0174, + "step": 16390 + }, + { + "epoch": 12.906656163844033, + "grad_norm": 0.5465098023414612, + "learning_rate": 2.7871666666666666e-05, + "loss": 0.1823, + "step": 16391 + }, + { + "epoch": 12.907443875541551, + "grad_norm": 0.40326011180877686, + "learning_rate": 2.7871333333333335e-05, + "loss": 0.1884, + "step": 16392 + }, + { + "epoch": 12.90823158723907, + "grad_norm": 0.5776021480560303, + "learning_rate": 2.7871e-05, + "loss": 0.1225, + "step": 16393 + }, + { + "epoch": 12.90901929893659, + "grad_norm": 0.41944026947021484, + "learning_rate": 2.7870666666666667e-05, + "loss": 0.0689, + "step": 16394 + }, + { + "epoch": 12.909807010634108, + "grad_norm": 0.39367935061454773, + "learning_rate": 2.7870333333333333e-05, + "loss": 0.0533, + "step": 16395 + }, + { + "epoch": 12.910594722331627, + "grad_norm": 0.48624518513679504, + "learning_rate": 2.7870000000000003e-05, + "loss": 0.0662, + "step": 16396 + }, + { + "epoch": 12.911382434029145, + "grad_norm": 0.4259655177593231, + "learning_rate": 2.7869666666666665e-05, + "loss": 0.0156, + "step": 16397 + }, + { + "epoch": 12.912170145726664, + "grad_norm": 0.13819122314453125, + "learning_rate": 2.7869333333333334e-05, + "loss": 0.0108, + "step": 16398 + }, + { + "epoch": 12.912957857424182, + "grad_norm": 0.3567659854888916, + "learning_rate": 2.7869000000000004e-05, + "loss": 0.0401, + "step": 16399 + }, + { + "epoch": 12.913745569121701, + "grad_norm": 0.410005122423172, + "learning_rate": 2.7868666666666666e-05, + "loss": 0.0176, + "step": 16400 + }, + { + "epoch": 12.91453328081922, + "grad_norm": 0.2068573385477066, + "learning_rate": 2.7868333333333336e-05, + "loss": 0.0096, + "step": 16401 + }, + { + "epoch": 12.915320992516738, + "grad_norm": 0.2356121987104416, + "learning_rate": 2.7868e-05, + "loss": 0.0097, + "step": 16402 + }, + { + "epoch": 12.916108704214258, + "grad_norm": 0.20596855878829956, + "learning_rate": 2.7867666666666667e-05, + "loss": 0.0125, + "step": 16403 + }, + { + "epoch": 12.916896415911776, + "grad_norm": 0.1808975785970688, + "learning_rate": 2.7867333333333333e-05, + "loss": 0.0123, + "step": 16404 + }, + { + "epoch": 12.917684127609295, + "grad_norm": 0.2473451793193817, + "learning_rate": 2.7867e-05, + "loss": 0.01, + "step": 16405 + }, + { + "epoch": 12.918471839306815, + "grad_norm": 0.29224735498428345, + "learning_rate": 2.7866666666666665e-05, + "loss": 0.0215, + "step": 16406 + }, + { + "epoch": 12.919259551004332, + "grad_norm": 0.09970398992300034, + "learning_rate": 2.7866333333333335e-05, + "loss": 0.0073, + "step": 16407 + }, + { + "epoch": 12.920047262701852, + "grad_norm": 0.1361275315284729, + "learning_rate": 2.7866e-05, + "loss": 0.009, + "step": 16408 + }, + { + "epoch": 12.92083497439937, + "grad_norm": 0.19991478323936462, + "learning_rate": 2.7865666666666666e-05, + "loss": 0.0076, + "step": 16409 + }, + { + "epoch": 12.921622686096889, + "grad_norm": 0.4790264368057251, + "learning_rate": 2.7865333333333336e-05, + "loss": 0.0166, + "step": 16410 + }, + { + "epoch": 12.922410397794406, + "grad_norm": 0.1969681680202484, + "learning_rate": 2.7865e-05, + "loss": 0.0112, + "step": 16411 + }, + { + "epoch": 12.923198109491926, + "grad_norm": 0.15173563361167908, + "learning_rate": 2.7864666666666668e-05, + "loss": 0.0166, + "step": 16412 + }, + { + "epoch": 12.923985821189445, + "grad_norm": 0.15229302644729614, + "learning_rate": 2.7864333333333334e-05, + "loss": 0.0104, + "step": 16413 + }, + { + "epoch": 12.924773532886963, + "grad_norm": 0.22190535068511963, + "learning_rate": 2.7864e-05, + "loss": 0.0109, + "step": 16414 + }, + { + "epoch": 12.925561244584483, + "grad_norm": 0.156368687748909, + "learning_rate": 2.786366666666667e-05, + "loss": 0.0049, + "step": 16415 + }, + { + "epoch": 12.926348956282, + "grad_norm": 0.10252214223146439, + "learning_rate": 2.7863333333333335e-05, + "loss": 0.0085, + "step": 16416 + }, + { + "epoch": 12.92713666797952, + "grad_norm": 0.12180536240339279, + "learning_rate": 2.7863e-05, + "loss": 0.0065, + "step": 16417 + }, + { + "epoch": 12.927924379677037, + "grad_norm": 0.2723405361175537, + "learning_rate": 2.7862666666666667e-05, + "loss": 0.0098, + "step": 16418 + }, + { + "epoch": 12.928712091374557, + "grad_norm": 0.19846530258655548, + "learning_rate": 2.7862333333333336e-05, + "loss": 0.0109, + "step": 16419 + }, + { + "epoch": 12.929499803072076, + "grad_norm": 0.2481425255537033, + "learning_rate": 2.7862e-05, + "loss": 0.0125, + "step": 16420 + }, + { + "epoch": 12.930287514769594, + "grad_norm": 0.2725277543067932, + "learning_rate": 2.7861666666666668e-05, + "loss": 0.0096, + "step": 16421 + }, + { + "epoch": 12.931075226467113, + "grad_norm": 0.22993507981300354, + "learning_rate": 2.7861333333333334e-05, + "loss": 0.01, + "step": 16422 + }, + { + "epoch": 12.931862938164631, + "grad_norm": 0.38457760214805603, + "learning_rate": 2.7861e-05, + "loss": 0.0134, + "step": 16423 + }, + { + "epoch": 12.93265064986215, + "grad_norm": 0.46220502257347107, + "learning_rate": 2.786066666666667e-05, + "loss": 0.0081, + "step": 16424 + }, + { + "epoch": 12.93343836155967, + "grad_norm": 0.2119682878255844, + "learning_rate": 2.7860333333333335e-05, + "loss": 0.0102, + "step": 16425 + }, + { + "epoch": 12.934226073257188, + "grad_norm": 0.17024119198322296, + "learning_rate": 2.786e-05, + "loss": 0.0111, + "step": 16426 + }, + { + "epoch": 12.935013784954707, + "grad_norm": 0.15429569780826569, + "learning_rate": 2.7859666666666667e-05, + "loss": 0.0112, + "step": 16427 + }, + { + "epoch": 12.935801496652225, + "grad_norm": 0.3251214623451233, + "learning_rate": 2.7859333333333336e-05, + "loss": 0.0112, + "step": 16428 + }, + { + "epoch": 12.936589208349744, + "grad_norm": 0.3960396647453308, + "learning_rate": 2.7859e-05, + "loss": 0.012, + "step": 16429 + }, + { + "epoch": 12.937376920047262, + "grad_norm": 0.45859095454216003, + "learning_rate": 2.7858666666666668e-05, + "loss": 0.0169, + "step": 16430 + }, + { + "epoch": 12.938164631744781, + "grad_norm": 0.21267549693584442, + "learning_rate": 2.7858333333333334e-05, + "loss": 0.0069, + "step": 16431 + }, + { + "epoch": 12.9389523434423, + "grad_norm": 0.2327824980020523, + "learning_rate": 2.7858e-05, + "loss": 0.0129, + "step": 16432 + }, + { + "epoch": 12.939740055139819, + "grad_norm": 0.19917747378349304, + "learning_rate": 2.785766666666667e-05, + "loss": 0.0103, + "step": 16433 + }, + { + "epoch": 12.940527766837338, + "grad_norm": 0.4857499301433563, + "learning_rate": 2.7857333333333332e-05, + "loss": 0.0078, + "step": 16434 + }, + { + "epoch": 12.941315478534856, + "grad_norm": 0.29711446166038513, + "learning_rate": 2.7857e-05, + "loss": 0.0113, + "step": 16435 + }, + { + "epoch": 12.942103190232375, + "grad_norm": 0.1514996886253357, + "learning_rate": 2.7856666666666667e-05, + "loss": 0.0069, + "step": 16436 + }, + { + "epoch": 12.942890901929893, + "grad_norm": 0.6606900691986084, + "learning_rate": 2.7856333333333333e-05, + "loss": 0.016, + "step": 16437 + }, + { + "epoch": 12.943678613627412, + "grad_norm": 0.24846918880939484, + "learning_rate": 2.7856e-05, + "loss": 0.0073, + "step": 16438 + }, + { + "epoch": 12.944466325324932, + "grad_norm": 0.20981983840465546, + "learning_rate": 2.7855666666666668e-05, + "loss": 0.0151, + "step": 16439 + }, + { + "epoch": 12.94525403702245, + "grad_norm": 0.17133408784866333, + "learning_rate": 2.7855333333333334e-05, + "loss": 0.0079, + "step": 16440 + }, + { + "epoch": 12.946041748719969, + "grad_norm": 0.6065212488174438, + "learning_rate": 2.7855e-05, + "loss": 0.1497, + "step": 16441 + }, + { + "epoch": 12.946829460417487, + "grad_norm": 0.5443993210792542, + "learning_rate": 2.785466666666667e-05, + "loss": 0.1226, + "step": 16442 + }, + { + "epoch": 12.947617172115006, + "grad_norm": 0.5267118215560913, + "learning_rate": 2.7854333333333332e-05, + "loss": 0.157, + "step": 16443 + }, + { + "epoch": 12.948404883812525, + "grad_norm": 0.5786182284355164, + "learning_rate": 2.7854e-05, + "loss": 0.0935, + "step": 16444 + }, + { + "epoch": 12.949192595510043, + "grad_norm": 0.374318391084671, + "learning_rate": 2.7853666666666667e-05, + "loss": 0.0772, + "step": 16445 + }, + { + "epoch": 12.949980307207563, + "grad_norm": 0.34388333559036255, + "learning_rate": 2.7853333333333333e-05, + "loss": 0.0208, + "step": 16446 + }, + { + "epoch": 12.95076801890508, + "grad_norm": 0.2992127537727356, + "learning_rate": 2.7853e-05, + "loss": 0.0165, + "step": 16447 + }, + { + "epoch": 12.9515557306026, + "grad_norm": 0.21784071624279022, + "learning_rate": 2.785266666666667e-05, + "loss": 0.0141, + "step": 16448 + }, + { + "epoch": 12.952343442300117, + "grad_norm": 0.361820250749588, + "learning_rate": 2.7852333333333334e-05, + "loss": 0.0185, + "step": 16449 + }, + { + "epoch": 12.953131153997637, + "grad_norm": 0.21921953558921814, + "learning_rate": 2.7852e-05, + "loss": 0.0164, + "step": 16450 + }, + { + "epoch": 12.953918865695156, + "grad_norm": 0.2163253128528595, + "learning_rate": 2.785166666666667e-05, + "loss": 0.0121, + "step": 16451 + }, + { + "epoch": 12.954706577392674, + "grad_norm": 0.29706209897994995, + "learning_rate": 2.7851333333333332e-05, + "loss": 0.0146, + "step": 16452 + }, + { + "epoch": 12.955494289090193, + "grad_norm": 0.3205137252807617, + "learning_rate": 2.7851e-05, + "loss": 0.0114, + "step": 16453 + }, + { + "epoch": 12.956282000787711, + "grad_norm": 0.3330293595790863, + "learning_rate": 2.7850666666666667e-05, + "loss": 0.0068, + "step": 16454 + }, + { + "epoch": 12.95706971248523, + "grad_norm": 0.23072312772274017, + "learning_rate": 2.7850333333333333e-05, + "loss": 0.0126, + "step": 16455 + }, + { + "epoch": 12.957857424182748, + "grad_norm": 0.21912601590156555, + "learning_rate": 2.7850000000000003e-05, + "loss": 0.0138, + "step": 16456 + }, + { + "epoch": 12.958645135880268, + "grad_norm": 0.19703595340251923, + "learning_rate": 2.784966666666667e-05, + "loss": 0.0142, + "step": 16457 + }, + { + "epoch": 12.959432847577787, + "grad_norm": 0.5420488119125366, + "learning_rate": 2.7849333333333335e-05, + "loss": 0.0191, + "step": 16458 + }, + { + "epoch": 12.960220559275305, + "grad_norm": 0.15310071408748627, + "learning_rate": 2.7849e-05, + "loss": 0.0102, + "step": 16459 + }, + { + "epoch": 12.961008270972824, + "grad_norm": 0.17502103745937347, + "learning_rate": 2.784866666666667e-05, + "loss": 0.0101, + "step": 16460 + }, + { + "epoch": 12.961795982670342, + "grad_norm": 0.20086658000946045, + "learning_rate": 2.7848333333333332e-05, + "loss": 0.0121, + "step": 16461 + }, + { + "epoch": 12.962583694367861, + "grad_norm": 0.15269635617733002, + "learning_rate": 2.7848000000000002e-05, + "loss": 0.0065, + "step": 16462 + }, + { + "epoch": 12.96337140606538, + "grad_norm": 0.14025206863880157, + "learning_rate": 2.7847666666666664e-05, + "loss": 0.0052, + "step": 16463 + }, + { + "epoch": 12.964159117762899, + "grad_norm": 0.1705653816461563, + "learning_rate": 2.7847333333333334e-05, + "loss": 0.007, + "step": 16464 + }, + { + "epoch": 12.964946829460418, + "grad_norm": 0.16623857617378235, + "learning_rate": 2.7847000000000003e-05, + "loss": 0.0067, + "step": 16465 + }, + { + "epoch": 12.965734541157936, + "grad_norm": 0.3067316710948944, + "learning_rate": 2.7846666666666665e-05, + "loss": 0.0395, + "step": 16466 + }, + { + "epoch": 12.966522252855455, + "grad_norm": 0.09911314398050308, + "learning_rate": 2.7846333333333335e-05, + "loss": 0.0051, + "step": 16467 + }, + { + "epoch": 12.967309964552973, + "grad_norm": 0.22238941490650177, + "learning_rate": 2.7846e-05, + "loss": 0.0116, + "step": 16468 + }, + { + "epoch": 12.968097676250492, + "grad_norm": 0.12639546394348145, + "learning_rate": 2.7845666666666667e-05, + "loss": 0.0054, + "step": 16469 + }, + { + "epoch": 12.968885387948012, + "grad_norm": 0.19734053313732147, + "learning_rate": 2.7845333333333333e-05, + "loss": 0.0123, + "step": 16470 + }, + { + "epoch": 12.96967309964553, + "grad_norm": 0.12115045636892319, + "learning_rate": 2.7845000000000002e-05, + "loss": 0.0084, + "step": 16471 + }, + { + "epoch": 12.970460811343049, + "grad_norm": 0.1334354132413864, + "learning_rate": 2.7844666666666664e-05, + "loss": 0.0061, + "step": 16472 + }, + { + "epoch": 12.971248523040567, + "grad_norm": 0.4130747318267822, + "learning_rate": 2.7844333333333334e-05, + "loss": 0.0119, + "step": 16473 + }, + { + "epoch": 12.972036234738086, + "grad_norm": 0.25943365693092346, + "learning_rate": 2.7844000000000003e-05, + "loss": 0.0092, + "step": 16474 + }, + { + "epoch": 12.972823946435604, + "grad_norm": 0.21500858664512634, + "learning_rate": 2.7843666666666666e-05, + "loss": 0.0084, + "step": 16475 + }, + { + "epoch": 12.973611658133123, + "grad_norm": 0.4343121349811554, + "learning_rate": 2.7843333333333335e-05, + "loss": 0.0063, + "step": 16476 + }, + { + "epoch": 12.974399369830643, + "grad_norm": 0.19826805591583252, + "learning_rate": 2.7843e-05, + "loss": 0.0088, + "step": 16477 + }, + { + "epoch": 12.97518708152816, + "grad_norm": 0.24527201056480408, + "learning_rate": 2.7842666666666667e-05, + "loss": 0.0101, + "step": 16478 + }, + { + "epoch": 12.97597479322568, + "grad_norm": 0.14085884392261505, + "learning_rate": 2.7842333333333333e-05, + "loss": 0.0051, + "step": 16479 + }, + { + "epoch": 12.976762504923197, + "grad_norm": 0.36385735869407654, + "learning_rate": 2.7842000000000002e-05, + "loss": 0.0136, + "step": 16480 + }, + { + "epoch": 12.977550216620717, + "grad_norm": 0.17945697903633118, + "learning_rate": 2.7841666666666668e-05, + "loss": 0.006, + "step": 16481 + }, + { + "epoch": 12.978337928318236, + "grad_norm": 0.15687620639801025, + "learning_rate": 2.7841333333333334e-05, + "loss": 0.004, + "step": 16482 + }, + { + "epoch": 12.979125640015754, + "grad_norm": 0.10926385968923569, + "learning_rate": 2.7841000000000003e-05, + "loss": 0.0036, + "step": 16483 + }, + { + "epoch": 12.979913351713273, + "grad_norm": 0.4520625174045563, + "learning_rate": 2.7840666666666666e-05, + "loss": 0.0114, + "step": 16484 + }, + { + "epoch": 12.980701063410791, + "grad_norm": 0.41368475556373596, + "learning_rate": 2.7840333333333335e-05, + "loss": 0.0086, + "step": 16485 + }, + { + "epoch": 12.98148877510831, + "grad_norm": 0.26758047938346863, + "learning_rate": 2.784e-05, + "loss": 0.0112, + "step": 16486 + }, + { + "epoch": 12.982276486805828, + "grad_norm": 0.4544539749622345, + "learning_rate": 2.7839666666666667e-05, + "loss": 0.0129, + "step": 16487 + }, + { + "epoch": 12.983064198503348, + "grad_norm": 0.14942175149917603, + "learning_rate": 2.7839333333333333e-05, + "loss": 0.0028, + "step": 16488 + }, + { + "epoch": 12.983851910200867, + "grad_norm": 0.29085591435432434, + "learning_rate": 2.7839000000000002e-05, + "loss": 0.0076, + "step": 16489 + }, + { + "epoch": 12.984639621898385, + "grad_norm": 0.2215236872434616, + "learning_rate": 2.7838666666666668e-05, + "loss": 0.0124, + "step": 16490 + }, + { + "epoch": 12.985427333595904, + "grad_norm": 0.3877023458480835, + "learning_rate": 2.7838333333333334e-05, + "loss": 0.0946, + "step": 16491 + }, + { + "epoch": 12.986215045293422, + "grad_norm": 0.40295645594596863, + "learning_rate": 2.7838000000000004e-05, + "loss": 0.0685, + "step": 16492 + }, + { + "epoch": 12.987002756990941, + "grad_norm": 0.14172525703907013, + "learning_rate": 2.7837666666666666e-05, + "loss": 0.0148, + "step": 16493 + }, + { + "epoch": 12.987790468688459, + "grad_norm": 0.17370949685573578, + "learning_rate": 2.7837333333333335e-05, + "loss": 0.0351, + "step": 16494 + }, + { + "epoch": 12.988578180385979, + "grad_norm": 0.16817966103553772, + "learning_rate": 2.7836999999999998e-05, + "loss": 0.0079, + "step": 16495 + }, + { + "epoch": 12.989365892083498, + "grad_norm": 0.22745974361896515, + "learning_rate": 2.7836666666666667e-05, + "loss": 0.009, + "step": 16496 + }, + { + "epoch": 12.990153603781016, + "grad_norm": 0.13803990185260773, + "learning_rate": 2.7836333333333333e-05, + "loss": 0.0067, + "step": 16497 + }, + { + "epoch": 12.990941315478535, + "grad_norm": 0.2903100550174713, + "learning_rate": 2.7836e-05, + "loss": 0.0101, + "step": 16498 + }, + { + "epoch": 12.991729027176053, + "grad_norm": 0.20229274034500122, + "learning_rate": 2.783566666666667e-05, + "loss": 0.0069, + "step": 16499 + }, + { + "epoch": 12.992516738873572, + "grad_norm": 0.33769750595092773, + "learning_rate": 2.7835333333333334e-05, + "loss": 0.0106, + "step": 16500 + }, + { + "epoch": 12.993304450571092, + "grad_norm": 0.25203222036361694, + "learning_rate": 2.7835e-05, + "loss": 0.0159, + "step": 16501 + }, + { + "epoch": 12.99409216226861, + "grad_norm": 0.17414802312850952, + "learning_rate": 2.7834666666666666e-05, + "loss": 0.011, + "step": 16502 + }, + { + "epoch": 12.994879873966129, + "grad_norm": 0.35557496547698975, + "learning_rate": 2.7834333333333336e-05, + "loss": 0.0063, + "step": 16503 + }, + { + "epoch": 12.995667585663647, + "grad_norm": 0.21258017420768738, + "learning_rate": 2.7833999999999998e-05, + "loss": 0.0123, + "step": 16504 + }, + { + "epoch": 12.996455297361166, + "grad_norm": 0.1690303385257721, + "learning_rate": 2.7833666666666667e-05, + "loss": 0.006, + "step": 16505 + }, + { + "epoch": 12.997243009058685, + "grad_norm": 0.22378137707710266, + "learning_rate": 2.7833333333333337e-05, + "loss": 0.0106, + "step": 16506 + }, + { + "epoch": 12.998030720756203, + "grad_norm": 0.32908567786216736, + "learning_rate": 2.7833e-05, + "loss": 0.0106, + "step": 16507 + }, + { + "epoch": 12.998818432453723, + "grad_norm": 0.18427802622318268, + "learning_rate": 2.783266666666667e-05, + "loss": 0.0062, + "step": 16508 + }, + { + "epoch": 12.99960614415124, + "grad_norm": 0.37017470598220825, + "learning_rate": 2.7832333333333335e-05, + "loss": 0.0091, + "step": 16509 + }, + { + "epoch": 13.0, + "grad_norm": 0.27130159735679626, + "learning_rate": 2.7832e-05, + "loss": 0.0029, + "step": 16510 + }, + { + "epoch": 13.00078771169752, + "grad_norm": 0.4760155975818634, + "learning_rate": 2.7831666666666666e-05, + "loss": 0.1437, + "step": 16511 + }, + { + "epoch": 13.001575423395037, + "grad_norm": 0.46373727917671204, + "learning_rate": 2.7831333333333336e-05, + "loss": 0.1484, + "step": 16512 + }, + { + "epoch": 13.002363135092557, + "grad_norm": 0.3870587646961212, + "learning_rate": 2.7831e-05, + "loss": 0.0989, + "step": 16513 + }, + { + "epoch": 13.003150846790074, + "grad_norm": 0.27275070548057556, + "learning_rate": 2.7830666666666668e-05, + "loss": 0.0603, + "step": 16514 + }, + { + "epoch": 13.003938558487594, + "grad_norm": 0.3353940546512604, + "learning_rate": 2.7830333333333337e-05, + "loss": 0.0917, + "step": 16515 + }, + { + "epoch": 13.004726270185111, + "grad_norm": 0.2748880684375763, + "learning_rate": 2.783e-05, + "loss": 0.0294, + "step": 16516 + }, + { + "epoch": 13.00551398188263, + "grad_norm": 0.13664080202579498, + "learning_rate": 2.782966666666667e-05, + "loss": 0.0099, + "step": 16517 + }, + { + "epoch": 13.00630169358015, + "grad_norm": 0.22246350347995758, + "learning_rate": 2.7829333333333335e-05, + "loss": 0.0325, + "step": 16518 + }, + { + "epoch": 13.007089405277668, + "grad_norm": 0.18438342213630676, + "learning_rate": 2.7829e-05, + "loss": 0.0252, + "step": 16519 + }, + { + "epoch": 13.007877116975187, + "grad_norm": 0.14170585572719574, + "learning_rate": 2.7828666666666667e-05, + "loss": 0.0125, + "step": 16520 + }, + { + "epoch": 13.008664828672705, + "grad_norm": 0.1996072381734848, + "learning_rate": 2.7828333333333336e-05, + "loss": 0.0158, + "step": 16521 + }, + { + "epoch": 13.009452540370225, + "grad_norm": 0.2714686095714569, + "learning_rate": 2.7828e-05, + "loss": 0.0295, + "step": 16522 + }, + { + "epoch": 13.010240252067744, + "grad_norm": 0.14584006369113922, + "learning_rate": 2.7827666666666668e-05, + "loss": 0.0075, + "step": 16523 + }, + { + "epoch": 13.011027963765262, + "grad_norm": 0.09622644633054733, + "learning_rate": 2.7827333333333334e-05, + "loss": 0.0232, + "step": 16524 + }, + { + "epoch": 13.011815675462781, + "grad_norm": 0.15975446999073029, + "learning_rate": 2.7827e-05, + "loss": 0.0073, + "step": 16525 + }, + { + "epoch": 13.012603387160299, + "grad_norm": 0.49411237239837646, + "learning_rate": 2.782666666666667e-05, + "loss": 0.0095, + "step": 16526 + }, + { + "epoch": 13.013391098857818, + "grad_norm": 0.097672238945961, + "learning_rate": 2.782633333333333e-05, + "loss": 0.0063, + "step": 16527 + }, + { + "epoch": 13.014178810555336, + "grad_norm": 0.2313781976699829, + "learning_rate": 2.7826e-05, + "loss": 0.0117, + "step": 16528 + }, + { + "epoch": 13.014966522252855, + "grad_norm": 0.0839649885892868, + "learning_rate": 2.7825666666666667e-05, + "loss": 0.0026, + "step": 16529 + }, + { + "epoch": 13.015754233950375, + "grad_norm": 0.15469568967819214, + "learning_rate": 2.7825333333333333e-05, + "loss": 0.0108, + "step": 16530 + }, + { + "epoch": 13.016541945647893, + "grad_norm": 0.19707465171813965, + "learning_rate": 2.7825000000000002e-05, + "loss": 0.0139, + "step": 16531 + }, + { + "epoch": 13.017329657345412, + "grad_norm": 0.10943644493818283, + "learning_rate": 2.7824666666666668e-05, + "loss": 0.0059, + "step": 16532 + }, + { + "epoch": 13.01811736904293, + "grad_norm": 0.0852910503745079, + "learning_rate": 2.7824333333333334e-05, + "loss": 0.0042, + "step": 16533 + }, + { + "epoch": 13.01890508074045, + "grad_norm": 0.14544881880283356, + "learning_rate": 2.7824e-05, + "loss": 0.0052, + "step": 16534 + }, + { + "epoch": 13.019692792437967, + "grad_norm": 0.16183209419250488, + "learning_rate": 2.782366666666667e-05, + "loss": 0.0089, + "step": 16535 + }, + { + "epoch": 13.020480504135486, + "grad_norm": 0.2420482039451599, + "learning_rate": 2.7823333333333332e-05, + "loss": 0.0092, + "step": 16536 + }, + { + "epoch": 13.021268215833006, + "grad_norm": 0.7131679654121399, + "learning_rate": 2.7823e-05, + "loss": 0.011, + "step": 16537 + }, + { + "epoch": 13.022055927530523, + "grad_norm": 0.4859992265701294, + "learning_rate": 2.7822666666666667e-05, + "loss": 0.0077, + "step": 16538 + }, + { + "epoch": 13.022843639228043, + "grad_norm": 0.15981416404247284, + "learning_rate": 2.7822333333333333e-05, + "loss": 0.01, + "step": 16539 + }, + { + "epoch": 13.02363135092556, + "grad_norm": 0.16674327850341797, + "learning_rate": 2.7822000000000002e-05, + "loss": 0.009, + "step": 16540 + }, + { + "epoch": 13.02441906262308, + "grad_norm": 0.15713705122470856, + "learning_rate": 2.7821666666666668e-05, + "loss": 0.0063, + "step": 16541 + }, + { + "epoch": 13.0252067743206, + "grad_norm": 0.09975647181272507, + "learning_rate": 2.7821333333333334e-05, + "loss": 0.0062, + "step": 16542 + }, + { + "epoch": 13.025994486018117, + "grad_norm": 0.24120955169200897, + "learning_rate": 2.7821e-05, + "loss": 0.0105, + "step": 16543 + }, + { + "epoch": 13.026782197715637, + "grad_norm": 0.09998119622468948, + "learning_rate": 2.782066666666667e-05, + "loss": 0.0033, + "step": 16544 + }, + { + "epoch": 13.027569909413154, + "grad_norm": 0.24667073786258698, + "learning_rate": 2.7820333333333332e-05, + "loss": 0.0063, + "step": 16545 + }, + { + "epoch": 13.028357621110674, + "grad_norm": 0.24447765946388245, + "learning_rate": 2.782e-05, + "loss": 0.0097, + "step": 16546 + }, + { + "epoch": 13.029145332808191, + "grad_norm": 0.17121350765228271, + "learning_rate": 2.7819666666666667e-05, + "loss": 0.0083, + "step": 16547 + }, + { + "epoch": 13.02993304450571, + "grad_norm": 0.20003128051757812, + "learning_rate": 2.7819333333333333e-05, + "loss": 0.0085, + "step": 16548 + }, + { + "epoch": 13.03072075620323, + "grad_norm": 0.12738974392414093, + "learning_rate": 2.7819000000000002e-05, + "loss": 0.0047, + "step": 16549 + }, + { + "epoch": 13.031508467900748, + "grad_norm": 0.47738757729530334, + "learning_rate": 2.781866666666667e-05, + "loss": 0.0153, + "step": 16550 + }, + { + "epoch": 13.032296179598267, + "grad_norm": 0.18989306688308716, + "learning_rate": 2.7818333333333334e-05, + "loss": 0.0071, + "step": 16551 + }, + { + "epoch": 13.033083891295785, + "grad_norm": 0.2065511792898178, + "learning_rate": 2.7818e-05, + "loss": 0.0047, + "step": 16552 + }, + { + "epoch": 13.033871602993305, + "grad_norm": 0.2844446897506714, + "learning_rate": 2.781766666666667e-05, + "loss": 0.0086, + "step": 16553 + }, + { + "epoch": 13.034659314690822, + "grad_norm": 0.19795364141464233, + "learning_rate": 2.7817333333333332e-05, + "loss": 0.0082, + "step": 16554 + }, + { + "epoch": 13.035447026388342, + "grad_norm": 0.3587705194950104, + "learning_rate": 2.7817e-05, + "loss": 0.0066, + "step": 16555 + }, + { + "epoch": 13.036234738085861, + "grad_norm": 0.12816786766052246, + "learning_rate": 2.7816666666666667e-05, + "loss": 0.0042, + "step": 16556 + }, + { + "epoch": 13.037022449783379, + "grad_norm": 0.20537735521793365, + "learning_rate": 2.7816333333333333e-05, + "loss": 0.0079, + "step": 16557 + }, + { + "epoch": 13.037810161480898, + "grad_norm": 0.18494191765785217, + "learning_rate": 2.7816000000000003e-05, + "loss": 0.0065, + "step": 16558 + }, + { + "epoch": 13.038597873178416, + "grad_norm": 0.5272058844566345, + "learning_rate": 2.7815666666666665e-05, + "loss": 0.011, + "step": 16559 + }, + { + "epoch": 13.039385584875935, + "grad_norm": 0.28033190965652466, + "learning_rate": 2.7815333333333335e-05, + "loss": 0.0095, + "step": 16560 + }, + { + "epoch": 13.040173296573455, + "grad_norm": 0.6700624823570251, + "learning_rate": 2.7815e-05, + "loss": 0.159, + "step": 16561 + }, + { + "epoch": 13.040961008270973, + "grad_norm": 0.3681842088699341, + "learning_rate": 2.7814666666666666e-05, + "loss": 0.0944, + "step": 16562 + }, + { + "epoch": 13.041748719968492, + "grad_norm": 0.31497278809547424, + "learning_rate": 2.7814333333333332e-05, + "loss": 0.0965, + "step": 16563 + }, + { + "epoch": 13.04253643166601, + "grad_norm": 0.3661981225013733, + "learning_rate": 2.7814000000000002e-05, + "loss": 0.0713, + "step": 16564 + }, + { + "epoch": 13.04332414336353, + "grad_norm": 0.5247658491134644, + "learning_rate": 2.7813666666666668e-05, + "loss": 0.0465, + "step": 16565 + }, + { + "epoch": 13.044111855061047, + "grad_norm": 0.17638856172561646, + "learning_rate": 2.7813333333333334e-05, + "loss": 0.0151, + "step": 16566 + }, + { + "epoch": 13.044899566758566, + "grad_norm": 0.23529669642448425, + "learning_rate": 2.7813000000000003e-05, + "loss": 0.0339, + "step": 16567 + }, + { + "epoch": 13.045687278456086, + "grad_norm": 0.2248692363500595, + "learning_rate": 2.7812666666666665e-05, + "loss": 0.0158, + "step": 16568 + }, + { + "epoch": 13.046474990153603, + "grad_norm": 0.350913941860199, + "learning_rate": 2.7812333333333335e-05, + "loss": 0.0166, + "step": 16569 + }, + { + "epoch": 13.047262701851123, + "grad_norm": 0.2523869574069977, + "learning_rate": 2.7812e-05, + "loss": 0.0136, + "step": 16570 + }, + { + "epoch": 13.04805041354864, + "grad_norm": 0.13504792749881744, + "learning_rate": 2.7811666666666667e-05, + "loss": 0.0092, + "step": 16571 + }, + { + "epoch": 13.04883812524616, + "grad_norm": 0.5104215741157532, + "learning_rate": 2.7811333333333333e-05, + "loss": 0.0089, + "step": 16572 + }, + { + "epoch": 13.04962583694368, + "grad_norm": 0.1710074096918106, + "learning_rate": 2.7811000000000002e-05, + "loss": 0.0074, + "step": 16573 + }, + { + "epoch": 13.050413548641197, + "grad_norm": 0.14365485310554504, + "learning_rate": 2.7810666666666668e-05, + "loss": 0.0096, + "step": 16574 + }, + { + "epoch": 13.051201260338717, + "grad_norm": 0.21605277061462402, + "learning_rate": 2.7810333333333334e-05, + "loss": 0.0107, + "step": 16575 + }, + { + "epoch": 13.051988972036234, + "grad_norm": 0.2308778166770935, + "learning_rate": 2.7810000000000003e-05, + "loss": 0.008, + "step": 16576 + }, + { + "epoch": 13.052776683733754, + "grad_norm": 0.1952846348285675, + "learning_rate": 2.7809666666666666e-05, + "loss": 0.0076, + "step": 16577 + }, + { + "epoch": 13.053564395431271, + "grad_norm": 0.13011585175991058, + "learning_rate": 2.7809333333333335e-05, + "loss": 0.0079, + "step": 16578 + }, + { + "epoch": 13.054352107128791, + "grad_norm": 0.09018164873123169, + "learning_rate": 2.7809e-05, + "loss": 0.0042, + "step": 16579 + }, + { + "epoch": 13.05513981882631, + "grad_norm": 0.253857284784317, + "learning_rate": 2.7808666666666667e-05, + "loss": 0.0088, + "step": 16580 + }, + { + "epoch": 13.055927530523828, + "grad_norm": 0.4468587636947632, + "learning_rate": 2.7808333333333336e-05, + "loss": 0.0057, + "step": 16581 + }, + { + "epoch": 13.056715242221347, + "grad_norm": 0.18900273740291595, + "learning_rate": 2.7808000000000002e-05, + "loss": 0.0097, + "step": 16582 + }, + { + "epoch": 13.057502953918865, + "grad_norm": 0.24122466146945953, + "learning_rate": 2.7807666666666668e-05, + "loss": 0.0059, + "step": 16583 + }, + { + "epoch": 13.058290665616385, + "grad_norm": 0.17857787013053894, + "learning_rate": 2.7807333333333334e-05, + "loss": 0.0069, + "step": 16584 + }, + { + "epoch": 13.059078377313902, + "grad_norm": 0.2383933812379837, + "learning_rate": 2.7807e-05, + "loss": 0.0051, + "step": 16585 + }, + { + "epoch": 13.059866089011422, + "grad_norm": 0.24129289388656616, + "learning_rate": 2.7806666666666666e-05, + "loss": 0.0084, + "step": 16586 + }, + { + "epoch": 13.060653800708941, + "grad_norm": 0.13319090008735657, + "learning_rate": 2.7806333333333335e-05, + "loss": 0.0043, + "step": 16587 + }, + { + "epoch": 13.061441512406459, + "grad_norm": 0.22678513824939728, + "learning_rate": 2.7805999999999998e-05, + "loss": 0.0091, + "step": 16588 + }, + { + "epoch": 13.062229224103978, + "grad_norm": 0.15168558061122894, + "learning_rate": 2.7805666666666667e-05, + "loss": 0.0061, + "step": 16589 + }, + { + "epoch": 13.063016935801496, + "grad_norm": 0.15914343297481537, + "learning_rate": 2.7805333333333336e-05, + "loss": 0.0026, + "step": 16590 + }, + { + "epoch": 13.063804647499015, + "grad_norm": 0.11461188644170761, + "learning_rate": 2.7805e-05, + "loss": 0.0041, + "step": 16591 + }, + { + "epoch": 13.064592359196535, + "grad_norm": 0.28037020564079285, + "learning_rate": 2.7804666666666668e-05, + "loss": 0.009, + "step": 16592 + }, + { + "epoch": 13.065380070894053, + "grad_norm": 0.09737776964902878, + "learning_rate": 2.7804333333333334e-05, + "loss": 0.0045, + "step": 16593 + }, + { + "epoch": 13.066167782591572, + "grad_norm": 0.14166833460330963, + "learning_rate": 2.7804e-05, + "loss": 0.0073, + "step": 16594 + }, + { + "epoch": 13.06695549428909, + "grad_norm": 0.09864964336156845, + "learning_rate": 2.7803666666666666e-05, + "loss": 0.0038, + "step": 16595 + }, + { + "epoch": 13.06774320598661, + "grad_norm": 0.07350867986679077, + "learning_rate": 2.7803333333333335e-05, + "loss": 0.0042, + "step": 16596 + }, + { + "epoch": 13.068530917684127, + "grad_norm": 0.1211908608675003, + "learning_rate": 2.7802999999999998e-05, + "loss": 0.0052, + "step": 16597 + }, + { + "epoch": 13.069318629381646, + "grad_norm": 0.38074007630348206, + "learning_rate": 2.7802666666666667e-05, + "loss": 0.0214, + "step": 16598 + }, + { + "epoch": 13.070106341079166, + "grad_norm": 0.0925106480717659, + "learning_rate": 2.7802333333333337e-05, + "loss": 0.0039, + "step": 16599 + }, + { + "epoch": 13.070894052776683, + "grad_norm": 0.10683818906545639, + "learning_rate": 2.7802e-05, + "loss": 0.0042, + "step": 16600 + }, + { + "epoch": 13.071681764474203, + "grad_norm": 0.14847558736801147, + "learning_rate": 2.780166666666667e-05, + "loss": 0.0047, + "step": 16601 + }, + { + "epoch": 13.07246947617172, + "grad_norm": 0.19963422417640686, + "learning_rate": 2.7801333333333334e-05, + "loss": 0.0063, + "step": 16602 + }, + { + "epoch": 13.07325718786924, + "grad_norm": 0.23478394746780396, + "learning_rate": 2.7801e-05, + "loss": 0.008, + "step": 16603 + }, + { + "epoch": 13.074044899566758, + "grad_norm": 0.453490287065506, + "learning_rate": 2.7800666666666666e-05, + "loss": 0.0085, + "step": 16604 + }, + { + "epoch": 13.074832611264277, + "grad_norm": 0.27510643005371094, + "learning_rate": 2.7800333333333336e-05, + "loss": 0.0104, + "step": 16605 + }, + { + "epoch": 13.075620322961797, + "grad_norm": 0.31386470794677734, + "learning_rate": 2.78e-05, + "loss": 0.0089, + "step": 16606 + }, + { + "epoch": 13.076408034659314, + "grad_norm": 0.3799976706504822, + "learning_rate": 2.7799666666666667e-05, + "loss": 0.0058, + "step": 16607 + }, + { + "epoch": 13.077195746356834, + "grad_norm": 0.23350022733211517, + "learning_rate": 2.7799333333333337e-05, + "loss": 0.0069, + "step": 16608 + }, + { + "epoch": 13.077983458054351, + "grad_norm": 0.46341997385025024, + "learning_rate": 2.7799e-05, + "loss": 0.0167, + "step": 16609 + }, + { + "epoch": 13.078771169751871, + "grad_norm": 0.3220592141151428, + "learning_rate": 2.779866666666667e-05, + "loss": 0.0082, + "step": 16610 + }, + { + "epoch": 13.07955888144939, + "grad_norm": 0.48825299739837646, + "learning_rate": 2.7798333333333335e-05, + "loss": 0.1357, + "step": 16611 + }, + { + "epoch": 13.080346593146908, + "grad_norm": 0.5059518814086914, + "learning_rate": 2.7798e-05, + "loss": 0.106, + "step": 16612 + }, + { + "epoch": 13.081134304844428, + "grad_norm": 0.6527137160301208, + "learning_rate": 2.7797666666666666e-05, + "loss": 0.0773, + "step": 16613 + }, + { + "epoch": 13.081922016541945, + "grad_norm": 0.3964948356151581, + "learning_rate": 2.7797333333333332e-05, + "loss": 0.0717, + "step": 16614 + }, + { + "epoch": 13.082709728239465, + "grad_norm": 0.3143252432346344, + "learning_rate": 2.7797e-05, + "loss": 0.0669, + "step": 16615 + }, + { + "epoch": 13.083497439936982, + "grad_norm": 0.16031400859355927, + "learning_rate": 2.7796666666666668e-05, + "loss": 0.0255, + "step": 16616 + }, + { + "epoch": 13.084285151634502, + "grad_norm": 0.1997620165348053, + "learning_rate": 2.7796333333333334e-05, + "loss": 0.0077, + "step": 16617 + }, + { + "epoch": 13.085072863332021, + "grad_norm": 0.22332613170146942, + "learning_rate": 2.7796e-05, + "loss": 0.0199, + "step": 16618 + }, + { + "epoch": 13.085860575029539, + "grad_norm": 0.21396009624004364, + "learning_rate": 2.779566666666667e-05, + "loss": 0.0073, + "step": 16619 + }, + { + "epoch": 13.086648286727058, + "grad_norm": 0.18583568930625916, + "learning_rate": 2.779533333333333e-05, + "loss": 0.0117, + "step": 16620 + }, + { + "epoch": 13.087435998424576, + "grad_norm": 0.15738236904144287, + "learning_rate": 2.7795e-05, + "loss": 0.0088, + "step": 16621 + }, + { + "epoch": 13.088223710122096, + "grad_norm": 0.09543132781982422, + "learning_rate": 2.7794666666666667e-05, + "loss": 0.0027, + "step": 16622 + }, + { + "epoch": 13.089011421819613, + "grad_norm": 0.12631812691688538, + "learning_rate": 2.7794333333333333e-05, + "loss": 0.0061, + "step": 16623 + }, + { + "epoch": 13.089799133517133, + "grad_norm": 0.2790418863296509, + "learning_rate": 2.7794000000000002e-05, + "loss": 0.0128, + "step": 16624 + }, + { + "epoch": 13.090586845214652, + "grad_norm": 0.16102896630764008, + "learning_rate": 2.7793666666666668e-05, + "loss": 0.0075, + "step": 16625 + }, + { + "epoch": 13.09137455691217, + "grad_norm": 0.21209992468357086, + "learning_rate": 2.7793333333333334e-05, + "loss": 0.0069, + "step": 16626 + }, + { + "epoch": 13.09216226860969, + "grad_norm": 0.06272324919700623, + "learning_rate": 2.7793e-05, + "loss": 0.0027, + "step": 16627 + }, + { + "epoch": 13.092949980307207, + "grad_norm": 0.1552603840827942, + "learning_rate": 2.779266666666667e-05, + "loss": 0.0081, + "step": 16628 + }, + { + "epoch": 13.093737692004726, + "grad_norm": 0.1348915547132492, + "learning_rate": 2.779233333333333e-05, + "loss": 0.0088, + "step": 16629 + }, + { + "epoch": 13.094525403702246, + "grad_norm": 0.1139843612909317, + "learning_rate": 2.7792e-05, + "loss": 0.0069, + "step": 16630 + }, + { + "epoch": 13.095313115399764, + "grad_norm": 0.16070955991744995, + "learning_rate": 2.779166666666667e-05, + "loss": 0.0067, + "step": 16631 + }, + { + "epoch": 13.096100827097283, + "grad_norm": 0.14205236732959747, + "learning_rate": 2.7791333333333333e-05, + "loss": 0.0097, + "step": 16632 + }, + { + "epoch": 13.0968885387948, + "grad_norm": 0.12148372828960419, + "learning_rate": 2.7791000000000002e-05, + "loss": 0.0062, + "step": 16633 + }, + { + "epoch": 13.09767625049232, + "grad_norm": 0.11061045527458191, + "learning_rate": 2.7790666666666668e-05, + "loss": 0.006, + "step": 16634 + }, + { + "epoch": 13.098463962189838, + "grad_norm": 0.09097205847501755, + "learning_rate": 2.7790333333333334e-05, + "loss": 0.0063, + "step": 16635 + }, + { + "epoch": 13.099251673887357, + "grad_norm": 0.11991500854492188, + "learning_rate": 2.779e-05, + "loss": 0.0076, + "step": 16636 + }, + { + "epoch": 13.100039385584877, + "grad_norm": 0.15926194190979004, + "learning_rate": 2.778966666666667e-05, + "loss": 0.0096, + "step": 16637 + }, + { + "epoch": 13.100827097282394, + "grad_norm": 0.71519935131073, + "learning_rate": 2.7789333333333332e-05, + "loss": 0.0125, + "step": 16638 + }, + { + "epoch": 13.101614808979914, + "grad_norm": 0.4257453978061676, + "learning_rate": 2.7789e-05, + "loss": 0.0044, + "step": 16639 + }, + { + "epoch": 13.102402520677431, + "grad_norm": 0.6103782057762146, + "learning_rate": 2.778866666666667e-05, + "loss": 0.0107, + "step": 16640 + }, + { + "epoch": 13.103190232374951, + "grad_norm": 0.2554223835468292, + "learning_rate": 2.7788333333333333e-05, + "loss": 0.0108, + "step": 16641 + }, + { + "epoch": 13.103977944072469, + "grad_norm": 0.15145908296108246, + "learning_rate": 2.7788000000000002e-05, + "loss": 0.0064, + "step": 16642 + }, + { + "epoch": 13.104765655769988, + "grad_norm": 0.2895471453666687, + "learning_rate": 2.7787666666666668e-05, + "loss": 0.0051, + "step": 16643 + }, + { + "epoch": 13.105553367467508, + "grad_norm": 0.3786683976650238, + "learning_rate": 2.7787333333333334e-05, + "loss": 0.0426, + "step": 16644 + }, + { + "epoch": 13.106341079165025, + "grad_norm": 0.30491793155670166, + "learning_rate": 2.7787e-05, + "loss": 0.0138, + "step": 16645 + }, + { + "epoch": 13.107128790862545, + "grad_norm": 0.3007437288761139, + "learning_rate": 2.7786666666666666e-05, + "loss": 0.0132, + "step": 16646 + }, + { + "epoch": 13.107916502560062, + "grad_norm": 0.11606451123952866, + "learning_rate": 2.7786333333333332e-05, + "loss": 0.005, + "step": 16647 + }, + { + "epoch": 13.108704214257582, + "grad_norm": 0.44195687770843506, + "learning_rate": 2.7786e-05, + "loss": 0.0103, + "step": 16648 + }, + { + "epoch": 13.109491925955101, + "grad_norm": 0.29297924041748047, + "learning_rate": 2.7785666666666667e-05, + "loss": 0.0079, + "step": 16649 + }, + { + "epoch": 13.110279637652619, + "grad_norm": 0.3892709016799927, + "learning_rate": 2.7785333333333333e-05, + "loss": 0.012, + "step": 16650 + }, + { + "epoch": 13.111067349350138, + "grad_norm": 0.10659424215555191, + "learning_rate": 2.7785000000000002e-05, + "loss": 0.0033, + "step": 16651 + }, + { + "epoch": 13.111855061047656, + "grad_norm": 0.311532586812973, + "learning_rate": 2.7784666666666665e-05, + "loss": 0.0134, + "step": 16652 + }, + { + "epoch": 13.112642772745176, + "grad_norm": 2.242705821990967, + "learning_rate": 2.7784333333333334e-05, + "loss": 0.0214, + "step": 16653 + }, + { + "epoch": 13.113430484442693, + "grad_norm": 0.21192434430122375, + "learning_rate": 2.7784e-05, + "loss": 0.0085, + "step": 16654 + }, + { + "epoch": 13.114218196140213, + "grad_norm": 0.13473118841648102, + "learning_rate": 2.7783666666666666e-05, + "loss": 0.0046, + "step": 16655 + }, + { + "epoch": 13.115005907837732, + "grad_norm": 0.11230610311031342, + "learning_rate": 2.7783333333333336e-05, + "loss": 0.0049, + "step": 16656 + }, + { + "epoch": 13.11579361953525, + "grad_norm": 0.38601499795913696, + "learning_rate": 2.7783e-05, + "loss": 0.0056, + "step": 16657 + }, + { + "epoch": 13.11658133123277, + "grad_norm": 0.15774543583393097, + "learning_rate": 2.7782666666666667e-05, + "loss": 0.0076, + "step": 16658 + }, + { + "epoch": 13.117369042930287, + "grad_norm": 0.3186524510383606, + "learning_rate": 2.7782333333333333e-05, + "loss": 0.0143, + "step": 16659 + }, + { + "epoch": 13.118156754627806, + "grad_norm": 0.4019593298435211, + "learning_rate": 2.7782000000000003e-05, + "loss": 0.015, + "step": 16660 + }, + { + "epoch": 13.118944466325324, + "grad_norm": 0.6424950361251831, + "learning_rate": 2.7781666666666665e-05, + "loss": 0.1599, + "step": 16661 + }, + { + "epoch": 13.119732178022844, + "grad_norm": 0.39547064900398254, + "learning_rate": 2.7781333333333335e-05, + "loss": 0.0772, + "step": 16662 + }, + { + "epoch": 13.120519889720363, + "grad_norm": 0.34035298228263855, + "learning_rate": 2.7781e-05, + "loss": 0.08, + "step": 16663 + }, + { + "epoch": 13.12130760141788, + "grad_norm": 0.2724452614784241, + "learning_rate": 2.7780666666666666e-05, + "loss": 0.0547, + "step": 16664 + }, + { + "epoch": 13.1220953131154, + "grad_norm": 0.3276134729385376, + "learning_rate": 2.7780333333333336e-05, + "loss": 0.0446, + "step": 16665 + }, + { + "epoch": 13.122883024812918, + "grad_norm": 0.22394587099552155, + "learning_rate": 2.778e-05, + "loss": 0.0297, + "step": 16666 + }, + { + "epoch": 13.123670736510437, + "grad_norm": 0.21665482223033905, + "learning_rate": 2.7779666666666668e-05, + "loss": 0.0172, + "step": 16667 + }, + { + "epoch": 13.124458448207957, + "grad_norm": 0.1974758356809616, + "learning_rate": 2.7779333333333334e-05, + "loss": 0.0142, + "step": 16668 + }, + { + "epoch": 13.125246159905474, + "grad_norm": 0.14598245918750763, + "learning_rate": 2.7779000000000003e-05, + "loss": 0.0092, + "step": 16669 + }, + { + "epoch": 13.126033871602994, + "grad_norm": 0.2749582529067993, + "learning_rate": 2.7778666666666665e-05, + "loss": 0.01, + "step": 16670 + }, + { + "epoch": 13.126821583300512, + "grad_norm": 0.2751579284667969, + "learning_rate": 2.7778333333333335e-05, + "loss": 0.0107, + "step": 16671 + }, + { + "epoch": 13.127609294998031, + "grad_norm": 0.17250467836856842, + "learning_rate": 2.7778e-05, + "loss": 0.0088, + "step": 16672 + }, + { + "epoch": 13.128397006695549, + "grad_norm": 0.13905945420265198, + "learning_rate": 2.7777666666666667e-05, + "loss": 0.0071, + "step": 16673 + }, + { + "epoch": 13.129184718393068, + "grad_norm": 0.15570953488349915, + "learning_rate": 2.7777333333333336e-05, + "loss": 0.006, + "step": 16674 + }, + { + "epoch": 13.129972430090588, + "grad_norm": 0.08385498076677322, + "learning_rate": 2.7777e-05, + "loss": 0.0034, + "step": 16675 + }, + { + "epoch": 13.130760141788105, + "grad_norm": 0.12547999620437622, + "learning_rate": 2.7776666666666668e-05, + "loss": 0.0043, + "step": 16676 + }, + { + "epoch": 13.131547853485625, + "grad_norm": 0.12556080520153046, + "learning_rate": 2.7776333333333334e-05, + "loss": 0.008, + "step": 16677 + }, + { + "epoch": 13.132335565183142, + "grad_norm": 0.3701355457305908, + "learning_rate": 2.7776e-05, + "loss": 0.0129, + "step": 16678 + }, + { + "epoch": 13.133123276880662, + "grad_norm": 0.07027062028646469, + "learning_rate": 2.7775666666666666e-05, + "loss": 0.0023, + "step": 16679 + }, + { + "epoch": 13.13391098857818, + "grad_norm": 0.2010810673236847, + "learning_rate": 2.7775333333333335e-05, + "loss": 0.0131, + "step": 16680 + }, + { + "epoch": 13.134698700275699, + "grad_norm": 0.13014079630374908, + "learning_rate": 2.7775e-05, + "loss": 0.0047, + "step": 16681 + }, + { + "epoch": 13.135486411973218, + "grad_norm": 0.45071494579315186, + "learning_rate": 2.7774666666666667e-05, + "loss": 0.0067, + "step": 16682 + }, + { + "epoch": 13.136274123670736, + "grad_norm": 0.24677592515945435, + "learning_rate": 2.7774333333333336e-05, + "loss": 0.0079, + "step": 16683 + }, + { + "epoch": 13.137061835368256, + "grad_norm": 0.1137036457657814, + "learning_rate": 2.7774e-05, + "loss": 0.0064, + "step": 16684 + }, + { + "epoch": 13.137849547065773, + "grad_norm": 0.0893140584230423, + "learning_rate": 2.7773666666666668e-05, + "loss": 0.0075, + "step": 16685 + }, + { + "epoch": 13.138637258763293, + "grad_norm": 0.1148107498884201, + "learning_rate": 2.7773333333333334e-05, + "loss": 0.0062, + "step": 16686 + }, + { + "epoch": 13.139424970460812, + "grad_norm": 0.1509954184293747, + "learning_rate": 2.7773e-05, + "loss": 0.0079, + "step": 16687 + }, + { + "epoch": 13.14021268215833, + "grad_norm": 0.2974506616592407, + "learning_rate": 2.7772666666666666e-05, + "loss": 0.0083, + "step": 16688 + }, + { + "epoch": 13.14100039385585, + "grad_norm": 0.17342986166477203, + "learning_rate": 2.7772333333333335e-05, + "loss": 0.0076, + "step": 16689 + }, + { + "epoch": 13.141788105553367, + "grad_norm": 0.4600293040275574, + "learning_rate": 2.7772e-05, + "loss": 0.0096, + "step": 16690 + }, + { + "epoch": 13.142575817250886, + "grad_norm": 0.18300023674964905, + "learning_rate": 2.7771666666666667e-05, + "loss": 0.0082, + "step": 16691 + }, + { + "epoch": 13.143363528948404, + "grad_norm": 0.07283845543861389, + "learning_rate": 2.7771333333333336e-05, + "loss": 0.0039, + "step": 16692 + }, + { + "epoch": 13.144151240645924, + "grad_norm": 0.12944836914539337, + "learning_rate": 2.7771e-05, + "loss": 0.008, + "step": 16693 + }, + { + "epoch": 13.144938952343443, + "grad_norm": 0.07037035375833511, + "learning_rate": 2.7770666666666668e-05, + "loss": 0.0032, + "step": 16694 + }, + { + "epoch": 13.14572666404096, + "grad_norm": 0.29182979464530945, + "learning_rate": 2.7770333333333334e-05, + "loss": 0.0114, + "step": 16695 + }, + { + "epoch": 13.14651437573848, + "grad_norm": 0.32624003291130066, + "learning_rate": 2.777e-05, + "loss": 0.0126, + "step": 16696 + }, + { + "epoch": 13.147302087435998, + "grad_norm": 0.1305868774652481, + "learning_rate": 2.7769666666666666e-05, + "loss": 0.0074, + "step": 16697 + }, + { + "epoch": 13.148089799133517, + "grad_norm": 0.14156334102153778, + "learning_rate": 2.7769333333333335e-05, + "loss": 0.0064, + "step": 16698 + }, + { + "epoch": 13.148877510831035, + "grad_norm": 0.18479757010936737, + "learning_rate": 2.7769e-05, + "loss": 0.0052, + "step": 16699 + }, + { + "epoch": 13.149665222528554, + "grad_norm": 0.4993114471435547, + "learning_rate": 2.7768666666666667e-05, + "loss": 0.0178, + "step": 16700 + }, + { + "epoch": 13.150452934226074, + "grad_norm": 0.29515668749809265, + "learning_rate": 2.7768333333333337e-05, + "loss": 0.0116, + "step": 16701 + }, + { + "epoch": 13.151240645923592, + "grad_norm": 1.0460240840911865, + "learning_rate": 2.7768e-05, + "loss": 0.0151, + "step": 16702 + }, + { + "epoch": 13.152028357621111, + "grad_norm": 0.30358219146728516, + "learning_rate": 2.776766666666667e-05, + "loss": 0.0175, + "step": 16703 + }, + { + "epoch": 13.152816069318629, + "grad_norm": 0.12383975088596344, + "learning_rate": 2.7767333333333334e-05, + "loss": 0.0055, + "step": 16704 + }, + { + "epoch": 13.153603781016148, + "grad_norm": 0.12470561265945435, + "learning_rate": 2.7767e-05, + "loss": 0.006, + "step": 16705 + }, + { + "epoch": 13.154391492713668, + "grad_norm": 0.24387721717357635, + "learning_rate": 2.776666666666667e-05, + "loss": 0.0095, + "step": 16706 + }, + { + "epoch": 13.155179204411185, + "grad_norm": 0.4811011254787445, + "learning_rate": 2.7766333333333332e-05, + "loss": 0.0167, + "step": 16707 + }, + { + "epoch": 13.155966916108705, + "grad_norm": 0.21162746846675873, + "learning_rate": 2.7766e-05, + "loss": 0.0083, + "step": 16708 + }, + { + "epoch": 13.156754627806222, + "grad_norm": 0.25748923420906067, + "learning_rate": 2.7765666666666667e-05, + "loss": 0.0115, + "step": 16709 + }, + { + "epoch": 13.157542339503742, + "grad_norm": 0.07542765140533447, + "learning_rate": 2.7765333333333333e-05, + "loss": 0.0033, + "step": 16710 + }, + { + "epoch": 13.15833005120126, + "grad_norm": 0.5912162661552429, + "learning_rate": 2.7765e-05, + "loss": 0.2065, + "step": 16711 + }, + { + "epoch": 13.159117762898779, + "grad_norm": 0.5407730937004089, + "learning_rate": 2.776466666666667e-05, + "loss": 0.115, + "step": 16712 + }, + { + "epoch": 13.159905474596298, + "grad_norm": 0.3261996805667877, + "learning_rate": 2.776433333333333e-05, + "loss": 0.0656, + "step": 16713 + }, + { + "epoch": 13.160693186293816, + "grad_norm": 0.4258989989757538, + "learning_rate": 2.7764e-05, + "loss": 0.0821, + "step": 16714 + }, + { + "epoch": 13.161480897991336, + "grad_norm": 0.3252391517162323, + "learning_rate": 2.776366666666667e-05, + "loss": 0.0615, + "step": 16715 + }, + { + "epoch": 13.162268609688853, + "grad_norm": 0.29561445116996765, + "learning_rate": 2.7763333333333332e-05, + "loss": 0.0201, + "step": 16716 + }, + { + "epoch": 13.163056321386373, + "grad_norm": 0.2903476357460022, + "learning_rate": 2.7763e-05, + "loss": 0.0167, + "step": 16717 + }, + { + "epoch": 13.16384403308389, + "grad_norm": 0.2229166030883789, + "learning_rate": 2.7762666666666668e-05, + "loss": 0.0109, + "step": 16718 + }, + { + "epoch": 13.16463174478141, + "grad_norm": 0.1843031495809555, + "learning_rate": 2.7762333333333334e-05, + "loss": 0.008, + "step": 16719 + }, + { + "epoch": 13.16541945647893, + "grad_norm": 0.18379613757133484, + "learning_rate": 2.7762e-05, + "loss": 0.0107, + "step": 16720 + }, + { + "epoch": 13.166207168176447, + "grad_norm": 0.11775198578834534, + "learning_rate": 2.776166666666667e-05, + "loss": 0.0076, + "step": 16721 + }, + { + "epoch": 13.166994879873966, + "grad_norm": 0.3990599811077118, + "learning_rate": 2.776133333333333e-05, + "loss": 0.0108, + "step": 16722 + }, + { + "epoch": 13.167782591571484, + "grad_norm": 0.32608848810195923, + "learning_rate": 2.7761e-05, + "loss": 0.0051, + "step": 16723 + }, + { + "epoch": 13.168570303269004, + "grad_norm": 0.3450544774532318, + "learning_rate": 2.776066666666667e-05, + "loss": 0.0144, + "step": 16724 + }, + { + "epoch": 13.169358014966523, + "grad_norm": 0.22218401730060577, + "learning_rate": 2.7760333333333333e-05, + "loss": 0.0095, + "step": 16725 + }, + { + "epoch": 13.17014572666404, + "grad_norm": 0.20252399146556854, + "learning_rate": 2.7760000000000002e-05, + "loss": 0.0094, + "step": 16726 + }, + { + "epoch": 13.17093343836156, + "grad_norm": 0.33768031001091003, + "learning_rate": 2.7759666666666668e-05, + "loss": 0.0097, + "step": 16727 + }, + { + "epoch": 13.171721150059078, + "grad_norm": 0.10934248566627502, + "learning_rate": 2.7759333333333334e-05, + "loss": 0.0062, + "step": 16728 + }, + { + "epoch": 13.172508861756597, + "grad_norm": 0.1834275722503662, + "learning_rate": 2.7759e-05, + "loss": 0.0086, + "step": 16729 + }, + { + "epoch": 13.173296573454115, + "grad_norm": 0.4582841396331787, + "learning_rate": 2.775866666666667e-05, + "loss": 0.0109, + "step": 16730 + }, + { + "epoch": 13.174084285151634, + "grad_norm": 0.22843800485134125, + "learning_rate": 2.7758333333333335e-05, + "loss": 0.0045, + "step": 16731 + }, + { + "epoch": 13.174871996849154, + "grad_norm": 0.10279589891433716, + "learning_rate": 2.7758e-05, + "loss": 0.0059, + "step": 16732 + }, + { + "epoch": 13.175659708546672, + "grad_norm": 0.19742794334888458, + "learning_rate": 2.775766666666667e-05, + "loss": 0.0121, + "step": 16733 + }, + { + "epoch": 13.176447420244191, + "grad_norm": 0.2503509819507599, + "learning_rate": 2.7757333333333333e-05, + "loss": 0.0136, + "step": 16734 + }, + { + "epoch": 13.177235131941709, + "grad_norm": 0.20191562175750732, + "learning_rate": 2.7757000000000002e-05, + "loss": 0.0091, + "step": 16735 + }, + { + "epoch": 13.178022843639228, + "grad_norm": 0.1998571902513504, + "learning_rate": 2.7756666666666665e-05, + "loss": 0.0051, + "step": 16736 + }, + { + "epoch": 13.178810555336748, + "grad_norm": 0.23675569891929626, + "learning_rate": 2.7756333333333334e-05, + "loss": 0.004, + "step": 16737 + }, + { + "epoch": 13.179598267034265, + "grad_norm": 0.37941616773605347, + "learning_rate": 2.7756e-05, + "loss": 0.0157, + "step": 16738 + }, + { + "epoch": 13.180385978731785, + "grad_norm": 0.23778551816940308, + "learning_rate": 2.7755666666666666e-05, + "loss": 0.0102, + "step": 16739 + }, + { + "epoch": 13.181173690429302, + "grad_norm": 0.13691000640392303, + "learning_rate": 2.7755333333333335e-05, + "loss": 0.0063, + "step": 16740 + }, + { + "epoch": 13.181961402126822, + "grad_norm": 0.0954703763127327, + "learning_rate": 2.7755e-05, + "loss": 0.0054, + "step": 16741 + }, + { + "epoch": 13.18274911382434, + "grad_norm": 0.2238231897354126, + "learning_rate": 2.7754666666666667e-05, + "loss": 0.0068, + "step": 16742 + }, + { + "epoch": 13.183536825521859, + "grad_norm": 0.13365112245082855, + "learning_rate": 2.7754333333333333e-05, + "loss": 0.0071, + "step": 16743 + }, + { + "epoch": 13.184324537219378, + "grad_norm": 0.16160224378108978, + "learning_rate": 2.7754000000000002e-05, + "loss": 0.0068, + "step": 16744 + }, + { + "epoch": 13.185112248916896, + "grad_norm": 0.23752281069755554, + "learning_rate": 2.7753666666666665e-05, + "loss": 0.0079, + "step": 16745 + }, + { + "epoch": 13.185899960614416, + "grad_norm": 0.4458152651786804, + "learning_rate": 2.7753333333333334e-05, + "loss": 0.0078, + "step": 16746 + }, + { + "epoch": 13.186687672311933, + "grad_norm": 0.06801706552505493, + "learning_rate": 2.7753e-05, + "loss": 0.0033, + "step": 16747 + }, + { + "epoch": 13.187475384009453, + "grad_norm": 0.11520078778266907, + "learning_rate": 2.7752666666666666e-05, + "loss": 0.0053, + "step": 16748 + }, + { + "epoch": 13.18826309570697, + "grad_norm": 0.33768555521965027, + "learning_rate": 2.7752333333333335e-05, + "loss": 0.0079, + "step": 16749 + }, + { + "epoch": 13.18905080740449, + "grad_norm": 0.17994555830955505, + "learning_rate": 2.7752e-05, + "loss": 0.0072, + "step": 16750 + }, + { + "epoch": 13.18983851910201, + "grad_norm": 0.12986770272254944, + "learning_rate": 2.7751666666666667e-05, + "loss": 0.0043, + "step": 16751 + }, + { + "epoch": 13.190626230799527, + "grad_norm": 0.37540310621261597, + "learning_rate": 2.7751333333333333e-05, + "loss": 0.0127, + "step": 16752 + }, + { + "epoch": 13.191413942497046, + "grad_norm": 0.43055063486099243, + "learning_rate": 2.7751000000000002e-05, + "loss": 0.0132, + "step": 16753 + }, + { + "epoch": 13.192201654194564, + "grad_norm": 1.3361663818359375, + "learning_rate": 2.7750666666666665e-05, + "loss": 0.0145, + "step": 16754 + }, + { + "epoch": 13.192989365892084, + "grad_norm": 0.2554013431072235, + "learning_rate": 2.7750333333333334e-05, + "loss": 0.0111, + "step": 16755 + }, + { + "epoch": 13.193777077589603, + "grad_norm": 0.28976428508758545, + "learning_rate": 2.7750000000000004e-05, + "loss": 0.011, + "step": 16756 + }, + { + "epoch": 13.19456478928712, + "grad_norm": 0.4488524794578552, + "learning_rate": 2.7749666666666666e-05, + "loss": 0.0115, + "step": 16757 + }, + { + "epoch": 13.19535250098464, + "grad_norm": 0.18781965970993042, + "learning_rate": 2.7749333333333336e-05, + "loss": 0.0037, + "step": 16758 + }, + { + "epoch": 13.196140212682158, + "grad_norm": 0.17376524209976196, + "learning_rate": 2.7749e-05, + "loss": 0.0081, + "step": 16759 + }, + { + "epoch": 13.196927924379677, + "grad_norm": 0.4928583800792694, + "learning_rate": 2.7748666666666667e-05, + "loss": 0.0172, + "step": 16760 + }, + { + "epoch": 13.197715636077195, + "grad_norm": 0.648224949836731, + "learning_rate": 2.7748333333333333e-05, + "loss": 0.1721, + "step": 16761 + }, + { + "epoch": 13.198503347774714, + "grad_norm": 0.4723367393016815, + "learning_rate": 2.7748000000000003e-05, + "loss": 0.148, + "step": 16762 + }, + { + "epoch": 13.199291059472234, + "grad_norm": 0.4680153727531433, + "learning_rate": 2.7747666666666665e-05, + "loss": 0.0956, + "step": 16763 + }, + { + "epoch": 13.200078771169752, + "grad_norm": 0.6127036809921265, + "learning_rate": 2.7747333333333335e-05, + "loss": 0.1247, + "step": 16764 + }, + { + "epoch": 13.200866482867271, + "grad_norm": 0.36135363578796387, + "learning_rate": 2.7747000000000004e-05, + "loss": 0.0665, + "step": 16765 + }, + { + "epoch": 13.201654194564789, + "grad_norm": 0.42764419317245483, + "learning_rate": 2.7746666666666666e-05, + "loss": 0.0549, + "step": 16766 + }, + { + "epoch": 13.202441906262308, + "grad_norm": 0.17957793176174164, + "learning_rate": 2.7746333333333336e-05, + "loss": 0.0236, + "step": 16767 + }, + { + "epoch": 13.203229617959826, + "grad_norm": 0.17739146947860718, + "learning_rate": 2.7745999999999998e-05, + "loss": 0.028, + "step": 16768 + }, + { + "epoch": 13.204017329657345, + "grad_norm": 0.16785474121570587, + "learning_rate": 2.7745666666666668e-05, + "loss": 0.0101, + "step": 16769 + }, + { + "epoch": 13.204805041354865, + "grad_norm": 0.1732395589351654, + "learning_rate": 2.7745333333333334e-05, + "loss": 0.014, + "step": 16770 + }, + { + "epoch": 13.205592753052382, + "grad_norm": 0.3287491202354431, + "learning_rate": 2.7745e-05, + "loss": 0.0159, + "step": 16771 + }, + { + "epoch": 13.206380464749902, + "grad_norm": 0.09014683216810226, + "learning_rate": 2.7744666666666665e-05, + "loss": 0.0053, + "step": 16772 + }, + { + "epoch": 13.20716817644742, + "grad_norm": 0.16067947447299957, + "learning_rate": 2.7744333333333335e-05, + "loss": 0.0073, + "step": 16773 + }, + { + "epoch": 13.207955888144939, + "grad_norm": 0.21609245240688324, + "learning_rate": 2.7744e-05, + "loss": 0.0071, + "step": 16774 + }, + { + "epoch": 13.208743599842458, + "grad_norm": 0.2326272428035736, + "learning_rate": 2.7743666666666667e-05, + "loss": 0.0118, + "step": 16775 + }, + { + "epoch": 13.209531311539976, + "grad_norm": 0.2847565710544586, + "learning_rate": 2.7743333333333336e-05, + "loss": 0.0157, + "step": 16776 + }, + { + "epoch": 13.210319023237496, + "grad_norm": 0.19731779396533966, + "learning_rate": 2.7743e-05, + "loss": 0.0089, + "step": 16777 + }, + { + "epoch": 13.211106734935013, + "grad_norm": 0.2395826131105423, + "learning_rate": 2.7742666666666668e-05, + "loss": 0.0098, + "step": 16778 + }, + { + "epoch": 13.211894446632533, + "grad_norm": 0.24945081770420074, + "learning_rate": 2.7742333333333334e-05, + "loss": 0.0095, + "step": 16779 + }, + { + "epoch": 13.21268215833005, + "grad_norm": 0.08348865807056427, + "learning_rate": 2.7742e-05, + "loss": 0.0029, + "step": 16780 + }, + { + "epoch": 13.21346987002757, + "grad_norm": 0.15015137195587158, + "learning_rate": 2.774166666666667e-05, + "loss": 0.007, + "step": 16781 + }, + { + "epoch": 13.21425758172509, + "grad_norm": 0.1741362065076828, + "learning_rate": 2.7741333333333335e-05, + "loss": 0.009, + "step": 16782 + }, + { + "epoch": 13.215045293422607, + "grad_norm": 0.09246259182691574, + "learning_rate": 2.7741e-05, + "loss": 0.004, + "step": 16783 + }, + { + "epoch": 13.215833005120126, + "grad_norm": 0.39113545417785645, + "learning_rate": 2.7740666666666667e-05, + "loss": 0.0083, + "step": 16784 + }, + { + "epoch": 13.216620716817644, + "grad_norm": 0.12034502625465393, + "learning_rate": 2.7740333333333336e-05, + "loss": 0.0044, + "step": 16785 + }, + { + "epoch": 13.217408428515164, + "grad_norm": 0.18371379375457764, + "learning_rate": 2.774e-05, + "loss": 0.0062, + "step": 16786 + }, + { + "epoch": 13.218196140212681, + "grad_norm": 0.30826935172080994, + "learning_rate": 2.7739666666666668e-05, + "loss": 0.01, + "step": 16787 + }, + { + "epoch": 13.2189838519102, + "grad_norm": 0.16762469708919525, + "learning_rate": 2.7739333333333334e-05, + "loss": 0.007, + "step": 16788 + }, + { + "epoch": 13.21977156360772, + "grad_norm": 0.3943358063697815, + "learning_rate": 2.7739e-05, + "loss": 0.0106, + "step": 16789 + }, + { + "epoch": 13.220559275305238, + "grad_norm": 0.15814144909381866, + "learning_rate": 2.773866666666667e-05, + "loss": 0.0081, + "step": 16790 + }, + { + "epoch": 13.221346987002757, + "grad_norm": 0.2261282503604889, + "learning_rate": 2.7738333333333335e-05, + "loss": 0.0173, + "step": 16791 + }, + { + "epoch": 13.222134698700275, + "grad_norm": 0.1959981918334961, + "learning_rate": 2.7738e-05, + "loss": 0.0067, + "step": 16792 + }, + { + "epoch": 13.222922410397794, + "grad_norm": 0.2414405196905136, + "learning_rate": 2.7737666666666667e-05, + "loss": 0.0114, + "step": 16793 + }, + { + "epoch": 13.223710122095314, + "grad_norm": 0.18004082143306732, + "learning_rate": 2.7737333333333336e-05, + "loss": 0.0083, + "step": 16794 + }, + { + "epoch": 13.224497833792832, + "grad_norm": 0.12209928780794144, + "learning_rate": 2.7737e-05, + "loss": 0.0094, + "step": 16795 + }, + { + "epoch": 13.225285545490351, + "grad_norm": 0.14480435848236084, + "learning_rate": 2.7736666666666668e-05, + "loss": 0.0068, + "step": 16796 + }, + { + "epoch": 13.226073257187869, + "grad_norm": 0.8271424174308777, + "learning_rate": 2.773633333333333e-05, + "loss": 0.0105, + "step": 16797 + }, + { + "epoch": 13.226860968885388, + "grad_norm": 0.4908670485019684, + "learning_rate": 2.7736e-05, + "loss": 0.0117, + "step": 16798 + }, + { + "epoch": 13.227648680582906, + "grad_norm": 0.6328930258750916, + "learning_rate": 2.773566666666667e-05, + "loss": 0.0101, + "step": 16799 + }, + { + "epoch": 13.228436392280425, + "grad_norm": 0.49874332547187805, + "learning_rate": 2.7735333333333332e-05, + "loss": 0.0071, + "step": 16800 + }, + { + "epoch": 13.229224103977945, + "grad_norm": 0.49553361535072327, + "learning_rate": 2.7735e-05, + "loss": 0.0145, + "step": 16801 + }, + { + "epoch": 13.230011815675462, + "grad_norm": 0.31963643431663513, + "learning_rate": 2.7734666666666667e-05, + "loss": 0.012, + "step": 16802 + }, + { + "epoch": 13.230799527372982, + "grad_norm": 0.3915599584579468, + "learning_rate": 2.7734333333333333e-05, + "loss": 0.0054, + "step": 16803 + }, + { + "epoch": 13.2315872390705, + "grad_norm": 0.28731679916381836, + "learning_rate": 2.7734e-05, + "loss": 0.0055, + "step": 16804 + }, + { + "epoch": 13.232374950768019, + "grad_norm": 0.28747665882110596, + "learning_rate": 2.773366666666667e-05, + "loss": 0.0106, + "step": 16805 + }, + { + "epoch": 13.233162662465537, + "grad_norm": 0.346430242061615, + "learning_rate": 2.7733333333333334e-05, + "loss": 0.0109, + "step": 16806 + }, + { + "epoch": 13.233950374163056, + "grad_norm": 0.17237840592861176, + "learning_rate": 2.7733e-05, + "loss": 0.0065, + "step": 16807 + }, + { + "epoch": 13.234738085860576, + "grad_norm": 0.9900192618370056, + "learning_rate": 2.773266666666667e-05, + "loss": 0.0079, + "step": 16808 + }, + { + "epoch": 13.235525797558093, + "grad_norm": 0.25121620297431946, + "learning_rate": 2.7732333333333332e-05, + "loss": 0.0083, + "step": 16809 + }, + { + "epoch": 13.236313509255613, + "grad_norm": 0.19055965542793274, + "learning_rate": 2.7732e-05, + "loss": 0.0078, + "step": 16810 + }, + { + "epoch": 13.23710122095313, + "grad_norm": 0.5851781368255615, + "learning_rate": 2.7731666666666667e-05, + "loss": 0.2142, + "step": 16811 + }, + { + "epoch": 13.23788893265065, + "grad_norm": 0.4582171142101288, + "learning_rate": 2.7731333333333333e-05, + "loss": 0.1553, + "step": 16812 + }, + { + "epoch": 13.23867664434817, + "grad_norm": 0.3506561815738678, + "learning_rate": 2.7731e-05, + "loss": 0.0688, + "step": 16813 + }, + { + "epoch": 13.239464356045687, + "grad_norm": 0.489289253950119, + "learning_rate": 2.773066666666667e-05, + "loss": 0.0789, + "step": 16814 + }, + { + "epoch": 13.240252067743207, + "grad_norm": 0.48093822598457336, + "learning_rate": 2.7730333333333335e-05, + "loss": 0.0499, + "step": 16815 + }, + { + "epoch": 13.241039779440724, + "grad_norm": 0.5078047513961792, + "learning_rate": 2.773e-05, + "loss": 0.0358, + "step": 16816 + }, + { + "epoch": 13.241827491138244, + "grad_norm": 0.2348308265209198, + "learning_rate": 2.772966666666667e-05, + "loss": 0.0125, + "step": 16817 + }, + { + "epoch": 13.242615202835761, + "grad_norm": 0.20761816203594208, + "learning_rate": 2.7729333333333332e-05, + "loss": 0.0353, + "step": 16818 + }, + { + "epoch": 13.24340291453328, + "grad_norm": 0.17736147344112396, + "learning_rate": 2.7729e-05, + "loss": 0.0172, + "step": 16819 + }, + { + "epoch": 13.2441906262308, + "grad_norm": 0.14299947023391724, + "learning_rate": 2.7728666666666668e-05, + "loss": 0.0079, + "step": 16820 + }, + { + "epoch": 13.244978337928318, + "grad_norm": 0.2668461203575134, + "learning_rate": 2.7728333333333334e-05, + "loss": 0.0077, + "step": 16821 + }, + { + "epoch": 13.245766049625837, + "grad_norm": 0.33676815032958984, + "learning_rate": 2.7728e-05, + "loss": 0.0152, + "step": 16822 + }, + { + "epoch": 13.246553761323355, + "grad_norm": 0.39805567264556885, + "learning_rate": 2.772766666666667e-05, + "loss": 0.0165, + "step": 16823 + }, + { + "epoch": 13.247341473020875, + "grad_norm": 0.12365767359733582, + "learning_rate": 2.7727333333333335e-05, + "loss": 0.0065, + "step": 16824 + }, + { + "epoch": 13.248129184718394, + "grad_norm": 0.16206876933574677, + "learning_rate": 2.7727e-05, + "loss": 0.0065, + "step": 16825 + }, + { + "epoch": 13.248916896415912, + "grad_norm": 0.161003977060318, + "learning_rate": 2.7726666666666667e-05, + "loss": 0.0107, + "step": 16826 + }, + { + "epoch": 13.249704608113431, + "grad_norm": 0.07105123996734619, + "learning_rate": 2.7726333333333333e-05, + "loss": 0.0034, + "step": 16827 + }, + { + "epoch": 13.250492319810949, + "grad_norm": 0.09755827486515045, + "learning_rate": 2.7726000000000002e-05, + "loss": 0.0085, + "step": 16828 + }, + { + "epoch": 13.251280031508468, + "grad_norm": 3.1346356868743896, + "learning_rate": 2.7725666666666664e-05, + "loss": 0.0174, + "step": 16829 + }, + { + "epoch": 13.252067743205986, + "grad_norm": 0.08400630950927734, + "learning_rate": 2.7725333333333334e-05, + "loss": 0.0056, + "step": 16830 + }, + { + "epoch": 13.252855454903505, + "grad_norm": 0.20412220060825348, + "learning_rate": 2.7725000000000003e-05, + "loss": 0.0072, + "step": 16831 + }, + { + "epoch": 13.253643166601025, + "grad_norm": 0.20008538663387299, + "learning_rate": 2.7724666666666666e-05, + "loss": 0.0093, + "step": 16832 + }, + { + "epoch": 13.254430878298542, + "grad_norm": 0.10279351472854614, + "learning_rate": 2.7724333333333335e-05, + "loss": 0.0018, + "step": 16833 + }, + { + "epoch": 13.255218589996062, + "grad_norm": 0.1014234870672226, + "learning_rate": 2.7724e-05, + "loss": 0.0047, + "step": 16834 + }, + { + "epoch": 13.25600630169358, + "grad_norm": 0.28778111934661865, + "learning_rate": 2.7723666666666667e-05, + "loss": 0.0142, + "step": 16835 + }, + { + "epoch": 13.256794013391099, + "grad_norm": 0.1234869584441185, + "learning_rate": 2.7723333333333333e-05, + "loss": 0.0033, + "step": 16836 + }, + { + "epoch": 13.257581725088617, + "grad_norm": 0.1899653971195221, + "learning_rate": 2.7723000000000002e-05, + "loss": 0.0114, + "step": 16837 + }, + { + "epoch": 13.258369436786136, + "grad_norm": 0.24368301033973694, + "learning_rate": 2.7722666666666665e-05, + "loss": 0.0058, + "step": 16838 + }, + { + "epoch": 13.259157148483656, + "grad_norm": 0.23669855296611786, + "learning_rate": 2.7722333333333334e-05, + "loss": 0.0073, + "step": 16839 + }, + { + "epoch": 13.259944860181173, + "grad_norm": 0.10516829043626785, + "learning_rate": 2.7722000000000003e-05, + "loss": 0.007, + "step": 16840 + }, + { + "epoch": 13.260732571878693, + "grad_norm": 0.1487560123205185, + "learning_rate": 2.7721666666666666e-05, + "loss": 0.0076, + "step": 16841 + }, + { + "epoch": 13.26152028357621, + "grad_norm": 0.25662434101104736, + "learning_rate": 2.7721333333333335e-05, + "loss": 0.0108, + "step": 16842 + }, + { + "epoch": 13.26230799527373, + "grad_norm": 0.6028914451599121, + "learning_rate": 2.7721e-05, + "loss": 0.0126, + "step": 16843 + }, + { + "epoch": 13.26309570697125, + "grad_norm": 0.33793267607688904, + "learning_rate": 2.7720666666666667e-05, + "loss": 0.0048, + "step": 16844 + }, + { + "epoch": 13.263883418668767, + "grad_norm": 0.8137744665145874, + "learning_rate": 2.7720333333333333e-05, + "loss": 0.0191, + "step": 16845 + }, + { + "epoch": 13.264671130366287, + "grad_norm": 0.20261085033416748, + "learning_rate": 2.7720000000000002e-05, + "loss": 0.0085, + "step": 16846 + }, + { + "epoch": 13.265458842063804, + "grad_norm": 0.11092564463615417, + "learning_rate": 2.7719666666666665e-05, + "loss": 0.009, + "step": 16847 + }, + { + "epoch": 13.266246553761324, + "grad_norm": 0.15010878443717957, + "learning_rate": 2.7719333333333334e-05, + "loss": 0.0097, + "step": 16848 + }, + { + "epoch": 13.267034265458841, + "grad_norm": 0.4624651074409485, + "learning_rate": 2.7719000000000003e-05, + "loss": 0.0181, + "step": 16849 + }, + { + "epoch": 13.26782197715636, + "grad_norm": 0.27693137526512146, + "learning_rate": 2.7718666666666666e-05, + "loss": 0.0094, + "step": 16850 + }, + { + "epoch": 13.26860968885388, + "grad_norm": 0.15543407201766968, + "learning_rate": 2.7718333333333335e-05, + "loss": 0.0109, + "step": 16851 + }, + { + "epoch": 13.269397400551398, + "grad_norm": 0.4372671842575073, + "learning_rate": 2.7718e-05, + "loss": 0.0157, + "step": 16852 + }, + { + "epoch": 13.270185112248917, + "grad_norm": 0.10723023116588593, + "learning_rate": 2.7717666666666667e-05, + "loss": 0.0041, + "step": 16853 + }, + { + "epoch": 13.270972823946435, + "grad_norm": 0.21520431339740753, + "learning_rate": 2.7717333333333333e-05, + "loss": 0.0088, + "step": 16854 + }, + { + "epoch": 13.271760535643955, + "grad_norm": 0.17179171741008759, + "learning_rate": 2.7717000000000002e-05, + "loss": 0.0084, + "step": 16855 + }, + { + "epoch": 13.272548247341472, + "grad_norm": 0.28037822246551514, + "learning_rate": 2.771666666666667e-05, + "loss": 0.0111, + "step": 16856 + }, + { + "epoch": 13.273335959038992, + "grad_norm": 0.15240083634853363, + "learning_rate": 2.7716333333333334e-05, + "loss": 0.0083, + "step": 16857 + }, + { + "epoch": 13.274123670736511, + "grad_norm": 0.2822010815143585, + "learning_rate": 2.7716e-05, + "loss": 0.011, + "step": 16858 + }, + { + "epoch": 13.274911382434029, + "grad_norm": 0.23034843802452087, + "learning_rate": 2.7715666666666666e-05, + "loss": 0.0073, + "step": 16859 + }, + { + "epoch": 13.275699094131548, + "grad_norm": 0.48713937401771545, + "learning_rate": 2.7715333333333336e-05, + "loss": 0.0083, + "step": 16860 + }, + { + "epoch": 13.276486805829066, + "grad_norm": 0.5933040976524353, + "learning_rate": 2.7714999999999998e-05, + "loss": 0.1528, + "step": 16861 + }, + { + "epoch": 13.277274517526585, + "grad_norm": 0.7382811903953552, + "learning_rate": 2.7714666666666667e-05, + "loss": 0.1152, + "step": 16862 + }, + { + "epoch": 13.278062229224105, + "grad_norm": 0.3540996313095093, + "learning_rate": 2.7714333333333333e-05, + "loss": 0.0699, + "step": 16863 + }, + { + "epoch": 13.278849940921623, + "grad_norm": 0.2530514895915985, + "learning_rate": 2.7714e-05, + "loss": 0.0491, + "step": 16864 + }, + { + "epoch": 13.279637652619142, + "grad_norm": 0.5557464361190796, + "learning_rate": 2.771366666666667e-05, + "loss": 0.0734, + "step": 16865 + }, + { + "epoch": 13.28042536431666, + "grad_norm": 0.20512913167476654, + "learning_rate": 2.7713333333333335e-05, + "loss": 0.018, + "step": 16866 + }, + { + "epoch": 13.281213076014179, + "grad_norm": 0.17942403256893158, + "learning_rate": 2.7713e-05, + "loss": 0.0108, + "step": 16867 + }, + { + "epoch": 13.282000787711697, + "grad_norm": 0.12929603457450867, + "learning_rate": 2.7712666666666666e-05, + "loss": 0.0117, + "step": 16868 + }, + { + "epoch": 13.282788499409216, + "grad_norm": 0.2534550726413727, + "learning_rate": 2.7712333333333336e-05, + "loss": 0.0201, + "step": 16869 + }, + { + "epoch": 13.283576211106736, + "grad_norm": 0.6518918871879578, + "learning_rate": 2.7711999999999998e-05, + "loss": 0.0131, + "step": 16870 + }, + { + "epoch": 13.284363922804253, + "grad_norm": 0.10764201730489731, + "learning_rate": 2.7711666666666668e-05, + "loss": 0.0061, + "step": 16871 + }, + { + "epoch": 13.285151634501773, + "grad_norm": 1.5747857093811035, + "learning_rate": 2.7711333333333334e-05, + "loss": 0.0309, + "step": 16872 + }, + { + "epoch": 13.28593934619929, + "grad_norm": 0.1323213428258896, + "learning_rate": 2.7711e-05, + "loss": 0.0075, + "step": 16873 + }, + { + "epoch": 13.28672705789681, + "grad_norm": 0.15064828097820282, + "learning_rate": 2.771066666666667e-05, + "loss": 0.008, + "step": 16874 + }, + { + "epoch": 13.287514769594328, + "grad_norm": 0.45428961515426636, + "learning_rate": 2.7710333333333335e-05, + "loss": 0.034, + "step": 16875 + }, + { + "epoch": 13.288302481291847, + "grad_norm": 0.26231783628463745, + "learning_rate": 2.771e-05, + "loss": 0.0105, + "step": 16876 + }, + { + "epoch": 13.289090192989367, + "grad_norm": 0.0841664969921112, + "learning_rate": 2.7709666666666667e-05, + "loss": 0.0056, + "step": 16877 + }, + { + "epoch": 13.289877904686884, + "grad_norm": 0.18584153056144714, + "learning_rate": 2.7709333333333336e-05, + "loss": 0.0106, + "step": 16878 + }, + { + "epoch": 13.290665616384404, + "grad_norm": 0.2977110743522644, + "learning_rate": 2.7709e-05, + "loss": 0.0145, + "step": 16879 + }, + { + "epoch": 13.291453328081921, + "grad_norm": 0.2383059859275818, + "learning_rate": 2.7708666666666668e-05, + "loss": 0.007, + "step": 16880 + }, + { + "epoch": 13.29224103977944, + "grad_norm": 0.3750036060810089, + "learning_rate": 2.7708333333333337e-05, + "loss": 0.0128, + "step": 16881 + }, + { + "epoch": 13.29302875147696, + "grad_norm": 0.1851661056280136, + "learning_rate": 2.7708e-05, + "loss": 0.0083, + "step": 16882 + }, + { + "epoch": 13.293816463174478, + "grad_norm": 0.08176232129335403, + "learning_rate": 2.770766666666667e-05, + "loss": 0.0041, + "step": 16883 + }, + { + "epoch": 13.294604174871997, + "grad_norm": 0.2266503870487213, + "learning_rate": 2.7707333333333335e-05, + "loss": 0.008, + "step": 16884 + }, + { + "epoch": 13.295391886569515, + "grad_norm": 0.12521933019161224, + "learning_rate": 2.7707e-05, + "loss": 0.0074, + "step": 16885 + }, + { + "epoch": 13.296179598267035, + "grad_norm": 0.14376148581504822, + "learning_rate": 2.7706666666666667e-05, + "loss": 0.0084, + "step": 16886 + }, + { + "epoch": 13.296967309964552, + "grad_norm": 0.20577190816402435, + "learning_rate": 2.7706333333333333e-05, + "loss": 0.0097, + "step": 16887 + }, + { + "epoch": 13.297755021662072, + "grad_norm": 0.13604186475276947, + "learning_rate": 2.7706e-05, + "loss": 0.0067, + "step": 16888 + }, + { + "epoch": 13.298542733359591, + "grad_norm": 0.08086086809635162, + "learning_rate": 2.7705666666666668e-05, + "loss": 0.0035, + "step": 16889 + }, + { + "epoch": 13.299330445057109, + "grad_norm": 0.25837239623069763, + "learning_rate": 2.7705333333333334e-05, + "loss": 0.008, + "step": 16890 + }, + { + "epoch": 13.300118156754628, + "grad_norm": 0.2898922860622406, + "learning_rate": 2.7705e-05, + "loss": 0.0163, + "step": 16891 + }, + { + "epoch": 13.300905868452146, + "grad_norm": 0.18525850772857666, + "learning_rate": 2.770466666666667e-05, + "loss": 0.0049, + "step": 16892 + }, + { + "epoch": 13.301693580149665, + "grad_norm": 0.18435311317443848, + "learning_rate": 2.7704333333333332e-05, + "loss": 0.0093, + "step": 16893 + }, + { + "epoch": 13.302481291847183, + "grad_norm": 0.0982818678021431, + "learning_rate": 2.7704e-05, + "loss": 0.0062, + "step": 16894 + }, + { + "epoch": 13.303269003544703, + "grad_norm": 0.12976473569869995, + "learning_rate": 2.7703666666666667e-05, + "loss": 0.0026, + "step": 16895 + }, + { + "epoch": 13.304056715242222, + "grad_norm": 0.08449582010507584, + "learning_rate": 2.7703333333333333e-05, + "loss": 0.0025, + "step": 16896 + }, + { + "epoch": 13.30484442693974, + "grad_norm": 0.3530782163143158, + "learning_rate": 2.7703e-05, + "loss": 0.0125, + "step": 16897 + }, + { + "epoch": 13.30563213863726, + "grad_norm": 0.21907678246498108, + "learning_rate": 2.7702666666666668e-05, + "loss": 0.0066, + "step": 16898 + }, + { + "epoch": 13.306419850334777, + "grad_norm": 0.2361854612827301, + "learning_rate": 2.7702333333333334e-05, + "loss": 0.0091, + "step": 16899 + }, + { + "epoch": 13.307207562032296, + "grad_norm": 0.124134361743927, + "learning_rate": 2.7702e-05, + "loss": 0.0091, + "step": 16900 + }, + { + "epoch": 13.307995273729816, + "grad_norm": 0.4581025540828705, + "learning_rate": 2.770166666666667e-05, + "loss": 0.011, + "step": 16901 + }, + { + "epoch": 13.308782985427333, + "grad_norm": 0.2236609309911728, + "learning_rate": 2.7701333333333332e-05, + "loss": 0.0168, + "step": 16902 + }, + { + "epoch": 13.309570697124853, + "grad_norm": 0.422862708568573, + "learning_rate": 2.7701e-05, + "loss": 0.0148, + "step": 16903 + }, + { + "epoch": 13.31035840882237, + "grad_norm": 0.2672017812728882, + "learning_rate": 2.7700666666666667e-05, + "loss": 0.0079, + "step": 16904 + }, + { + "epoch": 13.31114612051989, + "grad_norm": 0.13343508541584015, + "learning_rate": 2.7700333333333333e-05, + "loss": 0.0072, + "step": 16905 + }, + { + "epoch": 13.311933832217408, + "grad_norm": 0.1973676085472107, + "learning_rate": 2.7700000000000002e-05, + "loss": 0.0044, + "step": 16906 + }, + { + "epoch": 13.312721543914927, + "grad_norm": 0.30184441804885864, + "learning_rate": 2.769966666666667e-05, + "loss": 0.0067, + "step": 16907 + }, + { + "epoch": 13.313509255612447, + "grad_norm": 0.19408707320690155, + "learning_rate": 2.7699333333333334e-05, + "loss": 0.0074, + "step": 16908 + }, + { + "epoch": 13.314296967309964, + "grad_norm": 0.2175338715314865, + "learning_rate": 2.7699e-05, + "loss": 0.0098, + "step": 16909 + }, + { + "epoch": 13.315084679007484, + "grad_norm": 0.8204931616783142, + "learning_rate": 2.769866666666667e-05, + "loss": 0.01, + "step": 16910 + }, + { + "epoch": 13.315872390705001, + "grad_norm": 0.5676400065422058, + "learning_rate": 2.7698333333333332e-05, + "loss": 0.1589, + "step": 16911 + }, + { + "epoch": 13.31666010240252, + "grad_norm": 0.5279447436332703, + "learning_rate": 2.7698e-05, + "loss": 0.1265, + "step": 16912 + }, + { + "epoch": 13.317447814100039, + "grad_norm": 0.3444136679172516, + "learning_rate": 2.7697666666666667e-05, + "loss": 0.0921, + "step": 16913 + }, + { + "epoch": 13.318235525797558, + "grad_norm": 0.4585370123386383, + "learning_rate": 2.7697333333333333e-05, + "loss": 0.0805, + "step": 16914 + }, + { + "epoch": 13.319023237495077, + "grad_norm": 0.45366188883781433, + "learning_rate": 2.7697000000000003e-05, + "loss": 0.056, + "step": 16915 + }, + { + "epoch": 13.319810949192595, + "grad_norm": 0.241108238697052, + "learning_rate": 2.769666666666667e-05, + "loss": 0.0232, + "step": 16916 + }, + { + "epoch": 13.320598660890115, + "grad_norm": 0.16884498298168182, + "learning_rate": 2.7696333333333335e-05, + "loss": 0.0226, + "step": 16917 + }, + { + "epoch": 13.321386372587632, + "grad_norm": 0.2224399298429489, + "learning_rate": 2.7696e-05, + "loss": 0.0147, + "step": 16918 + }, + { + "epoch": 13.322174084285152, + "grad_norm": 0.17428739368915558, + "learning_rate": 2.7695666666666666e-05, + "loss": 0.0176, + "step": 16919 + }, + { + "epoch": 13.322961795982671, + "grad_norm": 0.20596106350421906, + "learning_rate": 2.7695333333333332e-05, + "loss": 0.0115, + "step": 16920 + }, + { + "epoch": 13.323749507680189, + "grad_norm": 0.38617250323295593, + "learning_rate": 2.7695e-05, + "loss": 0.0132, + "step": 16921 + }, + { + "epoch": 13.324537219377708, + "grad_norm": 0.3101242780685425, + "learning_rate": 2.7694666666666668e-05, + "loss": 0.0059, + "step": 16922 + }, + { + "epoch": 13.325324931075226, + "grad_norm": 0.36365625262260437, + "learning_rate": 2.7694333333333334e-05, + "loss": 0.0382, + "step": 16923 + }, + { + "epoch": 13.326112642772745, + "grad_norm": 0.1142684668302536, + "learning_rate": 2.7694000000000003e-05, + "loss": 0.0053, + "step": 16924 + }, + { + "epoch": 13.326900354470263, + "grad_norm": 0.33948373794555664, + "learning_rate": 2.7693666666666665e-05, + "loss": 0.0063, + "step": 16925 + }, + { + "epoch": 13.327688066167783, + "grad_norm": 0.1314591020345688, + "learning_rate": 2.7693333333333335e-05, + "loss": 0.0076, + "step": 16926 + }, + { + "epoch": 13.328475777865302, + "grad_norm": 0.25174859166145325, + "learning_rate": 2.7693e-05, + "loss": 0.0105, + "step": 16927 + }, + { + "epoch": 13.32926348956282, + "grad_norm": 0.4796903133392334, + "learning_rate": 2.7692666666666667e-05, + "loss": 0.0132, + "step": 16928 + }, + { + "epoch": 13.33005120126034, + "grad_norm": 0.14842717349529266, + "learning_rate": 2.7692333333333333e-05, + "loss": 0.0069, + "step": 16929 + }, + { + "epoch": 13.330838912957857, + "grad_norm": 0.23375362157821655, + "learning_rate": 2.7692000000000002e-05, + "loss": 0.0036, + "step": 16930 + }, + { + "epoch": 13.331626624655376, + "grad_norm": 0.10589993745088577, + "learning_rate": 2.7691666666666668e-05, + "loss": 0.0045, + "step": 16931 + }, + { + "epoch": 13.332414336352894, + "grad_norm": 0.16388358175754547, + "learning_rate": 2.7691333333333334e-05, + "loss": 0.0075, + "step": 16932 + }, + { + "epoch": 13.333202048050413, + "grad_norm": 0.05593608319759369, + "learning_rate": 2.7691000000000003e-05, + "loss": 0.002, + "step": 16933 + }, + { + "epoch": 13.333989759747933, + "grad_norm": 0.17286787927150726, + "learning_rate": 2.7690666666666666e-05, + "loss": 0.0067, + "step": 16934 + }, + { + "epoch": 13.33477747144545, + "grad_norm": 0.3901391327381134, + "learning_rate": 2.7690333333333335e-05, + "loss": 0.0099, + "step": 16935 + }, + { + "epoch": 13.33556518314297, + "grad_norm": 0.18186591565608978, + "learning_rate": 2.769e-05, + "loss": 0.011, + "step": 16936 + }, + { + "epoch": 13.336352894840488, + "grad_norm": 0.29269710183143616, + "learning_rate": 2.7689666666666667e-05, + "loss": 0.0365, + "step": 16937 + }, + { + "epoch": 13.337140606538007, + "grad_norm": 0.3055838346481323, + "learning_rate": 2.7689333333333333e-05, + "loss": 0.014, + "step": 16938 + }, + { + "epoch": 13.337928318235527, + "grad_norm": 0.2590034306049347, + "learning_rate": 2.7689000000000002e-05, + "loss": 0.0107, + "step": 16939 + }, + { + "epoch": 13.338716029933044, + "grad_norm": 0.25808316469192505, + "learning_rate": 2.7688666666666668e-05, + "loss": 0.0105, + "step": 16940 + }, + { + "epoch": 13.339503741630564, + "grad_norm": 0.33894625306129456, + "learning_rate": 2.7688333333333334e-05, + "loss": 0.027, + "step": 16941 + }, + { + "epoch": 13.340291453328081, + "grad_norm": 0.3929676115512848, + "learning_rate": 2.7688000000000003e-05, + "loss": 0.0096, + "step": 16942 + }, + { + "epoch": 13.3410791650256, + "grad_norm": 0.28096818923950195, + "learning_rate": 2.7687666666666666e-05, + "loss": 0.0175, + "step": 16943 + }, + { + "epoch": 13.341866876723119, + "grad_norm": 0.16388653218746185, + "learning_rate": 2.7687333333333335e-05, + "loss": 0.0057, + "step": 16944 + }, + { + "epoch": 13.342654588420638, + "grad_norm": 0.18150568008422852, + "learning_rate": 2.7687e-05, + "loss": 0.0075, + "step": 16945 + }, + { + "epoch": 13.343442300118157, + "grad_norm": 0.13075244426727295, + "learning_rate": 2.7686666666666667e-05, + "loss": 0.0069, + "step": 16946 + }, + { + "epoch": 13.344230011815675, + "grad_norm": 0.40380051732063293, + "learning_rate": 2.7686333333333333e-05, + "loss": 0.01, + "step": 16947 + }, + { + "epoch": 13.345017723513195, + "grad_norm": 0.43306317925453186, + "learning_rate": 2.7686e-05, + "loss": 0.012, + "step": 16948 + }, + { + "epoch": 13.345805435210712, + "grad_norm": 0.15588924288749695, + "learning_rate": 2.7685666666666668e-05, + "loss": 0.0095, + "step": 16949 + }, + { + "epoch": 13.346593146908232, + "grad_norm": 0.14467565715312958, + "learning_rate": 2.7685333333333334e-05, + "loss": 0.0077, + "step": 16950 + }, + { + "epoch": 13.34738085860575, + "grad_norm": 1.2849382162094116, + "learning_rate": 2.7685e-05, + "loss": 0.0094, + "step": 16951 + }, + { + "epoch": 13.348168570303269, + "grad_norm": 0.2380582094192505, + "learning_rate": 2.7684666666666666e-05, + "loss": 0.012, + "step": 16952 + }, + { + "epoch": 13.348956282000788, + "grad_norm": 0.1655079573392868, + "learning_rate": 2.7684333333333335e-05, + "loss": 0.0061, + "step": 16953 + }, + { + "epoch": 13.349743993698306, + "grad_norm": 0.17016883194446564, + "learning_rate": 2.7683999999999998e-05, + "loss": 0.0111, + "step": 16954 + }, + { + "epoch": 13.350531705395825, + "grad_norm": 0.3090193271636963, + "learning_rate": 2.7683666666666667e-05, + "loss": 0.0077, + "step": 16955 + }, + { + "epoch": 13.351319417093343, + "grad_norm": 0.11221463233232498, + "learning_rate": 2.7683333333333337e-05, + "loss": 0.005, + "step": 16956 + }, + { + "epoch": 13.352107128790863, + "grad_norm": 0.3589436709880829, + "learning_rate": 2.7683e-05, + "loss": 0.0168, + "step": 16957 + }, + { + "epoch": 13.352894840488382, + "grad_norm": 0.25552302598953247, + "learning_rate": 2.768266666666667e-05, + "loss": 0.0053, + "step": 16958 + }, + { + "epoch": 13.3536825521859, + "grad_norm": 0.2041008025407791, + "learning_rate": 2.7682333333333334e-05, + "loss": 0.0055, + "step": 16959 + }, + { + "epoch": 13.35447026388342, + "grad_norm": 0.26743876934051514, + "learning_rate": 2.7682e-05, + "loss": 0.0098, + "step": 16960 + }, + { + "epoch": 13.355257975580937, + "grad_norm": 0.6281552314758301, + "learning_rate": 2.7681666666666666e-05, + "loss": 0.1441, + "step": 16961 + }, + { + "epoch": 13.356045687278456, + "grad_norm": 0.5216352939605713, + "learning_rate": 2.7681333333333336e-05, + "loss": 0.0989, + "step": 16962 + }, + { + "epoch": 13.356833398975974, + "grad_norm": 0.673001229763031, + "learning_rate": 2.7680999999999998e-05, + "loss": 0.1, + "step": 16963 + }, + { + "epoch": 13.357621110673493, + "grad_norm": 0.326188862323761, + "learning_rate": 2.7680666666666667e-05, + "loss": 0.0726, + "step": 16964 + }, + { + "epoch": 13.358408822371013, + "grad_norm": 0.5166916251182556, + "learning_rate": 2.7680333333333337e-05, + "loss": 0.0495, + "step": 16965 + }, + { + "epoch": 13.35919653406853, + "grad_norm": 1.2526555061340332, + "learning_rate": 2.768e-05, + "loss": 0.0221, + "step": 16966 + }, + { + "epoch": 13.35998424576605, + "grad_norm": 0.1803363561630249, + "learning_rate": 2.767966666666667e-05, + "loss": 0.0157, + "step": 16967 + }, + { + "epoch": 13.360771957463568, + "grad_norm": 0.2552536725997925, + "learning_rate": 2.7679333333333335e-05, + "loss": 0.0149, + "step": 16968 + }, + { + "epoch": 13.361559669161087, + "grad_norm": 0.2991780936717987, + "learning_rate": 2.7679e-05, + "loss": 0.0103, + "step": 16969 + }, + { + "epoch": 13.362347380858605, + "grad_norm": 0.18527567386627197, + "learning_rate": 2.7678666666666666e-05, + "loss": 0.0123, + "step": 16970 + }, + { + "epoch": 13.363135092556124, + "grad_norm": 0.21835315227508545, + "learning_rate": 2.7678333333333336e-05, + "loss": 0.0158, + "step": 16971 + }, + { + "epoch": 13.363922804253644, + "grad_norm": 0.2917240858078003, + "learning_rate": 2.7678e-05, + "loss": 0.0448, + "step": 16972 + }, + { + "epoch": 13.364710515951161, + "grad_norm": 0.22204868495464325, + "learning_rate": 2.7677666666666668e-05, + "loss": 0.0089, + "step": 16973 + }, + { + "epoch": 13.365498227648681, + "grad_norm": 0.16563312709331512, + "learning_rate": 2.7677333333333337e-05, + "loss": 0.0119, + "step": 16974 + }, + { + "epoch": 13.366285939346199, + "grad_norm": 0.2592363953590393, + "learning_rate": 2.7677e-05, + "loss": 0.0086, + "step": 16975 + }, + { + "epoch": 13.367073651043718, + "grad_norm": 0.20156601071357727, + "learning_rate": 2.767666666666667e-05, + "loss": 0.0057, + "step": 16976 + }, + { + "epoch": 13.367861362741237, + "grad_norm": 0.3728775978088379, + "learning_rate": 2.767633333333333e-05, + "loss": 0.0153, + "step": 16977 + }, + { + "epoch": 13.368649074438755, + "grad_norm": 0.15992063283920288, + "learning_rate": 2.7676e-05, + "loss": 0.0077, + "step": 16978 + }, + { + "epoch": 13.369436786136275, + "grad_norm": 0.2557348608970642, + "learning_rate": 2.7675666666666667e-05, + "loss": 0.0115, + "step": 16979 + }, + { + "epoch": 13.370224497833792, + "grad_norm": 0.3592653274536133, + "learning_rate": 2.7675333333333333e-05, + "loss": 0.0061, + "step": 16980 + }, + { + "epoch": 13.371012209531312, + "grad_norm": 0.29283568263053894, + "learning_rate": 2.7675000000000002e-05, + "loss": 0.0191, + "step": 16981 + }, + { + "epoch": 13.37179992122883, + "grad_norm": 0.14174264669418335, + "learning_rate": 2.7674666666666668e-05, + "loss": 0.0098, + "step": 16982 + }, + { + "epoch": 13.372587632926349, + "grad_norm": 0.3069620430469513, + "learning_rate": 2.7674333333333334e-05, + "loss": 0.0078, + "step": 16983 + }, + { + "epoch": 13.373375344623868, + "grad_norm": 0.14961116015911102, + "learning_rate": 2.7674e-05, + "loss": 0.0102, + "step": 16984 + }, + { + "epoch": 13.374163056321386, + "grad_norm": 0.17784351110458374, + "learning_rate": 2.767366666666667e-05, + "loss": 0.0055, + "step": 16985 + }, + { + "epoch": 13.374950768018905, + "grad_norm": 0.20914800465106964, + "learning_rate": 2.767333333333333e-05, + "loss": 0.013, + "step": 16986 + }, + { + "epoch": 13.375738479716423, + "grad_norm": 0.1375042051076889, + "learning_rate": 2.7673e-05, + "loss": 0.0059, + "step": 16987 + }, + { + "epoch": 13.376526191413943, + "grad_norm": 0.3890204429626465, + "learning_rate": 2.7672666666666667e-05, + "loss": 0.0167, + "step": 16988 + }, + { + "epoch": 13.37731390311146, + "grad_norm": 0.21056430041790009, + "learning_rate": 2.7672333333333333e-05, + "loss": 0.0061, + "step": 16989 + }, + { + "epoch": 13.37810161480898, + "grad_norm": 0.1320372223854065, + "learning_rate": 2.7672000000000002e-05, + "loss": 0.0047, + "step": 16990 + }, + { + "epoch": 13.3788893265065, + "grad_norm": 0.8257355093955994, + "learning_rate": 2.7671666666666668e-05, + "loss": 0.0116, + "step": 16991 + }, + { + "epoch": 13.379677038204017, + "grad_norm": 0.07628431171178818, + "learning_rate": 2.7671333333333334e-05, + "loss": 0.0043, + "step": 16992 + }, + { + "epoch": 13.380464749901536, + "grad_norm": 0.13419251143932343, + "learning_rate": 2.7671e-05, + "loss": 0.0063, + "step": 16993 + }, + { + "epoch": 13.381252461599054, + "grad_norm": 0.25341111421585083, + "learning_rate": 2.767066666666667e-05, + "loss": 0.0092, + "step": 16994 + }, + { + "epoch": 13.382040173296573, + "grad_norm": 0.31054145097732544, + "learning_rate": 2.7670333333333332e-05, + "loss": 0.0194, + "step": 16995 + }, + { + "epoch": 13.382827884994093, + "grad_norm": 0.343418151140213, + "learning_rate": 2.767e-05, + "loss": 0.0096, + "step": 16996 + }, + { + "epoch": 13.38361559669161, + "grad_norm": 0.15408505499362946, + "learning_rate": 2.7669666666666667e-05, + "loss": 0.0075, + "step": 16997 + }, + { + "epoch": 13.38440330838913, + "grad_norm": 0.18526601791381836, + "learning_rate": 2.7669333333333333e-05, + "loss": 0.0066, + "step": 16998 + }, + { + "epoch": 13.385191020086648, + "grad_norm": 0.09157869219779968, + "learning_rate": 2.7669000000000002e-05, + "loss": 0.0039, + "step": 16999 + }, + { + "epoch": 13.385978731784167, + "grad_norm": 0.13176481425762177, + "learning_rate": 2.7668666666666668e-05, + "loss": 0.007, + "step": 17000 + }, + { + "epoch": 13.385978731784167, + "eval_cer": 0.11136169220001556, + "eval_loss": 0.3254829943180084, + "eval_runtime": 16.1731, + "eval_samples_per_second": 18.797, + "eval_steps_per_second": 0.618, + "eval_wer": 0.37854950115118957, + "step": 17000 + }, + { + "epoch": 13.386766443481685, + "grad_norm": 0.2581745982170105, + "learning_rate": 2.7668333333333334e-05, + "loss": 0.0143, + "step": 17001 + }, + { + "epoch": 13.387554155179204, + "grad_norm": 0.13137167692184448, + "learning_rate": 2.7668e-05, + "loss": 0.0054, + "step": 17002 + }, + { + "epoch": 13.388341866876724, + "grad_norm": 0.16175217926502228, + "learning_rate": 2.766766666666667e-05, + "loss": 0.0067, + "step": 17003 + }, + { + "epoch": 13.389129578574241, + "grad_norm": 0.6500137448310852, + "learning_rate": 2.7667333333333332e-05, + "loss": 0.0128, + "step": 17004 + }, + { + "epoch": 13.389917290271761, + "grad_norm": 0.2779584228992462, + "learning_rate": 2.7667e-05, + "loss": 0.0116, + "step": 17005 + }, + { + "epoch": 13.390705001969279, + "grad_norm": 0.10002639889717102, + "learning_rate": 2.766666666666667e-05, + "loss": 0.0041, + "step": 17006 + }, + { + "epoch": 13.391492713666798, + "grad_norm": 0.5657243132591248, + "learning_rate": 2.7666333333333333e-05, + "loss": 0.0114, + "step": 17007 + }, + { + "epoch": 13.392280425364318, + "grad_norm": 0.22134442627429962, + "learning_rate": 2.7666000000000002e-05, + "loss": 0.0062, + "step": 17008 + }, + { + "epoch": 13.393068137061835, + "grad_norm": 0.43016377091407776, + "learning_rate": 2.7665666666666665e-05, + "loss": 0.0139, + "step": 17009 + }, + { + "epoch": 13.393855848759355, + "grad_norm": 0.38922134041786194, + "learning_rate": 2.7665333333333334e-05, + "loss": 0.0077, + "step": 17010 + }, + { + "epoch": 13.394643560456872, + "grad_norm": 0.5780421495437622, + "learning_rate": 2.7665e-05, + "loss": 0.2113, + "step": 17011 + }, + { + "epoch": 13.395431272154392, + "grad_norm": 0.4088199734687805, + "learning_rate": 2.7664666666666666e-05, + "loss": 0.1195, + "step": 17012 + }, + { + "epoch": 13.39621898385191, + "grad_norm": 0.48855793476104736, + "learning_rate": 2.7664333333333332e-05, + "loss": 0.079, + "step": 17013 + }, + { + "epoch": 13.397006695549429, + "grad_norm": 0.5898545384407043, + "learning_rate": 2.7664e-05, + "loss": 0.1285, + "step": 17014 + }, + { + "epoch": 13.397794407246948, + "grad_norm": 0.36148351430892944, + "learning_rate": 2.7663666666666667e-05, + "loss": 0.0699, + "step": 17015 + }, + { + "epoch": 13.398582118944466, + "grad_norm": 0.5266053676605225, + "learning_rate": 2.7663333333333333e-05, + "loss": 0.1125, + "step": 17016 + }, + { + "epoch": 13.399369830641986, + "grad_norm": 0.769117534160614, + "learning_rate": 2.7663000000000003e-05, + "loss": 0.0869, + "step": 17017 + }, + { + "epoch": 13.400157542339503, + "grad_norm": 0.1761850267648697, + "learning_rate": 2.7662666666666665e-05, + "loss": 0.0164, + "step": 17018 + }, + { + "epoch": 13.400945254037023, + "grad_norm": 0.17215964198112488, + "learning_rate": 2.7662333333333335e-05, + "loss": 0.0061, + "step": 17019 + }, + { + "epoch": 13.40173296573454, + "grad_norm": 0.16801342368125916, + "learning_rate": 2.7662e-05, + "loss": 0.0089, + "step": 17020 + }, + { + "epoch": 13.40252067743206, + "grad_norm": 0.2356347143650055, + "learning_rate": 2.7661666666666666e-05, + "loss": 0.0099, + "step": 17021 + }, + { + "epoch": 13.40330838912958, + "grad_norm": 0.22278034687042236, + "learning_rate": 2.7661333333333336e-05, + "loss": 0.0197, + "step": 17022 + }, + { + "epoch": 13.404096100827097, + "grad_norm": 0.2645297050476074, + "learning_rate": 2.7661e-05, + "loss": 0.0094, + "step": 17023 + }, + { + "epoch": 13.404883812524616, + "grad_norm": 0.20459724962711334, + "learning_rate": 2.7660666666666668e-05, + "loss": 0.0097, + "step": 17024 + }, + { + "epoch": 13.405671524222134, + "grad_norm": 0.18062692880630493, + "learning_rate": 2.7660333333333334e-05, + "loss": 0.0192, + "step": 17025 + }, + { + "epoch": 13.406459235919653, + "grad_norm": 0.34188181161880493, + "learning_rate": 2.7660000000000003e-05, + "loss": 0.0119, + "step": 17026 + }, + { + "epoch": 13.407246947617173, + "grad_norm": 0.3639182448387146, + "learning_rate": 2.7659666666666665e-05, + "loss": 0.0146, + "step": 17027 + }, + { + "epoch": 13.40803465931469, + "grad_norm": 0.24771346151828766, + "learning_rate": 2.7659333333333335e-05, + "loss": 0.0097, + "step": 17028 + }, + { + "epoch": 13.40882237101221, + "grad_norm": 0.17674851417541504, + "learning_rate": 2.7659e-05, + "loss": 0.0083, + "step": 17029 + }, + { + "epoch": 13.409610082709728, + "grad_norm": 0.18785801529884338, + "learning_rate": 2.7658666666666667e-05, + "loss": 0.0083, + "step": 17030 + }, + { + "epoch": 13.410397794407247, + "grad_norm": 0.14686988294124603, + "learning_rate": 2.7658333333333336e-05, + "loss": 0.0116, + "step": 17031 + }, + { + "epoch": 13.411185506104765, + "grad_norm": 0.11999647319316864, + "learning_rate": 2.7658000000000002e-05, + "loss": 0.0111, + "step": 17032 + }, + { + "epoch": 13.411973217802284, + "grad_norm": 0.14909376204013824, + "learning_rate": 2.7657666666666668e-05, + "loss": 0.0094, + "step": 17033 + }, + { + "epoch": 13.412760929499804, + "grad_norm": 0.19301067292690277, + "learning_rate": 2.7657333333333334e-05, + "loss": 0.0274, + "step": 17034 + }, + { + "epoch": 13.413548641197321, + "grad_norm": 0.10097382217645645, + "learning_rate": 2.7657000000000003e-05, + "loss": 0.0042, + "step": 17035 + }, + { + "epoch": 13.414336352894841, + "grad_norm": 0.14302046597003937, + "learning_rate": 2.7656666666666666e-05, + "loss": 0.0078, + "step": 17036 + }, + { + "epoch": 13.415124064592359, + "grad_norm": 0.07639936357736588, + "learning_rate": 2.7656333333333335e-05, + "loss": 0.003, + "step": 17037 + }, + { + "epoch": 13.415911776289878, + "grad_norm": 0.16698028147220612, + "learning_rate": 2.7655999999999997e-05, + "loss": 0.0091, + "step": 17038 + }, + { + "epoch": 13.416699487987396, + "grad_norm": 0.5625725388526917, + "learning_rate": 2.7655666666666667e-05, + "loss": 0.0096, + "step": 17039 + }, + { + "epoch": 13.417487199684915, + "grad_norm": 0.20657628774642944, + "learning_rate": 2.7655333333333336e-05, + "loss": 0.0071, + "step": 17040 + }, + { + "epoch": 13.418274911382435, + "grad_norm": 0.2516092360019684, + "learning_rate": 2.7655e-05, + "loss": 0.012, + "step": 17041 + }, + { + "epoch": 13.419062623079952, + "grad_norm": 0.15616776049137115, + "learning_rate": 2.7654666666666668e-05, + "loss": 0.0052, + "step": 17042 + }, + { + "epoch": 13.419850334777472, + "grad_norm": 0.1643180549144745, + "learning_rate": 2.7654333333333334e-05, + "loss": 0.0068, + "step": 17043 + }, + { + "epoch": 13.42063804647499, + "grad_norm": 0.291375070810318, + "learning_rate": 2.7654e-05, + "loss": 0.0073, + "step": 17044 + }, + { + "epoch": 13.421425758172509, + "grad_norm": 0.15902815759181976, + "learning_rate": 2.7653666666666666e-05, + "loss": 0.01, + "step": 17045 + }, + { + "epoch": 13.422213469870028, + "grad_norm": 0.24806524813175201, + "learning_rate": 2.7653333333333335e-05, + "loss": 0.0108, + "step": 17046 + }, + { + "epoch": 13.423001181567546, + "grad_norm": 0.14397113025188446, + "learning_rate": 2.7653e-05, + "loss": 0.0098, + "step": 17047 + }, + { + "epoch": 13.423788893265066, + "grad_norm": 0.24162547290325165, + "learning_rate": 2.7652666666666667e-05, + "loss": 0.0078, + "step": 17048 + }, + { + "epoch": 13.424576604962583, + "grad_norm": 0.3183920085430145, + "learning_rate": 2.7652333333333336e-05, + "loss": 0.005, + "step": 17049 + }, + { + "epoch": 13.425364316660103, + "grad_norm": 0.1605731099843979, + "learning_rate": 2.7652e-05, + "loss": 0.0092, + "step": 17050 + }, + { + "epoch": 13.42615202835762, + "grad_norm": 0.1268622726202011, + "learning_rate": 2.7651666666666668e-05, + "loss": 0.0066, + "step": 17051 + }, + { + "epoch": 13.42693974005514, + "grad_norm": 0.13462688028812408, + "learning_rate": 2.7651333333333334e-05, + "loss": 0.0086, + "step": 17052 + }, + { + "epoch": 13.42772745175266, + "grad_norm": 0.31021904945373535, + "learning_rate": 2.7651e-05, + "loss": 0.0067, + "step": 17053 + }, + { + "epoch": 13.428515163450177, + "grad_norm": 0.6589981913566589, + "learning_rate": 2.7650666666666666e-05, + "loss": 0.0127, + "step": 17054 + }, + { + "epoch": 13.429302875147696, + "grad_norm": 0.11485300958156586, + "learning_rate": 2.7650333333333335e-05, + "loss": 0.0067, + "step": 17055 + }, + { + "epoch": 13.430090586845214, + "grad_norm": 0.52488774061203, + "learning_rate": 2.765e-05, + "loss": 0.025, + "step": 17056 + }, + { + "epoch": 13.430878298542734, + "grad_norm": 0.2907715439796448, + "learning_rate": 2.7649666666666667e-05, + "loss": 0.0091, + "step": 17057 + }, + { + "epoch": 13.431666010240253, + "grad_norm": 0.22392414510250092, + "learning_rate": 2.7649333333333336e-05, + "loss": 0.006, + "step": 17058 + }, + { + "epoch": 13.43245372193777, + "grad_norm": 0.21385949850082397, + "learning_rate": 2.7649e-05, + "loss": 0.0119, + "step": 17059 + }, + { + "epoch": 13.43324143363529, + "grad_norm": 0.29141315817832947, + "learning_rate": 2.764866666666667e-05, + "loss": 0.0071, + "step": 17060 + }, + { + "epoch": 13.434029145332808, + "grad_norm": 0.7782638669013977, + "learning_rate": 2.7648333333333334e-05, + "loss": 0.1598, + "step": 17061 + }, + { + "epoch": 13.434816857030327, + "grad_norm": 0.4937779903411865, + "learning_rate": 2.7648e-05, + "loss": 0.1551, + "step": 17062 + }, + { + "epoch": 13.435604568727845, + "grad_norm": 0.5082290768623352, + "learning_rate": 2.7647666666666666e-05, + "loss": 0.1372, + "step": 17063 + }, + { + "epoch": 13.436392280425364, + "grad_norm": 0.3315331041812897, + "learning_rate": 2.7647333333333335e-05, + "loss": 0.0588, + "step": 17064 + }, + { + "epoch": 13.437179992122884, + "grad_norm": 0.31424960494041443, + "learning_rate": 2.7647e-05, + "loss": 0.0416, + "step": 17065 + }, + { + "epoch": 13.437967703820402, + "grad_norm": 0.17113427817821503, + "learning_rate": 2.7646666666666667e-05, + "loss": 0.0144, + "step": 17066 + }, + { + "epoch": 13.438755415517921, + "grad_norm": 0.17027612030506134, + "learning_rate": 2.7646333333333337e-05, + "loss": 0.0071, + "step": 17067 + }, + { + "epoch": 13.439543127215439, + "grad_norm": 0.37421557307243347, + "learning_rate": 2.7646e-05, + "loss": 0.0293, + "step": 17068 + }, + { + "epoch": 13.440330838912958, + "grad_norm": 0.158660426735878, + "learning_rate": 2.764566666666667e-05, + "loss": 0.0058, + "step": 17069 + }, + { + "epoch": 13.441118550610476, + "grad_norm": 0.2378500998020172, + "learning_rate": 2.764533333333333e-05, + "loss": 0.0232, + "step": 17070 + }, + { + "epoch": 13.441906262307995, + "grad_norm": 0.1239871084690094, + "learning_rate": 2.7645e-05, + "loss": 0.0092, + "step": 17071 + }, + { + "epoch": 13.442693974005515, + "grad_norm": 0.1694612354040146, + "learning_rate": 2.764466666666667e-05, + "loss": 0.0063, + "step": 17072 + }, + { + "epoch": 13.443481685703032, + "grad_norm": 0.30456000566482544, + "learning_rate": 2.7644333333333332e-05, + "loss": 0.0135, + "step": 17073 + }, + { + "epoch": 13.444269397400552, + "grad_norm": 0.267589271068573, + "learning_rate": 2.7644e-05, + "loss": 0.0061, + "step": 17074 + }, + { + "epoch": 13.44505710909807, + "grad_norm": 0.4845590889453888, + "learning_rate": 2.7643666666666668e-05, + "loss": 0.0089, + "step": 17075 + }, + { + "epoch": 13.445844820795589, + "grad_norm": 0.14487482607364655, + "learning_rate": 2.7643333333333334e-05, + "loss": 0.0078, + "step": 17076 + }, + { + "epoch": 13.446632532493108, + "grad_norm": 0.48826974630355835, + "learning_rate": 2.7643e-05, + "loss": 0.0107, + "step": 17077 + }, + { + "epoch": 13.447420244190626, + "grad_norm": 0.12114303559064865, + "learning_rate": 2.764266666666667e-05, + "loss": 0.0081, + "step": 17078 + }, + { + "epoch": 13.448207955888146, + "grad_norm": 0.22733032703399658, + "learning_rate": 2.764233333333333e-05, + "loss": 0.0276, + "step": 17079 + }, + { + "epoch": 13.448995667585663, + "grad_norm": 0.22015570104122162, + "learning_rate": 2.7642e-05, + "loss": 0.0102, + "step": 17080 + }, + { + "epoch": 13.449783379283183, + "grad_norm": 0.402704119682312, + "learning_rate": 2.764166666666667e-05, + "loss": 0.0145, + "step": 17081 + }, + { + "epoch": 13.4505710909807, + "grad_norm": 0.1679324358701706, + "learning_rate": 2.7641333333333333e-05, + "loss": 0.006, + "step": 17082 + }, + { + "epoch": 13.45135880267822, + "grad_norm": 0.3139515519142151, + "learning_rate": 2.7641000000000002e-05, + "loss": 0.0103, + "step": 17083 + }, + { + "epoch": 13.45214651437574, + "grad_norm": 0.17208223044872284, + "learning_rate": 2.7640666666666668e-05, + "loss": 0.0121, + "step": 17084 + }, + { + "epoch": 13.452934226073257, + "grad_norm": 0.50718092918396, + "learning_rate": 2.7640333333333334e-05, + "loss": 0.0557, + "step": 17085 + }, + { + "epoch": 13.453721937770776, + "grad_norm": 0.480058878660202, + "learning_rate": 2.764e-05, + "loss": 0.0128, + "step": 17086 + }, + { + "epoch": 13.454509649468294, + "grad_norm": 0.2070085108280182, + "learning_rate": 2.763966666666667e-05, + "loss": 0.0086, + "step": 17087 + }, + { + "epoch": 13.455297361165814, + "grad_norm": 0.14250825345516205, + "learning_rate": 2.763933333333333e-05, + "loss": 0.006, + "step": 17088 + }, + { + "epoch": 13.456085072863331, + "grad_norm": 0.12843644618988037, + "learning_rate": 2.7639e-05, + "loss": 0.0083, + "step": 17089 + }, + { + "epoch": 13.45687278456085, + "grad_norm": 0.18088334798812866, + "learning_rate": 2.763866666666667e-05, + "loss": 0.0079, + "step": 17090 + }, + { + "epoch": 13.45766049625837, + "grad_norm": 0.191002756357193, + "learning_rate": 2.7638333333333333e-05, + "loss": 0.0079, + "step": 17091 + }, + { + "epoch": 13.458448207955888, + "grad_norm": 0.19811773300170898, + "learning_rate": 2.7638000000000002e-05, + "loss": 0.0109, + "step": 17092 + }, + { + "epoch": 13.459235919653407, + "grad_norm": 0.1859740912914276, + "learning_rate": 2.7637666666666668e-05, + "loss": 0.0053, + "step": 17093 + }, + { + "epoch": 13.460023631350925, + "grad_norm": 0.18145570158958435, + "learning_rate": 2.7637333333333334e-05, + "loss": 0.0073, + "step": 17094 + }, + { + "epoch": 13.460811343048444, + "grad_norm": 0.21440495550632477, + "learning_rate": 2.7637e-05, + "loss": 0.0194, + "step": 17095 + }, + { + "epoch": 13.461599054745964, + "grad_norm": 0.1696215271949768, + "learning_rate": 2.763666666666667e-05, + "loss": 0.0035, + "step": 17096 + }, + { + "epoch": 13.462386766443482, + "grad_norm": 0.46584001183509827, + "learning_rate": 2.7636333333333335e-05, + "loss": 0.0129, + "step": 17097 + }, + { + "epoch": 13.463174478141001, + "grad_norm": 0.19680972397327423, + "learning_rate": 2.7636e-05, + "loss": 0.0093, + "step": 17098 + }, + { + "epoch": 13.463962189838519, + "grad_norm": 0.1309492588043213, + "learning_rate": 2.7635666666666667e-05, + "loss": 0.0073, + "step": 17099 + }, + { + "epoch": 13.464749901536038, + "grad_norm": 0.24934425950050354, + "learning_rate": 2.7635333333333333e-05, + "loss": 0.015, + "step": 17100 + }, + { + "epoch": 13.465537613233556, + "grad_norm": 0.28270652890205383, + "learning_rate": 2.7635000000000002e-05, + "loss": 0.0181, + "step": 17101 + }, + { + "epoch": 13.466325324931075, + "grad_norm": 0.30451735854148865, + "learning_rate": 2.7634666666666665e-05, + "loss": 0.0083, + "step": 17102 + }, + { + "epoch": 13.467113036628595, + "grad_norm": 0.11252909153699875, + "learning_rate": 2.7634333333333334e-05, + "loss": 0.0039, + "step": 17103 + }, + { + "epoch": 13.467900748326112, + "grad_norm": 0.24064429104328156, + "learning_rate": 2.7634e-05, + "loss": 0.0108, + "step": 17104 + }, + { + "epoch": 13.468688460023632, + "grad_norm": 0.24147473275661469, + "learning_rate": 2.7633666666666666e-05, + "loss": 0.0117, + "step": 17105 + }, + { + "epoch": 13.46947617172115, + "grad_norm": 0.3979370594024658, + "learning_rate": 2.7633333333333335e-05, + "loss": 0.0067, + "step": 17106 + }, + { + "epoch": 13.470263883418669, + "grad_norm": 0.10217513144016266, + "learning_rate": 2.7633e-05, + "loss": 0.003, + "step": 17107 + }, + { + "epoch": 13.471051595116187, + "grad_norm": 0.27948158979415894, + "learning_rate": 2.7632666666666667e-05, + "loss": 0.0098, + "step": 17108 + }, + { + "epoch": 13.471839306813706, + "grad_norm": 0.3218264877796173, + "learning_rate": 2.7632333333333333e-05, + "loss": 0.0149, + "step": 17109 + }, + { + "epoch": 13.472627018511226, + "grad_norm": 0.20227350294589996, + "learning_rate": 2.7632000000000002e-05, + "loss": 0.0071, + "step": 17110 + }, + { + "epoch": 13.473414730208743, + "grad_norm": 0.4433888792991638, + "learning_rate": 2.7631666666666665e-05, + "loss": 0.1297, + "step": 17111 + }, + { + "epoch": 13.474202441906263, + "grad_norm": 0.49733197689056396, + "learning_rate": 2.7631333333333334e-05, + "loss": 0.0876, + "step": 17112 + }, + { + "epoch": 13.47499015360378, + "grad_norm": 0.5396605134010315, + "learning_rate": 2.7631e-05, + "loss": 0.0754, + "step": 17113 + }, + { + "epoch": 13.4757778653013, + "grad_norm": 0.4300483465194702, + "learning_rate": 2.7630666666666666e-05, + "loss": 0.0726, + "step": 17114 + }, + { + "epoch": 13.47656557699882, + "grad_norm": 0.4671882092952728, + "learning_rate": 2.7630333333333335e-05, + "loss": 0.0352, + "step": 17115 + }, + { + "epoch": 13.477353288696337, + "grad_norm": 0.26000678539276123, + "learning_rate": 2.763e-05, + "loss": 0.0455, + "step": 17116 + }, + { + "epoch": 13.478141000393856, + "grad_norm": 0.2135070562362671, + "learning_rate": 2.7629666666666667e-05, + "loss": 0.0084, + "step": 17117 + }, + { + "epoch": 13.478928712091374, + "grad_norm": 0.29081106185913086, + "learning_rate": 2.7629333333333333e-05, + "loss": 0.0113, + "step": 17118 + }, + { + "epoch": 13.479716423788894, + "grad_norm": 0.12576407194137573, + "learning_rate": 2.7629000000000003e-05, + "loss": 0.0137, + "step": 17119 + }, + { + "epoch": 13.480504135486411, + "grad_norm": 0.11343562602996826, + "learning_rate": 2.7628666666666665e-05, + "loss": 0.01, + "step": 17120 + }, + { + "epoch": 13.48129184718393, + "grad_norm": 0.1652580201625824, + "learning_rate": 2.7628333333333334e-05, + "loss": 0.0077, + "step": 17121 + }, + { + "epoch": 13.48207955888145, + "grad_norm": 0.15994496643543243, + "learning_rate": 2.7628000000000004e-05, + "loss": 0.0095, + "step": 17122 + }, + { + "epoch": 13.482867270578968, + "grad_norm": 0.08591383695602417, + "learning_rate": 2.7627666666666666e-05, + "loss": 0.0069, + "step": 17123 + }, + { + "epoch": 13.483654982276487, + "grad_norm": 0.13483957946300507, + "learning_rate": 2.7627333333333336e-05, + "loss": 0.0098, + "step": 17124 + }, + { + "epoch": 13.484442693974005, + "grad_norm": 0.09453523904085159, + "learning_rate": 2.7627e-05, + "loss": 0.0052, + "step": 17125 + }, + { + "epoch": 13.485230405671524, + "grad_norm": 0.1321735829114914, + "learning_rate": 2.7626666666666668e-05, + "loss": 0.0058, + "step": 17126 + }, + { + "epoch": 13.486018117369042, + "grad_norm": 0.1662929803133011, + "learning_rate": 2.7626333333333333e-05, + "loss": 0.0086, + "step": 17127 + }, + { + "epoch": 13.486805829066562, + "grad_norm": 0.14647690951824188, + "learning_rate": 2.7626e-05, + "loss": 0.0052, + "step": 17128 + }, + { + "epoch": 13.487593540764081, + "grad_norm": 0.12902534008026123, + "learning_rate": 2.7625666666666665e-05, + "loss": 0.0072, + "step": 17129 + }, + { + "epoch": 13.488381252461599, + "grad_norm": 0.2430253028869629, + "learning_rate": 2.7625333333333335e-05, + "loss": 0.0073, + "step": 17130 + }, + { + "epoch": 13.489168964159118, + "grad_norm": 0.2839612066745758, + "learning_rate": 2.7625e-05, + "loss": 0.0094, + "step": 17131 + }, + { + "epoch": 13.489956675856636, + "grad_norm": 0.11625406891107559, + "learning_rate": 2.7624666666666667e-05, + "loss": 0.0052, + "step": 17132 + }, + { + "epoch": 13.490744387554155, + "grad_norm": 0.18079957365989685, + "learning_rate": 2.7624333333333336e-05, + "loss": 0.0087, + "step": 17133 + }, + { + "epoch": 13.491532099251675, + "grad_norm": 0.10513236373662949, + "learning_rate": 2.7624e-05, + "loss": 0.0035, + "step": 17134 + }, + { + "epoch": 13.492319810949192, + "grad_norm": 0.2907065153121948, + "learning_rate": 2.7623666666666668e-05, + "loss": 0.0151, + "step": 17135 + }, + { + "epoch": 13.493107522646712, + "grad_norm": 0.5073875784873962, + "learning_rate": 2.7623333333333334e-05, + "loss": 0.0096, + "step": 17136 + }, + { + "epoch": 13.49389523434423, + "grad_norm": 0.15444877743721008, + "learning_rate": 2.7623e-05, + "loss": 0.0081, + "step": 17137 + }, + { + "epoch": 13.494682946041749, + "grad_norm": 0.1572672426700592, + "learning_rate": 2.7622666666666666e-05, + "loss": 0.0044, + "step": 17138 + }, + { + "epoch": 13.495470657739267, + "grad_norm": 0.18527406454086304, + "learning_rate": 2.7622333333333335e-05, + "loss": 0.0041, + "step": 17139 + }, + { + "epoch": 13.496258369436786, + "grad_norm": 0.13528019189834595, + "learning_rate": 2.7622e-05, + "loss": 0.0033, + "step": 17140 + }, + { + "epoch": 13.497046081134306, + "grad_norm": 0.24106158316135406, + "learning_rate": 2.7621666666666667e-05, + "loss": 0.0097, + "step": 17141 + }, + { + "epoch": 13.497833792831823, + "grad_norm": 1.1266006231307983, + "learning_rate": 2.7621333333333336e-05, + "loss": 0.0087, + "step": 17142 + }, + { + "epoch": 13.498621504529343, + "grad_norm": 0.25607675313949585, + "learning_rate": 2.7621e-05, + "loss": 0.0058, + "step": 17143 + }, + { + "epoch": 13.49940921622686, + "grad_norm": 0.11381421238183975, + "learning_rate": 2.7620666666666668e-05, + "loss": 0.0073, + "step": 17144 + }, + { + "epoch": 13.50019692792438, + "grad_norm": 0.3032033443450928, + "learning_rate": 2.7620333333333334e-05, + "loss": 0.0089, + "step": 17145 + }, + { + "epoch": 13.500984639621898, + "grad_norm": 0.41549256443977356, + "learning_rate": 2.762e-05, + "loss": 0.0099, + "step": 17146 + }, + { + "epoch": 13.501772351319417, + "grad_norm": 0.17862921953201294, + "learning_rate": 2.761966666666667e-05, + "loss": 0.0031, + "step": 17147 + }, + { + "epoch": 13.502560063016936, + "grad_norm": 0.15109318494796753, + "learning_rate": 2.7619333333333335e-05, + "loss": 0.0114, + "step": 17148 + }, + { + "epoch": 13.503347774714454, + "grad_norm": 0.36115342378616333, + "learning_rate": 2.7619e-05, + "loss": 0.0086, + "step": 17149 + }, + { + "epoch": 13.504135486411974, + "grad_norm": 0.25290918350219727, + "learning_rate": 2.7618666666666667e-05, + "loss": 0.0098, + "step": 17150 + }, + { + "epoch": 13.504923198109491, + "grad_norm": 0.1766355335712433, + "learning_rate": 2.7618333333333336e-05, + "loss": 0.0055, + "step": 17151 + }, + { + "epoch": 13.50571090980701, + "grad_norm": 0.11328230053186417, + "learning_rate": 2.7618e-05, + "loss": 0.0077, + "step": 17152 + }, + { + "epoch": 13.50649862150453, + "grad_norm": 0.16982875764369965, + "learning_rate": 2.7617666666666668e-05, + "loss": 0.0067, + "step": 17153 + }, + { + "epoch": 13.507286333202048, + "grad_norm": 0.2920724153518677, + "learning_rate": 2.7617333333333334e-05, + "loss": 0.0085, + "step": 17154 + }, + { + "epoch": 13.508074044899567, + "grad_norm": 0.20781219005584717, + "learning_rate": 2.7617e-05, + "loss": 0.007, + "step": 17155 + }, + { + "epoch": 13.508861756597085, + "grad_norm": 0.20188039541244507, + "learning_rate": 2.761666666666667e-05, + "loss": 0.0077, + "step": 17156 + }, + { + "epoch": 13.509649468294604, + "grad_norm": 1.7494463920593262, + "learning_rate": 2.7616333333333335e-05, + "loss": 0.0103, + "step": 17157 + }, + { + "epoch": 13.510437179992122, + "grad_norm": 0.23753255605697632, + "learning_rate": 2.7616e-05, + "loss": 0.0077, + "step": 17158 + }, + { + "epoch": 13.511224891689642, + "grad_norm": 0.22819599509239197, + "learning_rate": 2.7615666666666667e-05, + "loss": 0.0066, + "step": 17159 + }, + { + "epoch": 13.512012603387161, + "grad_norm": 0.672355055809021, + "learning_rate": 2.7615333333333333e-05, + "loss": 0.0126, + "step": 17160 + }, + { + "epoch": 13.512800315084679, + "grad_norm": 0.6105953454971313, + "learning_rate": 2.7615e-05, + "loss": 0.2061, + "step": 17161 + }, + { + "epoch": 13.513588026782198, + "grad_norm": 0.7309384942054749, + "learning_rate": 2.761466666666667e-05, + "loss": 0.1667, + "step": 17162 + }, + { + "epoch": 13.514375738479716, + "grad_norm": 0.5080049633979797, + "learning_rate": 2.761433333333333e-05, + "loss": 0.16, + "step": 17163 + }, + { + "epoch": 13.515163450177235, + "grad_norm": 0.4387866258621216, + "learning_rate": 2.7614e-05, + "loss": 0.0825, + "step": 17164 + }, + { + "epoch": 13.515951161874753, + "grad_norm": 0.23224468529224396, + "learning_rate": 2.761366666666667e-05, + "loss": 0.0415, + "step": 17165 + }, + { + "epoch": 13.516738873572272, + "grad_norm": 0.2851257920265198, + "learning_rate": 2.7613333333333332e-05, + "loss": 0.0304, + "step": 17166 + }, + { + "epoch": 13.517526585269792, + "grad_norm": 0.1670610010623932, + "learning_rate": 2.7613e-05, + "loss": 0.0497, + "step": 17167 + }, + { + "epoch": 13.51831429696731, + "grad_norm": 0.30760252475738525, + "learning_rate": 2.7612666666666667e-05, + "loss": 0.0197, + "step": 17168 + }, + { + "epoch": 13.519102008664829, + "grad_norm": 0.2088339924812317, + "learning_rate": 2.7612333333333333e-05, + "loss": 0.0262, + "step": 17169 + }, + { + "epoch": 13.519889720362347, + "grad_norm": 0.2429233342409134, + "learning_rate": 2.7612e-05, + "loss": 0.0158, + "step": 17170 + }, + { + "epoch": 13.520677432059866, + "grad_norm": 0.12321536988019943, + "learning_rate": 2.761166666666667e-05, + "loss": 0.008, + "step": 17171 + }, + { + "epoch": 13.521465143757386, + "grad_norm": 0.23039205372333527, + "learning_rate": 2.7611333333333334e-05, + "loss": 0.0354, + "step": 17172 + }, + { + "epoch": 13.522252855454903, + "grad_norm": 0.31467264890670776, + "learning_rate": 2.7611e-05, + "loss": 0.0061, + "step": 17173 + }, + { + "epoch": 13.523040567152423, + "grad_norm": 0.10418768227100372, + "learning_rate": 2.761066666666667e-05, + "loss": 0.007, + "step": 17174 + }, + { + "epoch": 13.52382827884994, + "grad_norm": 0.17706626653671265, + "learning_rate": 2.7610333333333332e-05, + "loss": 0.0082, + "step": 17175 + }, + { + "epoch": 13.52461599054746, + "grad_norm": 0.22397787868976593, + "learning_rate": 2.761e-05, + "loss": 0.0082, + "step": 17176 + }, + { + "epoch": 13.525403702244978, + "grad_norm": 0.19321665167808533, + "learning_rate": 2.7609666666666668e-05, + "loss": 0.0086, + "step": 17177 + }, + { + "epoch": 13.526191413942497, + "grad_norm": 0.25730186700820923, + "learning_rate": 2.7609333333333333e-05, + "loss": 0.009, + "step": 17178 + }, + { + "epoch": 13.526979125640016, + "grad_norm": 0.10206311196088791, + "learning_rate": 2.7609e-05, + "loss": 0.0061, + "step": 17179 + }, + { + "epoch": 13.527766837337534, + "grad_norm": 0.1661442369222641, + "learning_rate": 2.760866666666667e-05, + "loss": 0.0149, + "step": 17180 + }, + { + "epoch": 13.528554549035054, + "grad_norm": 0.10633624345064163, + "learning_rate": 2.7608333333333335e-05, + "loss": 0.0039, + "step": 17181 + }, + { + "epoch": 13.529342260732571, + "grad_norm": 0.29007065296173096, + "learning_rate": 2.7608e-05, + "loss": 0.0181, + "step": 17182 + }, + { + "epoch": 13.53012997243009, + "grad_norm": 0.2135895937681198, + "learning_rate": 2.760766666666667e-05, + "loss": 0.0417, + "step": 17183 + }, + { + "epoch": 13.530917684127608, + "grad_norm": 0.12995298206806183, + "learning_rate": 2.7607333333333332e-05, + "loss": 0.0062, + "step": 17184 + }, + { + "epoch": 13.531705395825128, + "grad_norm": 0.31061074137687683, + "learning_rate": 2.7607000000000002e-05, + "loss": 0.0093, + "step": 17185 + }, + { + "epoch": 13.532493107522647, + "grad_norm": 0.14384551346302032, + "learning_rate": 2.7606666666666668e-05, + "loss": 0.0058, + "step": 17186 + }, + { + "epoch": 13.533280819220165, + "grad_norm": 0.14085029065608978, + "learning_rate": 2.7606333333333334e-05, + "loss": 0.0084, + "step": 17187 + }, + { + "epoch": 13.534068530917684, + "grad_norm": 0.2766353487968445, + "learning_rate": 2.7606e-05, + "loss": 0.0168, + "step": 17188 + }, + { + "epoch": 13.534856242615202, + "grad_norm": 1.0115753412246704, + "learning_rate": 2.7605666666666666e-05, + "loss": 0.0066, + "step": 17189 + }, + { + "epoch": 13.535643954312722, + "grad_norm": 0.14406679570674896, + "learning_rate": 2.7605333333333335e-05, + "loss": 0.0074, + "step": 17190 + }, + { + "epoch": 13.536431666010241, + "grad_norm": 0.6690758466720581, + "learning_rate": 2.7605e-05, + "loss": 0.0165, + "step": 17191 + }, + { + "epoch": 13.537219377707759, + "grad_norm": 0.18887768685817719, + "learning_rate": 2.7604666666666667e-05, + "loss": 0.0068, + "step": 17192 + }, + { + "epoch": 13.538007089405278, + "grad_norm": 0.19688577950000763, + "learning_rate": 2.7604333333333333e-05, + "loss": 0.0072, + "step": 17193 + }, + { + "epoch": 13.538794801102796, + "grad_norm": 0.11564560234546661, + "learning_rate": 2.7604000000000002e-05, + "loss": 0.0046, + "step": 17194 + }, + { + "epoch": 13.539582512800315, + "grad_norm": 0.20471449196338654, + "learning_rate": 2.7603666666666665e-05, + "loss": 0.0092, + "step": 17195 + }, + { + "epoch": 13.540370224497833, + "grad_norm": 0.28885409235954285, + "learning_rate": 2.7603333333333334e-05, + "loss": 0.0162, + "step": 17196 + }, + { + "epoch": 13.541157936195352, + "grad_norm": 0.36017856001853943, + "learning_rate": 2.7603000000000003e-05, + "loss": 0.0097, + "step": 17197 + }, + { + "epoch": 13.541945647892872, + "grad_norm": 0.28687453269958496, + "learning_rate": 2.7602666666666666e-05, + "loss": 0.013, + "step": 17198 + }, + { + "epoch": 13.54273335959039, + "grad_norm": 0.2878970205783844, + "learning_rate": 2.7602333333333335e-05, + "loss": 0.0084, + "step": 17199 + }, + { + "epoch": 13.543521071287909, + "grad_norm": 0.2461247742176056, + "learning_rate": 2.7602e-05, + "loss": 0.0054, + "step": 17200 + }, + { + "epoch": 13.544308782985427, + "grad_norm": 0.39807119965553284, + "learning_rate": 2.7601666666666667e-05, + "loss": 0.0077, + "step": 17201 + }, + { + "epoch": 13.545096494682946, + "grad_norm": 0.24132519960403442, + "learning_rate": 2.7601333333333333e-05, + "loss": 0.0091, + "step": 17202 + }, + { + "epoch": 13.545884206380464, + "grad_norm": 0.22897978127002716, + "learning_rate": 2.7601000000000002e-05, + "loss": 0.0093, + "step": 17203 + }, + { + "epoch": 13.546671918077983, + "grad_norm": 0.7075797915458679, + "learning_rate": 2.7600666666666665e-05, + "loss": 0.01, + "step": 17204 + }, + { + "epoch": 13.547459629775503, + "grad_norm": 0.3075448274612427, + "learning_rate": 2.7600333333333334e-05, + "loss": 0.014, + "step": 17205 + }, + { + "epoch": 13.54824734147302, + "grad_norm": 0.2884611487388611, + "learning_rate": 2.7600000000000003e-05, + "loss": 0.0164, + "step": 17206 + }, + { + "epoch": 13.54903505317054, + "grad_norm": 0.27564096450805664, + "learning_rate": 2.7599666666666666e-05, + "loss": 0.0126, + "step": 17207 + }, + { + "epoch": 13.549822764868058, + "grad_norm": 0.49031591415405273, + "learning_rate": 2.7599333333333335e-05, + "loss": 0.0256, + "step": 17208 + }, + { + "epoch": 13.550610476565577, + "grad_norm": 0.261651873588562, + "learning_rate": 2.7599e-05, + "loss": 0.0111, + "step": 17209 + }, + { + "epoch": 13.551398188263097, + "grad_norm": 0.2884368598461151, + "learning_rate": 2.7598666666666667e-05, + "loss": 0.0109, + "step": 17210 + }, + { + "epoch": 13.552185899960614, + "grad_norm": 0.9255011677742004, + "learning_rate": 2.7598333333333333e-05, + "loss": 0.1886, + "step": 17211 + }, + { + "epoch": 13.552973611658134, + "grad_norm": 0.7152432203292847, + "learning_rate": 2.7598000000000002e-05, + "loss": 0.0975, + "step": 17212 + }, + { + "epoch": 13.553761323355651, + "grad_norm": 0.49410685896873474, + "learning_rate": 2.7597666666666665e-05, + "loss": 0.1078, + "step": 17213 + }, + { + "epoch": 13.55454903505317, + "grad_norm": 0.3548724353313446, + "learning_rate": 2.7597333333333334e-05, + "loss": 0.0731, + "step": 17214 + }, + { + "epoch": 13.555336746750688, + "grad_norm": 0.3901941478252411, + "learning_rate": 2.7597000000000004e-05, + "loss": 0.0369, + "step": 17215 + }, + { + "epoch": 13.556124458448208, + "grad_norm": 0.1813647300004959, + "learning_rate": 2.7596666666666666e-05, + "loss": 0.0189, + "step": 17216 + }, + { + "epoch": 13.556912170145727, + "grad_norm": 0.19661864638328552, + "learning_rate": 2.7596333333333335e-05, + "loss": 0.0238, + "step": 17217 + }, + { + "epoch": 13.557699881843245, + "grad_norm": 0.2826048731803894, + "learning_rate": 2.7596e-05, + "loss": 0.0542, + "step": 17218 + }, + { + "epoch": 13.558487593540764, + "grad_norm": 0.17786559462547302, + "learning_rate": 2.7595666666666667e-05, + "loss": 0.0125, + "step": 17219 + }, + { + "epoch": 13.559275305238282, + "grad_norm": 0.22092445194721222, + "learning_rate": 2.7595333333333333e-05, + "loss": 0.0112, + "step": 17220 + }, + { + "epoch": 13.560063016935802, + "grad_norm": 0.25171375274658203, + "learning_rate": 2.7595e-05, + "loss": 0.0123, + "step": 17221 + }, + { + "epoch": 13.56085072863332, + "grad_norm": 0.32709524035453796, + "learning_rate": 2.759466666666667e-05, + "loss": 0.0141, + "step": 17222 + }, + { + "epoch": 13.561638440330839, + "grad_norm": 0.18739734590053558, + "learning_rate": 2.7594333333333334e-05, + "loss": 0.0074, + "step": 17223 + }, + { + "epoch": 13.562426152028358, + "grad_norm": 0.23465555906295776, + "learning_rate": 2.7594e-05, + "loss": 0.0442, + "step": 17224 + }, + { + "epoch": 13.563213863725876, + "grad_norm": 0.08655966073274612, + "learning_rate": 2.7593666666666666e-05, + "loss": 0.0058, + "step": 17225 + }, + { + "epoch": 13.564001575423395, + "grad_norm": 0.2902969717979431, + "learning_rate": 2.7593333333333336e-05, + "loss": 0.0093, + "step": 17226 + }, + { + "epoch": 13.564789287120913, + "grad_norm": 0.251508891582489, + "learning_rate": 2.7592999999999998e-05, + "loss": 0.0119, + "step": 17227 + }, + { + "epoch": 13.565576998818432, + "grad_norm": 0.07528044283390045, + "learning_rate": 2.7592666666666668e-05, + "loss": 0.0061, + "step": 17228 + }, + { + "epoch": 13.566364710515952, + "grad_norm": 0.26269087195396423, + "learning_rate": 2.7592333333333333e-05, + "loss": 0.0092, + "step": 17229 + }, + { + "epoch": 13.56715242221347, + "grad_norm": 0.20829567313194275, + "learning_rate": 2.7592e-05, + "loss": 0.0135, + "step": 17230 + }, + { + "epoch": 13.567940133910989, + "grad_norm": 0.15572692453861237, + "learning_rate": 2.759166666666667e-05, + "loss": 0.0076, + "step": 17231 + }, + { + "epoch": 13.568727845608507, + "grad_norm": 0.1033264547586441, + "learning_rate": 2.7591333333333335e-05, + "loss": 0.0053, + "step": 17232 + }, + { + "epoch": 13.569515557306026, + "grad_norm": 0.17904791235923767, + "learning_rate": 2.7591e-05, + "loss": 0.0069, + "step": 17233 + }, + { + "epoch": 13.570303269003544, + "grad_norm": 0.21964669227600098, + "learning_rate": 2.7590666666666667e-05, + "loss": 0.0063, + "step": 17234 + }, + { + "epoch": 13.571090980701063, + "grad_norm": 0.1053638607263565, + "learning_rate": 2.7590333333333336e-05, + "loss": 0.0062, + "step": 17235 + }, + { + "epoch": 13.571878692398583, + "grad_norm": 0.31908249855041504, + "learning_rate": 2.759e-05, + "loss": 0.005, + "step": 17236 + }, + { + "epoch": 13.5726664040961, + "grad_norm": 0.0938085988163948, + "learning_rate": 2.7589666666666668e-05, + "loss": 0.0065, + "step": 17237 + }, + { + "epoch": 13.57345411579362, + "grad_norm": 0.07146665453910828, + "learning_rate": 2.7589333333333334e-05, + "loss": 0.0048, + "step": 17238 + }, + { + "epoch": 13.574241827491138, + "grad_norm": 0.13159193098545074, + "learning_rate": 2.7589e-05, + "loss": 0.0045, + "step": 17239 + }, + { + "epoch": 13.575029539188657, + "grad_norm": 0.17702822387218475, + "learning_rate": 2.758866666666667e-05, + "loss": 0.0121, + "step": 17240 + }, + { + "epoch": 13.575817250886175, + "grad_norm": 0.1784086674451828, + "learning_rate": 2.7588333333333335e-05, + "loss": 0.0059, + "step": 17241 + }, + { + "epoch": 13.576604962583694, + "grad_norm": 0.13375824689865112, + "learning_rate": 2.7588e-05, + "loss": 0.0076, + "step": 17242 + }, + { + "epoch": 13.577392674281214, + "grad_norm": 0.12379100173711777, + "learning_rate": 2.7587666666666667e-05, + "loss": 0.0066, + "step": 17243 + }, + { + "epoch": 13.578180385978731, + "grad_norm": 0.2013128399848938, + "learning_rate": 2.7587333333333336e-05, + "loss": 0.0061, + "step": 17244 + }, + { + "epoch": 13.57896809767625, + "grad_norm": 0.11666063964366913, + "learning_rate": 2.7587e-05, + "loss": 0.0052, + "step": 17245 + }, + { + "epoch": 13.579755809373768, + "grad_norm": 0.252899706363678, + "learning_rate": 2.7586666666666668e-05, + "loss": 0.0089, + "step": 17246 + }, + { + "epoch": 13.580543521071288, + "grad_norm": 0.2966325581073761, + "learning_rate": 2.7586333333333337e-05, + "loss": 0.0126, + "step": 17247 + }, + { + "epoch": 13.581331232768807, + "grad_norm": 0.09657847881317139, + "learning_rate": 2.7586e-05, + "loss": 0.003, + "step": 17248 + }, + { + "epoch": 13.582118944466325, + "grad_norm": 0.13012556731700897, + "learning_rate": 2.758566666666667e-05, + "loss": 0.0054, + "step": 17249 + }, + { + "epoch": 13.582906656163845, + "grad_norm": 0.1593552976846695, + "learning_rate": 2.758533333333333e-05, + "loss": 0.0047, + "step": 17250 + }, + { + "epoch": 13.583694367861362, + "grad_norm": 0.5596591830253601, + "learning_rate": 2.7585e-05, + "loss": 0.0156, + "step": 17251 + }, + { + "epoch": 13.584482079558882, + "grad_norm": 0.3905711770057678, + "learning_rate": 2.7584666666666667e-05, + "loss": 0.0142, + "step": 17252 + }, + { + "epoch": 13.5852697912564, + "grad_norm": 0.1611802577972412, + "learning_rate": 2.7584333333333333e-05, + "loss": 0.0068, + "step": 17253 + }, + { + "epoch": 13.586057502953919, + "grad_norm": 0.42491739988327026, + "learning_rate": 2.7584e-05, + "loss": 0.011, + "step": 17254 + }, + { + "epoch": 13.586845214651438, + "grad_norm": 0.2588317096233368, + "learning_rate": 2.7583666666666668e-05, + "loss": 0.0074, + "step": 17255 + }, + { + "epoch": 13.587632926348956, + "grad_norm": 0.15491998195648193, + "learning_rate": 2.7583333333333334e-05, + "loss": 0.0047, + "step": 17256 + }, + { + "epoch": 13.588420638046475, + "grad_norm": 0.16236373782157898, + "learning_rate": 2.7583e-05, + "loss": 0.0048, + "step": 17257 + }, + { + "epoch": 13.589208349743993, + "grad_norm": 0.5395763516426086, + "learning_rate": 2.758266666666667e-05, + "loss": 0.0165, + "step": 17258 + }, + { + "epoch": 13.589996061441513, + "grad_norm": 0.5430200099945068, + "learning_rate": 2.7582333333333332e-05, + "loss": 0.0169, + "step": 17259 + }, + { + "epoch": 13.59078377313903, + "grad_norm": 0.22034952044487, + "learning_rate": 2.7582e-05, + "loss": 0.0052, + "step": 17260 + }, + { + "epoch": 13.59157148483655, + "grad_norm": 0.6535844802856445, + "learning_rate": 2.7581666666666667e-05, + "loss": 0.1846, + "step": 17261 + }, + { + "epoch": 13.592359196534069, + "grad_norm": 0.5432119965553284, + "learning_rate": 2.7581333333333333e-05, + "loss": 0.095, + "step": 17262 + }, + { + "epoch": 13.593146908231587, + "grad_norm": 0.4830615222454071, + "learning_rate": 2.7581e-05, + "loss": 0.0718, + "step": 17263 + }, + { + "epoch": 13.593934619929106, + "grad_norm": 0.3660663068294525, + "learning_rate": 2.758066666666667e-05, + "loss": 0.0479, + "step": 17264 + }, + { + "epoch": 13.594722331626624, + "grad_norm": 0.25128018856048584, + "learning_rate": 2.7580333333333334e-05, + "loss": 0.0294, + "step": 17265 + }, + { + "epoch": 13.595510043324143, + "grad_norm": 1.756528377532959, + "learning_rate": 2.758e-05, + "loss": 0.0848, + "step": 17266 + }, + { + "epoch": 13.596297755021663, + "grad_norm": 0.25434622168540955, + "learning_rate": 2.757966666666667e-05, + "loss": 0.0151, + "step": 17267 + }, + { + "epoch": 13.59708546671918, + "grad_norm": 0.18146809935569763, + "learning_rate": 2.7579333333333332e-05, + "loss": 0.0166, + "step": 17268 + }, + { + "epoch": 13.5978731784167, + "grad_norm": 0.32520854473114014, + "learning_rate": 2.7579e-05, + "loss": 0.0377, + "step": 17269 + }, + { + "epoch": 13.598660890114218, + "grad_norm": 0.19589108228683472, + "learning_rate": 2.7578666666666667e-05, + "loss": 0.0146, + "step": 17270 + }, + { + "epoch": 13.599448601811737, + "grad_norm": 0.13082419335842133, + "learning_rate": 2.7578333333333333e-05, + "loss": 0.0077, + "step": 17271 + }, + { + "epoch": 13.600236313509257, + "grad_norm": 0.16447541117668152, + "learning_rate": 2.7578000000000003e-05, + "loss": 0.0121, + "step": 17272 + }, + { + "epoch": 13.601024025206774, + "grad_norm": 0.14427460730075836, + "learning_rate": 2.757766666666667e-05, + "loss": 0.0072, + "step": 17273 + }, + { + "epoch": 13.601811736904294, + "grad_norm": 0.0885196179151535, + "learning_rate": 2.7577333333333334e-05, + "loss": 0.0051, + "step": 17274 + }, + { + "epoch": 13.602599448601811, + "grad_norm": 0.10385335981845856, + "learning_rate": 2.7577e-05, + "loss": 0.0069, + "step": 17275 + }, + { + "epoch": 13.60338716029933, + "grad_norm": 0.2020082026720047, + "learning_rate": 2.757666666666667e-05, + "loss": 0.0101, + "step": 17276 + }, + { + "epoch": 13.604174871996848, + "grad_norm": 0.09475422650575638, + "learning_rate": 2.7576333333333332e-05, + "loss": 0.0043, + "step": 17277 + }, + { + "epoch": 13.604962583694368, + "grad_norm": 0.16733413934707642, + "learning_rate": 2.7576e-05, + "loss": 0.0078, + "step": 17278 + }, + { + "epoch": 13.605750295391886, + "grad_norm": 0.22725172340869904, + "learning_rate": 2.7575666666666668e-05, + "loss": 0.0095, + "step": 17279 + }, + { + "epoch": 13.606538007089405, + "grad_norm": 0.12934817373752594, + "learning_rate": 2.7575333333333333e-05, + "loss": 0.0068, + "step": 17280 + }, + { + "epoch": 13.607325718786925, + "grad_norm": 0.0968584343791008, + "learning_rate": 2.7575000000000003e-05, + "loss": 0.0048, + "step": 17281 + }, + { + "epoch": 13.608113430484442, + "grad_norm": 0.13802562654018402, + "learning_rate": 2.7574666666666665e-05, + "loss": 0.0054, + "step": 17282 + }, + { + "epoch": 13.608901142181962, + "grad_norm": 0.41249948740005493, + "learning_rate": 2.7574333333333335e-05, + "loss": 0.0104, + "step": 17283 + }, + { + "epoch": 13.60968885387948, + "grad_norm": 0.3742504119873047, + "learning_rate": 2.7574e-05, + "loss": 0.0115, + "step": 17284 + }, + { + "epoch": 13.610476565576999, + "grad_norm": 0.29116371273994446, + "learning_rate": 2.7573666666666667e-05, + "loss": 0.0089, + "step": 17285 + }, + { + "epoch": 13.611264277274518, + "grad_norm": 0.2087584286928177, + "learning_rate": 2.7573333333333332e-05, + "loss": 0.014, + "step": 17286 + }, + { + "epoch": 13.612051988972036, + "grad_norm": 0.1290629357099533, + "learning_rate": 2.7573000000000002e-05, + "loss": 0.0078, + "step": 17287 + }, + { + "epoch": 13.612839700669555, + "grad_norm": 0.17685773968696594, + "learning_rate": 2.7572666666666664e-05, + "loss": 0.0072, + "step": 17288 + }, + { + "epoch": 13.613627412367073, + "grad_norm": 0.20420783758163452, + "learning_rate": 2.7572333333333334e-05, + "loss": 0.0138, + "step": 17289 + }, + { + "epoch": 13.614415124064593, + "grad_norm": 0.16939009726047516, + "learning_rate": 2.7572000000000003e-05, + "loss": 0.0091, + "step": 17290 + }, + { + "epoch": 13.615202835762112, + "grad_norm": 0.16076123714447021, + "learning_rate": 2.7571666666666666e-05, + "loss": 0.0054, + "step": 17291 + }, + { + "epoch": 13.61599054745963, + "grad_norm": 0.10140927135944366, + "learning_rate": 2.7571333333333335e-05, + "loss": 0.0041, + "step": 17292 + }, + { + "epoch": 13.61677825915715, + "grad_norm": 0.10957474261522293, + "learning_rate": 2.7571e-05, + "loss": 0.0062, + "step": 17293 + }, + { + "epoch": 13.617565970854667, + "grad_norm": 0.26778659224510193, + "learning_rate": 2.7570666666666667e-05, + "loss": 0.0091, + "step": 17294 + }, + { + "epoch": 13.618353682552186, + "grad_norm": 0.46174582839012146, + "learning_rate": 2.7570333333333333e-05, + "loss": 0.0112, + "step": 17295 + }, + { + "epoch": 13.619141394249704, + "grad_norm": 0.10395587235689163, + "learning_rate": 2.7570000000000002e-05, + "loss": 0.0035, + "step": 17296 + }, + { + "epoch": 13.619929105947223, + "grad_norm": 0.09450868517160416, + "learning_rate": 2.7569666666666668e-05, + "loss": 0.0036, + "step": 17297 + }, + { + "epoch": 13.620716817644743, + "grad_norm": 0.26271942257881165, + "learning_rate": 2.7569333333333334e-05, + "loss": 0.0082, + "step": 17298 + }, + { + "epoch": 13.62150452934226, + "grad_norm": 0.18091799318790436, + "learning_rate": 2.7569000000000003e-05, + "loss": 0.0056, + "step": 17299 + }, + { + "epoch": 13.62229224103978, + "grad_norm": 0.3209778368473053, + "learning_rate": 2.7568666666666666e-05, + "loss": 0.0075, + "step": 17300 + }, + { + "epoch": 13.623079952737298, + "grad_norm": 0.09612836688756943, + "learning_rate": 2.7568333333333335e-05, + "loss": 0.0051, + "step": 17301 + }, + { + "epoch": 13.623867664434817, + "grad_norm": 0.2105482667684555, + "learning_rate": 2.7568e-05, + "loss": 0.008, + "step": 17302 + }, + { + "epoch": 13.624655376132335, + "grad_norm": 0.24086259305477142, + "learning_rate": 2.7567666666666667e-05, + "loss": 0.0041, + "step": 17303 + }, + { + "epoch": 13.625443087829854, + "grad_norm": 0.10223553329706192, + "learning_rate": 2.7567333333333333e-05, + "loss": 0.0041, + "step": 17304 + }, + { + "epoch": 13.626230799527374, + "grad_norm": 0.3605995178222656, + "learning_rate": 2.7567000000000002e-05, + "loss": 0.0105, + "step": 17305 + }, + { + "epoch": 13.627018511224891, + "grad_norm": 0.30473464727401733, + "learning_rate": 2.7566666666666668e-05, + "loss": 0.0151, + "step": 17306 + }, + { + "epoch": 13.62780622292241, + "grad_norm": 0.1876835823059082, + "learning_rate": 2.7566333333333334e-05, + "loss": 0.0041, + "step": 17307 + }, + { + "epoch": 13.628593934619929, + "grad_norm": 0.24454620480537415, + "learning_rate": 2.7566000000000003e-05, + "loss": 0.0142, + "step": 17308 + }, + { + "epoch": 13.629381646317448, + "grad_norm": 0.6347468495368958, + "learning_rate": 2.7565666666666666e-05, + "loss": 0.0104, + "step": 17309 + }, + { + "epoch": 13.630169358014967, + "grad_norm": 0.1569562703371048, + "learning_rate": 2.7565333333333335e-05, + "loss": 0.0029, + "step": 17310 + }, + { + "epoch": 13.630957069712485, + "grad_norm": 0.625018835067749, + "learning_rate": 2.7564999999999998e-05, + "loss": 0.1454, + "step": 17311 + }, + { + "epoch": 13.631744781410005, + "grad_norm": 0.6514822840690613, + "learning_rate": 2.7564666666666667e-05, + "loss": 0.1211, + "step": 17312 + }, + { + "epoch": 13.632532493107522, + "grad_norm": 0.3016030192375183, + "learning_rate": 2.7564333333333333e-05, + "loss": 0.0941, + "step": 17313 + }, + { + "epoch": 13.633320204805042, + "grad_norm": 0.4451550543308258, + "learning_rate": 2.7564e-05, + "loss": 0.0518, + "step": 17314 + }, + { + "epoch": 13.63410791650256, + "grad_norm": 0.4599483907222748, + "learning_rate": 2.756366666666667e-05, + "loss": 0.0643, + "step": 17315 + }, + { + "epoch": 13.634895628200079, + "grad_norm": 0.7560158371925354, + "learning_rate": 2.7563333333333334e-05, + "loss": 0.0335, + "step": 17316 + }, + { + "epoch": 13.635683339897598, + "grad_norm": 0.2746305763721466, + "learning_rate": 2.7563e-05, + "loss": 0.0557, + "step": 17317 + }, + { + "epoch": 13.636471051595116, + "grad_norm": 0.222477987408638, + "learning_rate": 2.7562666666666666e-05, + "loss": 0.012, + "step": 17318 + }, + { + "epoch": 13.637258763292635, + "grad_norm": 0.2707042992115021, + "learning_rate": 2.7562333333333335e-05, + "loss": 0.0142, + "step": 17319 + }, + { + "epoch": 13.638046474990153, + "grad_norm": 0.3111862242221832, + "learning_rate": 2.7561999999999998e-05, + "loss": 0.0117, + "step": 17320 + }, + { + "epoch": 13.638834186687673, + "grad_norm": 0.12083521485328674, + "learning_rate": 2.7561666666666667e-05, + "loss": 0.0111, + "step": 17321 + }, + { + "epoch": 13.63962189838519, + "grad_norm": 0.24073924124240875, + "learning_rate": 2.7561333333333337e-05, + "loss": 0.0303, + "step": 17322 + }, + { + "epoch": 13.64040961008271, + "grad_norm": 0.15078069269657135, + "learning_rate": 2.7561e-05, + "loss": 0.0077, + "step": 17323 + }, + { + "epoch": 13.64119732178023, + "grad_norm": 0.7668417096138, + "learning_rate": 2.756066666666667e-05, + "loss": 0.0162, + "step": 17324 + }, + { + "epoch": 13.641985033477747, + "grad_norm": 0.17006923258304596, + "learning_rate": 2.7560333333333334e-05, + "loss": 0.0114, + "step": 17325 + }, + { + "epoch": 13.642772745175266, + "grad_norm": 0.25706779956817627, + "learning_rate": 2.756e-05, + "loss": 0.0134, + "step": 17326 + }, + { + "epoch": 13.643560456872784, + "grad_norm": 0.477575421333313, + "learning_rate": 2.7559666666666666e-05, + "loss": 0.0111, + "step": 17327 + }, + { + "epoch": 13.644348168570303, + "grad_norm": 0.22444549202919006, + "learning_rate": 2.7559333333333336e-05, + "loss": 0.0082, + "step": 17328 + }, + { + "epoch": 13.645135880267823, + "grad_norm": 0.29491499066352844, + "learning_rate": 2.7558999999999998e-05, + "loss": 0.0181, + "step": 17329 + }, + { + "epoch": 13.64592359196534, + "grad_norm": 0.2315109670162201, + "learning_rate": 2.7558666666666668e-05, + "loss": 0.0093, + "step": 17330 + }, + { + "epoch": 13.64671130366286, + "grad_norm": 0.30611225962638855, + "learning_rate": 2.7558333333333337e-05, + "loss": 0.0103, + "step": 17331 + }, + { + "epoch": 13.647499015360378, + "grad_norm": 0.17118306457996368, + "learning_rate": 2.7558e-05, + "loss": 0.0097, + "step": 17332 + }, + { + "epoch": 13.648286727057897, + "grad_norm": 1.6850849390029907, + "learning_rate": 2.755766666666667e-05, + "loss": 0.0108, + "step": 17333 + }, + { + "epoch": 13.649074438755415, + "grad_norm": 0.1655188947916031, + "learning_rate": 2.7557333333333335e-05, + "loss": 0.0071, + "step": 17334 + }, + { + "epoch": 13.649862150452934, + "grad_norm": 0.14517635107040405, + "learning_rate": 2.7557e-05, + "loss": 0.0091, + "step": 17335 + }, + { + "epoch": 13.650649862150454, + "grad_norm": 0.0553760901093483, + "learning_rate": 2.7556666666666667e-05, + "loss": 0.0028, + "step": 17336 + }, + { + "epoch": 13.651437573847971, + "grad_norm": 0.07878680527210236, + "learning_rate": 2.7556333333333336e-05, + "loss": 0.0049, + "step": 17337 + }, + { + "epoch": 13.65222528554549, + "grad_norm": 0.21623218059539795, + "learning_rate": 2.7556e-05, + "loss": 0.0136, + "step": 17338 + }, + { + "epoch": 13.653012997243009, + "grad_norm": 1.1829575300216675, + "learning_rate": 2.7555666666666668e-05, + "loss": 0.0152, + "step": 17339 + }, + { + "epoch": 13.653800708940528, + "grad_norm": 0.7119274735450745, + "learning_rate": 2.7555333333333334e-05, + "loss": 0.0182, + "step": 17340 + }, + { + "epoch": 13.654588420638046, + "grad_norm": 0.12282679975032806, + "learning_rate": 2.7555e-05, + "loss": 0.0051, + "step": 17341 + }, + { + "epoch": 13.655376132335565, + "grad_norm": 0.14322198927402496, + "learning_rate": 2.755466666666667e-05, + "loss": 0.0055, + "step": 17342 + }, + { + "epoch": 13.656163844033085, + "grad_norm": 0.2440643012523651, + "learning_rate": 2.755433333333333e-05, + "loss": 0.0084, + "step": 17343 + }, + { + "epoch": 13.656951555730602, + "grad_norm": 0.23700320720672607, + "learning_rate": 2.7554e-05, + "loss": 0.0141, + "step": 17344 + }, + { + "epoch": 13.657739267428122, + "grad_norm": 0.3473334312438965, + "learning_rate": 2.7553666666666667e-05, + "loss": 0.0145, + "step": 17345 + }, + { + "epoch": 13.65852697912564, + "grad_norm": 0.38646399974823, + "learning_rate": 2.7553333333333333e-05, + "loss": 0.0151, + "step": 17346 + }, + { + "epoch": 13.659314690823159, + "grad_norm": 0.22191134095191956, + "learning_rate": 2.7553000000000002e-05, + "loss": 0.0088, + "step": 17347 + }, + { + "epoch": 13.660102402520678, + "grad_norm": 0.5844201445579529, + "learning_rate": 2.7552666666666668e-05, + "loss": 0.0216, + "step": 17348 + }, + { + "epoch": 13.660890114218196, + "grad_norm": 0.19807857275009155, + "learning_rate": 2.7552333333333334e-05, + "loss": 0.0086, + "step": 17349 + }, + { + "epoch": 13.661677825915715, + "grad_norm": 0.8499236106872559, + "learning_rate": 2.7552e-05, + "loss": 0.0218, + "step": 17350 + }, + { + "epoch": 13.662465537613233, + "grad_norm": 0.14841552078723907, + "learning_rate": 2.755166666666667e-05, + "loss": 0.0085, + "step": 17351 + }, + { + "epoch": 13.663253249310753, + "grad_norm": 0.09218233078718185, + "learning_rate": 2.755133333333333e-05, + "loss": 0.0042, + "step": 17352 + }, + { + "epoch": 13.66404096100827, + "grad_norm": 0.33655840158462524, + "learning_rate": 2.7551e-05, + "loss": 0.0093, + "step": 17353 + }, + { + "epoch": 13.66482867270579, + "grad_norm": 0.5773738026618958, + "learning_rate": 2.7550666666666667e-05, + "loss": 0.0308, + "step": 17354 + }, + { + "epoch": 13.66561638440331, + "grad_norm": 0.3184424936771393, + "learning_rate": 2.7550333333333333e-05, + "loss": 0.035, + "step": 17355 + }, + { + "epoch": 13.666404096100827, + "grad_norm": 0.36192673444747925, + "learning_rate": 2.7550000000000002e-05, + "loss": 0.0111, + "step": 17356 + }, + { + "epoch": 13.667191807798346, + "grad_norm": 0.19594639539718628, + "learning_rate": 2.7549666666666668e-05, + "loss": 0.01, + "step": 17357 + }, + { + "epoch": 13.667979519495864, + "grad_norm": 0.11810939759016037, + "learning_rate": 2.7549333333333334e-05, + "loss": 0.0063, + "step": 17358 + }, + { + "epoch": 13.668767231193383, + "grad_norm": 0.6294525265693665, + "learning_rate": 2.7549e-05, + "loss": 0.0217, + "step": 17359 + }, + { + "epoch": 13.669554942890901, + "grad_norm": 0.389157772064209, + "learning_rate": 2.754866666666667e-05, + "loss": 0.0173, + "step": 17360 + }, + { + "epoch": 13.67034265458842, + "grad_norm": 0.6290896534919739, + "learning_rate": 2.7548333333333332e-05, + "loss": 0.1445, + "step": 17361 + }, + { + "epoch": 13.67113036628594, + "grad_norm": 0.7316772937774658, + "learning_rate": 2.7548e-05, + "loss": 0.1989, + "step": 17362 + }, + { + "epoch": 13.671918077983458, + "grad_norm": 0.390835165977478, + "learning_rate": 2.7547666666666667e-05, + "loss": 0.0785, + "step": 17363 + }, + { + "epoch": 13.672705789680977, + "grad_norm": 0.4241999685764313, + "learning_rate": 2.7547333333333333e-05, + "loss": 0.0814, + "step": 17364 + }, + { + "epoch": 13.673493501378495, + "grad_norm": 0.40199801325798035, + "learning_rate": 2.7547000000000002e-05, + "loss": 0.0347, + "step": 17365 + }, + { + "epoch": 13.674281213076014, + "grad_norm": 0.18746700882911682, + "learning_rate": 2.754666666666667e-05, + "loss": 0.0149, + "step": 17366 + }, + { + "epoch": 13.675068924773534, + "grad_norm": 0.2688598334789276, + "learning_rate": 2.7546333333333334e-05, + "loss": 0.0226, + "step": 17367 + }, + { + "epoch": 13.675856636471051, + "grad_norm": 0.22843113541603088, + "learning_rate": 2.7546e-05, + "loss": 0.0315, + "step": 17368 + }, + { + "epoch": 13.67664434816857, + "grad_norm": 0.16832458972930908, + "learning_rate": 2.754566666666667e-05, + "loss": 0.009, + "step": 17369 + }, + { + "epoch": 13.677432059866089, + "grad_norm": 0.5216951370239258, + "learning_rate": 2.7545333333333332e-05, + "loss": 0.0144, + "step": 17370 + }, + { + "epoch": 13.678219771563608, + "grad_norm": 0.578857958316803, + "learning_rate": 2.7545e-05, + "loss": 0.0174, + "step": 17371 + }, + { + "epoch": 13.679007483261126, + "grad_norm": 0.31252819299697876, + "learning_rate": 2.7544666666666667e-05, + "loss": 0.0105, + "step": 17372 + }, + { + "epoch": 13.679795194958645, + "grad_norm": 0.28069746494293213, + "learning_rate": 2.7544333333333333e-05, + "loss": 0.0113, + "step": 17373 + }, + { + "epoch": 13.680582906656165, + "grad_norm": 0.26851266622543335, + "learning_rate": 2.7544000000000003e-05, + "loss": 0.0202, + "step": 17374 + }, + { + "epoch": 13.681370618353682, + "grad_norm": 0.4552655518054962, + "learning_rate": 2.7543666666666665e-05, + "loss": 0.0154, + "step": 17375 + }, + { + "epoch": 13.682158330051202, + "grad_norm": 0.3385179042816162, + "learning_rate": 2.7543333333333334e-05, + "loss": 0.0085, + "step": 17376 + }, + { + "epoch": 13.68294604174872, + "grad_norm": 0.1517200469970703, + "learning_rate": 2.7543e-05, + "loss": 0.0079, + "step": 17377 + }, + { + "epoch": 13.683733753446239, + "grad_norm": 0.2380300611257553, + "learning_rate": 2.7542666666666666e-05, + "loss": 0.0079, + "step": 17378 + }, + { + "epoch": 13.684521465143757, + "grad_norm": 0.32042962312698364, + "learning_rate": 2.7542333333333332e-05, + "loss": 0.0161, + "step": 17379 + }, + { + "epoch": 13.685309176841276, + "grad_norm": 0.46325793862342834, + "learning_rate": 2.7542e-05, + "loss": 0.0084, + "step": 17380 + }, + { + "epoch": 13.686096888538795, + "grad_norm": 0.2951979637145996, + "learning_rate": 2.7541666666666668e-05, + "loss": 0.0147, + "step": 17381 + }, + { + "epoch": 13.686884600236313, + "grad_norm": 0.11322201788425446, + "learning_rate": 2.7541333333333333e-05, + "loss": 0.0058, + "step": 17382 + }, + { + "epoch": 13.687672311933833, + "grad_norm": 0.1958107203245163, + "learning_rate": 2.7541000000000003e-05, + "loss": 0.0071, + "step": 17383 + }, + { + "epoch": 13.68846002363135, + "grad_norm": 0.23233522474765778, + "learning_rate": 2.7540666666666665e-05, + "loss": 0.0063, + "step": 17384 + }, + { + "epoch": 13.68924773532887, + "grad_norm": 0.3943394422531128, + "learning_rate": 2.7540333333333335e-05, + "loss": 0.0198, + "step": 17385 + }, + { + "epoch": 13.69003544702639, + "grad_norm": 0.17281027138233185, + "learning_rate": 2.754e-05, + "loss": 0.0052, + "step": 17386 + }, + { + "epoch": 13.690823158723907, + "grad_norm": 0.4873805046081543, + "learning_rate": 2.7539666666666667e-05, + "loss": 0.0137, + "step": 17387 + }, + { + "epoch": 13.691610870421426, + "grad_norm": 0.23828300833702087, + "learning_rate": 2.7539333333333332e-05, + "loss": 0.0087, + "step": 17388 + }, + { + "epoch": 13.692398582118944, + "grad_norm": 0.4267806112766266, + "learning_rate": 2.7539000000000002e-05, + "loss": 0.0166, + "step": 17389 + }, + { + "epoch": 13.693186293816463, + "grad_norm": 0.2663990259170532, + "learning_rate": 2.7538666666666668e-05, + "loss": 0.0085, + "step": 17390 + }, + { + "epoch": 13.693974005513981, + "grad_norm": 0.22889600694179535, + "learning_rate": 2.7538333333333334e-05, + "loss": 0.0108, + "step": 17391 + }, + { + "epoch": 13.6947617172115, + "grad_norm": 0.17378868162631989, + "learning_rate": 2.7538000000000003e-05, + "loss": 0.0119, + "step": 17392 + }, + { + "epoch": 13.69554942890902, + "grad_norm": 0.2690667510032654, + "learning_rate": 2.7537666666666666e-05, + "loss": 0.0145, + "step": 17393 + }, + { + "epoch": 13.696337140606538, + "grad_norm": 0.3612821698188782, + "learning_rate": 2.7537333333333335e-05, + "loss": 0.0097, + "step": 17394 + }, + { + "epoch": 13.697124852304057, + "grad_norm": 0.3293192684650421, + "learning_rate": 2.7537e-05, + "loss": 0.0182, + "step": 17395 + }, + { + "epoch": 13.697912564001575, + "grad_norm": 0.18732766807079315, + "learning_rate": 2.7536666666666667e-05, + "loss": 0.0115, + "step": 17396 + }, + { + "epoch": 13.698700275699094, + "grad_norm": 0.17860420048236847, + "learning_rate": 2.7536333333333336e-05, + "loss": 0.0073, + "step": 17397 + }, + { + "epoch": 13.699487987396612, + "grad_norm": 0.15245065093040466, + "learning_rate": 2.7536000000000002e-05, + "loss": 0.006, + "step": 17398 + }, + { + "epoch": 13.700275699094131, + "grad_norm": 0.13443422317504883, + "learning_rate": 2.7535666666666668e-05, + "loss": 0.0032, + "step": 17399 + }, + { + "epoch": 13.701063410791651, + "grad_norm": 0.3578752279281616, + "learning_rate": 2.7535333333333334e-05, + "loss": 0.0161, + "step": 17400 + }, + { + "epoch": 13.701851122489169, + "grad_norm": 0.15521815419197083, + "learning_rate": 2.7535e-05, + "loss": 0.0107, + "step": 17401 + }, + { + "epoch": 13.702638834186688, + "grad_norm": 0.26644065976142883, + "learning_rate": 2.7534666666666666e-05, + "loss": 0.0073, + "step": 17402 + }, + { + "epoch": 13.703426545884206, + "grad_norm": 0.27335458993911743, + "learning_rate": 2.7534333333333335e-05, + "loss": 0.0095, + "step": 17403 + }, + { + "epoch": 13.704214257581725, + "grad_norm": 0.12840305268764496, + "learning_rate": 2.7533999999999998e-05, + "loss": 0.0082, + "step": 17404 + }, + { + "epoch": 13.705001969279245, + "grad_norm": 0.2959338128566742, + "learning_rate": 2.7533666666666667e-05, + "loss": 0.0085, + "step": 17405 + }, + { + "epoch": 13.705789680976762, + "grad_norm": 0.18679293990135193, + "learning_rate": 2.7533333333333336e-05, + "loss": 0.0083, + "step": 17406 + }, + { + "epoch": 13.706577392674282, + "grad_norm": 0.350589394569397, + "learning_rate": 2.7533e-05, + "loss": 0.0122, + "step": 17407 + }, + { + "epoch": 13.7073651043718, + "grad_norm": 0.4112795889377594, + "learning_rate": 2.7532666666666668e-05, + "loss": 0.0079, + "step": 17408 + }, + { + "epoch": 13.708152816069319, + "grad_norm": 0.19348010420799255, + "learning_rate": 2.7532333333333334e-05, + "loss": 0.0082, + "step": 17409 + }, + { + "epoch": 13.708940527766837, + "grad_norm": 0.28088825941085815, + "learning_rate": 2.7532e-05, + "loss": 0.0067, + "step": 17410 + }, + { + "epoch": 13.709728239464356, + "grad_norm": 0.6742435693740845, + "learning_rate": 2.7531666666666666e-05, + "loss": 0.175, + "step": 17411 + }, + { + "epoch": 13.710515951161875, + "grad_norm": 0.5179862380027771, + "learning_rate": 2.7531333333333335e-05, + "loss": 0.1599, + "step": 17412 + }, + { + "epoch": 13.711303662859393, + "grad_norm": 0.5291458368301392, + "learning_rate": 2.7531e-05, + "loss": 0.1234, + "step": 17413 + }, + { + "epoch": 13.712091374556913, + "grad_norm": 0.5921914577484131, + "learning_rate": 2.7530666666666667e-05, + "loss": 0.0881, + "step": 17414 + }, + { + "epoch": 13.71287908625443, + "grad_norm": 0.31119126081466675, + "learning_rate": 2.7530333333333336e-05, + "loss": 0.0368, + "step": 17415 + }, + { + "epoch": 13.71366679795195, + "grad_norm": 0.47284218668937683, + "learning_rate": 2.753e-05, + "loss": 0.0396, + "step": 17416 + }, + { + "epoch": 13.714454509649467, + "grad_norm": 0.11211104691028595, + "learning_rate": 2.752966666666667e-05, + "loss": 0.0098, + "step": 17417 + }, + { + "epoch": 13.715242221346987, + "grad_norm": 0.2601950764656067, + "learning_rate": 2.7529333333333334e-05, + "loss": 0.0169, + "step": 17418 + }, + { + "epoch": 13.716029933044506, + "grad_norm": 0.08356798440217972, + "learning_rate": 2.7529e-05, + "loss": 0.0078, + "step": 17419 + }, + { + "epoch": 13.716817644742024, + "grad_norm": 0.27128833532333374, + "learning_rate": 2.7528666666666666e-05, + "loss": 0.0138, + "step": 17420 + }, + { + "epoch": 13.717605356439543, + "grad_norm": 0.2227376252412796, + "learning_rate": 2.7528333333333335e-05, + "loss": 0.0117, + "step": 17421 + }, + { + "epoch": 13.718393068137061, + "grad_norm": 0.12356076389551163, + "learning_rate": 2.7528e-05, + "loss": 0.0169, + "step": 17422 + }, + { + "epoch": 13.71918077983458, + "grad_norm": 0.19841425120830536, + "learning_rate": 2.7527666666666667e-05, + "loss": 0.0127, + "step": 17423 + }, + { + "epoch": 13.7199684915321, + "grad_norm": 0.12808744609355927, + "learning_rate": 2.7527333333333337e-05, + "loss": 0.0056, + "step": 17424 + }, + { + "epoch": 13.720756203229618, + "grad_norm": 0.17652249336242676, + "learning_rate": 2.7527e-05, + "loss": 0.0133, + "step": 17425 + }, + { + "epoch": 13.721543914927137, + "grad_norm": 0.18288259208202362, + "learning_rate": 2.752666666666667e-05, + "loss": 0.0076, + "step": 17426 + }, + { + "epoch": 13.722331626624655, + "grad_norm": 0.1774977743625641, + "learning_rate": 2.7526333333333334e-05, + "loss": 0.0251, + "step": 17427 + }, + { + "epoch": 13.723119338322174, + "grad_norm": 0.15553796291351318, + "learning_rate": 2.7526e-05, + "loss": 0.0086, + "step": 17428 + }, + { + "epoch": 13.723907050019692, + "grad_norm": 0.09258756786584854, + "learning_rate": 2.7525666666666666e-05, + "loss": 0.0039, + "step": 17429 + }, + { + "epoch": 13.724694761717211, + "grad_norm": 0.3659302294254303, + "learning_rate": 2.7525333333333336e-05, + "loss": 0.0176, + "step": 17430 + }, + { + "epoch": 13.725482473414731, + "grad_norm": 0.2824702262878418, + "learning_rate": 2.7525e-05, + "loss": 0.0161, + "step": 17431 + }, + { + "epoch": 13.726270185112249, + "grad_norm": 0.13062497973442078, + "learning_rate": 2.7524666666666668e-05, + "loss": 0.0072, + "step": 17432 + }, + { + "epoch": 13.727057896809768, + "grad_norm": 0.2515506148338318, + "learning_rate": 2.7524333333333333e-05, + "loss": 0.0087, + "step": 17433 + }, + { + "epoch": 13.727845608507286, + "grad_norm": 0.19051755964756012, + "learning_rate": 2.7524e-05, + "loss": 0.01, + "step": 17434 + }, + { + "epoch": 13.728633320204805, + "grad_norm": 0.3867374360561371, + "learning_rate": 2.752366666666667e-05, + "loss": 0.0104, + "step": 17435 + }, + { + "epoch": 13.729421031902323, + "grad_norm": 0.22713381052017212, + "learning_rate": 2.752333333333333e-05, + "loss": 0.0087, + "step": 17436 + }, + { + "epoch": 13.730208743599842, + "grad_norm": 0.6062530279159546, + "learning_rate": 2.7523e-05, + "loss": 0.0172, + "step": 17437 + }, + { + "epoch": 13.730996455297362, + "grad_norm": 0.3648368716239929, + "learning_rate": 2.7522666666666667e-05, + "loss": 0.0097, + "step": 17438 + }, + { + "epoch": 13.73178416699488, + "grad_norm": 0.1715550422668457, + "learning_rate": 2.7522333333333332e-05, + "loss": 0.0056, + "step": 17439 + }, + { + "epoch": 13.732571878692399, + "grad_norm": 0.1432480365037918, + "learning_rate": 2.7522000000000002e-05, + "loss": 0.0107, + "step": 17440 + }, + { + "epoch": 13.733359590389917, + "grad_norm": 0.12683667242527008, + "learning_rate": 2.7521666666666668e-05, + "loss": 0.0073, + "step": 17441 + }, + { + "epoch": 13.734147302087436, + "grad_norm": 0.19718097150325775, + "learning_rate": 2.7521333333333334e-05, + "loss": 0.0078, + "step": 17442 + }, + { + "epoch": 13.734935013784956, + "grad_norm": 0.13282401859760284, + "learning_rate": 2.7521e-05, + "loss": 0.0067, + "step": 17443 + }, + { + "epoch": 13.735722725482473, + "grad_norm": 0.18525154888629913, + "learning_rate": 2.752066666666667e-05, + "loss": 0.0044, + "step": 17444 + }, + { + "epoch": 13.736510437179993, + "grad_norm": 0.2673681378364563, + "learning_rate": 2.752033333333333e-05, + "loss": 0.0063, + "step": 17445 + }, + { + "epoch": 13.73729814887751, + "grad_norm": 0.18840982019901276, + "learning_rate": 2.752e-05, + "loss": 0.0106, + "step": 17446 + }, + { + "epoch": 13.73808586057503, + "grad_norm": 0.47434160113334656, + "learning_rate": 2.751966666666667e-05, + "loss": 0.0152, + "step": 17447 + }, + { + "epoch": 13.738873572272547, + "grad_norm": 0.29845723509788513, + "learning_rate": 2.7519333333333333e-05, + "loss": 0.0151, + "step": 17448 + }, + { + "epoch": 13.739661283970067, + "grad_norm": 0.20951053500175476, + "learning_rate": 2.7519000000000002e-05, + "loss": 0.0078, + "step": 17449 + }, + { + "epoch": 13.740448995667586, + "grad_norm": 0.17065615952014923, + "learning_rate": 2.7518666666666668e-05, + "loss": 0.006, + "step": 17450 + }, + { + "epoch": 13.741236707365104, + "grad_norm": 0.13883104920387268, + "learning_rate": 2.7518333333333334e-05, + "loss": 0.0047, + "step": 17451 + }, + { + "epoch": 13.742024419062624, + "grad_norm": 0.27181386947631836, + "learning_rate": 2.7518e-05, + "loss": 0.0112, + "step": 17452 + }, + { + "epoch": 13.742812130760141, + "grad_norm": 0.21762163937091827, + "learning_rate": 2.751766666666667e-05, + "loss": 0.0088, + "step": 17453 + }, + { + "epoch": 13.74359984245766, + "grad_norm": 0.23343877494335175, + "learning_rate": 2.751733333333333e-05, + "loss": 0.0117, + "step": 17454 + }, + { + "epoch": 13.744387554155178, + "grad_norm": 0.21848733723163605, + "learning_rate": 2.7517e-05, + "loss": 0.0065, + "step": 17455 + }, + { + "epoch": 13.745175265852698, + "grad_norm": 0.11768017709255219, + "learning_rate": 2.751666666666667e-05, + "loss": 0.0066, + "step": 17456 + }, + { + "epoch": 13.745962977550217, + "grad_norm": 0.06445752829313278, + "learning_rate": 2.7516333333333333e-05, + "loss": 0.0025, + "step": 17457 + }, + { + "epoch": 13.746750689247735, + "grad_norm": 0.2388114035129547, + "learning_rate": 2.7516000000000002e-05, + "loss": 0.0082, + "step": 17458 + }, + { + "epoch": 13.747538400945254, + "grad_norm": 0.17237533628940582, + "learning_rate": 2.7515666666666668e-05, + "loss": 0.0074, + "step": 17459 + }, + { + "epoch": 13.748326112642772, + "grad_norm": 0.4131735563278198, + "learning_rate": 2.7515333333333334e-05, + "loss": 0.0065, + "step": 17460 + }, + { + "epoch": 13.749113824340292, + "grad_norm": 0.5521437525749207, + "learning_rate": 2.7515e-05, + "loss": 0.1305, + "step": 17461 + }, + { + "epoch": 13.749901536037811, + "grad_norm": 0.6085600852966309, + "learning_rate": 2.7514666666666666e-05, + "loss": 0.1365, + "step": 17462 + }, + { + "epoch": 13.750689247735329, + "grad_norm": 0.43895095586776733, + "learning_rate": 2.7514333333333335e-05, + "loss": 0.0961, + "step": 17463 + }, + { + "epoch": 13.751476959432848, + "grad_norm": 0.5018057227134705, + "learning_rate": 2.7514e-05, + "loss": 0.076, + "step": 17464 + }, + { + "epoch": 13.752264671130366, + "grad_norm": 0.5372744798660278, + "learning_rate": 2.7513666666666667e-05, + "loss": 0.0682, + "step": 17465 + }, + { + "epoch": 13.753052382827885, + "grad_norm": 0.32860687375068665, + "learning_rate": 2.7513333333333333e-05, + "loss": 0.065, + "step": 17466 + }, + { + "epoch": 13.753840094525403, + "grad_norm": 0.6365111470222473, + "learning_rate": 2.7513000000000002e-05, + "loss": 0.0167, + "step": 17467 + }, + { + "epoch": 13.754627806222922, + "grad_norm": 0.22650481760501862, + "learning_rate": 2.7512666666666665e-05, + "loss": 0.0194, + "step": 17468 + }, + { + "epoch": 13.755415517920442, + "grad_norm": 0.15571478009223938, + "learning_rate": 2.7512333333333334e-05, + "loss": 0.0092, + "step": 17469 + }, + { + "epoch": 13.75620322961796, + "grad_norm": 0.13524781167507172, + "learning_rate": 2.7512e-05, + "loss": 0.0069, + "step": 17470 + }, + { + "epoch": 13.756990941315479, + "grad_norm": 0.6226984262466431, + "learning_rate": 2.7511666666666666e-05, + "loss": 0.0119, + "step": 17471 + }, + { + "epoch": 13.757778653012997, + "grad_norm": 0.2708067297935486, + "learning_rate": 2.7511333333333335e-05, + "loss": 0.0068, + "step": 17472 + }, + { + "epoch": 13.758566364710516, + "grad_norm": 0.10418872535228729, + "learning_rate": 2.7511e-05, + "loss": 0.0052, + "step": 17473 + }, + { + "epoch": 13.759354076408034, + "grad_norm": 0.11110301315784454, + "learning_rate": 2.7510666666666667e-05, + "loss": 0.0021, + "step": 17474 + }, + { + "epoch": 13.760141788105553, + "grad_norm": 0.2802988588809967, + "learning_rate": 2.7510333333333333e-05, + "loss": 0.0139, + "step": 17475 + }, + { + "epoch": 13.760929499803073, + "grad_norm": 0.13012544810771942, + "learning_rate": 2.7510000000000003e-05, + "loss": 0.0081, + "step": 17476 + }, + { + "epoch": 13.76171721150059, + "grad_norm": 0.21256636083126068, + "learning_rate": 2.7509666666666665e-05, + "loss": 0.0076, + "step": 17477 + }, + { + "epoch": 13.76250492319811, + "grad_norm": 0.18718236684799194, + "learning_rate": 2.7509333333333334e-05, + "loss": 0.0072, + "step": 17478 + }, + { + "epoch": 13.763292634895627, + "grad_norm": 0.2098158746957779, + "learning_rate": 2.7509e-05, + "loss": 0.0108, + "step": 17479 + }, + { + "epoch": 13.764080346593147, + "grad_norm": 0.07885152101516724, + "learning_rate": 2.7508666666666666e-05, + "loss": 0.0029, + "step": 17480 + }, + { + "epoch": 13.764868058290666, + "grad_norm": 0.10840816795825958, + "learning_rate": 2.7508333333333336e-05, + "loss": 0.0069, + "step": 17481 + }, + { + "epoch": 13.765655769988184, + "grad_norm": 0.22291600704193115, + "learning_rate": 2.7508e-05, + "loss": 0.0072, + "step": 17482 + }, + { + "epoch": 13.766443481685704, + "grad_norm": 0.19498150050640106, + "learning_rate": 2.7507666666666668e-05, + "loss": 0.011, + "step": 17483 + }, + { + "epoch": 13.767231193383221, + "grad_norm": 0.3061565160751343, + "learning_rate": 2.7507333333333333e-05, + "loss": 0.0094, + "step": 17484 + }, + { + "epoch": 13.76801890508074, + "grad_norm": 0.15723226964473724, + "learning_rate": 2.7507000000000003e-05, + "loss": 0.0079, + "step": 17485 + }, + { + "epoch": 13.768806616778258, + "grad_norm": 0.12135210633277893, + "learning_rate": 2.7506666666666665e-05, + "loss": 0.0034, + "step": 17486 + }, + { + "epoch": 13.769594328475778, + "grad_norm": 0.27620837092399597, + "learning_rate": 2.7506333333333335e-05, + "loss": 0.0127, + "step": 17487 + }, + { + "epoch": 13.770382040173297, + "grad_norm": 0.07565804570913315, + "learning_rate": 2.7506e-05, + "loss": 0.0043, + "step": 17488 + }, + { + "epoch": 13.771169751870815, + "grad_norm": 0.2533246576786041, + "learning_rate": 2.7505666666666667e-05, + "loss": 0.0091, + "step": 17489 + }, + { + "epoch": 13.771957463568334, + "grad_norm": 0.1217937245965004, + "learning_rate": 2.7505333333333336e-05, + "loss": 0.0103, + "step": 17490 + }, + { + "epoch": 13.772745175265852, + "grad_norm": 0.4056093692779541, + "learning_rate": 2.7505e-05, + "loss": 0.013, + "step": 17491 + }, + { + "epoch": 13.773532886963372, + "grad_norm": 0.5898820161819458, + "learning_rate": 2.7504666666666668e-05, + "loss": 0.0138, + "step": 17492 + }, + { + "epoch": 13.77432059866089, + "grad_norm": 0.07901275902986526, + "learning_rate": 2.7504333333333334e-05, + "loss": 0.0055, + "step": 17493 + }, + { + "epoch": 13.775108310358409, + "grad_norm": 0.2168925702571869, + "learning_rate": 2.7504e-05, + "loss": 0.0093, + "step": 17494 + }, + { + "epoch": 13.775896022055928, + "grad_norm": 0.35919806361198425, + "learning_rate": 2.7503666666666666e-05, + "loss": 0.0058, + "step": 17495 + }, + { + "epoch": 13.776683733753446, + "grad_norm": 0.20261333882808685, + "learning_rate": 2.7503333333333335e-05, + "loss": 0.007, + "step": 17496 + }, + { + "epoch": 13.777471445450965, + "grad_norm": 0.20987533032894135, + "learning_rate": 2.7503e-05, + "loss": 0.0104, + "step": 17497 + }, + { + "epoch": 13.778259157148483, + "grad_norm": 0.3224078118801117, + "learning_rate": 2.7502666666666667e-05, + "loss": 0.0099, + "step": 17498 + }, + { + "epoch": 13.779046868846002, + "grad_norm": 0.09121496975421906, + "learning_rate": 2.7502333333333336e-05, + "loss": 0.0048, + "step": 17499 + }, + { + "epoch": 13.779834580543522, + "grad_norm": 0.16357627511024475, + "learning_rate": 2.7502e-05, + "loss": 0.0075, + "step": 17500 + }, + { + "epoch": 13.78062229224104, + "grad_norm": 0.1402232050895691, + "learning_rate": 2.7501666666666668e-05, + "loss": 0.0068, + "step": 17501 + }, + { + "epoch": 13.781410003938559, + "grad_norm": 0.2195734828710556, + "learning_rate": 2.7501333333333334e-05, + "loss": 0.0059, + "step": 17502 + }, + { + "epoch": 13.782197715636077, + "grad_norm": 0.3483358323574066, + "learning_rate": 2.7501e-05, + "loss": 0.0128, + "step": 17503 + }, + { + "epoch": 13.782985427333596, + "grad_norm": 0.3920004367828369, + "learning_rate": 2.7500666666666666e-05, + "loss": 0.0089, + "step": 17504 + }, + { + "epoch": 13.783773139031114, + "grad_norm": 0.2621411383152008, + "learning_rate": 2.7500333333333335e-05, + "loss": 0.0118, + "step": 17505 + }, + { + "epoch": 13.784560850728633, + "grad_norm": 0.2167118638753891, + "learning_rate": 2.75e-05, + "loss": 0.0233, + "step": 17506 + }, + { + "epoch": 13.785348562426153, + "grad_norm": 0.2757869362831116, + "learning_rate": 2.7499666666666667e-05, + "loss": 0.026, + "step": 17507 + }, + { + "epoch": 13.78613627412367, + "grad_norm": 0.4063427448272705, + "learning_rate": 2.7499333333333336e-05, + "loss": 0.012, + "step": 17508 + }, + { + "epoch": 13.78692398582119, + "grad_norm": 0.3186163008213043, + "learning_rate": 2.7499e-05, + "loss": 0.0094, + "step": 17509 + }, + { + "epoch": 13.787711697518708, + "grad_norm": 0.33425986766815186, + "learning_rate": 2.7498666666666668e-05, + "loss": 0.0287, + "step": 17510 + }, + { + "epoch": 13.788499409216227, + "grad_norm": 0.6522340774536133, + "learning_rate": 2.7498333333333334e-05, + "loss": 0.1512, + "step": 17511 + }, + { + "epoch": 13.789287120913745, + "grad_norm": 0.4139358401298523, + "learning_rate": 2.7498e-05, + "loss": 0.1298, + "step": 17512 + }, + { + "epoch": 13.790074832611264, + "grad_norm": 0.3438662886619568, + "learning_rate": 2.749766666666667e-05, + "loss": 0.0799, + "step": 17513 + }, + { + "epoch": 13.790862544308784, + "grad_norm": 0.31665876507759094, + "learning_rate": 2.7497333333333335e-05, + "loss": 0.0541, + "step": 17514 + }, + { + "epoch": 13.791650256006301, + "grad_norm": 0.4090665280818939, + "learning_rate": 2.7497e-05, + "loss": 0.053, + "step": 17515 + }, + { + "epoch": 13.79243796770382, + "grad_norm": 0.37156471610069275, + "learning_rate": 2.7496666666666667e-05, + "loss": 0.0392, + "step": 17516 + }, + { + "epoch": 13.793225679401338, + "grad_norm": 0.39361000061035156, + "learning_rate": 2.7496333333333336e-05, + "loss": 0.0474, + "step": 17517 + }, + { + "epoch": 13.794013391098858, + "grad_norm": 0.23367559909820557, + "learning_rate": 2.7496e-05, + "loss": 0.0137, + "step": 17518 + }, + { + "epoch": 13.794801102796377, + "grad_norm": 0.18861813843250275, + "learning_rate": 2.749566666666667e-05, + "loss": 0.0091, + "step": 17519 + }, + { + "epoch": 13.795588814493895, + "grad_norm": 0.24344298243522644, + "learning_rate": 2.7495333333333334e-05, + "loss": 0.0122, + "step": 17520 + }, + { + "epoch": 13.796376526191414, + "grad_norm": 0.5664974451065063, + "learning_rate": 2.7495e-05, + "loss": 0.0119, + "step": 17521 + }, + { + "epoch": 13.797164237888932, + "grad_norm": 0.21203553676605225, + "learning_rate": 2.749466666666667e-05, + "loss": 0.0294, + "step": 17522 + }, + { + "epoch": 13.797951949586452, + "grad_norm": 0.5497538447380066, + "learning_rate": 2.7494333333333332e-05, + "loss": 0.0079, + "step": 17523 + }, + { + "epoch": 13.798739661283971, + "grad_norm": 0.10815035551786423, + "learning_rate": 2.7494e-05, + "loss": 0.0057, + "step": 17524 + }, + { + "epoch": 13.799527372981489, + "grad_norm": 0.12374700605869293, + "learning_rate": 2.7493666666666667e-05, + "loss": 0.0064, + "step": 17525 + }, + { + "epoch": 13.800315084679008, + "grad_norm": 0.10019723325967789, + "learning_rate": 2.7493333333333333e-05, + "loss": 0.0044, + "step": 17526 + }, + { + "epoch": 13.801102796376526, + "grad_norm": 0.1460571438074112, + "learning_rate": 2.7493e-05, + "loss": 0.0108, + "step": 17527 + }, + { + "epoch": 13.801890508074045, + "grad_norm": 0.1829463243484497, + "learning_rate": 2.749266666666667e-05, + "loss": 0.0095, + "step": 17528 + }, + { + "epoch": 13.802678219771563, + "grad_norm": 0.10935243219137192, + "learning_rate": 2.749233333333333e-05, + "loss": 0.0052, + "step": 17529 + }, + { + "epoch": 13.803465931469082, + "grad_norm": 0.21302542090415955, + "learning_rate": 2.7492e-05, + "loss": 0.0092, + "step": 17530 + }, + { + "epoch": 13.8042536431666, + "grad_norm": 0.346024751663208, + "learning_rate": 2.749166666666667e-05, + "loss": 0.0116, + "step": 17531 + }, + { + "epoch": 13.80504135486412, + "grad_norm": 0.329683393239975, + "learning_rate": 2.7491333333333332e-05, + "loss": 0.0064, + "step": 17532 + }, + { + "epoch": 13.805829066561639, + "grad_norm": 0.20769891142845154, + "learning_rate": 2.7491e-05, + "loss": 0.0067, + "step": 17533 + }, + { + "epoch": 13.806616778259157, + "grad_norm": 0.24150171875953674, + "learning_rate": 2.7490666666666668e-05, + "loss": 0.0096, + "step": 17534 + }, + { + "epoch": 13.807404489956676, + "grad_norm": 0.19081082940101624, + "learning_rate": 2.7490333333333333e-05, + "loss": 0.0086, + "step": 17535 + }, + { + "epoch": 13.808192201654194, + "grad_norm": 0.38794320821762085, + "learning_rate": 2.749e-05, + "loss": 0.0412, + "step": 17536 + }, + { + "epoch": 13.808979913351713, + "grad_norm": 0.22021691501140594, + "learning_rate": 2.748966666666667e-05, + "loss": 0.0064, + "step": 17537 + }, + { + "epoch": 13.809767625049233, + "grad_norm": 0.14202353358268738, + "learning_rate": 2.7489333333333335e-05, + "loss": 0.0116, + "step": 17538 + }, + { + "epoch": 13.81055533674675, + "grad_norm": 0.1601739525794983, + "learning_rate": 2.7489e-05, + "loss": 0.0052, + "step": 17539 + }, + { + "epoch": 13.81134304844427, + "grad_norm": 0.24480848014354706, + "learning_rate": 2.748866666666667e-05, + "loss": 0.0108, + "step": 17540 + }, + { + "epoch": 13.812130760141788, + "grad_norm": 0.23634114861488342, + "learning_rate": 2.7488333333333332e-05, + "loss": 0.0135, + "step": 17541 + }, + { + "epoch": 13.812918471839307, + "grad_norm": 0.3249950110912323, + "learning_rate": 2.7488000000000002e-05, + "loss": 0.0083, + "step": 17542 + }, + { + "epoch": 13.813706183536826, + "grad_norm": 0.17806664109230042, + "learning_rate": 2.7487666666666668e-05, + "loss": 0.0069, + "step": 17543 + }, + { + "epoch": 13.814493895234344, + "grad_norm": 0.265520304441452, + "learning_rate": 2.7487333333333334e-05, + "loss": 0.0113, + "step": 17544 + }, + { + "epoch": 13.815281606931864, + "grad_norm": 0.2677291929721832, + "learning_rate": 2.7487e-05, + "loss": 0.0079, + "step": 17545 + }, + { + "epoch": 13.816069318629381, + "grad_norm": 0.3145609200000763, + "learning_rate": 2.748666666666667e-05, + "loss": 0.0093, + "step": 17546 + }, + { + "epoch": 13.8168570303269, + "grad_norm": 0.12023165076971054, + "learning_rate": 2.7486333333333335e-05, + "loss": 0.0077, + "step": 17547 + }, + { + "epoch": 13.817644742024418, + "grad_norm": 0.1926904022693634, + "learning_rate": 2.7486e-05, + "loss": 0.0114, + "step": 17548 + }, + { + "epoch": 13.818432453721938, + "grad_norm": 0.12276460230350494, + "learning_rate": 2.748566666666667e-05, + "loss": 0.0045, + "step": 17549 + }, + { + "epoch": 13.819220165419457, + "grad_norm": 0.18041573464870453, + "learning_rate": 2.7485333333333333e-05, + "loss": 0.0073, + "step": 17550 + }, + { + "epoch": 13.820007877116975, + "grad_norm": 0.18344646692276, + "learning_rate": 2.7485000000000002e-05, + "loss": 0.0054, + "step": 17551 + }, + { + "epoch": 13.820795588814494, + "grad_norm": 0.34200045466423035, + "learning_rate": 2.7484666666666665e-05, + "loss": 0.0113, + "step": 17552 + }, + { + "epoch": 13.821583300512012, + "grad_norm": 0.22606651484966278, + "learning_rate": 2.7484333333333334e-05, + "loss": 0.0202, + "step": 17553 + }, + { + "epoch": 13.822371012209532, + "grad_norm": 0.19355742633342743, + "learning_rate": 2.7484e-05, + "loss": 0.0054, + "step": 17554 + }, + { + "epoch": 13.82315872390705, + "grad_norm": 0.11703483015298843, + "learning_rate": 2.7483666666666666e-05, + "loss": 0.0075, + "step": 17555 + }, + { + "epoch": 13.823946435604569, + "grad_norm": 0.26528704166412354, + "learning_rate": 2.7483333333333335e-05, + "loss": 0.0086, + "step": 17556 + }, + { + "epoch": 13.824734147302088, + "grad_norm": 0.15522325038909912, + "learning_rate": 2.7483e-05, + "loss": 0.006, + "step": 17557 + }, + { + "epoch": 13.825521858999606, + "grad_norm": 0.13835583627223969, + "learning_rate": 2.7482666666666667e-05, + "loss": 0.0065, + "step": 17558 + }, + { + "epoch": 13.826309570697125, + "grad_norm": 0.6555100083351135, + "learning_rate": 2.7482333333333333e-05, + "loss": 0.0119, + "step": 17559 + }, + { + "epoch": 13.827097282394643, + "grad_norm": 0.29173609614372253, + "learning_rate": 2.7482000000000002e-05, + "loss": 0.0238, + "step": 17560 + }, + { + "epoch": 13.827884994092162, + "grad_norm": 0.5463745594024658, + "learning_rate": 2.7481666666666665e-05, + "loss": 0.2023, + "step": 17561 + }, + { + "epoch": 13.828672705789682, + "grad_norm": 0.32134920358657837, + "learning_rate": 2.7481333333333334e-05, + "loss": 0.0699, + "step": 17562 + }, + { + "epoch": 13.8294604174872, + "grad_norm": 0.5012906789779663, + "learning_rate": 2.7481000000000003e-05, + "loss": 0.0634, + "step": 17563 + }, + { + "epoch": 13.830248129184719, + "grad_norm": 0.33523568511009216, + "learning_rate": 2.7480666666666666e-05, + "loss": 0.0541, + "step": 17564 + }, + { + "epoch": 13.831035840882237, + "grad_norm": 0.4019499719142914, + "learning_rate": 2.7480333333333335e-05, + "loss": 0.0394, + "step": 17565 + }, + { + "epoch": 13.831823552579756, + "grad_norm": 0.6214007139205933, + "learning_rate": 2.748e-05, + "loss": 0.0411, + "step": 17566 + }, + { + "epoch": 13.832611264277274, + "grad_norm": 0.2489248514175415, + "learning_rate": 2.7479666666666667e-05, + "loss": 0.0171, + "step": 17567 + }, + { + "epoch": 13.833398975974793, + "grad_norm": 0.22281058132648468, + "learning_rate": 2.7479333333333333e-05, + "loss": 0.0185, + "step": 17568 + }, + { + "epoch": 13.834186687672313, + "grad_norm": 0.1194135770201683, + "learning_rate": 2.7479000000000002e-05, + "loss": 0.009, + "step": 17569 + }, + { + "epoch": 13.83497439936983, + "grad_norm": 0.1397404670715332, + "learning_rate": 2.7478666666666665e-05, + "loss": 0.0077, + "step": 17570 + }, + { + "epoch": 13.83576211106735, + "grad_norm": 0.2677694857120514, + "learning_rate": 2.7478333333333334e-05, + "loss": 0.0086, + "step": 17571 + }, + { + "epoch": 13.836549822764868, + "grad_norm": 0.2749648988246918, + "learning_rate": 2.7478000000000004e-05, + "loss": 0.0112, + "step": 17572 + }, + { + "epoch": 13.837337534462387, + "grad_norm": 0.12994086742401123, + "learning_rate": 2.7477666666666666e-05, + "loss": 0.0081, + "step": 17573 + }, + { + "epoch": 13.838125246159905, + "grad_norm": 0.10259934514760971, + "learning_rate": 2.7477333333333335e-05, + "loss": 0.0038, + "step": 17574 + }, + { + "epoch": 13.838912957857424, + "grad_norm": 0.12196534126996994, + "learning_rate": 2.7477e-05, + "loss": 0.0078, + "step": 17575 + }, + { + "epoch": 13.839700669554944, + "grad_norm": 0.33304402232170105, + "learning_rate": 2.7476666666666667e-05, + "loss": 0.0234, + "step": 17576 + }, + { + "epoch": 13.840488381252461, + "grad_norm": 0.1580430567264557, + "learning_rate": 2.7476333333333333e-05, + "loss": 0.0082, + "step": 17577 + }, + { + "epoch": 13.84127609294998, + "grad_norm": 0.19211316108703613, + "learning_rate": 2.7476000000000003e-05, + "loss": 0.0091, + "step": 17578 + }, + { + "epoch": 13.842063804647498, + "grad_norm": 0.11462557315826416, + "learning_rate": 2.7475666666666665e-05, + "loss": 0.0062, + "step": 17579 + }, + { + "epoch": 13.842851516345018, + "grad_norm": 0.4852922260761261, + "learning_rate": 2.7475333333333334e-05, + "loss": 0.0057, + "step": 17580 + }, + { + "epoch": 13.843639228042537, + "grad_norm": 0.20754209160804749, + "learning_rate": 2.7475000000000004e-05, + "loss": 0.0087, + "step": 17581 + }, + { + "epoch": 13.844426939740055, + "grad_norm": 0.21799089014530182, + "learning_rate": 2.7474666666666666e-05, + "loss": 0.013, + "step": 17582 + }, + { + "epoch": 13.845214651437574, + "grad_norm": 0.3956426680088043, + "learning_rate": 2.7474333333333336e-05, + "loss": 0.0096, + "step": 17583 + }, + { + "epoch": 13.846002363135092, + "grad_norm": 0.30740731954574585, + "learning_rate": 2.7473999999999998e-05, + "loss": 0.0133, + "step": 17584 + }, + { + "epoch": 13.846790074832612, + "grad_norm": 0.16578951478004456, + "learning_rate": 2.7473666666666668e-05, + "loss": 0.0088, + "step": 17585 + }, + { + "epoch": 13.84757778653013, + "grad_norm": 0.31573760509490967, + "learning_rate": 2.7473333333333333e-05, + "loss": 0.0125, + "step": 17586 + }, + { + "epoch": 13.848365498227649, + "grad_norm": 0.0670086070895195, + "learning_rate": 2.7473e-05, + "loss": 0.0034, + "step": 17587 + }, + { + "epoch": 13.849153209925168, + "grad_norm": 0.5329708456993103, + "learning_rate": 2.747266666666667e-05, + "loss": 0.0132, + "step": 17588 + }, + { + "epoch": 13.849940921622686, + "grad_norm": 0.1305714249610901, + "learning_rate": 2.7472333333333335e-05, + "loss": 0.0054, + "step": 17589 + }, + { + "epoch": 13.850728633320205, + "grad_norm": 0.647648811340332, + "learning_rate": 2.7472e-05, + "loss": 0.0332, + "step": 17590 + }, + { + "epoch": 13.851516345017723, + "grad_norm": 0.37186741828918457, + "learning_rate": 2.7471666666666667e-05, + "loss": 0.0148, + "step": 17591 + }, + { + "epoch": 13.852304056715242, + "grad_norm": 0.08482073992490768, + "learning_rate": 2.7471333333333336e-05, + "loss": 0.0045, + "step": 17592 + }, + { + "epoch": 13.85309176841276, + "grad_norm": 0.08855034410953522, + "learning_rate": 2.7471e-05, + "loss": 0.0045, + "step": 17593 + }, + { + "epoch": 13.85387948011028, + "grad_norm": 0.15558087825775146, + "learning_rate": 2.7470666666666668e-05, + "loss": 0.0049, + "step": 17594 + }, + { + "epoch": 13.854667191807799, + "grad_norm": 0.2075604945421219, + "learning_rate": 2.7470333333333334e-05, + "loss": 0.0078, + "step": 17595 + }, + { + "epoch": 13.855454903505317, + "grad_norm": 0.0764869675040245, + "learning_rate": 2.747e-05, + "loss": 0.0034, + "step": 17596 + }, + { + "epoch": 13.856242615202836, + "grad_norm": 0.1898677796125412, + "learning_rate": 2.746966666666667e-05, + "loss": 0.0081, + "step": 17597 + }, + { + "epoch": 13.857030326900354, + "grad_norm": 0.3238963782787323, + "learning_rate": 2.7469333333333335e-05, + "loss": 0.0078, + "step": 17598 + }, + { + "epoch": 13.857818038597873, + "grad_norm": 0.6968814134597778, + "learning_rate": 2.7469e-05, + "loss": 0.0129, + "step": 17599 + }, + { + "epoch": 13.858605750295393, + "grad_norm": 0.21608534455299377, + "learning_rate": 2.7468666666666667e-05, + "loss": 0.0053, + "step": 17600 + }, + { + "epoch": 13.85939346199291, + "grad_norm": 0.3949313163757324, + "learning_rate": 2.7468333333333336e-05, + "loss": 0.0115, + "step": 17601 + }, + { + "epoch": 13.86018117369043, + "grad_norm": 0.22247712314128876, + "learning_rate": 2.7468e-05, + "loss": 0.0108, + "step": 17602 + }, + { + "epoch": 13.860968885387948, + "grad_norm": 0.2616165578365326, + "learning_rate": 2.7467666666666668e-05, + "loss": 0.0078, + "step": 17603 + }, + { + "epoch": 13.861756597085467, + "grad_norm": 0.1609996259212494, + "learning_rate": 2.7467333333333334e-05, + "loss": 0.0068, + "step": 17604 + }, + { + "epoch": 13.862544308782985, + "grad_norm": 0.4275757074356079, + "learning_rate": 2.7467e-05, + "loss": 0.0107, + "step": 17605 + }, + { + "epoch": 13.863332020480504, + "grad_norm": 0.24094006419181824, + "learning_rate": 2.746666666666667e-05, + "loss": 0.0104, + "step": 17606 + }, + { + "epoch": 13.864119732178024, + "grad_norm": 0.22852587699890137, + "learning_rate": 2.7466333333333335e-05, + "loss": 0.0069, + "step": 17607 + }, + { + "epoch": 13.864907443875541, + "grad_norm": 0.24231314659118652, + "learning_rate": 2.7466e-05, + "loss": 0.0065, + "step": 17608 + }, + { + "epoch": 13.86569515557306, + "grad_norm": 1.3649463653564453, + "learning_rate": 2.7465666666666667e-05, + "loss": 0.0111, + "step": 17609 + }, + { + "epoch": 13.866482867270578, + "grad_norm": 0.21877922117710114, + "learning_rate": 2.7465333333333336e-05, + "loss": 0.0071, + "step": 17610 + }, + { + "epoch": 13.867270578968098, + "grad_norm": 0.36630216240882874, + "learning_rate": 2.7465e-05, + "loss": 0.157, + "step": 17611 + }, + { + "epoch": 13.868058290665616, + "grad_norm": 0.5010067224502563, + "learning_rate": 2.7464666666666668e-05, + "loss": 0.0943, + "step": 17612 + }, + { + "epoch": 13.868846002363135, + "grad_norm": 0.4788311719894409, + "learning_rate": 2.7464333333333334e-05, + "loss": 0.0959, + "step": 17613 + }, + { + "epoch": 13.869633714060654, + "grad_norm": 0.4220115542411804, + "learning_rate": 2.7464e-05, + "loss": 0.0546, + "step": 17614 + }, + { + "epoch": 13.870421425758172, + "grad_norm": 0.37715694308280945, + "learning_rate": 2.746366666666667e-05, + "loss": 0.0601, + "step": 17615 + }, + { + "epoch": 13.871209137455692, + "grad_norm": 0.2568596303462982, + "learning_rate": 2.7463333333333332e-05, + "loss": 0.0295, + "step": 17616 + }, + { + "epoch": 13.87199684915321, + "grad_norm": 0.12864701449871063, + "learning_rate": 2.7463e-05, + "loss": 0.0084, + "step": 17617 + }, + { + "epoch": 13.872784560850729, + "grad_norm": 0.1724494844675064, + "learning_rate": 2.7462666666666667e-05, + "loss": 0.0102, + "step": 17618 + }, + { + "epoch": 13.873572272548248, + "grad_norm": 0.18444865942001343, + "learning_rate": 2.7462333333333333e-05, + "loss": 0.035, + "step": 17619 + }, + { + "epoch": 13.874359984245766, + "grad_norm": 0.3588055670261383, + "learning_rate": 2.7462e-05, + "loss": 0.0172, + "step": 17620 + }, + { + "epoch": 13.875147695943285, + "grad_norm": 0.1520662009716034, + "learning_rate": 2.7461666666666668e-05, + "loss": 0.0105, + "step": 17621 + }, + { + "epoch": 13.875935407640803, + "grad_norm": 0.1974637508392334, + "learning_rate": 2.7461333333333334e-05, + "loss": 0.0112, + "step": 17622 + }, + { + "epoch": 13.876723119338322, + "grad_norm": 0.16576099395751953, + "learning_rate": 2.7461e-05, + "loss": 0.0096, + "step": 17623 + }, + { + "epoch": 13.87751083103584, + "grad_norm": 0.17858517169952393, + "learning_rate": 2.746066666666667e-05, + "loss": 0.0086, + "step": 17624 + }, + { + "epoch": 13.87829854273336, + "grad_norm": 0.21901191771030426, + "learning_rate": 2.7460333333333332e-05, + "loss": 0.0081, + "step": 17625 + }, + { + "epoch": 13.879086254430879, + "grad_norm": 0.283873975276947, + "learning_rate": 2.746e-05, + "loss": 0.0122, + "step": 17626 + }, + { + "epoch": 13.879873966128397, + "grad_norm": 0.3398366868495941, + "learning_rate": 2.7459666666666667e-05, + "loss": 0.0134, + "step": 17627 + }, + { + "epoch": 13.880661677825916, + "grad_norm": 0.3765575885772705, + "learning_rate": 2.7459333333333333e-05, + "loss": 0.0122, + "step": 17628 + }, + { + "epoch": 13.881449389523434, + "grad_norm": 0.20822808146476746, + "learning_rate": 2.7459e-05, + "loss": 0.0126, + "step": 17629 + }, + { + "epoch": 13.882237101220953, + "grad_norm": 0.36583343148231506, + "learning_rate": 2.745866666666667e-05, + "loss": 0.0104, + "step": 17630 + }, + { + "epoch": 13.883024812918471, + "grad_norm": 0.4157744348049164, + "learning_rate": 2.7458333333333334e-05, + "loss": 0.0081, + "step": 17631 + }, + { + "epoch": 13.88381252461599, + "grad_norm": 0.1749645620584488, + "learning_rate": 2.7458e-05, + "loss": 0.0038, + "step": 17632 + }, + { + "epoch": 13.88460023631351, + "grad_norm": 0.33443763852119446, + "learning_rate": 2.745766666666667e-05, + "loss": 0.0137, + "step": 17633 + }, + { + "epoch": 13.885387948011028, + "grad_norm": 0.2575473189353943, + "learning_rate": 2.7457333333333332e-05, + "loss": 0.0104, + "step": 17634 + }, + { + "epoch": 13.886175659708547, + "grad_norm": 0.2679041624069214, + "learning_rate": 2.7457e-05, + "loss": 0.0054, + "step": 17635 + }, + { + "epoch": 13.886963371406065, + "grad_norm": 0.3660401701927185, + "learning_rate": 2.7456666666666667e-05, + "loss": 0.0137, + "step": 17636 + }, + { + "epoch": 13.887751083103584, + "grad_norm": 0.2983311414718628, + "learning_rate": 2.7456333333333333e-05, + "loss": 0.02, + "step": 17637 + }, + { + "epoch": 13.888538794801104, + "grad_norm": 0.44855761528015137, + "learning_rate": 2.7456000000000003e-05, + "loss": 0.0346, + "step": 17638 + }, + { + "epoch": 13.889326506498621, + "grad_norm": 0.23904818296432495, + "learning_rate": 2.745566666666667e-05, + "loss": 0.0099, + "step": 17639 + }, + { + "epoch": 13.89011421819614, + "grad_norm": 0.16033323109149933, + "learning_rate": 2.7455333333333335e-05, + "loss": 0.0087, + "step": 17640 + }, + { + "epoch": 13.890901929893658, + "grad_norm": 0.20698195695877075, + "learning_rate": 2.7455e-05, + "loss": 0.0121, + "step": 17641 + }, + { + "epoch": 13.891689641591178, + "grad_norm": 0.1498750001192093, + "learning_rate": 2.745466666666667e-05, + "loss": 0.007, + "step": 17642 + }, + { + "epoch": 13.892477353288696, + "grad_norm": 0.15395405888557434, + "learning_rate": 2.7454333333333332e-05, + "loss": 0.0063, + "step": 17643 + }, + { + "epoch": 13.893265064986215, + "grad_norm": 0.2887342870235443, + "learning_rate": 2.7454000000000002e-05, + "loss": 0.0122, + "step": 17644 + }, + { + "epoch": 13.894052776683735, + "grad_norm": 0.19555474817752838, + "learning_rate": 2.7453666666666664e-05, + "loss": 0.0074, + "step": 17645 + }, + { + "epoch": 13.894840488381252, + "grad_norm": 0.26334649324417114, + "learning_rate": 2.7453333333333334e-05, + "loss": 0.0105, + "step": 17646 + }, + { + "epoch": 13.895628200078772, + "grad_norm": 0.15684457123279572, + "learning_rate": 2.7453000000000003e-05, + "loss": 0.0095, + "step": 17647 + }, + { + "epoch": 13.89641591177629, + "grad_norm": 0.14483444392681122, + "learning_rate": 2.7452666666666666e-05, + "loss": 0.0042, + "step": 17648 + }, + { + "epoch": 13.897203623473809, + "grad_norm": 0.407234787940979, + "learning_rate": 2.7452333333333335e-05, + "loss": 0.0107, + "step": 17649 + }, + { + "epoch": 13.897991335171326, + "grad_norm": 0.13846556842327118, + "learning_rate": 2.7452e-05, + "loss": 0.0069, + "step": 17650 + }, + { + "epoch": 13.898779046868846, + "grad_norm": 1.082948923110962, + "learning_rate": 2.7451666666666667e-05, + "loss": 0.0179, + "step": 17651 + }, + { + "epoch": 13.899566758566365, + "grad_norm": 0.14451630413532257, + "learning_rate": 2.7451333333333333e-05, + "loss": 0.0044, + "step": 17652 + }, + { + "epoch": 13.900354470263883, + "grad_norm": 0.12465376406908035, + "learning_rate": 2.7451000000000002e-05, + "loss": 0.0084, + "step": 17653 + }, + { + "epoch": 13.901142181961402, + "grad_norm": 0.1742006242275238, + "learning_rate": 2.7450666666666665e-05, + "loss": 0.0067, + "step": 17654 + }, + { + "epoch": 13.90192989365892, + "grad_norm": 0.22947126626968384, + "learning_rate": 2.7450333333333334e-05, + "loss": 0.0134, + "step": 17655 + }, + { + "epoch": 13.90271760535644, + "grad_norm": 0.433074027299881, + "learning_rate": 2.7450000000000003e-05, + "loss": 0.0135, + "step": 17656 + }, + { + "epoch": 13.903505317053959, + "grad_norm": 0.10684944689273834, + "learning_rate": 2.7449666666666666e-05, + "loss": 0.007, + "step": 17657 + }, + { + "epoch": 13.904293028751477, + "grad_norm": 0.27544277906417847, + "learning_rate": 2.7449333333333335e-05, + "loss": 0.0118, + "step": 17658 + }, + { + "epoch": 13.905080740448996, + "grad_norm": 0.25720909237861633, + "learning_rate": 2.7449e-05, + "loss": 0.0094, + "step": 17659 + }, + { + "epoch": 13.905868452146514, + "grad_norm": 0.17430385947227478, + "learning_rate": 2.7448666666666667e-05, + "loss": 0.0075, + "step": 17660 + }, + { + "epoch": 13.906656163844033, + "grad_norm": 0.8275476098060608, + "learning_rate": 2.7448333333333333e-05, + "loss": 0.2254, + "step": 17661 + }, + { + "epoch": 13.907443875541551, + "grad_norm": 0.7271108627319336, + "learning_rate": 2.7448000000000002e-05, + "loss": 0.1824, + "step": 17662 + }, + { + "epoch": 13.90823158723907, + "grad_norm": 0.6149405837059021, + "learning_rate": 2.7447666666666668e-05, + "loss": 0.0863, + "step": 17663 + }, + { + "epoch": 13.90901929893659, + "grad_norm": 0.2976894676685333, + "learning_rate": 2.7447333333333334e-05, + "loss": 0.0578, + "step": 17664 + }, + { + "epoch": 13.909807010634108, + "grad_norm": 0.4462434947490692, + "learning_rate": 2.7447000000000003e-05, + "loss": 0.071, + "step": 17665 + }, + { + "epoch": 13.910594722331627, + "grad_norm": 0.2575850486755371, + "learning_rate": 2.7446666666666666e-05, + "loss": 0.0303, + "step": 17666 + }, + { + "epoch": 13.911382434029145, + "grad_norm": 0.23020310699939728, + "learning_rate": 2.7446333333333335e-05, + "loss": 0.021, + "step": 17667 + }, + { + "epoch": 13.912170145726664, + "grad_norm": 0.1971084028482437, + "learning_rate": 2.7446e-05, + "loss": 0.0199, + "step": 17668 + }, + { + "epoch": 13.912957857424182, + "grad_norm": 0.12137340754270554, + "learning_rate": 2.7445666666666667e-05, + "loss": 0.0069, + "step": 17669 + }, + { + "epoch": 13.913745569121701, + "grad_norm": 0.21622921526432037, + "learning_rate": 2.7445333333333333e-05, + "loss": 0.0298, + "step": 17670 + }, + { + "epoch": 13.91453328081922, + "grad_norm": 0.23124396800994873, + "learning_rate": 2.7445000000000002e-05, + "loss": 0.0062, + "step": 17671 + }, + { + "epoch": 13.915320992516738, + "grad_norm": 0.12444692850112915, + "learning_rate": 2.7444666666666668e-05, + "loss": 0.0043, + "step": 17672 + }, + { + "epoch": 13.916108704214258, + "grad_norm": 0.22618936002254486, + "learning_rate": 2.7444333333333334e-05, + "loss": 0.0097, + "step": 17673 + }, + { + "epoch": 13.916896415911776, + "grad_norm": 0.27596819400787354, + "learning_rate": 2.7444e-05, + "loss": 0.0092, + "step": 17674 + }, + { + "epoch": 13.917684127609295, + "grad_norm": 0.1420181393623352, + "learning_rate": 2.7443666666666666e-05, + "loss": 0.0074, + "step": 17675 + }, + { + "epoch": 13.918471839306815, + "grad_norm": 0.2779257893562317, + "learning_rate": 2.7443333333333335e-05, + "loss": 0.0137, + "step": 17676 + }, + { + "epoch": 13.919259551004332, + "grad_norm": 0.14343267679214478, + "learning_rate": 2.7442999999999998e-05, + "loss": 0.007, + "step": 17677 + }, + { + "epoch": 13.920047262701852, + "grad_norm": 0.2754994034767151, + "learning_rate": 2.7442666666666667e-05, + "loss": 0.0109, + "step": 17678 + }, + { + "epoch": 13.92083497439937, + "grad_norm": 0.28061816096305847, + "learning_rate": 2.7442333333333333e-05, + "loss": 0.0116, + "step": 17679 + }, + { + "epoch": 13.921622686096889, + "grad_norm": 0.17047645151615143, + "learning_rate": 2.7442e-05, + "loss": 0.0117, + "step": 17680 + }, + { + "epoch": 13.922410397794406, + "grad_norm": 0.15422137081623077, + "learning_rate": 2.744166666666667e-05, + "loss": 0.0086, + "step": 17681 + }, + { + "epoch": 13.923198109491926, + "grad_norm": 0.24928665161132812, + "learning_rate": 2.7441333333333334e-05, + "loss": 0.0117, + "step": 17682 + }, + { + "epoch": 13.923985821189445, + "grad_norm": 0.27737507224082947, + "learning_rate": 2.7441e-05, + "loss": 0.0067, + "step": 17683 + }, + { + "epoch": 13.924773532886963, + "grad_norm": 0.14113318920135498, + "learning_rate": 2.7440666666666666e-05, + "loss": 0.0053, + "step": 17684 + }, + { + "epoch": 13.925561244584483, + "grad_norm": 0.162947878241539, + "learning_rate": 2.7440333333333336e-05, + "loss": 0.0093, + "step": 17685 + }, + { + "epoch": 13.926348956282, + "grad_norm": 0.15924322605133057, + "learning_rate": 2.7439999999999998e-05, + "loss": 0.0043, + "step": 17686 + }, + { + "epoch": 13.92713666797952, + "grad_norm": 0.15092213451862335, + "learning_rate": 2.7439666666666667e-05, + "loss": 0.005, + "step": 17687 + }, + { + "epoch": 13.927924379677037, + "grad_norm": 0.15469969809055328, + "learning_rate": 2.7439333333333337e-05, + "loss": 0.0021, + "step": 17688 + }, + { + "epoch": 13.928712091374557, + "grad_norm": 0.1393464356660843, + "learning_rate": 2.7439e-05, + "loss": 0.0085, + "step": 17689 + }, + { + "epoch": 13.929499803072076, + "grad_norm": 0.2691240906715393, + "learning_rate": 2.743866666666667e-05, + "loss": 0.0131, + "step": 17690 + }, + { + "epoch": 13.930287514769594, + "grad_norm": 0.2293030023574829, + "learning_rate": 2.7438333333333335e-05, + "loss": 0.0066, + "step": 17691 + }, + { + "epoch": 13.931075226467113, + "grad_norm": 0.562030553817749, + "learning_rate": 2.7438e-05, + "loss": 0.0087, + "step": 17692 + }, + { + "epoch": 13.931862938164631, + "grad_norm": 0.17034535109996796, + "learning_rate": 2.7437666666666666e-05, + "loss": 0.0046, + "step": 17693 + }, + { + "epoch": 13.93265064986215, + "grad_norm": 0.1999777853488922, + "learning_rate": 2.7437333333333336e-05, + "loss": 0.0133, + "step": 17694 + }, + { + "epoch": 13.93343836155967, + "grad_norm": 0.2935571074485779, + "learning_rate": 2.7437e-05, + "loss": 0.0104, + "step": 17695 + }, + { + "epoch": 13.934226073257188, + "grad_norm": 0.326509028673172, + "learning_rate": 2.7436666666666668e-05, + "loss": 0.0111, + "step": 17696 + }, + { + "epoch": 13.935013784954707, + "grad_norm": 0.18035158514976501, + "learning_rate": 2.7436333333333337e-05, + "loss": 0.0068, + "step": 17697 + }, + { + "epoch": 13.935801496652225, + "grad_norm": 0.10020837187767029, + "learning_rate": 2.7436e-05, + "loss": 0.0039, + "step": 17698 + }, + { + "epoch": 13.936589208349744, + "grad_norm": 0.1859595775604248, + "learning_rate": 2.743566666666667e-05, + "loss": 0.0078, + "step": 17699 + }, + { + "epoch": 13.937376920047262, + "grad_norm": 0.12311124056577682, + "learning_rate": 2.7435333333333335e-05, + "loss": 0.0066, + "step": 17700 + }, + { + "epoch": 13.938164631744781, + "grad_norm": 0.3346538543701172, + "learning_rate": 2.7435e-05, + "loss": 0.0092, + "step": 17701 + }, + { + "epoch": 13.9389523434423, + "grad_norm": 0.26403507590293884, + "learning_rate": 2.7434666666666667e-05, + "loss": 0.0122, + "step": 17702 + }, + { + "epoch": 13.939740055139819, + "grad_norm": 0.10601070523262024, + "learning_rate": 2.7434333333333333e-05, + "loss": 0.004, + "step": 17703 + }, + { + "epoch": 13.940527766837338, + "grad_norm": 0.31559473276138306, + "learning_rate": 2.7434e-05, + "loss": 0.0088, + "step": 17704 + }, + { + "epoch": 13.941315478534856, + "grad_norm": 0.1289602816104889, + "learning_rate": 2.7433666666666668e-05, + "loss": 0.0074, + "step": 17705 + }, + { + "epoch": 13.942103190232375, + "grad_norm": 0.2424381822347641, + "learning_rate": 2.7433333333333334e-05, + "loss": 0.0109, + "step": 17706 + }, + { + "epoch": 13.942890901929893, + "grad_norm": 0.44814997911453247, + "learning_rate": 2.7433e-05, + "loss": 0.0134, + "step": 17707 + }, + { + "epoch": 13.943678613627412, + "grad_norm": 0.4819667637348175, + "learning_rate": 2.743266666666667e-05, + "loss": 0.0135, + "step": 17708 + }, + { + "epoch": 13.944466325324932, + "grad_norm": 0.39847683906555176, + "learning_rate": 2.743233333333333e-05, + "loss": 0.0149, + "step": 17709 + }, + { + "epoch": 13.94525403702245, + "grad_norm": 0.2794108986854553, + "learning_rate": 2.7432e-05, + "loss": 0.0111, + "step": 17710 + }, + { + "epoch": 13.946041748719969, + "grad_norm": 0.512064516544342, + "learning_rate": 2.7431666666666667e-05, + "loss": 0.2068, + "step": 17711 + }, + { + "epoch": 13.946829460417487, + "grad_norm": 0.3846120536327362, + "learning_rate": 2.7431333333333333e-05, + "loss": 0.1127, + "step": 17712 + }, + { + "epoch": 13.947617172115006, + "grad_norm": 0.5186195969581604, + "learning_rate": 2.7431000000000002e-05, + "loss": 0.0903, + "step": 17713 + }, + { + "epoch": 13.948404883812525, + "grad_norm": 0.6448810696601868, + "learning_rate": 2.7430666666666668e-05, + "loss": 0.0949, + "step": 17714 + }, + { + "epoch": 13.949192595510043, + "grad_norm": 0.4318353235721588, + "learning_rate": 2.7430333333333334e-05, + "loss": 0.07, + "step": 17715 + }, + { + "epoch": 13.949980307207563, + "grad_norm": 0.6695376038551331, + "learning_rate": 2.743e-05, + "loss": 0.0767, + "step": 17716 + }, + { + "epoch": 13.95076801890508, + "grad_norm": 0.1462799608707428, + "learning_rate": 2.742966666666667e-05, + "loss": 0.0184, + "step": 17717 + }, + { + "epoch": 13.9515557306026, + "grad_norm": 0.20450830459594727, + "learning_rate": 2.7429333333333332e-05, + "loss": 0.0113, + "step": 17718 + }, + { + "epoch": 13.952343442300117, + "grad_norm": 0.3482944369316101, + "learning_rate": 2.7429e-05, + "loss": 0.0235, + "step": 17719 + }, + { + "epoch": 13.953131153997637, + "grad_norm": 0.35106611251831055, + "learning_rate": 2.7428666666666667e-05, + "loss": 0.0087, + "step": 17720 + }, + { + "epoch": 13.953918865695156, + "grad_norm": 0.22776640951633453, + "learning_rate": 2.7428333333333333e-05, + "loss": 0.0118, + "step": 17721 + }, + { + "epoch": 13.954706577392674, + "grad_norm": 0.3829660713672638, + "learning_rate": 2.7428000000000002e-05, + "loss": 0.0075, + "step": 17722 + }, + { + "epoch": 13.955494289090193, + "grad_norm": 0.17869655787944794, + "learning_rate": 2.7427666666666668e-05, + "loss": 0.009, + "step": 17723 + }, + { + "epoch": 13.956282000787711, + "grad_norm": 0.27785539627075195, + "learning_rate": 2.7427333333333334e-05, + "loss": 0.0179, + "step": 17724 + }, + { + "epoch": 13.95706971248523, + "grad_norm": 0.15202076733112335, + "learning_rate": 2.7427e-05, + "loss": 0.0086, + "step": 17725 + }, + { + "epoch": 13.957857424182748, + "grad_norm": 0.25870370864868164, + "learning_rate": 2.742666666666667e-05, + "loss": 0.0096, + "step": 17726 + }, + { + "epoch": 13.958645135880268, + "grad_norm": 0.8286091685295105, + "learning_rate": 2.7426333333333332e-05, + "loss": 0.0155, + "step": 17727 + }, + { + "epoch": 13.959432847577787, + "grad_norm": 0.13098026812076569, + "learning_rate": 2.7426e-05, + "loss": 0.0076, + "step": 17728 + }, + { + "epoch": 13.960220559275305, + "grad_norm": 0.10352195054292679, + "learning_rate": 2.7425666666666667e-05, + "loss": 0.0071, + "step": 17729 + }, + { + "epoch": 13.961008270972824, + "grad_norm": 0.20932720601558685, + "learning_rate": 2.7425333333333333e-05, + "loss": 0.0126, + "step": 17730 + }, + { + "epoch": 13.961795982670342, + "grad_norm": 0.17612555623054504, + "learning_rate": 2.7425000000000003e-05, + "loss": 0.0124, + "step": 17731 + }, + { + "epoch": 13.962583694367861, + "grad_norm": 0.19614268839359283, + "learning_rate": 2.742466666666667e-05, + "loss": 0.0131, + "step": 17732 + }, + { + "epoch": 13.96337140606538, + "grad_norm": 0.23428672552108765, + "learning_rate": 2.7424333333333334e-05, + "loss": 0.009, + "step": 17733 + }, + { + "epoch": 13.964159117762899, + "grad_norm": 0.21088863909244537, + "learning_rate": 2.7424e-05, + "loss": 0.0185, + "step": 17734 + }, + { + "epoch": 13.964946829460418, + "grad_norm": 0.1013948991894722, + "learning_rate": 2.7423666666666666e-05, + "loss": 0.0051, + "step": 17735 + }, + { + "epoch": 13.965734541157936, + "grad_norm": 0.4945618808269501, + "learning_rate": 2.7423333333333332e-05, + "loss": 0.0222, + "step": 17736 + }, + { + "epoch": 13.966522252855455, + "grad_norm": 0.2976462244987488, + "learning_rate": 2.7423e-05, + "loss": 0.0111, + "step": 17737 + }, + { + "epoch": 13.967309964552973, + "grad_norm": 1.6275948286056519, + "learning_rate": 2.7422666666666667e-05, + "loss": 0.009, + "step": 17738 + }, + { + "epoch": 13.968097676250492, + "grad_norm": 0.22635366022586823, + "learning_rate": 2.7422333333333333e-05, + "loss": 0.0076, + "step": 17739 + }, + { + "epoch": 13.968885387948012, + "grad_norm": 0.13342693448066711, + "learning_rate": 2.7422000000000003e-05, + "loss": 0.0084, + "step": 17740 + }, + { + "epoch": 13.96967309964553, + "grad_norm": 0.13087931275367737, + "learning_rate": 2.7421666666666665e-05, + "loss": 0.011, + "step": 17741 + }, + { + "epoch": 13.970460811343049, + "grad_norm": 0.23112615942955017, + "learning_rate": 2.7421333333333335e-05, + "loss": 0.013, + "step": 17742 + }, + { + "epoch": 13.971248523040567, + "grad_norm": 0.1499493420124054, + "learning_rate": 2.7421e-05, + "loss": 0.0067, + "step": 17743 + }, + { + "epoch": 13.972036234738086, + "grad_norm": 0.13040198385715485, + "learning_rate": 2.7420666666666666e-05, + "loss": 0.0084, + "step": 17744 + }, + { + "epoch": 13.972823946435604, + "grad_norm": 0.21554379165172577, + "learning_rate": 2.7420333333333332e-05, + "loss": 0.0064, + "step": 17745 + }, + { + "epoch": 13.973611658133123, + "grad_norm": 0.18095624446868896, + "learning_rate": 2.7420000000000002e-05, + "loss": 0.0084, + "step": 17746 + }, + { + "epoch": 13.974399369830643, + "grad_norm": 0.09303389489650726, + "learning_rate": 2.7419666666666668e-05, + "loss": 0.004, + "step": 17747 + }, + { + "epoch": 13.97518708152816, + "grad_norm": 0.223928302526474, + "learning_rate": 2.7419333333333334e-05, + "loss": 0.018, + "step": 17748 + }, + { + "epoch": 13.97597479322568, + "grad_norm": 0.5750905871391296, + "learning_rate": 2.7419000000000003e-05, + "loss": 0.0147, + "step": 17749 + }, + { + "epoch": 13.976762504923197, + "grad_norm": 0.28112465143203735, + "learning_rate": 2.7418666666666665e-05, + "loss": 0.0121, + "step": 17750 + }, + { + "epoch": 13.977550216620717, + "grad_norm": 0.16221016645431519, + "learning_rate": 2.7418333333333335e-05, + "loss": 0.0067, + "step": 17751 + }, + { + "epoch": 13.978337928318236, + "grad_norm": 0.12280064076185226, + "learning_rate": 2.7418e-05, + "loss": 0.0037, + "step": 17752 + }, + { + "epoch": 13.979125640015754, + "grad_norm": 0.15942655503749847, + "learning_rate": 2.7417666666666667e-05, + "loss": 0.0073, + "step": 17753 + }, + { + "epoch": 13.979913351713273, + "grad_norm": 0.24162636697292328, + "learning_rate": 2.7417333333333333e-05, + "loss": 0.0091, + "step": 17754 + }, + { + "epoch": 13.980701063410791, + "grad_norm": 0.23176652193069458, + "learning_rate": 2.7417000000000002e-05, + "loss": 0.0134, + "step": 17755 + }, + { + "epoch": 13.98148877510831, + "grad_norm": 0.11159543693065643, + "learning_rate": 2.7416666666666668e-05, + "loss": 0.006, + "step": 17756 + }, + { + "epoch": 13.982276486805828, + "grad_norm": 0.3238244950771332, + "learning_rate": 2.7416333333333334e-05, + "loss": 0.0107, + "step": 17757 + }, + { + "epoch": 13.983064198503348, + "grad_norm": 0.3138583302497864, + "learning_rate": 2.7416000000000003e-05, + "loss": 0.0106, + "step": 17758 + }, + { + "epoch": 13.983851910200867, + "grad_norm": 0.3828473687171936, + "learning_rate": 2.7415666666666666e-05, + "loss": 0.014, + "step": 17759 + }, + { + "epoch": 13.984639621898385, + "grad_norm": 0.5352763533592224, + "learning_rate": 2.7415333333333335e-05, + "loss": 0.0085, + "step": 17760 + }, + { + "epoch": 13.985427333595904, + "grad_norm": 0.515697181224823, + "learning_rate": 2.7415e-05, + "loss": 0.1069, + "step": 17761 + }, + { + "epoch": 13.986215045293422, + "grad_norm": 0.899907648563385, + "learning_rate": 2.7414666666666667e-05, + "loss": 0.0902, + "step": 17762 + }, + { + "epoch": 13.987002756990941, + "grad_norm": 0.3493831157684326, + "learning_rate": 2.7414333333333336e-05, + "loss": 0.0203, + "step": 17763 + }, + { + "epoch": 13.987790468688459, + "grad_norm": 0.13175341486930847, + "learning_rate": 2.7414e-05, + "loss": 0.0095, + "step": 17764 + }, + { + "epoch": 13.988578180385979, + "grad_norm": 0.12418647110462189, + "learning_rate": 2.7413666666666668e-05, + "loss": 0.0091, + "step": 17765 + }, + { + "epoch": 13.989365892083498, + "grad_norm": 0.22025275230407715, + "learning_rate": 2.7413333333333334e-05, + "loss": 0.017, + "step": 17766 + }, + { + "epoch": 13.990153603781016, + "grad_norm": 0.21682782471179962, + "learning_rate": 2.7413e-05, + "loss": 0.011, + "step": 17767 + }, + { + "epoch": 13.990941315478535, + "grad_norm": 0.18734556436538696, + "learning_rate": 2.7412666666666666e-05, + "loss": 0.0094, + "step": 17768 + }, + { + "epoch": 13.991729027176053, + "grad_norm": 0.09338715672492981, + "learning_rate": 2.7412333333333335e-05, + "loss": 0.0056, + "step": 17769 + }, + { + "epoch": 13.992516738873572, + "grad_norm": 0.46548837423324585, + "learning_rate": 2.7411999999999998e-05, + "loss": 0.0104, + "step": 17770 + }, + { + "epoch": 13.993304450571092, + "grad_norm": 0.12342150509357452, + "learning_rate": 2.7411666666666667e-05, + "loss": 0.0074, + "step": 17771 + }, + { + "epoch": 13.99409216226861, + "grad_norm": 0.19330625236034393, + "learning_rate": 2.7411333333333336e-05, + "loss": 0.0071, + "step": 17772 + }, + { + "epoch": 13.994879873966129, + "grad_norm": 0.16530220210552216, + "learning_rate": 2.7411e-05, + "loss": 0.009, + "step": 17773 + }, + { + "epoch": 13.995667585663647, + "grad_norm": 0.17896327376365662, + "learning_rate": 2.7410666666666668e-05, + "loss": 0.0142, + "step": 17774 + }, + { + "epoch": 13.996455297361166, + "grad_norm": 0.4294185936450958, + "learning_rate": 2.7410333333333334e-05, + "loss": 0.0077, + "step": 17775 + }, + { + "epoch": 13.997243009058685, + "grad_norm": 0.2250540554523468, + "learning_rate": 2.741e-05, + "loss": 0.0085, + "step": 17776 + }, + { + "epoch": 13.998030720756203, + "grad_norm": 0.18401406705379486, + "learning_rate": 2.7409666666666666e-05, + "loss": 0.0084, + "step": 17777 + }, + { + "epoch": 13.998818432453723, + "grad_norm": 0.11120627820491791, + "learning_rate": 2.7409333333333335e-05, + "loss": 0.0058, + "step": 17778 + }, + { + "epoch": 13.99960614415124, + "grad_norm": 0.19495677947998047, + "learning_rate": 2.7408999999999998e-05, + "loss": 0.0062, + "step": 17779 + }, + { + "epoch": 14.0, + "grad_norm": 0.6040757298469543, + "learning_rate": 2.7408666666666667e-05, + "loss": 0.006, + "step": 17780 + }, + { + "epoch": 14.00078771169752, + "grad_norm": 0.5546537637710571, + "learning_rate": 2.7408333333333337e-05, + "loss": 0.1519, + "step": 17781 + }, + { + "epoch": 14.001575423395037, + "grad_norm": 0.4696548283100128, + "learning_rate": 2.7408e-05, + "loss": 0.0981, + "step": 17782 + }, + { + "epoch": 14.002363135092557, + "grad_norm": 0.5080746412277222, + "learning_rate": 2.740766666666667e-05, + "loss": 0.049, + "step": 17783 + }, + { + "epoch": 14.003150846790074, + "grad_norm": 0.34486913681030273, + "learning_rate": 2.7407333333333334e-05, + "loss": 0.0525, + "step": 17784 + }, + { + "epoch": 14.003938558487594, + "grad_norm": 0.25077304244041443, + "learning_rate": 2.7407e-05, + "loss": 0.0333, + "step": 17785 + }, + { + "epoch": 14.004726270185111, + "grad_norm": 0.6072329878807068, + "learning_rate": 2.7406666666666666e-05, + "loss": 0.0191, + "step": 17786 + }, + { + "epoch": 14.00551398188263, + "grad_norm": 0.2602481245994568, + "learning_rate": 2.7406333333333336e-05, + "loss": 0.0138, + "step": 17787 + }, + { + "epoch": 14.00630169358015, + "grad_norm": 0.21863296627998352, + "learning_rate": 2.7406e-05, + "loss": 0.0082, + "step": 17788 + }, + { + "epoch": 14.007089405277668, + "grad_norm": 0.16074177622795105, + "learning_rate": 2.7405666666666667e-05, + "loss": 0.0092, + "step": 17789 + }, + { + "epoch": 14.007877116975187, + "grad_norm": 0.16820988059043884, + "learning_rate": 2.7405333333333337e-05, + "loss": 0.0071, + "step": 17790 + }, + { + "epoch": 14.008664828672705, + "grad_norm": 0.1828070431947708, + "learning_rate": 2.7405e-05, + "loss": 0.0051, + "step": 17791 + }, + { + "epoch": 14.009452540370225, + "grad_norm": 0.2514386773109436, + "learning_rate": 2.740466666666667e-05, + "loss": 0.0091, + "step": 17792 + }, + { + "epoch": 14.010240252067744, + "grad_norm": 0.34659305214881897, + "learning_rate": 2.7404333333333335e-05, + "loss": 0.0107, + "step": 17793 + }, + { + "epoch": 14.011027963765262, + "grad_norm": 0.11964552104473114, + "learning_rate": 2.7404e-05, + "loss": 0.0056, + "step": 17794 + }, + { + "epoch": 14.011815675462781, + "grad_norm": 0.18444380164146423, + "learning_rate": 2.7403666666666666e-05, + "loss": 0.0058, + "step": 17795 + }, + { + "epoch": 14.012603387160299, + "grad_norm": 0.24464194476604462, + "learning_rate": 2.7403333333333332e-05, + "loss": 0.0121, + "step": 17796 + }, + { + "epoch": 14.013391098857818, + "grad_norm": 0.13717325031757355, + "learning_rate": 2.7403000000000002e-05, + "loss": 0.0053, + "step": 17797 + }, + { + "epoch": 14.014178810555336, + "grad_norm": 0.19987474381923676, + "learning_rate": 2.7402666666666668e-05, + "loss": 0.0104, + "step": 17798 + }, + { + "epoch": 14.014966522252855, + "grad_norm": 0.20189814269542694, + "learning_rate": 2.7402333333333334e-05, + "loss": 0.0088, + "step": 17799 + }, + { + "epoch": 14.015754233950375, + "grad_norm": 0.10164224356412888, + "learning_rate": 2.7402e-05, + "loss": 0.0049, + "step": 17800 + }, + { + "epoch": 14.016541945647893, + "grad_norm": 0.23336471617221832, + "learning_rate": 2.740166666666667e-05, + "loss": 0.0065, + "step": 17801 + }, + { + "epoch": 14.017329657345412, + "grad_norm": 0.15014122426509857, + "learning_rate": 2.740133333333333e-05, + "loss": 0.0063, + "step": 17802 + }, + { + "epoch": 14.01811736904293, + "grad_norm": 0.1431579738855362, + "learning_rate": 2.7401e-05, + "loss": 0.0084, + "step": 17803 + }, + { + "epoch": 14.01890508074045, + "grad_norm": 0.08354826271533966, + "learning_rate": 2.7400666666666667e-05, + "loss": 0.004, + "step": 17804 + }, + { + "epoch": 14.019692792437967, + "grad_norm": 0.2415834218263626, + "learning_rate": 2.7400333333333333e-05, + "loss": 0.0064, + "step": 17805 + }, + { + "epoch": 14.020480504135486, + "grad_norm": 0.1620343178510666, + "learning_rate": 2.7400000000000002e-05, + "loss": 0.005, + "step": 17806 + }, + { + "epoch": 14.021268215833006, + "grad_norm": 0.26065510511398315, + "learning_rate": 2.7399666666666668e-05, + "loss": 0.0077, + "step": 17807 + }, + { + "epoch": 14.022055927530523, + "grad_norm": 0.37625762820243835, + "learning_rate": 2.7399333333333334e-05, + "loss": 0.0143, + "step": 17808 + }, + { + "epoch": 14.022843639228043, + "grad_norm": 0.18840135633945465, + "learning_rate": 2.7399e-05, + "loss": 0.0057, + "step": 17809 + }, + { + "epoch": 14.02363135092556, + "grad_norm": 0.181730717420578, + "learning_rate": 2.739866666666667e-05, + "loss": 0.0061, + "step": 17810 + }, + { + "epoch": 14.02441906262308, + "grad_norm": 0.27064573764801025, + "learning_rate": 2.739833333333333e-05, + "loss": 0.0118, + "step": 17811 + }, + { + "epoch": 14.0252067743206, + "grad_norm": 0.13918940722942352, + "learning_rate": 2.7398e-05, + "loss": 0.0022, + "step": 17812 + }, + { + "epoch": 14.025994486018117, + "grad_norm": 0.1911163628101349, + "learning_rate": 2.739766666666667e-05, + "loss": 0.0097, + "step": 17813 + }, + { + "epoch": 14.026782197715637, + "grad_norm": 0.148702472448349, + "learning_rate": 2.7397333333333333e-05, + "loss": 0.0082, + "step": 17814 + }, + { + "epoch": 14.027569909413154, + "grad_norm": 0.24183768033981323, + "learning_rate": 2.7397000000000002e-05, + "loss": 0.0179, + "step": 17815 + }, + { + "epoch": 14.028357621110674, + "grad_norm": 0.20235402882099152, + "learning_rate": 2.7396666666666668e-05, + "loss": 0.007, + "step": 17816 + }, + { + "epoch": 14.029145332808191, + "grad_norm": 0.11949028819799423, + "learning_rate": 2.7396333333333334e-05, + "loss": 0.0039, + "step": 17817 + }, + { + "epoch": 14.02993304450571, + "grad_norm": 0.1338866949081421, + "learning_rate": 2.7396e-05, + "loss": 0.0083, + "step": 17818 + }, + { + "epoch": 14.03072075620323, + "grad_norm": 0.16387012600898743, + "learning_rate": 2.739566666666667e-05, + "loss": 0.0056, + "step": 17819 + }, + { + "epoch": 14.031508467900748, + "grad_norm": 0.2128230631351471, + "learning_rate": 2.7395333333333332e-05, + "loss": 0.0055, + "step": 17820 + }, + { + "epoch": 14.032296179598267, + "grad_norm": 0.2811996042728424, + "learning_rate": 2.7395e-05, + "loss": 0.0101, + "step": 17821 + }, + { + "epoch": 14.033083891295785, + "grad_norm": 0.1397954821586609, + "learning_rate": 2.739466666666667e-05, + "loss": 0.0064, + "step": 17822 + }, + { + "epoch": 14.033871602993305, + "grad_norm": 0.13766105473041534, + "learning_rate": 2.7394333333333333e-05, + "loss": 0.0046, + "step": 17823 + }, + { + "epoch": 14.034659314690822, + "grad_norm": 0.14202480018138885, + "learning_rate": 2.7394000000000002e-05, + "loss": 0.0082, + "step": 17824 + }, + { + "epoch": 14.035447026388342, + "grad_norm": 0.22750554978847504, + "learning_rate": 2.7393666666666665e-05, + "loss": 0.0125, + "step": 17825 + }, + { + "epoch": 14.036234738085861, + "grad_norm": 0.20593632757663727, + "learning_rate": 2.7393333333333334e-05, + "loss": 0.0064, + "step": 17826 + }, + { + "epoch": 14.037022449783379, + "grad_norm": 0.1984787881374359, + "learning_rate": 2.7393e-05, + "loss": 0.0053, + "step": 17827 + }, + { + "epoch": 14.037810161480898, + "grad_norm": 0.19206024706363678, + "learning_rate": 2.7392666666666666e-05, + "loss": 0.0102, + "step": 17828 + }, + { + "epoch": 14.038597873178416, + "grad_norm": 0.37064027786254883, + "learning_rate": 2.7392333333333332e-05, + "loss": 0.0059, + "step": 17829 + }, + { + "epoch": 14.039385584875935, + "grad_norm": 0.13288098573684692, + "learning_rate": 2.7392e-05, + "loss": 0.0046, + "step": 17830 + }, + { + "epoch": 14.040173296573455, + "grad_norm": 0.5798410773277283, + "learning_rate": 2.7391666666666667e-05, + "loss": 0.1256, + "step": 17831 + }, + { + "epoch": 14.040961008270973, + "grad_norm": 0.5838444828987122, + "learning_rate": 2.7391333333333333e-05, + "loss": 0.1022, + "step": 17832 + }, + { + "epoch": 14.041748719968492, + "grad_norm": 0.4544867277145386, + "learning_rate": 2.7391000000000003e-05, + "loss": 0.1162, + "step": 17833 + }, + { + "epoch": 14.04253643166601, + "grad_norm": 0.6153704524040222, + "learning_rate": 2.7390666666666665e-05, + "loss": 0.0477, + "step": 17834 + }, + { + "epoch": 14.04332414336353, + "grad_norm": 0.4174545407295227, + "learning_rate": 2.7390333333333334e-05, + "loss": 0.0547, + "step": 17835 + }, + { + "epoch": 14.044111855061047, + "grad_norm": 0.16536052525043488, + "learning_rate": 2.739e-05, + "loss": 0.0444, + "step": 17836 + }, + { + "epoch": 14.044899566758566, + "grad_norm": 0.19947175681591034, + "learning_rate": 2.7389666666666666e-05, + "loss": 0.0128, + "step": 17837 + }, + { + "epoch": 14.045687278456086, + "grad_norm": 0.2820470631122589, + "learning_rate": 2.7389333333333336e-05, + "loss": 0.0225, + "step": 17838 + }, + { + "epoch": 14.046474990153603, + "grad_norm": 0.22703130543231964, + "learning_rate": 2.7389e-05, + "loss": 0.0193, + "step": 17839 + }, + { + "epoch": 14.047262701851123, + "grad_norm": 0.19983568787574768, + "learning_rate": 2.7388666666666667e-05, + "loss": 0.0086, + "step": 17840 + }, + { + "epoch": 14.04805041354864, + "grad_norm": 0.2630940079689026, + "learning_rate": 2.7388333333333333e-05, + "loss": 0.0079, + "step": 17841 + }, + { + "epoch": 14.04883812524616, + "grad_norm": 0.4438660740852356, + "learning_rate": 2.7388000000000003e-05, + "loss": 0.0078, + "step": 17842 + }, + { + "epoch": 14.04962583694368, + "grad_norm": 0.16988074779510498, + "learning_rate": 2.7387666666666665e-05, + "loss": 0.0063, + "step": 17843 + }, + { + "epoch": 14.050413548641197, + "grad_norm": 0.6476714611053467, + "learning_rate": 2.7387333333333335e-05, + "loss": 0.0067, + "step": 17844 + }, + { + "epoch": 14.051201260338717, + "grad_norm": 0.15734454989433289, + "learning_rate": 2.7387e-05, + "loss": 0.0077, + "step": 17845 + }, + { + "epoch": 14.051988972036234, + "grad_norm": 0.0930037721991539, + "learning_rate": 2.7386666666666666e-05, + "loss": 0.0036, + "step": 17846 + }, + { + "epoch": 14.052776683733754, + "grad_norm": 0.33581289649009705, + "learning_rate": 2.7386333333333336e-05, + "loss": 0.0124, + "step": 17847 + }, + { + "epoch": 14.053564395431271, + "grad_norm": 0.1912253051996231, + "learning_rate": 2.7386000000000002e-05, + "loss": 0.0052, + "step": 17848 + }, + { + "epoch": 14.054352107128791, + "grad_norm": 0.1652211993932724, + "learning_rate": 2.7385666666666668e-05, + "loss": 0.022, + "step": 17849 + }, + { + "epoch": 14.05513981882631, + "grad_norm": 0.09651920199394226, + "learning_rate": 2.7385333333333334e-05, + "loss": 0.0038, + "step": 17850 + }, + { + "epoch": 14.055927530523828, + "grad_norm": 0.3739010989665985, + "learning_rate": 2.7385000000000003e-05, + "loss": 0.0223, + "step": 17851 + }, + { + "epoch": 14.056715242221347, + "grad_norm": 0.481778621673584, + "learning_rate": 2.7384666666666665e-05, + "loss": 0.0058, + "step": 17852 + }, + { + "epoch": 14.057502953918865, + "grad_norm": 0.1515519917011261, + "learning_rate": 2.7384333333333335e-05, + "loss": 0.0055, + "step": 17853 + }, + { + "epoch": 14.058290665616385, + "grad_norm": 0.15752796828746796, + "learning_rate": 2.7383999999999997e-05, + "loss": 0.0077, + "step": 17854 + }, + { + "epoch": 14.059078377313902, + "grad_norm": 0.15462030470371246, + "learning_rate": 2.7383666666666667e-05, + "loss": 0.0053, + "step": 17855 + }, + { + "epoch": 14.059866089011422, + "grad_norm": 0.1326991468667984, + "learning_rate": 2.7383333333333336e-05, + "loss": 0.0092, + "step": 17856 + }, + { + "epoch": 14.060653800708941, + "grad_norm": 0.39810165762901306, + "learning_rate": 2.7383e-05, + "loss": 0.015, + "step": 17857 + }, + { + "epoch": 14.061441512406459, + "grad_norm": 0.19985859096050262, + "learning_rate": 2.7382666666666668e-05, + "loss": 0.0131, + "step": 17858 + }, + { + "epoch": 14.062229224103978, + "grad_norm": 0.21104080975055695, + "learning_rate": 2.7382333333333334e-05, + "loss": 0.0115, + "step": 17859 + }, + { + "epoch": 14.063016935801496, + "grad_norm": 0.21212096512317657, + "learning_rate": 2.7382e-05, + "loss": 0.0055, + "step": 17860 + }, + { + "epoch": 14.063804647499015, + "grad_norm": 0.820324718952179, + "learning_rate": 2.7381666666666666e-05, + "loss": 0.0068, + "step": 17861 + }, + { + "epoch": 14.064592359196535, + "grad_norm": 0.05253696069121361, + "learning_rate": 2.7381333333333335e-05, + "loss": 0.0033, + "step": 17862 + }, + { + "epoch": 14.065380070894053, + "grad_norm": 1.2117754220962524, + "learning_rate": 2.7381e-05, + "loss": 0.0094, + "step": 17863 + }, + { + "epoch": 14.066167782591572, + "grad_norm": 0.29759731888771057, + "learning_rate": 2.7380666666666667e-05, + "loss": 0.0078, + "step": 17864 + }, + { + "epoch": 14.06695549428909, + "grad_norm": 0.24196867644786835, + "learning_rate": 2.7380333333333336e-05, + "loss": 0.0068, + "step": 17865 + }, + { + "epoch": 14.06774320598661, + "grad_norm": 0.12845590710639954, + "learning_rate": 2.738e-05, + "loss": 0.0079, + "step": 17866 + }, + { + "epoch": 14.068530917684127, + "grad_norm": 0.124026358127594, + "learning_rate": 2.7379666666666668e-05, + "loss": 0.0066, + "step": 17867 + }, + { + "epoch": 14.069318629381646, + "grad_norm": 0.1498960554599762, + "learning_rate": 2.7379333333333334e-05, + "loss": 0.0061, + "step": 17868 + }, + { + "epoch": 14.070106341079166, + "grad_norm": 0.14717914164066315, + "learning_rate": 2.7379e-05, + "loss": 0.0055, + "step": 17869 + }, + { + "epoch": 14.070894052776683, + "grad_norm": 0.23794306814670563, + "learning_rate": 2.7378666666666666e-05, + "loss": 0.0121, + "step": 17870 + }, + { + "epoch": 14.071681764474203, + "grad_norm": 0.14471958577632904, + "learning_rate": 2.7378333333333335e-05, + "loss": 0.0112, + "step": 17871 + }, + { + "epoch": 14.07246947617172, + "grad_norm": 0.16537626087665558, + "learning_rate": 2.7378e-05, + "loss": 0.0111, + "step": 17872 + }, + { + "epoch": 14.07325718786924, + "grad_norm": 0.2944079339504242, + "learning_rate": 2.7377666666666667e-05, + "loss": 0.0041, + "step": 17873 + }, + { + "epoch": 14.074044899566758, + "grad_norm": 0.49037206172943115, + "learning_rate": 2.7377333333333336e-05, + "loss": 0.0132, + "step": 17874 + }, + { + "epoch": 14.074832611264277, + "grad_norm": 0.24914208054542542, + "learning_rate": 2.7377e-05, + "loss": 0.0054, + "step": 17875 + }, + { + "epoch": 14.075620322961797, + "grad_norm": 0.07448166608810425, + "learning_rate": 2.7376666666666668e-05, + "loss": 0.0038, + "step": 17876 + }, + { + "epoch": 14.076408034659314, + "grad_norm": 0.17337806522846222, + "learning_rate": 2.7376333333333334e-05, + "loss": 0.0085, + "step": 17877 + }, + { + "epoch": 14.077195746356834, + "grad_norm": 0.2607441842556, + "learning_rate": 2.7376e-05, + "loss": 0.0078, + "step": 17878 + }, + { + "epoch": 14.077983458054351, + "grad_norm": 0.37865784764289856, + "learning_rate": 2.7375666666666666e-05, + "loss": 0.0118, + "step": 17879 + }, + { + "epoch": 14.078771169751871, + "grad_norm": 0.26861143112182617, + "learning_rate": 2.7375333333333335e-05, + "loss": 0.007, + "step": 17880 + }, + { + "epoch": 14.07955888144939, + "grad_norm": 0.7186662554740906, + "learning_rate": 2.7375e-05, + "loss": 0.1303, + "step": 17881 + }, + { + "epoch": 14.080346593146908, + "grad_norm": 0.5512067079544067, + "learning_rate": 2.7374666666666667e-05, + "loss": 0.1495, + "step": 17882 + }, + { + "epoch": 14.081134304844428, + "grad_norm": 0.5301699042320251, + "learning_rate": 2.7374333333333337e-05, + "loss": 0.0928, + "step": 17883 + }, + { + "epoch": 14.081922016541945, + "grad_norm": 0.3989224135875702, + "learning_rate": 2.7374e-05, + "loss": 0.057, + "step": 17884 + }, + { + "epoch": 14.082709728239465, + "grad_norm": 0.33404117822647095, + "learning_rate": 2.737366666666667e-05, + "loss": 0.0583, + "step": 17885 + }, + { + "epoch": 14.083497439936982, + "grad_norm": 0.11045482754707336, + "learning_rate": 2.737333333333333e-05, + "loss": 0.0146, + "step": 17886 + }, + { + "epoch": 14.084285151634502, + "grad_norm": 0.16525693237781525, + "learning_rate": 2.7373e-05, + "loss": 0.0344, + "step": 17887 + }, + { + "epoch": 14.085072863332021, + "grad_norm": 0.437897652387619, + "learning_rate": 2.737266666666667e-05, + "loss": 0.0084, + "step": 17888 + }, + { + "epoch": 14.085860575029539, + "grad_norm": 0.13471746444702148, + "learning_rate": 2.7372333333333332e-05, + "loss": 0.0063, + "step": 17889 + }, + { + "epoch": 14.086648286727058, + "grad_norm": 0.12376592308282852, + "learning_rate": 2.7372e-05, + "loss": 0.0092, + "step": 17890 + }, + { + "epoch": 14.087435998424576, + "grad_norm": 0.1373528689146042, + "learning_rate": 2.7371666666666667e-05, + "loss": 0.0211, + "step": 17891 + }, + { + "epoch": 14.088223710122096, + "grad_norm": 0.20443753898143768, + "learning_rate": 2.7371333333333333e-05, + "loss": 0.0095, + "step": 17892 + }, + { + "epoch": 14.089011421819613, + "grad_norm": 0.44018781185150146, + "learning_rate": 2.7371e-05, + "loss": 0.0079, + "step": 17893 + }, + { + "epoch": 14.089799133517133, + "grad_norm": 0.2597666084766388, + "learning_rate": 2.737066666666667e-05, + "loss": 0.0085, + "step": 17894 + }, + { + "epoch": 14.090586845214652, + "grad_norm": 0.1853959709405899, + "learning_rate": 2.737033333333333e-05, + "loss": 0.0068, + "step": 17895 + }, + { + "epoch": 14.09137455691217, + "grad_norm": 0.1585877537727356, + "learning_rate": 2.737e-05, + "loss": 0.0085, + "step": 17896 + }, + { + "epoch": 14.09216226860969, + "grad_norm": 0.08548685908317566, + "learning_rate": 2.736966666666667e-05, + "loss": 0.0044, + "step": 17897 + }, + { + "epoch": 14.092949980307207, + "grad_norm": 0.4026268720626831, + "learning_rate": 2.7369333333333332e-05, + "loss": 0.0089, + "step": 17898 + }, + { + "epoch": 14.093737692004726, + "grad_norm": 0.10558972507715225, + "learning_rate": 2.7369000000000002e-05, + "loss": 0.0064, + "step": 17899 + }, + { + "epoch": 14.094525403702246, + "grad_norm": 0.13323284685611725, + "learning_rate": 2.7368666666666668e-05, + "loss": 0.0038, + "step": 17900 + }, + { + "epoch": 14.095313115399764, + "grad_norm": 0.12094221264123917, + "learning_rate": 2.7368333333333334e-05, + "loss": 0.005, + "step": 17901 + }, + { + "epoch": 14.096100827097283, + "grad_norm": 0.10654368251562119, + "learning_rate": 2.7368e-05, + "loss": 0.0039, + "step": 17902 + }, + { + "epoch": 14.0968885387948, + "grad_norm": 0.6747168898582458, + "learning_rate": 2.736766666666667e-05, + "loss": 0.01, + "step": 17903 + }, + { + "epoch": 14.09767625049232, + "grad_norm": 0.36157503724098206, + "learning_rate": 2.736733333333333e-05, + "loss": 0.0135, + "step": 17904 + }, + { + "epoch": 14.098463962189838, + "grad_norm": 0.18548262119293213, + "learning_rate": 2.7367e-05, + "loss": 0.0046, + "step": 17905 + }, + { + "epoch": 14.099251673887357, + "grad_norm": 0.2056744247674942, + "learning_rate": 2.736666666666667e-05, + "loss": 0.0068, + "step": 17906 + }, + { + "epoch": 14.100039385584877, + "grad_norm": 0.21119743585586548, + "learning_rate": 2.7366333333333333e-05, + "loss": 0.0108, + "step": 17907 + }, + { + "epoch": 14.100827097282394, + "grad_norm": 0.19919580221176147, + "learning_rate": 2.7366000000000002e-05, + "loss": 0.0043, + "step": 17908 + }, + { + "epoch": 14.101614808979914, + "grad_norm": 0.18818779289722443, + "learning_rate": 2.7365666666666668e-05, + "loss": 0.0109, + "step": 17909 + }, + { + "epoch": 14.102402520677431, + "grad_norm": 0.22057457268238068, + "learning_rate": 2.7365333333333334e-05, + "loss": 0.0083, + "step": 17910 + }, + { + "epoch": 14.103190232374951, + "grad_norm": 0.1380000114440918, + "learning_rate": 2.7365e-05, + "loss": 0.0035, + "step": 17911 + }, + { + "epoch": 14.103977944072469, + "grad_norm": 0.25043749809265137, + "learning_rate": 2.736466666666667e-05, + "loss": 0.0062, + "step": 17912 + }, + { + "epoch": 14.104765655769988, + "grad_norm": 0.13537967205047607, + "learning_rate": 2.7364333333333335e-05, + "loss": 0.0052, + "step": 17913 + }, + { + "epoch": 14.105553367467508, + "grad_norm": 0.21243037283420563, + "learning_rate": 2.7364e-05, + "loss": 0.0081, + "step": 17914 + }, + { + "epoch": 14.106341079165025, + "grad_norm": 0.19726696610450745, + "learning_rate": 2.7363666666666667e-05, + "loss": 0.0065, + "step": 17915 + }, + { + "epoch": 14.107128790862545, + "grad_norm": 0.255327433347702, + "learning_rate": 2.7363333333333333e-05, + "loss": 0.0058, + "step": 17916 + }, + { + "epoch": 14.107916502560062, + "grad_norm": 0.8238925337791443, + "learning_rate": 2.7363000000000002e-05, + "loss": 0.0183, + "step": 17917 + }, + { + "epoch": 14.108704214257582, + "grad_norm": 0.18158884346485138, + "learning_rate": 2.7362666666666665e-05, + "loss": 0.0048, + "step": 17918 + }, + { + "epoch": 14.109491925955101, + "grad_norm": 0.23695972561836243, + "learning_rate": 2.7362333333333334e-05, + "loss": 0.0077, + "step": 17919 + }, + { + "epoch": 14.110279637652619, + "grad_norm": 0.29866158962249756, + "learning_rate": 2.7362e-05, + "loss": 0.0138, + "step": 17920 + }, + { + "epoch": 14.111067349350138, + "grad_norm": 0.13867850601673126, + "learning_rate": 2.7361666666666666e-05, + "loss": 0.006, + "step": 17921 + }, + { + "epoch": 14.111855061047656, + "grad_norm": 0.2514412999153137, + "learning_rate": 2.7361333333333335e-05, + "loss": 0.0084, + "step": 17922 + }, + { + "epoch": 14.112642772745176, + "grad_norm": 0.48531270027160645, + "learning_rate": 2.7361e-05, + "loss": 0.0111, + "step": 17923 + }, + { + "epoch": 14.113430484442693, + "grad_norm": 0.09981058537960052, + "learning_rate": 2.7360666666666667e-05, + "loss": 0.0048, + "step": 17924 + }, + { + "epoch": 14.114218196140213, + "grad_norm": 0.7928016185760498, + "learning_rate": 2.7360333333333333e-05, + "loss": 0.0137, + "step": 17925 + }, + { + "epoch": 14.115005907837732, + "grad_norm": 0.2476978302001953, + "learning_rate": 2.7360000000000002e-05, + "loss": 0.0086, + "step": 17926 + }, + { + "epoch": 14.11579361953525, + "grad_norm": 0.11563019454479218, + "learning_rate": 2.7359666666666665e-05, + "loss": 0.005, + "step": 17927 + }, + { + "epoch": 14.11658133123277, + "grad_norm": 0.219149649143219, + "learning_rate": 2.7359333333333334e-05, + "loss": 0.0064, + "step": 17928 + }, + { + "epoch": 14.117369042930287, + "grad_norm": 0.2340647131204605, + "learning_rate": 2.7359e-05, + "loss": 0.0097, + "step": 17929 + }, + { + "epoch": 14.118156754627806, + "grad_norm": 3.0377233028411865, + "learning_rate": 2.7358666666666666e-05, + "loss": 0.0208, + "step": 17930 + }, + { + "epoch": 14.118944466325324, + "grad_norm": 0.7669278383255005, + "learning_rate": 2.7358333333333335e-05, + "loss": 0.1691, + "step": 17931 + }, + { + "epoch": 14.119732178022844, + "grad_norm": 0.490710973739624, + "learning_rate": 2.7358e-05, + "loss": 0.1095, + "step": 17932 + }, + { + "epoch": 14.120519889720363, + "grad_norm": 0.8230147957801819, + "learning_rate": 2.7357666666666667e-05, + "loss": 0.0832, + "step": 17933 + }, + { + "epoch": 14.12130760141788, + "grad_norm": 0.29583990573883057, + "learning_rate": 2.7357333333333333e-05, + "loss": 0.0619, + "step": 17934 + }, + { + "epoch": 14.1220953131154, + "grad_norm": 0.28955474495887756, + "learning_rate": 2.7357000000000003e-05, + "loss": 0.0434, + "step": 17935 + }, + { + "epoch": 14.122883024812918, + "grad_norm": 0.2548077702522278, + "learning_rate": 2.7356666666666665e-05, + "loss": 0.0205, + "step": 17936 + }, + { + "epoch": 14.123670736510437, + "grad_norm": 0.17614097893238068, + "learning_rate": 2.7356333333333334e-05, + "loss": 0.009, + "step": 17937 + }, + { + "epoch": 14.124458448207957, + "grad_norm": 0.198359876871109, + "learning_rate": 2.7356000000000004e-05, + "loss": 0.0128, + "step": 17938 + }, + { + "epoch": 14.125246159905474, + "grad_norm": 0.2110402137041092, + "learning_rate": 2.7355666666666666e-05, + "loss": 0.0165, + "step": 17939 + }, + { + "epoch": 14.126033871602994, + "grad_norm": 0.13715140521526337, + "learning_rate": 2.7355333333333336e-05, + "loss": 0.0075, + "step": 17940 + }, + { + "epoch": 14.126821583300512, + "grad_norm": 0.46350932121276855, + "learning_rate": 2.7355e-05, + "loss": 0.0102, + "step": 17941 + }, + { + "epoch": 14.127609294998031, + "grad_norm": 0.2106034904718399, + "learning_rate": 2.7354666666666667e-05, + "loss": 0.0093, + "step": 17942 + }, + { + "epoch": 14.128397006695549, + "grad_norm": 0.25003141164779663, + "learning_rate": 2.7354333333333333e-05, + "loss": 0.0183, + "step": 17943 + }, + { + "epoch": 14.129184718393068, + "grad_norm": 0.3266333341598511, + "learning_rate": 2.7354000000000003e-05, + "loss": 0.0115, + "step": 17944 + }, + { + "epoch": 14.129972430090588, + "grad_norm": 0.08940689265727997, + "learning_rate": 2.7353666666666665e-05, + "loss": 0.0061, + "step": 17945 + }, + { + "epoch": 14.130760141788105, + "grad_norm": 0.5303405523300171, + "learning_rate": 2.7353333333333335e-05, + "loss": 0.0085, + "step": 17946 + }, + { + "epoch": 14.131547853485625, + "grad_norm": 0.23305916786193848, + "learning_rate": 2.7353e-05, + "loss": 0.0074, + "step": 17947 + }, + { + "epoch": 14.132335565183142, + "grad_norm": 0.08265204727649689, + "learning_rate": 2.7352666666666666e-05, + "loss": 0.0049, + "step": 17948 + }, + { + "epoch": 14.133123276880662, + "grad_norm": 0.2512916326522827, + "learning_rate": 2.7352333333333336e-05, + "loss": 0.0073, + "step": 17949 + }, + { + "epoch": 14.13391098857818, + "grad_norm": 0.25163140892982483, + "learning_rate": 2.7352e-05, + "loss": 0.0097, + "step": 17950 + }, + { + "epoch": 14.134698700275699, + "grad_norm": 0.30548596382141113, + "learning_rate": 2.7351666666666668e-05, + "loss": 0.0091, + "step": 17951 + }, + { + "epoch": 14.135486411973218, + "grad_norm": 0.5345240831375122, + "learning_rate": 2.7351333333333334e-05, + "loss": 0.0113, + "step": 17952 + }, + { + "epoch": 14.136274123670736, + "grad_norm": 0.22533375024795532, + "learning_rate": 2.7351e-05, + "loss": 0.0067, + "step": 17953 + }, + { + "epoch": 14.137061835368256, + "grad_norm": 0.1398773044347763, + "learning_rate": 2.7350666666666665e-05, + "loss": 0.0061, + "step": 17954 + }, + { + "epoch": 14.137849547065773, + "grad_norm": 0.20554423332214355, + "learning_rate": 2.7350333333333335e-05, + "loss": 0.008, + "step": 17955 + }, + { + "epoch": 14.138637258763293, + "grad_norm": 0.21121907234191895, + "learning_rate": 2.735e-05, + "loss": 0.0129, + "step": 17956 + }, + { + "epoch": 14.139424970460812, + "grad_norm": 0.17196357250213623, + "learning_rate": 2.7349666666666667e-05, + "loss": 0.0114, + "step": 17957 + }, + { + "epoch": 14.14021268215833, + "grad_norm": 0.3797248601913452, + "learning_rate": 2.7349333333333336e-05, + "loss": 0.0136, + "step": 17958 + }, + { + "epoch": 14.14100039385585, + "grad_norm": 0.19262748956680298, + "learning_rate": 2.7349e-05, + "loss": 0.0081, + "step": 17959 + }, + { + "epoch": 14.141788105553367, + "grad_norm": 0.23630547523498535, + "learning_rate": 2.7348666666666668e-05, + "loss": 0.0085, + "step": 17960 + }, + { + "epoch": 14.142575817250886, + "grad_norm": 0.15978246927261353, + "learning_rate": 2.7348333333333334e-05, + "loss": 0.0105, + "step": 17961 + }, + { + "epoch": 14.143363528948404, + "grad_norm": 0.29129260778427124, + "learning_rate": 2.7348e-05, + "loss": 0.009, + "step": 17962 + }, + { + "epoch": 14.144151240645924, + "grad_norm": 0.13145500421524048, + "learning_rate": 2.734766666666667e-05, + "loss": 0.0062, + "step": 17963 + }, + { + "epoch": 14.144938952343443, + "grad_norm": 0.09434239566326141, + "learning_rate": 2.7347333333333335e-05, + "loss": 0.0038, + "step": 17964 + }, + { + "epoch": 14.14572666404096, + "grad_norm": 0.10074865818023682, + "learning_rate": 2.7347e-05, + "loss": 0.0051, + "step": 17965 + }, + { + "epoch": 14.14651437573848, + "grad_norm": 0.13839463889598846, + "learning_rate": 2.7346666666666667e-05, + "loss": 0.0031, + "step": 17966 + }, + { + "epoch": 14.147302087435998, + "grad_norm": 0.20936517417430878, + "learning_rate": 2.7346333333333336e-05, + "loss": 0.0144, + "step": 17967 + }, + { + "epoch": 14.148089799133517, + "grad_norm": 0.594183087348938, + "learning_rate": 2.7346e-05, + "loss": 0.0192, + "step": 17968 + }, + { + "epoch": 14.148877510831035, + "grad_norm": 0.1655697226524353, + "learning_rate": 2.7345666666666668e-05, + "loss": 0.0061, + "step": 17969 + }, + { + "epoch": 14.149665222528554, + "grad_norm": 0.20860852301120758, + "learning_rate": 2.7345333333333334e-05, + "loss": 0.0065, + "step": 17970 + }, + { + "epoch": 14.150452934226074, + "grad_norm": 0.24377048015594482, + "learning_rate": 2.7345e-05, + "loss": 0.01, + "step": 17971 + }, + { + "epoch": 14.151240645923592, + "grad_norm": 0.2136966735124588, + "learning_rate": 2.734466666666667e-05, + "loss": 0.0122, + "step": 17972 + }, + { + "epoch": 14.152028357621111, + "grad_norm": 0.1641939878463745, + "learning_rate": 2.7344333333333335e-05, + "loss": 0.0074, + "step": 17973 + }, + { + "epoch": 14.152816069318629, + "grad_norm": 0.12338040769100189, + "learning_rate": 2.7344e-05, + "loss": 0.0042, + "step": 17974 + }, + { + "epoch": 14.153603781016148, + "grad_norm": 0.24263814091682434, + "learning_rate": 2.7343666666666667e-05, + "loss": 0.0089, + "step": 17975 + }, + { + "epoch": 14.154391492713668, + "grad_norm": 0.2565319538116455, + "learning_rate": 2.7343333333333333e-05, + "loss": 0.0123, + "step": 17976 + }, + { + "epoch": 14.155179204411185, + "grad_norm": 0.14357398450374603, + "learning_rate": 2.7343e-05, + "loss": 0.0066, + "step": 17977 + }, + { + "epoch": 14.155966916108705, + "grad_norm": 0.24079959094524384, + "learning_rate": 2.7342666666666668e-05, + "loss": 0.0121, + "step": 17978 + }, + { + "epoch": 14.156754627806222, + "grad_norm": 0.3946249485015869, + "learning_rate": 2.7342333333333334e-05, + "loss": 0.0104, + "step": 17979 + }, + { + "epoch": 14.157542339503742, + "grad_norm": 0.25901105999946594, + "learning_rate": 2.7342e-05, + "loss": 0.0072, + "step": 17980 + }, + { + "epoch": 14.15833005120126, + "grad_norm": 0.5509277582168579, + "learning_rate": 2.734166666666667e-05, + "loss": 0.1474, + "step": 17981 + }, + { + "epoch": 14.159117762898779, + "grad_norm": 0.39281535148620605, + "learning_rate": 2.7341333333333332e-05, + "loss": 0.0922, + "step": 17982 + }, + { + "epoch": 14.159905474596298, + "grad_norm": 0.47843343019485474, + "learning_rate": 2.7341e-05, + "loss": 0.0808, + "step": 17983 + }, + { + "epoch": 14.160693186293816, + "grad_norm": 0.4316960275173187, + "learning_rate": 2.7340666666666667e-05, + "loss": 0.0401, + "step": 17984 + }, + { + "epoch": 14.161480897991336, + "grad_norm": 0.24425141513347626, + "learning_rate": 2.7340333333333333e-05, + "loss": 0.0289, + "step": 17985 + }, + { + "epoch": 14.162268609688853, + "grad_norm": 0.24119031429290771, + "learning_rate": 2.734e-05, + "loss": 0.0661, + "step": 17986 + }, + { + "epoch": 14.163056321386373, + "grad_norm": 0.17838551104068756, + "learning_rate": 2.733966666666667e-05, + "loss": 0.0092, + "step": 17987 + }, + { + "epoch": 14.16384403308389, + "grad_norm": 0.16442659497261047, + "learning_rate": 2.7339333333333334e-05, + "loss": 0.0207, + "step": 17988 + }, + { + "epoch": 14.16463174478141, + "grad_norm": 0.16315427422523499, + "learning_rate": 2.7339e-05, + "loss": 0.0122, + "step": 17989 + }, + { + "epoch": 14.16541945647893, + "grad_norm": 0.06836215406656265, + "learning_rate": 2.733866666666667e-05, + "loss": 0.0027, + "step": 17990 + }, + { + "epoch": 14.166207168176447, + "grad_norm": 0.3158721923828125, + "learning_rate": 2.7338333333333332e-05, + "loss": 0.0117, + "step": 17991 + }, + { + "epoch": 14.166994879873966, + "grad_norm": 0.17493444681167603, + "learning_rate": 2.7338e-05, + "loss": 0.0115, + "step": 17992 + }, + { + "epoch": 14.167782591571484, + "grad_norm": 0.15845713019371033, + "learning_rate": 2.7337666666666667e-05, + "loss": 0.0082, + "step": 17993 + }, + { + "epoch": 14.168570303269004, + "grad_norm": 0.08029942959547043, + "learning_rate": 2.7337333333333333e-05, + "loss": 0.0042, + "step": 17994 + }, + { + "epoch": 14.169358014966523, + "grad_norm": 0.23089176416397095, + "learning_rate": 2.7337e-05, + "loss": 0.0058, + "step": 17995 + }, + { + "epoch": 14.17014572666404, + "grad_norm": 0.3622674345970154, + "learning_rate": 2.733666666666667e-05, + "loss": 0.0089, + "step": 17996 + }, + { + "epoch": 14.17093343836156, + "grad_norm": 0.25143784284591675, + "learning_rate": 2.7336333333333335e-05, + "loss": 0.0095, + "step": 17997 + }, + { + "epoch": 14.171721150059078, + "grad_norm": 0.1470479518175125, + "learning_rate": 2.7336e-05, + "loss": 0.0072, + "step": 17998 + }, + { + "epoch": 14.172508861756597, + "grad_norm": 0.3320884108543396, + "learning_rate": 2.733566666666667e-05, + "loss": 0.0139, + "step": 17999 + }, + { + "epoch": 14.173296573454115, + "grad_norm": 0.17978593707084656, + "learning_rate": 2.7335333333333332e-05, + "loss": 0.0108, + "step": 18000 + }, + { + "epoch": 14.173296573454115, + "eval_cer": 0.1152240972600254, + "eval_loss": 0.33926066756248474, + "eval_runtime": 17.0464, + "eval_samples_per_second": 17.834, + "eval_steps_per_second": 0.587, + "eval_wer": 0.4044512663085188, + "step": 18000 + }, + { + "epoch": 14.173296573454115, + "step": 18000, + "total_flos": 2.0753874958738276e+20, + "train_loss": 0.2817888858753981, + "train_runtime": 29183.5878, + "train_samples_per_second": 219.301, + "train_steps_per_second": 3.427 + } + ], + "logging_steps": 1.0, + "max_steps": 100000, + "num_input_tokens_seen": 0, + "num_train_epochs": 79, + "save_steps": 1000, + "stateful_callbacks": { + "EarlyStoppingCallback": { + "args": { + "early_stopping_patience": 5, + "early_stopping_threshold": 0.0 + }, + "attributes": { + "early_stopping_patience_counter": 5 + } + }, + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.0753874958738276e+20, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +}