{ "best_metric": 0.26057168841362, "best_model_checkpoint": "./w2v-bert-2.0-luo_19_19h/checkpoint-5000", "epoch": 24.27217496962333, "eval_steps": 1000, "global_step": 10000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002430133657351154, "grad_norm": 31.72887420654297, "learning_rate": 3.0000000000000004e-09, "loss": 11.1485, "step": 1 }, { "epoch": 0.004860267314702308, "grad_norm": 26.410093307495117, "learning_rate": 6.000000000000001e-09, "loss": 9.2601, "step": 2 }, { "epoch": 0.007290400972053463, "grad_norm": 27.016387939453125, "learning_rate": 9e-09, "loss": 9.4432, "step": 3 }, { "epoch": 0.009720534629404616, "grad_norm": 24.092432022094727, "learning_rate": 1.2000000000000002e-08, "loss": 8.8841, "step": 4 }, { "epoch": 0.012150668286755772, "grad_norm": 30.00749969482422, "learning_rate": 1.5000000000000002e-08, "loss": 10.4295, "step": 5 }, { "epoch": 0.014580801944106925, "grad_norm": 25.669897079467773, "learning_rate": 1.8e-08, "loss": 9.1977, "step": 6 }, { "epoch": 0.01701093560145808, "grad_norm": 27.44280433654785, "learning_rate": 2.1e-08, "loss": 9.4543, "step": 7 }, { "epoch": 0.019441069258809233, "grad_norm": 26.839799880981445, "learning_rate": 2.4000000000000003e-08, "loss": 9.409, "step": 8 }, { "epoch": 0.02187120291616039, "grad_norm": 25.294570922851562, "learning_rate": 2.7e-08, "loss": 8.8186, "step": 9 }, { "epoch": 0.024301336573511544, "grad_norm": 23.73955726623535, "learning_rate": 3.0000000000000004e-08, "loss": 8.4984, "step": 10 }, { "epoch": 0.026731470230862697, "grad_norm": 24.945449829101562, "learning_rate": 3.3000000000000004e-08, "loss": 8.6996, "step": 11 }, { "epoch": 0.02916160388821385, "grad_norm": 25.395112991333008, "learning_rate": 3.6e-08, "loss": 8.7921, "step": 12 }, { "epoch": 0.031591737545565005, "grad_norm": 25.967538833618164, "learning_rate": 3.9e-08, "loss": 8.9295, "step": 13 }, { "epoch": 0.03402187120291616, "grad_norm": 27.518543243408203, "learning_rate": 4.2e-08, "loss": 9.1406, "step": 14 }, { "epoch": 0.03645200486026731, "grad_norm": 24.751224517822266, "learning_rate": 4.5e-08, "loss": 8.6674, "step": 15 }, { "epoch": 0.038882138517618466, "grad_norm": 23.62750244140625, "learning_rate": 4.8000000000000006e-08, "loss": 8.1317, "step": 16 }, { "epoch": 0.041312272174969626, "grad_norm": 24.097095489501953, "learning_rate": 5.1e-08, "loss": 8.3597, "step": 17 }, { "epoch": 0.04374240583232078, "grad_norm": 25.882150650024414, "learning_rate": 5.4e-08, "loss": 8.46, "step": 18 }, { "epoch": 0.046172539489671933, "grad_norm": 22.797712326049805, "learning_rate": 5.7e-08, "loss": 7.7899, "step": 19 }, { "epoch": 0.04860267314702309, "grad_norm": 23.5137882232666, "learning_rate": 6.000000000000001e-08, "loss": 8.1402, "step": 20 }, { "epoch": 0.05103280680437424, "grad_norm": 25.550765991210938, "learning_rate": 6.3e-08, "loss": 8.6786, "step": 21 }, { "epoch": 0.053462940461725394, "grad_norm": 23.73699188232422, "learning_rate": 6.600000000000001e-08, "loss": 8.2318, "step": 22 }, { "epoch": 0.05589307411907655, "grad_norm": 24.600170135498047, "learning_rate": 6.9e-08, "loss": 8.3269, "step": 23 }, { "epoch": 0.0583232077764277, "grad_norm": 24.161409378051758, "learning_rate": 7.2e-08, "loss": 8.2753, "step": 24 }, { "epoch": 0.060753341433778855, "grad_norm": 22.61728286743164, "learning_rate": 7.500000000000001e-08, "loss": 7.8175, "step": 25 }, { "epoch": 0.06318347509113001, "grad_norm": 23.49274253845215, "learning_rate": 7.8e-08, "loss": 8.0296, "step": 26 }, { "epoch": 0.06561360874848117, "grad_norm": 23.7640380859375, "learning_rate": 8.100000000000001e-08, "loss": 8.2326, "step": 27 }, { "epoch": 0.06804374240583232, "grad_norm": 22.87200355529785, "learning_rate": 8.4e-08, "loss": 7.8413, "step": 28 }, { "epoch": 0.07047387606318348, "grad_norm": 24.368379592895508, "learning_rate": 8.7e-08, "loss": 8.298, "step": 29 }, { "epoch": 0.07290400972053462, "grad_norm": 21.276613235473633, "learning_rate": 9e-08, "loss": 7.4231, "step": 30 }, { "epoch": 0.07533414337788578, "grad_norm": 24.502164840698242, "learning_rate": 9.3e-08, "loss": 8.2438, "step": 31 }, { "epoch": 0.07776427703523693, "grad_norm": 23.914932250976562, "learning_rate": 9.600000000000001e-08, "loss": 8.032, "step": 32 }, { "epoch": 0.08019441069258809, "grad_norm": 25.51205825805664, "learning_rate": 9.9e-08, "loss": 8.4247, "step": 33 }, { "epoch": 0.08262454434993925, "grad_norm": 27.83383560180664, "learning_rate": 1.02e-07, "loss": 8.8456, "step": 34 }, { "epoch": 0.0850546780072904, "grad_norm": 22.238880157470703, "learning_rate": 1.05e-07, "loss": 7.6928, "step": 35 }, { "epoch": 0.08748481166464156, "grad_norm": 27.02587890625, "learning_rate": 1.08e-07, "loss": 8.7762, "step": 36 }, { "epoch": 0.0899149453219927, "grad_norm": 24.28791618347168, "learning_rate": 1.11e-07, "loss": 7.9067, "step": 37 }, { "epoch": 0.09234507897934387, "grad_norm": 23.083635330200195, "learning_rate": 1.14e-07, "loss": 7.6347, "step": 38 }, { "epoch": 0.09477521263669501, "grad_norm": 26.075321197509766, "learning_rate": 1.17e-07, "loss": 8.5562, "step": 39 }, { "epoch": 0.09720534629404617, "grad_norm": 24.965587615966797, "learning_rate": 1.2000000000000002e-07, "loss": 8.0468, "step": 40 }, { "epoch": 0.09963547995139732, "grad_norm": 26.401365280151367, "learning_rate": 1.23e-07, "loss": 8.4633, "step": 41 }, { "epoch": 0.10206561360874848, "grad_norm": 26.267797470092773, "learning_rate": 1.26e-07, "loss": 8.5233, "step": 42 }, { "epoch": 0.10449574726609964, "grad_norm": 29.41322898864746, "learning_rate": 1.29e-07, "loss": 8.772, "step": 43 }, { "epoch": 0.10692588092345079, "grad_norm": 24.533571243286133, "learning_rate": 1.3200000000000002e-07, "loss": 7.89, "step": 44 }, { "epoch": 0.10935601458080195, "grad_norm": 28.708637237548828, "learning_rate": 1.35e-07, "loss": 8.7874, "step": 45 }, { "epoch": 0.1117861482381531, "grad_norm": 28.792644500732422, "learning_rate": 1.38e-07, "loss": 8.7954, "step": 46 }, { "epoch": 0.11421628189550426, "grad_norm": 27.861797332763672, "learning_rate": 1.41e-07, "loss": 8.7854, "step": 47 }, { "epoch": 0.1166464155528554, "grad_norm": 30.90754508972168, "learning_rate": 1.44e-07, "loss": 9.2711, "step": 48 }, { "epoch": 0.11907654921020656, "grad_norm": 28.8869686126709, "learning_rate": 1.47e-07, "loss": 8.9342, "step": 49 }, { "epoch": 0.12150668286755771, "grad_norm": Infinity, "learning_rate": 1.47e-07, "loss": 10.0409, "step": 50 }, { "epoch": 0.12393681652490887, "grad_norm": 32.96950912475586, "learning_rate": 1.5000000000000002e-07, "loss": 10.5459, "step": 51 }, { "epoch": 0.12636695018226002, "grad_norm": 27.806352615356445, "learning_rate": 1.53e-07, "loss": 9.3425, "step": 52 }, { "epoch": 0.12879708383961117, "grad_norm": 25.09990119934082, "learning_rate": 1.56e-07, "loss": 8.3718, "step": 53 }, { "epoch": 0.13122721749696234, "grad_norm": 29.019784927368164, "learning_rate": 1.59e-07, "loss": 9.3419, "step": 54 }, { "epoch": 0.1336573511543135, "grad_norm": 31.131988525390625, "learning_rate": 1.6200000000000002e-07, "loss": 9.7479, "step": 55 }, { "epoch": 0.13608748481166463, "grad_norm": 30.606874465942383, "learning_rate": 1.6499999999999998e-07, "loss": 9.5902, "step": 56 }, { "epoch": 0.1385176184690158, "grad_norm": 29.999305725097656, "learning_rate": 1.68e-07, "loss": 9.0907, "step": 57 }, { "epoch": 0.14094775212636695, "grad_norm": 27.282432556152344, "learning_rate": 1.71e-07, "loss": 8.4994, "step": 58 }, { "epoch": 0.1433778857837181, "grad_norm": 28.001203536987305, "learning_rate": 1.74e-07, "loss": 8.637, "step": 59 }, { "epoch": 0.14580801944106925, "grad_norm": 26.76766586303711, "learning_rate": 1.77e-07, "loss": 8.3985, "step": 60 }, { "epoch": 0.14823815309842042, "grad_norm": 25.868179321289062, "learning_rate": 1.8e-07, "loss": 8.0081, "step": 61 }, { "epoch": 0.15066828675577157, "grad_norm": 26.6855411529541, "learning_rate": 1.83e-07, "loss": 8.1588, "step": 62 }, { "epoch": 0.15309842041312272, "grad_norm": 28.025657653808594, "learning_rate": 1.86e-07, "loss": 8.3384, "step": 63 }, { "epoch": 0.15552855407047386, "grad_norm": 27.84844970703125, "learning_rate": 1.89e-07, "loss": 8.2488, "step": 64 }, { "epoch": 0.15795868772782504, "grad_norm": 26.57767677307129, "learning_rate": 1.9200000000000003e-07, "loss": 7.9735, "step": 65 }, { "epoch": 0.16038882138517618, "grad_norm": 26.14347267150879, "learning_rate": 1.9499999999999999e-07, "loss": 7.9128, "step": 66 }, { "epoch": 0.16281895504252733, "grad_norm": 27.442689895629883, "learning_rate": 1.98e-07, "loss": 8.0991, "step": 67 }, { "epoch": 0.1652490886998785, "grad_norm": 25.239824295043945, "learning_rate": 2.01e-07, "loss": 7.6216, "step": 68 }, { "epoch": 0.16767922235722965, "grad_norm": 26.322359085083008, "learning_rate": 2.04e-07, "loss": 7.689, "step": 69 }, { "epoch": 0.1701093560145808, "grad_norm": 26.81964683532715, "learning_rate": 2.0700000000000001e-07, "loss": 7.8505, "step": 70 }, { "epoch": 0.17253948967193194, "grad_norm": 25.071407318115234, "learning_rate": 2.1e-07, "loss": 7.6306, "step": 71 }, { "epoch": 0.17496962332928312, "grad_norm": 26.770170211791992, "learning_rate": 2.1300000000000001e-07, "loss": 7.8837, "step": 72 }, { "epoch": 0.17739975698663427, "grad_norm": 26.514434814453125, "learning_rate": 2.16e-07, "loss": 7.7924, "step": 73 }, { "epoch": 0.1798298906439854, "grad_norm": 25.59735107421875, "learning_rate": 2.1900000000000002e-07, "loss": 7.51, "step": 74 }, { "epoch": 0.1822600243013366, "grad_norm": 26.402545928955078, "learning_rate": 2.22e-07, "loss": 7.7768, "step": 75 }, { "epoch": 0.18469015795868773, "grad_norm": 25.93678855895996, "learning_rate": 2.25e-07, "loss": 7.489, "step": 76 }, { "epoch": 0.18712029161603888, "grad_norm": 25.759546279907227, "learning_rate": 2.28e-07, "loss": 7.2838, "step": 77 }, { "epoch": 0.18955042527339003, "grad_norm": 27.10691261291504, "learning_rate": 2.3100000000000002e-07, "loss": 7.5397, "step": 78 }, { "epoch": 0.1919805589307412, "grad_norm": 30.198894500732422, "learning_rate": 2.34e-07, "loss": 7.9881, "step": 79 }, { "epoch": 0.19441069258809235, "grad_norm": 25.996177673339844, "learning_rate": 2.3700000000000002e-07, "loss": 7.4603, "step": 80 }, { "epoch": 0.1968408262454435, "grad_norm": 25.258569717407227, "learning_rate": 2.4000000000000003e-07, "loss": 7.1544, "step": 81 }, { "epoch": 0.19927095990279464, "grad_norm": 36.78901290893555, "learning_rate": 2.43e-07, "loss": 8.617, "step": 82 }, { "epoch": 0.20170109356014582, "grad_norm": 27.105087280273438, "learning_rate": 2.46e-07, "loss": 7.3417, "step": 83 }, { "epoch": 0.20413122721749696, "grad_norm": 26.987415313720703, "learning_rate": 2.49e-07, "loss": 7.3498, "step": 84 }, { "epoch": 0.2065613608748481, "grad_norm": 33.67708206176758, "learning_rate": 2.52e-07, "loss": 8.2004, "step": 85 }, { "epoch": 0.20899149453219928, "grad_norm": 28.876110076904297, "learning_rate": 2.5500000000000005e-07, "loss": 7.5498, "step": 86 }, { "epoch": 0.21142162818955043, "grad_norm": 25.259580612182617, "learning_rate": 2.58e-07, "loss": 6.9011, "step": 87 }, { "epoch": 0.21385176184690158, "grad_norm": 34.15337371826172, "learning_rate": 2.6099999999999997e-07, "loss": 8.1252, "step": 88 }, { "epoch": 0.21628189550425272, "grad_norm": 27.657987594604492, "learning_rate": 2.6400000000000003e-07, "loss": 7.1193, "step": 89 }, { "epoch": 0.2187120291616039, "grad_norm": 31.236072540283203, "learning_rate": 2.67e-07, "loss": 7.5621, "step": 90 }, { "epoch": 0.22114216281895505, "grad_norm": 31.13542938232422, "learning_rate": 2.7e-07, "loss": 7.5237, "step": 91 }, { "epoch": 0.2235722964763062, "grad_norm": 33.754764556884766, "learning_rate": 2.73e-07, "loss": 7.7868, "step": 92 }, { "epoch": 0.22600243013365734, "grad_norm": 31.57640838623047, "learning_rate": 2.76e-07, "loss": 7.3906, "step": 93 }, { "epoch": 0.2284325637910085, "grad_norm": 33.09754943847656, "learning_rate": 2.79e-07, "loss": 7.5435, "step": 94 }, { "epoch": 0.23086269744835966, "grad_norm": 32.6729850769043, "learning_rate": 2.82e-07, "loss": 7.6354, "step": 95 }, { "epoch": 0.2332928311057108, "grad_norm": 49.0784912109375, "learning_rate": 2.85e-07, "loss": 9.3337, "step": 96 }, { "epoch": 0.23572296476306198, "grad_norm": 35.38929748535156, "learning_rate": 2.88e-07, "loss": 7.832, "step": 97 }, { "epoch": 0.23815309842041313, "grad_norm": 50.730220794677734, "learning_rate": 2.91e-07, "loss": 9.1967, "step": 98 }, { "epoch": 0.24058323207776428, "grad_norm": 47.69318389892578, "learning_rate": 2.94e-07, "loss": 9.0184, "step": 99 }, { "epoch": 0.24301336573511542, "grad_norm": 53.41519546508789, "learning_rate": 2.97e-07, "loss": 9.888, "step": 100 }, { "epoch": 0.2454434993924666, "grad_norm": 44.970069885253906, "learning_rate": 3.0000000000000004e-07, "loss": 9.4436, "step": 101 }, { "epoch": 0.24787363304981774, "grad_norm": 35.39624786376953, "learning_rate": 3.03e-07, "loss": 8.1508, "step": 102 }, { "epoch": 0.2503037667071689, "grad_norm": 42.964569091796875, "learning_rate": 3.06e-07, "loss": 8.5109, "step": 103 }, { "epoch": 0.25273390036452004, "grad_norm": 44.49302291870117, "learning_rate": 3.0900000000000003e-07, "loss": 8.5681, "step": 104 }, { "epoch": 0.2551640340218712, "grad_norm": 47.64183044433594, "learning_rate": 3.12e-07, "loss": 9.0951, "step": 105 }, { "epoch": 0.25759416767922233, "grad_norm": 46.741050720214844, "learning_rate": 3.15e-07, "loss": 8.6282, "step": 106 }, { "epoch": 0.2600243013365735, "grad_norm": 41.436859130859375, "learning_rate": 3.18e-07, "loss": 8.0727, "step": 107 }, { "epoch": 0.2624544349939247, "grad_norm": 39.95646667480469, "learning_rate": 3.21e-07, "loss": 7.6291, "step": 108 }, { "epoch": 0.2648845686512758, "grad_norm": 39.34744644165039, "learning_rate": 3.2400000000000004e-07, "loss": 7.6461, "step": 109 }, { "epoch": 0.267314702308627, "grad_norm": 37.7058219909668, "learning_rate": 3.27e-07, "loss": 7.2743, "step": 110 }, { "epoch": 0.26974483596597815, "grad_norm": 42.549842834472656, "learning_rate": 3.2999999999999996e-07, "loss": 7.5539, "step": 111 }, { "epoch": 0.27217496962332927, "grad_norm": 43.079383850097656, "learning_rate": 3.3300000000000003e-07, "loss": 7.5834, "step": 112 }, { "epoch": 0.27460510328068044, "grad_norm": 39.086143493652344, "learning_rate": 3.36e-07, "loss": 7.3111, "step": 113 }, { "epoch": 0.2770352369380316, "grad_norm": 36.140838623046875, "learning_rate": 3.39e-07, "loss": 6.9477, "step": 114 }, { "epoch": 0.27946537059538273, "grad_norm": 42.023983001708984, "learning_rate": 3.42e-07, "loss": 7.2852, "step": 115 }, { "epoch": 0.2818955042527339, "grad_norm": 41.586463928222656, "learning_rate": 3.45e-07, "loss": 7.2114, "step": 116 }, { "epoch": 0.284325637910085, "grad_norm": 37.130245208740234, "learning_rate": 3.48e-07, "loss": 6.8281, "step": 117 }, { "epoch": 0.2867557715674362, "grad_norm": 41.961368560791016, "learning_rate": 3.51e-07, "loss": 7.0201, "step": 118 }, { "epoch": 0.2891859052247874, "grad_norm": 43.75677490234375, "learning_rate": 3.54e-07, "loss": 7.1409, "step": 119 }, { "epoch": 0.2916160388821385, "grad_norm": 37.992271423339844, "learning_rate": 3.5700000000000003e-07, "loss": 6.6535, "step": 120 }, { "epoch": 0.29404617253948967, "grad_norm": 44.849239349365234, "learning_rate": 3.6e-07, "loss": 7.0332, "step": 121 }, { "epoch": 0.29647630619684084, "grad_norm": 38.23017883300781, "learning_rate": 3.63e-07, "loss": 6.5173, "step": 122 }, { "epoch": 0.29890643985419196, "grad_norm": 40.541358947753906, "learning_rate": 3.66e-07, "loss": 6.7486, "step": 123 }, { "epoch": 0.30133657351154314, "grad_norm": 40.93120193481445, "learning_rate": 3.6900000000000004e-07, "loss": 6.591, "step": 124 }, { "epoch": 0.3037667071688943, "grad_norm": 51.918880462646484, "learning_rate": 3.72e-07, "loss": 6.8413, "step": 125 }, { "epoch": 0.30619684082624543, "grad_norm": 37.86509704589844, "learning_rate": 3.75e-07, "loss": 6.3359, "step": 126 }, { "epoch": 0.3086269744835966, "grad_norm": 36.46135330200195, "learning_rate": 3.78e-07, "loss": 6.1276, "step": 127 }, { "epoch": 0.3110571081409477, "grad_norm": 35.06586456298828, "learning_rate": 3.81e-07, "loss": 6.1362, "step": 128 }, { "epoch": 0.3134872417982989, "grad_norm": 47.91879653930664, "learning_rate": 3.8400000000000005e-07, "loss": 6.9091, "step": 129 }, { "epoch": 0.3159173754556501, "grad_norm": 32.735633850097656, "learning_rate": 3.87e-07, "loss": 5.7525, "step": 130 }, { "epoch": 0.3183475091130012, "grad_norm": 33.911231994628906, "learning_rate": 3.8999999999999997e-07, "loss": 5.8027, "step": 131 }, { "epoch": 0.32077764277035237, "grad_norm": 43.872371673583984, "learning_rate": 3.9300000000000004e-07, "loss": 6.4643, "step": 132 }, { "epoch": 0.32320777642770354, "grad_norm": 39.0894889831543, "learning_rate": 3.96e-07, "loss": 6.1076, "step": 133 }, { "epoch": 0.32563791008505466, "grad_norm": 38.104736328125, "learning_rate": 3.99e-07, "loss": 5.9401, "step": 134 }, { "epoch": 0.32806804374240583, "grad_norm": 40.68254470825195, "learning_rate": 4.02e-07, "loss": 6.2083, "step": 135 }, { "epoch": 0.330498177399757, "grad_norm": 41.93454360961914, "learning_rate": 4.05e-07, "loss": 6.035, "step": 136 }, { "epoch": 0.33292831105710813, "grad_norm": 36.31924057006836, "learning_rate": 4.08e-07, "loss": 5.7286, "step": 137 }, { "epoch": 0.3353584447144593, "grad_norm": 53.77237319946289, "learning_rate": 4.11e-07, "loss": 6.7197, "step": 138 }, { "epoch": 0.3377885783718105, "grad_norm": 44.848636627197266, "learning_rate": 4.1400000000000003e-07, "loss": 6.042, "step": 139 }, { "epoch": 0.3402187120291616, "grad_norm": Infinity, "learning_rate": 4.1400000000000003e-07, "loss": 6.0479, "step": 140 }, { "epoch": 0.34264884568651277, "grad_norm": 44.843082427978516, "learning_rate": 4.17e-07, "loss": 6.1293, "step": 141 }, { "epoch": 0.3450789793438639, "grad_norm": 38.639896392822266, "learning_rate": 4.2e-07, "loss": 5.736, "step": 142 }, { "epoch": 0.34750911300121506, "grad_norm": 48.88438415527344, "learning_rate": 4.23e-07, "loss": 6.2073, "step": 143 }, { "epoch": 0.34993924665856624, "grad_norm": 46.436553955078125, "learning_rate": 4.2600000000000003e-07, "loss": 5.9965, "step": 144 }, { "epoch": 0.35236938031591736, "grad_norm": 51.612266540527344, "learning_rate": 4.2900000000000004e-07, "loss": 6.2878, "step": 145 }, { "epoch": 0.35479951397326853, "grad_norm": 42.59443664550781, "learning_rate": 4.32e-07, "loss": 5.7129, "step": 146 }, { "epoch": 0.3572296476306197, "grad_norm": 63.876827239990234, "learning_rate": 4.35e-07, "loss": 6.8393, "step": 147 }, { "epoch": 0.3596597812879708, "grad_norm": 55.2116813659668, "learning_rate": 4.3800000000000003e-07, "loss": 6.2789, "step": 148 }, { "epoch": 0.362089914945322, "grad_norm": 55.10670852661133, "learning_rate": 4.41e-07, "loss": 6.2275, "step": 149 }, { "epoch": 0.3645200486026732, "grad_norm": 65.96805572509766, "learning_rate": 4.44e-07, "loss": 6.9135, "step": 150 }, { "epoch": 0.3669501822600243, "grad_norm": 73.25938415527344, "learning_rate": 4.47e-07, "loss": 7.1323, "step": 151 }, { "epoch": 0.36938031591737547, "grad_norm": 52.81471252441406, "learning_rate": 4.5e-07, "loss": 6.0905, "step": 152 }, { "epoch": 0.3718104495747266, "grad_norm": 43.14121627807617, "learning_rate": 4.5300000000000005e-07, "loss": 5.6465, "step": 153 }, { "epoch": 0.37424058323207776, "grad_norm": 49.83356475830078, "learning_rate": 4.56e-07, "loss": 5.9232, "step": 154 }, { "epoch": 0.37667071688942894, "grad_norm": 58.16476058959961, "learning_rate": 4.5899999999999997e-07, "loss": 6.1543, "step": 155 }, { "epoch": 0.37910085054678005, "grad_norm": 50.15607452392578, "learning_rate": 4.6200000000000003e-07, "loss": 5.7792, "step": 156 }, { "epoch": 0.38153098420413123, "grad_norm": 42.7601432800293, "learning_rate": 4.65e-07, "loss": 5.3846, "step": 157 }, { "epoch": 0.3839611178614824, "grad_norm": 40.56768798828125, "learning_rate": 4.68e-07, "loss": 5.2678, "step": 158 }, { "epoch": 0.3863912515188335, "grad_norm": 46.411746978759766, "learning_rate": 4.7099999999999997e-07, "loss": 5.4773, "step": 159 }, { "epoch": 0.3888213851761847, "grad_norm": 45.369544982910156, "learning_rate": 4.7400000000000004e-07, "loss": 5.3627, "step": 160 }, { "epoch": 0.39125151883353587, "grad_norm": 45.28466033935547, "learning_rate": 4.77e-07, "loss": 5.3339, "step": 161 }, { "epoch": 0.393681652490887, "grad_norm": 37.966548919677734, "learning_rate": 4.800000000000001e-07, "loss": 5.1061, "step": 162 }, { "epoch": 0.39611178614823817, "grad_norm": 34.310951232910156, "learning_rate": 4.83e-07, "loss": 4.8953, "step": 163 }, { "epoch": 0.3985419198055893, "grad_norm": 34.235008239746094, "learning_rate": 4.86e-07, "loss": 4.8806, "step": 164 }, { "epoch": 0.40097205346294046, "grad_norm": 33.294189453125, "learning_rate": 4.89e-07, "loss": 4.8382, "step": 165 }, { "epoch": 0.40340218712029163, "grad_norm": 31.02336311340332, "learning_rate": 4.92e-07, "loss": 4.7773, "step": 166 }, { "epoch": 0.40583232077764275, "grad_norm": 33.854068756103516, "learning_rate": 4.95e-07, "loss": 4.8642, "step": 167 }, { "epoch": 0.4082624544349939, "grad_norm": 27.540771484375, "learning_rate": 4.98e-07, "loss": 4.6453, "step": 168 }, { "epoch": 0.4106925880923451, "grad_norm": 30.874906539916992, "learning_rate": 5.01e-07, "loss": 4.764, "step": 169 }, { "epoch": 0.4131227217496962, "grad_norm": 24.567358016967773, "learning_rate": 5.04e-07, "loss": 4.5528, "step": 170 }, { "epoch": 0.4155528554070474, "grad_norm": 24.321304321289062, "learning_rate": 5.07e-07, "loss": 4.568, "step": 171 }, { "epoch": 0.41798298906439857, "grad_norm": 22.9201602935791, "learning_rate": 5.100000000000001e-07, "loss": 4.5349, "step": 172 }, { "epoch": 0.4204131227217497, "grad_norm": 22.42011070251465, "learning_rate": 5.13e-07, "loss": 4.4898, "step": 173 }, { "epoch": 0.42284325637910086, "grad_norm": 19.751916885375977, "learning_rate": 5.16e-07, "loss": 4.4573, "step": 174 }, { "epoch": 0.425273390036452, "grad_norm": 15.41208267211914, "learning_rate": 5.19e-07, "loss": 4.3015, "step": 175 }, { "epoch": 0.42770352369380316, "grad_norm": 20.904817581176758, "learning_rate": 5.219999999999999e-07, "loss": 4.5293, "step": 176 }, { "epoch": 0.43013365735115433, "grad_norm": 14.373517036437988, "learning_rate": 5.250000000000001e-07, "loss": 4.3211, "step": 177 }, { "epoch": 0.43256379100850545, "grad_norm": 15.047904968261719, "learning_rate": 5.280000000000001e-07, "loss": 4.3745, "step": 178 }, { "epoch": 0.4349939246658566, "grad_norm": 15.578954696655273, "learning_rate": 5.31e-07, "loss": 4.379, "step": 179 }, { "epoch": 0.4374240583232078, "grad_norm": 12.744380950927734, "learning_rate": 5.34e-07, "loss": 4.3319, "step": 180 }, { "epoch": 0.4398541919805589, "grad_norm": 10.698866844177246, "learning_rate": 5.37e-07, "loss": 4.2711, "step": 181 }, { "epoch": 0.4422843256379101, "grad_norm": 10.808382987976074, "learning_rate": 5.4e-07, "loss": 4.3137, "step": 182 }, { "epoch": 0.44471445929526127, "grad_norm": 7.862553596496582, "learning_rate": 5.43e-07, "loss": 4.1986, "step": 183 }, { "epoch": 0.4471445929526124, "grad_norm": 8.61621379852295, "learning_rate": 5.46e-07, "loss": 4.2187, "step": 184 }, { "epoch": 0.44957472660996356, "grad_norm": 7.474984645843506, "learning_rate": 5.490000000000001e-07, "loss": 4.1192, "step": 185 }, { "epoch": 0.4520048602673147, "grad_norm": 7.322343826293945, "learning_rate": 5.52e-07, "loss": 4.221, "step": 186 }, { "epoch": 0.45443499392466585, "grad_norm": 7.767692565917969, "learning_rate": 5.55e-07, "loss": 4.2457, "step": 187 }, { "epoch": 0.456865127582017, "grad_norm": 7.211567401885986, "learning_rate": 5.58e-07, "loss": 4.1967, "step": 188 }, { "epoch": 0.45929526123936815, "grad_norm": 7.388667106628418, "learning_rate": 5.61e-07, "loss": 4.1548, "step": 189 }, { "epoch": 0.4617253948967193, "grad_norm": 7.411800861358643, "learning_rate": 5.64e-07, "loss": 4.2998, "step": 190 }, { "epoch": 0.4641555285540705, "grad_norm": 7.391948223114014, "learning_rate": 5.67e-07, "loss": 4.1523, "step": 191 }, { "epoch": 0.4665856622114216, "grad_norm": 8.18923568725586, "learning_rate": 5.7e-07, "loss": 4.095, "step": 192 }, { "epoch": 0.4690157958687728, "grad_norm": 8.078570365905762, "learning_rate": 5.73e-07, "loss": 4.136, "step": 193 }, { "epoch": 0.47144592952612396, "grad_norm": 8.016390800476074, "learning_rate": 5.76e-07, "loss": 4.1554, "step": 194 }, { "epoch": 0.4738760631834751, "grad_norm": 7.508421897888184, "learning_rate": 5.790000000000001e-07, "loss": 4.2093, "step": 195 }, { "epoch": 0.47630619684082626, "grad_norm": 6.901839256286621, "learning_rate": 5.82e-07, "loss": 4.2358, "step": 196 }, { "epoch": 0.4787363304981774, "grad_norm": 8.33242416381836, "learning_rate": 5.85e-07, "loss": 4.2572, "step": 197 }, { "epoch": 0.48116646415552855, "grad_norm": 7.100293159484863, "learning_rate": 5.88e-07, "loss": 4.2163, "step": 198 }, { "epoch": 0.4835965978128797, "grad_norm": 7.152838230133057, "learning_rate": 5.909999999999999e-07, "loss": 4.2871, "step": 199 }, { "epoch": 0.48602673147023084, "grad_norm": 10.067346572875977, "learning_rate": 5.94e-07, "loss": 4.5498, "step": 200 }, { "epoch": 0.488456865127582, "grad_norm": 12.213619232177734, "learning_rate": 5.970000000000001e-07, "loss": 4.215, "step": 201 }, { "epoch": 0.4908869987849332, "grad_norm": 6.637013912200928, "learning_rate": 6.000000000000001e-07, "loss": 4.0533, "step": 202 }, { "epoch": 0.4933171324422843, "grad_norm": 6.686540126800537, "learning_rate": 6.03e-07, "loss": 4.0771, "step": 203 }, { "epoch": 0.4957472660996355, "grad_norm": 6.878912448883057, "learning_rate": 6.06e-07, "loss": 4.1006, "step": 204 }, { "epoch": 0.49817739975698666, "grad_norm": 6.72047233581543, "learning_rate": 6.09e-07, "loss": 4.0735, "step": 205 }, { "epoch": 0.5006075334143378, "grad_norm": 6.586817264556885, "learning_rate": 6.12e-07, "loss": 4.0791, "step": 206 }, { "epoch": 0.503037667071689, "grad_norm": 7.593733310699463, "learning_rate": 6.15e-07, "loss": 3.9737, "step": 207 }, { "epoch": 0.5054678007290401, "grad_norm": 8.392916679382324, "learning_rate": 6.180000000000001e-07, "loss": 3.9106, "step": 208 }, { "epoch": 0.5078979343863913, "grad_norm": 7.031166076660156, "learning_rate": 6.21e-07, "loss": 3.9827, "step": 209 }, { "epoch": 0.5103280680437424, "grad_norm": 8.26637077331543, "learning_rate": 6.24e-07, "loss": 3.8974, "step": 210 }, { "epoch": 0.5127582017010935, "grad_norm": 8.92978572845459, "learning_rate": 6.27e-07, "loss": 3.8765, "step": 211 }, { "epoch": 0.5151883353584447, "grad_norm": 8.423067092895508, "learning_rate": 6.3e-07, "loss": 3.9005, "step": 212 }, { "epoch": 0.5176184690157959, "grad_norm": 7.520168781280518, "learning_rate": 6.33e-07, "loss": 3.9161, "step": 213 }, { "epoch": 0.520048602673147, "grad_norm": 7.175937652587891, "learning_rate": 6.36e-07, "loss": 3.8461, "step": 214 }, { "epoch": 0.5224787363304981, "grad_norm": 8.293469429016113, "learning_rate": 6.39e-07, "loss": 3.8275, "step": 215 }, { "epoch": 0.5249088699878494, "grad_norm": 5.786047458648682, "learning_rate": 6.42e-07, "loss": 3.8186, "step": 216 }, { "epoch": 0.5273390036452005, "grad_norm": 6.012807369232178, "learning_rate": 6.45e-07, "loss": 3.8092, "step": 217 }, { "epoch": 0.5297691373025516, "grad_norm": 6.0639328956604, "learning_rate": 6.480000000000001e-07, "loss": 3.792, "step": 218 }, { "epoch": 0.5321992709599028, "grad_norm": 6.5306501388549805, "learning_rate": 6.51e-07, "loss": 3.7503, "step": 219 }, { "epoch": 0.534629404617254, "grad_norm": 5.338498115539551, "learning_rate": 6.54e-07, "loss": 3.797, "step": 220 }, { "epoch": 0.5370595382746051, "grad_norm": 5.34122371673584, "learning_rate": 6.57e-07, "loss": 3.7742, "step": 221 }, { "epoch": 0.5394896719319563, "grad_norm": 6.205730438232422, "learning_rate": 6.599999999999999e-07, "loss": 3.8542, "step": 222 }, { "epoch": 0.5419198055893074, "grad_norm": 5.933345794677734, "learning_rate": 6.63e-07, "loss": 3.7582, "step": 223 }, { "epoch": 0.5443499392466585, "grad_norm": 5.296053886413574, "learning_rate": 6.660000000000001e-07, "loss": 3.7603, "step": 224 }, { "epoch": 0.5467800729040098, "grad_norm": 5.648688793182373, "learning_rate": 6.690000000000001e-07, "loss": 3.744, "step": 225 }, { "epoch": 0.5492102065613609, "grad_norm": 5.209010601043701, "learning_rate": 6.72e-07, "loss": 3.7363, "step": 226 }, { "epoch": 0.551640340218712, "grad_norm": 4.754680156707764, "learning_rate": 6.75e-07, "loss": 3.7339, "step": 227 }, { "epoch": 0.5540704738760632, "grad_norm": 7.906564712524414, "learning_rate": 6.78e-07, "loss": 3.8363, "step": 228 }, { "epoch": 0.5565006075334143, "grad_norm": 4.795210838317871, "learning_rate": 6.81e-07, "loss": 3.7396, "step": 229 }, { "epoch": 0.5589307411907655, "grad_norm": 4.764222621917725, "learning_rate": 6.84e-07, "loss": 3.5916, "step": 230 }, { "epoch": 0.5613608748481167, "grad_norm": 5.063666343688965, "learning_rate": 6.87e-07, "loss": 3.7226, "step": 231 }, { "epoch": 0.5637910085054678, "grad_norm": 4.502518653869629, "learning_rate": 6.9e-07, "loss": 3.6281, "step": 232 }, { "epoch": 0.5662211421628189, "grad_norm": 4.430773735046387, "learning_rate": 6.93e-07, "loss": 3.6816, "step": 233 }, { "epoch": 0.56865127582017, "grad_norm": 5.058294773101807, "learning_rate": 6.96e-07, "loss": 3.7273, "step": 234 }, { "epoch": 0.5710814094775213, "grad_norm": 4.25796365737915, "learning_rate": 6.990000000000001e-07, "loss": 3.6367, "step": 235 }, { "epoch": 0.5735115431348724, "grad_norm": 5.936837196350098, "learning_rate": 7.02e-07, "loss": 3.5414, "step": 236 }, { "epoch": 0.5759416767922235, "grad_norm": 6.639903545379639, "learning_rate": 7.05e-07, "loss": 3.633, "step": 237 }, { "epoch": 0.5783718104495748, "grad_norm": 4.416826248168945, "learning_rate": 7.08e-07, "loss": 3.5731, "step": 238 }, { "epoch": 0.5808019441069259, "grad_norm": 5.199351787567139, "learning_rate": 7.11e-07, "loss": 3.5728, "step": 239 }, { "epoch": 0.583232077764277, "grad_norm": 4.145137786865234, "learning_rate": 7.140000000000001e-07, "loss": 3.6243, "step": 240 }, { "epoch": 0.5856622114216282, "grad_norm": 4.0052490234375, "learning_rate": 7.170000000000001e-07, "loss": 3.5703, "step": 241 }, { "epoch": 0.5880923450789793, "grad_norm": 4.154129981994629, "learning_rate": 7.2e-07, "loss": 3.6309, "step": 242 }, { "epoch": 0.5905224787363305, "grad_norm": 3.827484369277954, "learning_rate": 7.23e-07, "loss": 3.5365, "step": 243 }, { "epoch": 0.5929526123936817, "grad_norm": 5.347609519958496, "learning_rate": 7.26e-07, "loss": 3.6797, "step": 244 }, { "epoch": 0.5953827460510328, "grad_norm": 4.691897869110107, "learning_rate": 7.29e-07, "loss": 3.5933, "step": 245 }, { "epoch": 0.5978128797083839, "grad_norm": 3.546259641647339, "learning_rate": 7.32e-07, "loss": 3.5727, "step": 246 }, { "epoch": 0.6002430133657352, "grad_norm": 8.971769332885742, "learning_rate": 7.350000000000001e-07, "loss": 3.5627, "step": 247 }, { "epoch": 0.6026731470230863, "grad_norm": 6.243602752685547, "learning_rate": 7.380000000000001e-07, "loss": 3.6965, "step": 248 }, { "epoch": 0.6051032806804374, "grad_norm": 10.85800838470459, "learning_rate": 7.41e-07, "loss": 3.707, "step": 249 }, { "epoch": 0.6075334143377886, "grad_norm": 4.077145099639893, "learning_rate": 7.44e-07, "loss": 3.6492, "step": 250 }, { "epoch": 0.6099635479951397, "grad_norm": 13.693325996398926, "learning_rate": 7.47e-07, "loss": 3.6745, "step": 251 }, { "epoch": 0.6123936816524909, "grad_norm": 4.915408611297607, "learning_rate": 7.5e-07, "loss": 3.4928, "step": 252 }, { "epoch": 0.6148238153098421, "grad_norm": 6.315511226654053, "learning_rate": 7.53e-07, "loss": 3.5176, "step": 253 }, { "epoch": 0.6172539489671932, "grad_norm": 4.0585618019104, "learning_rate": 7.56e-07, "loss": 3.5942, "step": 254 }, { "epoch": 0.6196840826245443, "grad_norm": 5.094327926635742, "learning_rate": 7.59e-07, "loss": 3.5568, "step": 255 }, { "epoch": 0.6221142162818954, "grad_norm": 7.397136211395264, "learning_rate": 7.62e-07, "loss": 3.4361, "step": 256 }, { "epoch": 0.6245443499392467, "grad_norm": 8.550902366638184, "learning_rate": 7.65e-07, "loss": 3.4432, "step": 257 }, { "epoch": 0.6269744835965978, "grad_norm": 6.88322639465332, "learning_rate": 7.680000000000001e-07, "loss": 3.4328, "step": 258 }, { "epoch": 0.6294046172539489, "grad_norm": 7.070106029510498, "learning_rate": 7.71e-07, "loss": 3.4087, "step": 259 }, { "epoch": 0.6318347509113001, "grad_norm": 5.725564002990723, "learning_rate": 7.74e-07, "loss": 3.396, "step": 260 }, { "epoch": 0.6342648845686513, "grad_norm": 3.899899482727051, "learning_rate": 7.77e-07, "loss": 3.3481, "step": 261 }, { "epoch": 0.6366950182260024, "grad_norm": 3.075965642929077, "learning_rate": 7.799999999999999e-07, "loss": 3.3465, "step": 262 }, { "epoch": 0.6391251518833536, "grad_norm": 2.8951144218444824, "learning_rate": 7.830000000000001e-07, "loss": 3.3391, "step": 263 }, { "epoch": 0.6415552855407047, "grad_norm": 3.8436195850372314, "learning_rate": 7.860000000000001e-07, "loss": 3.3693, "step": 264 }, { "epoch": 0.6439854191980559, "grad_norm": 3.5079572200775146, "learning_rate": 7.89e-07, "loss": 3.316, "step": 265 }, { "epoch": 0.6464155528554071, "grad_norm": 4.349302768707275, "learning_rate": 7.92e-07, "loss": 3.319, "step": 266 }, { "epoch": 0.6488456865127582, "grad_norm": 5.482784748077393, "learning_rate": 7.95e-07, "loss": 3.33, "step": 267 }, { "epoch": 0.6512758201701093, "grad_norm": 2.9848546981811523, "learning_rate": 7.98e-07, "loss": 3.3164, "step": 268 }, { "epoch": 0.6537059538274606, "grad_norm": 3.968324899673462, "learning_rate": 8.01e-07, "loss": 3.295, "step": 269 }, { "epoch": 0.6561360874848117, "grad_norm": 3.4573776721954346, "learning_rate": 8.04e-07, "loss": 3.3364, "step": 270 }, { "epoch": 0.6585662211421628, "grad_norm": 2.8220999240875244, "learning_rate": 8.070000000000001e-07, "loss": 3.314, "step": 271 }, { "epoch": 0.660996354799514, "grad_norm": 5.295152187347412, "learning_rate": 8.1e-07, "loss": 3.2823, "step": 272 }, { "epoch": 0.6634264884568651, "grad_norm": 3.59615421295166, "learning_rate": 8.13e-07, "loss": 3.2917, "step": 273 }, { "epoch": 0.6658566221142163, "grad_norm": 4.092174053192139, "learning_rate": 8.16e-07, "loss": 3.2219, "step": 274 }, { "epoch": 0.6682867557715675, "grad_norm": 4.155817985534668, "learning_rate": 8.19e-07, "loss": 3.2875, "step": 275 }, { "epoch": 0.6707168894289186, "grad_norm": 5.311927795410156, "learning_rate": 8.22e-07, "loss": 3.2199, "step": 276 }, { "epoch": 0.6731470230862697, "grad_norm": 2.360161781311035, "learning_rate": 8.25e-07, "loss": 3.2166, "step": 277 }, { "epoch": 0.675577156743621, "grad_norm": 3.189689874649048, "learning_rate": 8.280000000000001e-07, "loss": 3.2805, "step": 278 }, { "epoch": 0.6780072904009721, "grad_norm": 3.0500330924987793, "learning_rate": 8.31e-07, "loss": 3.1776, "step": 279 }, { "epoch": 0.6804374240583232, "grad_norm": 2.815340757369995, "learning_rate": 8.34e-07, "loss": 3.1613, "step": 280 }, { "epoch": 0.6828675577156743, "grad_norm": 6.214359283447266, "learning_rate": 8.370000000000001e-07, "loss": 3.2798, "step": 281 }, { "epoch": 0.6852976913730255, "grad_norm": 4.067070007324219, "learning_rate": 8.4e-07, "loss": 3.1585, "step": 282 }, { "epoch": 0.6877278250303767, "grad_norm": 3.082977056503296, "learning_rate": 8.43e-07, "loss": 3.1916, "step": 283 }, { "epoch": 0.6901579586877278, "grad_norm": 4.276469707489014, "learning_rate": 8.46e-07, "loss": 3.2071, "step": 284 }, { "epoch": 0.692588092345079, "grad_norm": 3.223588466644287, "learning_rate": 8.489999999999999e-07, "loss": 3.1852, "step": 285 }, { "epoch": 0.6950182260024301, "grad_norm": 3.5014028549194336, "learning_rate": 8.520000000000001e-07, "loss": 3.2316, "step": 286 }, { "epoch": 0.6974483596597812, "grad_norm": 5.73889684677124, "learning_rate": 8.550000000000001e-07, "loss": 3.1505, "step": 287 }, { "epoch": 0.6998784933171325, "grad_norm": 3.960982322692871, "learning_rate": 8.580000000000001e-07, "loss": 3.1666, "step": 288 }, { "epoch": 0.7023086269744836, "grad_norm": 2.8662755489349365, "learning_rate": 8.61e-07, "loss": 3.1755, "step": 289 }, { "epoch": 0.7047387606318347, "grad_norm": 5.222265243530273, "learning_rate": 8.64e-07, "loss": 3.1273, "step": 290 }, { "epoch": 0.707168894289186, "grad_norm": 5.047396183013916, "learning_rate": 8.67e-07, "loss": 3.1987, "step": 291 }, { "epoch": 0.7095990279465371, "grad_norm": 4.044116497039795, "learning_rate": 8.7e-07, "loss": 3.1481, "step": 292 }, { "epoch": 0.7120291616038882, "grad_norm": 17.266653060913086, "learning_rate": 8.73e-07, "loss": 3.2015, "step": 293 }, { "epoch": 0.7144592952612394, "grad_norm": 3.728804349899292, "learning_rate": 8.760000000000001e-07, "loss": 3.1443, "step": 294 }, { "epoch": 0.7168894289185905, "grad_norm": 4.664876461029053, "learning_rate": 8.79e-07, "loss": 3.1422, "step": 295 }, { "epoch": 0.7193195625759417, "grad_norm": 4.272023677825928, "learning_rate": 8.82e-07, "loss": 3.1815, "step": 296 }, { "epoch": 0.7217496962332929, "grad_norm": 12.563604354858398, "learning_rate": 8.85e-07, "loss": 3.2102, "step": 297 }, { "epoch": 0.724179829890644, "grad_norm": 8.746769905090332, "learning_rate": 8.88e-07, "loss": 3.2684, "step": 298 }, { "epoch": 0.7266099635479951, "grad_norm": 5.020985126495361, "learning_rate": 8.91e-07, "loss": 3.1788, "step": 299 }, { "epoch": 0.7290400972053463, "grad_norm": 5.6469292640686035, "learning_rate": 8.94e-07, "loss": 3.3027, "step": 300 }, { "epoch": 0.7314702308626975, "grad_norm": 19.54851722717285, "learning_rate": 8.97e-07, "loss": 3.3091, "step": 301 }, { "epoch": 0.7339003645200486, "grad_norm": 6.044022083282471, "learning_rate": 9e-07, "loss": 3.126, "step": 302 }, { "epoch": 0.7363304981773997, "grad_norm": 5.269871234893799, "learning_rate": 9.03e-07, "loss": 3.1444, "step": 303 }, { "epoch": 0.7387606318347509, "grad_norm": 3.2705559730529785, "learning_rate": 9.060000000000001e-07, "loss": 3.0913, "step": 304 }, { "epoch": 0.741190765492102, "grad_norm": 7.828869342803955, "learning_rate": 9.09e-07, "loss": 3.1032, "step": 305 }, { "epoch": 0.7436208991494532, "grad_norm": 7.164411544799805, "learning_rate": 9.12e-07, "loss": 3.0937, "step": 306 }, { "epoch": 0.7460510328068044, "grad_norm": 7.705827713012695, "learning_rate": 9.15e-07, "loss": 3.0811, "step": 307 }, { "epoch": 0.7484811664641555, "grad_norm": 5.523608207702637, "learning_rate": 9.179999999999999e-07, "loss": 3.0427, "step": 308 }, { "epoch": 0.7509113001215066, "grad_norm": 3.7685329914093018, "learning_rate": 9.210000000000001e-07, "loss": 3.0345, "step": 309 }, { "epoch": 0.7533414337788579, "grad_norm": 3.588013172149658, "learning_rate": 9.240000000000001e-07, "loss": 3.0072, "step": 310 }, { "epoch": 0.755771567436209, "grad_norm": 2.0635666847229004, "learning_rate": 9.270000000000001e-07, "loss": 2.9984, "step": 311 }, { "epoch": 0.7582017010935601, "grad_norm": 4.697166919708252, "learning_rate": 9.3e-07, "loss": 3.0527, "step": 312 }, { "epoch": 0.7606318347509113, "grad_norm": 5.149069309234619, "learning_rate": 9.33e-07, "loss": 3.0191, "step": 313 }, { "epoch": 0.7630619684082625, "grad_norm": 7.530883312225342, "learning_rate": 9.36e-07, "loss": 3.068, "step": 314 }, { "epoch": 0.7654921020656136, "grad_norm": 5.692478656768799, "learning_rate": 9.39e-07, "loss": 3.0515, "step": 315 }, { "epoch": 0.7679222357229648, "grad_norm": 4.097743988037109, "learning_rate": 9.419999999999999e-07, "loss": 3.0125, "step": 316 }, { "epoch": 0.7703523693803159, "grad_norm": 5.472975254058838, "learning_rate": 9.450000000000001e-07, "loss": 3.0586, "step": 317 }, { "epoch": 0.772782503037667, "grad_norm": 2.9595370292663574, "learning_rate": 9.480000000000001e-07, "loss": 2.9776, "step": 318 }, { "epoch": 0.7752126366950183, "grad_norm": 3.9548182487487793, "learning_rate": 9.51e-07, "loss": 3.0427, "step": 319 }, { "epoch": 0.7776427703523694, "grad_norm": 3.499312162399292, "learning_rate": 9.54e-07, "loss": 3.0691, "step": 320 }, { "epoch": 0.7800729040097205, "grad_norm": 10.322419166564941, "learning_rate": 9.57e-07, "loss": 3.0298, "step": 321 }, { "epoch": 0.7825030376670717, "grad_norm": 5.1080002784729, "learning_rate": 9.600000000000001e-07, "loss": 3.0225, "step": 322 }, { "epoch": 0.7849331713244229, "grad_norm": 4.119999408721924, "learning_rate": 9.63e-07, "loss": 3.0133, "step": 323 }, { "epoch": 0.787363304981774, "grad_norm": 3.0090909004211426, "learning_rate": 9.66e-07, "loss": 2.9831, "step": 324 }, { "epoch": 0.7897934386391251, "grad_norm": 3.8169403076171875, "learning_rate": 9.690000000000002e-07, "loss": 3.0266, "step": 325 }, { "epoch": 0.7922235722964763, "grad_norm": 9.150503158569336, "learning_rate": 9.72e-07, "loss": 2.9771, "step": 326 }, { "epoch": 0.7946537059538274, "grad_norm": 3.1400256156921387, "learning_rate": 9.75e-07, "loss": 2.999, "step": 327 }, { "epoch": 0.7970838396111786, "grad_norm": 4.511935710906982, "learning_rate": 9.78e-07, "loss": 2.9787, "step": 328 }, { "epoch": 0.7995139732685298, "grad_norm": 3.6198339462280273, "learning_rate": 9.81e-07, "loss": 2.9542, "step": 329 }, { "epoch": 0.8019441069258809, "grad_norm": 8.847219467163086, "learning_rate": 9.84e-07, "loss": 2.9637, "step": 330 }, { "epoch": 0.804374240583232, "grad_norm": 5.78687047958374, "learning_rate": 9.87e-07, "loss": 3.0029, "step": 331 }, { "epoch": 0.8068043742405833, "grad_norm": 7.134451866149902, "learning_rate": 9.9e-07, "loss": 2.9963, "step": 332 }, { "epoch": 0.8092345078979344, "grad_norm": 3.1396076679229736, "learning_rate": 9.929999999999999e-07, "loss": 3.0218, "step": 333 }, { "epoch": 0.8116646415552855, "grad_norm": 4.346477508544922, "learning_rate": 9.96e-07, "loss": 3.0605, "step": 334 }, { "epoch": 0.8140947752126367, "grad_norm": 2.707158327102661, "learning_rate": 9.99e-07, "loss": 2.9914, "step": 335 }, { "epoch": 0.8165249088699879, "grad_norm": 5.81843900680542, "learning_rate": 1.002e-06, "loss": 2.9524, "step": 336 }, { "epoch": 0.818955042527339, "grad_norm": 7.961120128631592, "learning_rate": 1.0050000000000001e-06, "loss": 3.0191, "step": 337 }, { "epoch": 0.8213851761846902, "grad_norm": 5.183945178985596, "learning_rate": 1.008e-06, "loss": 2.9669, "step": 338 }, { "epoch": 0.8238153098420413, "grad_norm": 5.009942054748535, "learning_rate": 1.0110000000000001e-06, "loss": 2.9527, "step": 339 }, { "epoch": 0.8262454434993924, "grad_norm": 3.2630765438079834, "learning_rate": 1.014e-06, "loss": 3.0073, "step": 340 }, { "epoch": 0.8286755771567437, "grad_norm": 3.4544050693511963, "learning_rate": 1.017e-06, "loss": 2.9709, "step": 341 }, { "epoch": 0.8311057108140948, "grad_norm": 2.496800422668457, "learning_rate": 1.0200000000000002e-06, "loss": 3.0064, "step": 342 }, { "epoch": 0.8335358444714459, "grad_norm": 7.276801109313965, "learning_rate": 1.023e-06, "loss": 3.0524, "step": 343 }, { "epoch": 0.8359659781287971, "grad_norm": 9.192217826843262, "learning_rate": 1.026e-06, "loss": 2.9903, "step": 344 }, { "epoch": 0.8383961117861483, "grad_norm": 4.889023780822754, "learning_rate": 1.029e-06, "loss": 3.0234, "step": 345 }, { "epoch": 0.8408262454434994, "grad_norm": 4.13486909866333, "learning_rate": 1.032e-06, "loss": 3.0074, "step": 346 }, { "epoch": 0.8432563791008505, "grad_norm": 6.263684272766113, "learning_rate": 1.035e-06, "loss": 3.0242, "step": 347 }, { "epoch": 0.8456865127582017, "grad_norm": 5.342067241668701, "learning_rate": 1.038e-06, "loss": 3.055, "step": 348 }, { "epoch": 0.8481166464155528, "grad_norm": 7.007871150970459, "learning_rate": 1.041e-06, "loss": 2.9289, "step": 349 }, { "epoch": 0.850546780072904, "grad_norm": 11.489191055297852, "learning_rate": 1.0439999999999999e-06, "loss": 3.0851, "step": 350 }, { "epoch": 0.8529769137302552, "grad_norm": 13.787400245666504, "learning_rate": 1.047e-06, "loss": 3.137, "step": 351 }, { "epoch": 0.8554070473876063, "grad_norm": 5.9705681800842285, "learning_rate": 1.0500000000000001e-06, "loss": 3.0118, "step": 352 }, { "epoch": 0.8578371810449574, "grad_norm": 4.081809997558594, "learning_rate": 1.053e-06, "loss": 2.9872, "step": 353 }, { "epoch": 0.8602673147023087, "grad_norm": 2.4096293449401855, "learning_rate": 1.0560000000000001e-06, "loss": 3.0044, "step": 354 }, { "epoch": 0.8626974483596598, "grad_norm": 5.293358325958252, "learning_rate": 1.059e-06, "loss": 2.9627, "step": 355 }, { "epoch": 0.8651275820170109, "grad_norm": 9.046375274658203, "learning_rate": 1.062e-06, "loss": 2.9825, "step": 356 }, { "epoch": 0.8675577156743621, "grad_norm": 11.698064804077148, "learning_rate": 1.065e-06, "loss": 2.9323, "step": 357 }, { "epoch": 0.8699878493317132, "grad_norm": 9.591734886169434, "learning_rate": 1.068e-06, "loss": 2.9557, "step": 358 }, { "epoch": 0.8724179829890644, "grad_norm": 11.94154167175293, "learning_rate": 1.0710000000000002e-06, "loss": 2.9737, "step": 359 }, { "epoch": 0.8748481166464156, "grad_norm": 5.025158882141113, "learning_rate": 1.074e-06, "loss": 2.9263, "step": 360 }, { "epoch": 0.8772782503037667, "grad_norm": 3.219633102416992, "learning_rate": 1.077e-06, "loss": 2.904, "step": 361 }, { "epoch": 0.8797083839611178, "grad_norm": 3.885911464691162, "learning_rate": 1.08e-06, "loss": 2.883, "step": 362 }, { "epoch": 0.8821385176184691, "grad_norm": 5.015427112579346, "learning_rate": 1.083e-06, "loss": 2.9063, "step": 363 }, { "epoch": 0.8845686512758202, "grad_norm": 6.546631813049316, "learning_rate": 1.086e-06, "loss": 2.9141, "step": 364 }, { "epoch": 0.8869987849331713, "grad_norm": 7.829056262969971, "learning_rate": 1.089e-06, "loss": 2.9432, "step": 365 }, { "epoch": 0.8894289185905225, "grad_norm": 4.387656211853027, "learning_rate": 1.092e-06, "loss": 2.9407, "step": 366 }, { "epoch": 0.8918590522478737, "grad_norm": 2.658281087875366, "learning_rate": 1.0949999999999999e-06, "loss": 2.9137, "step": 367 }, { "epoch": 0.8942891859052248, "grad_norm": 3.974809169769287, "learning_rate": 1.0980000000000001e-06, "loss": 2.9076, "step": 368 }, { "epoch": 0.8967193195625759, "grad_norm": 3.0229134559631348, "learning_rate": 1.1010000000000001e-06, "loss": 2.9129, "step": 369 }, { "epoch": 0.8991494532199271, "grad_norm": 6.396419525146484, "learning_rate": 1.104e-06, "loss": 2.9456, "step": 370 }, { "epoch": 0.9015795868772782, "grad_norm": 3.050353527069092, "learning_rate": 1.1070000000000002e-06, "loss": 2.9372, "step": 371 }, { "epoch": 0.9040097205346294, "grad_norm": 3.933783531188965, "learning_rate": 1.11e-06, "loss": 2.9156, "step": 372 }, { "epoch": 0.9064398541919806, "grad_norm": 5.817261219024658, "learning_rate": 1.113e-06, "loss": 2.9383, "step": 373 }, { "epoch": 0.9088699878493317, "grad_norm": 2.7727599143981934, "learning_rate": 1.116e-06, "loss": 2.9303, "step": 374 }, { "epoch": 0.9113001215066828, "grad_norm": 2.4177491664886475, "learning_rate": 1.119e-06, "loss": 2.8858, "step": 375 }, { "epoch": 0.913730255164034, "grad_norm": 5.094928741455078, "learning_rate": 1.122e-06, "loss": 2.9392, "step": 376 }, { "epoch": 0.9161603888213852, "grad_norm": 4.837106227874756, "learning_rate": 1.125e-06, "loss": 2.9295, "step": 377 }, { "epoch": 0.9185905224787363, "grad_norm": 6.71516752243042, "learning_rate": 1.128e-06, "loss": 2.8955, "step": 378 }, { "epoch": 0.9210206561360875, "grad_norm": 4.150357246398926, "learning_rate": 1.131e-06, "loss": 2.9274, "step": 379 }, { "epoch": 0.9234507897934386, "grad_norm": 4.886255741119385, "learning_rate": 1.134e-06, "loss": 2.8897, "step": 380 }, { "epoch": 0.9258809234507898, "grad_norm": 4.781455993652344, "learning_rate": 1.137e-06, "loss": 2.954, "step": 381 }, { "epoch": 0.928311057108141, "grad_norm": 3.272282123565674, "learning_rate": 1.14e-06, "loss": 2.9081, "step": 382 }, { "epoch": 0.9307411907654921, "grad_norm": 3.1398160457611084, "learning_rate": 1.1430000000000001e-06, "loss": 2.8978, "step": 383 }, { "epoch": 0.9331713244228432, "grad_norm": 4.982151985168457, "learning_rate": 1.146e-06, "loss": 2.9364, "step": 384 }, { "epoch": 0.9356014580801945, "grad_norm": 4.438499927520752, "learning_rate": 1.1490000000000001e-06, "loss": 2.9365, "step": 385 }, { "epoch": 0.9380315917375456, "grad_norm": 4.119975566864014, "learning_rate": 1.152e-06, "loss": 2.9315, "step": 386 }, { "epoch": 0.9404617253948967, "grad_norm": 2.6801373958587646, "learning_rate": 1.155e-06, "loss": 2.9242, "step": 387 }, { "epoch": 0.9428918590522479, "grad_norm": 3.922069787979126, "learning_rate": 1.1580000000000002e-06, "loss": 2.9124, "step": 388 }, { "epoch": 0.945321992709599, "grad_norm": 8.835030555725098, "learning_rate": 1.161e-06, "loss": 2.8726, "step": 389 }, { "epoch": 0.9477521263669502, "grad_norm": 8.11300277709961, "learning_rate": 1.164e-06, "loss": 2.9732, "step": 390 }, { "epoch": 0.9501822600243013, "grad_norm": 4.204689979553223, "learning_rate": 1.167e-06, "loss": 2.9432, "step": 391 }, { "epoch": 0.9526123936816525, "grad_norm": 3.7626848220825195, "learning_rate": 1.17e-06, "loss": 2.9293, "step": 392 }, { "epoch": 0.9550425273390036, "grad_norm": 5.216957092285156, "learning_rate": 1.173e-06, "loss": 2.8659, "step": 393 }, { "epoch": 0.9574726609963548, "grad_norm": 5.89713191986084, "learning_rate": 1.176e-06, "loss": 2.9601, "step": 394 }, { "epoch": 0.959902794653706, "grad_norm": 7.95246696472168, "learning_rate": 1.179e-06, "loss": 2.9979, "step": 395 }, { "epoch": 0.9623329283110571, "grad_norm": 5.265660285949707, "learning_rate": 1.1819999999999999e-06, "loss": 2.9229, "step": 396 }, { "epoch": 0.9647630619684082, "grad_norm": 9.593534469604492, "learning_rate": 1.185e-06, "loss": 2.8789, "step": 397 }, { "epoch": 0.9671931956257594, "grad_norm": 4.806003093719482, "learning_rate": 1.188e-06, "loss": 2.9869, "step": 398 }, { "epoch": 0.9696233292831106, "grad_norm": 4.263002872467041, "learning_rate": 1.191e-06, "loss": 2.924, "step": 399 }, { "epoch": 0.9720534629404617, "grad_norm": 7.018017292022705, "learning_rate": 1.1940000000000001e-06, "loss": 3.0194, "step": 400 }, { "epoch": 0.9744835965978129, "grad_norm": 8.685270309448242, "learning_rate": 1.197e-06, "loss": 2.9978, "step": 401 }, { "epoch": 0.976913730255164, "grad_norm": 3.853455066680908, "learning_rate": 1.2000000000000002e-06, "loss": 2.89, "step": 402 }, { "epoch": 0.9793438639125152, "grad_norm": 2.5147430896759033, "learning_rate": 1.203e-06, "loss": 2.8895, "step": 403 }, { "epoch": 0.9817739975698664, "grad_norm": 5.4434638023376465, "learning_rate": 1.206e-06, "loss": 2.8837, "step": 404 }, { "epoch": 0.9842041312272175, "grad_norm": 6.393418788909912, "learning_rate": 1.2090000000000002e-06, "loss": 2.8664, "step": 405 }, { "epoch": 0.9866342648845686, "grad_norm": 5.64236307144165, "learning_rate": 1.212e-06, "loss": 2.8883, "step": 406 }, { "epoch": 0.9890643985419199, "grad_norm": 5.99589204788208, "learning_rate": 1.215e-06, "loss": 2.8871, "step": 407 }, { "epoch": 0.991494532199271, "grad_norm": 2.9953370094299316, "learning_rate": 1.218e-06, "loss": 2.8317, "step": 408 }, { "epoch": 0.9939246658566221, "grad_norm": 4.168422698974609, "learning_rate": 1.221e-06, "loss": 2.9359, "step": 409 }, { "epoch": 0.9963547995139733, "grad_norm": 5.053891181945801, "learning_rate": 1.224e-06, "loss": 2.9158, "step": 410 }, { "epoch": 0.9987849331713244, "grad_norm": 5.467897415161133, "learning_rate": 1.227e-06, "loss": 2.9519, "step": 411 }, { "epoch": 1.0, "grad_norm": 8.152799606323242, "learning_rate": 1.23e-06, "loss": 1.472, "step": 412 }, { "epoch": 1.0024301336573511, "grad_norm": 15.006340026855469, "learning_rate": 1.2329999999999999e-06, "loss": 3.1046, "step": 413 }, { "epoch": 1.0048602673147022, "grad_norm": 8.346914291381836, "learning_rate": 1.2360000000000001e-06, "loss": 2.9502, "step": 414 }, { "epoch": 1.0072904009720534, "grad_norm": 6.0951828956604, "learning_rate": 1.2390000000000001e-06, "loss": 2.9066, "step": 415 }, { "epoch": 1.0097205346294047, "grad_norm": 2.9484267234802246, "learning_rate": 1.242e-06, "loss": 2.9009, "step": 416 }, { "epoch": 1.0121506682867558, "grad_norm": 3.41139554977417, "learning_rate": 1.2450000000000002e-06, "loss": 2.9606, "step": 417 }, { "epoch": 1.014580801944107, "grad_norm": 8.172654151916504, "learning_rate": 1.248e-06, "loss": 2.9068, "step": 418 }, { "epoch": 1.017010935601458, "grad_norm": 11.49866008758545, "learning_rate": 1.251e-06, "loss": 2.9361, "step": 419 }, { "epoch": 1.0194410692588092, "grad_norm": 10.0910062789917, "learning_rate": 1.254e-06, "loss": 2.9084, "step": 420 }, { "epoch": 1.0218712029161603, "grad_norm": 15.45237922668457, "learning_rate": 1.257e-06, "loss": 2.8509, "step": 421 }, { "epoch": 1.0243013365735116, "grad_norm": 6.361912250518799, "learning_rate": 1.26e-06, "loss": 2.8478, "step": 422 }, { "epoch": 1.0267314702308628, "grad_norm": 3.842728614807129, "learning_rate": 1.263e-06, "loss": 2.8665, "step": 423 }, { "epoch": 1.0291616038882139, "grad_norm": 5.201762676239014, "learning_rate": 1.266e-06, "loss": 2.8512, "step": 424 }, { "epoch": 1.031591737545565, "grad_norm": 4.795050144195557, "learning_rate": 1.269e-06, "loss": 2.8733, "step": 425 }, { "epoch": 1.034021871202916, "grad_norm": 7.89229679107666, "learning_rate": 1.272e-06, "loss": 2.8946, "step": 426 }, { "epoch": 1.0364520048602672, "grad_norm": 4.565535068511963, "learning_rate": 1.275e-06, "loss": 2.8521, "step": 427 }, { "epoch": 1.0388821385176186, "grad_norm": 5.015725612640381, "learning_rate": 1.278e-06, "loss": 2.8493, "step": 428 }, { "epoch": 1.0413122721749697, "grad_norm": 4.489656448364258, "learning_rate": 1.281e-06, "loss": 2.8594, "step": 429 }, { "epoch": 1.0437424058323208, "grad_norm": 2.9980838298797607, "learning_rate": 1.284e-06, "loss": 2.836, "step": 430 }, { "epoch": 1.046172539489672, "grad_norm": 3.9521656036376953, "learning_rate": 1.2870000000000001e-06, "loss": 2.8192, "step": 431 }, { "epoch": 1.048602673147023, "grad_norm": 5.235060214996338, "learning_rate": 1.29e-06, "loss": 2.8555, "step": 432 }, { "epoch": 1.0510328068043742, "grad_norm": 6.007364749908447, "learning_rate": 1.293e-06, "loss": 2.8711, "step": 433 }, { "epoch": 1.0534629404617255, "grad_norm": 4.031667709350586, "learning_rate": 1.2960000000000002e-06, "loss": 2.848, "step": 434 }, { "epoch": 1.0558930741190766, "grad_norm": 4.215724468231201, "learning_rate": 1.299e-06, "loss": 2.868, "step": 435 }, { "epoch": 1.0583232077764277, "grad_norm": 2.5593512058258057, "learning_rate": 1.302e-06, "loss": 2.8343, "step": 436 }, { "epoch": 1.0607533414337789, "grad_norm": 4.611675262451172, "learning_rate": 1.305e-06, "loss": 2.8825, "step": 437 }, { "epoch": 1.06318347509113, "grad_norm": 3.3319525718688965, "learning_rate": 1.308e-06, "loss": 2.8719, "step": 438 }, { "epoch": 1.065613608748481, "grad_norm": 11.382694244384766, "learning_rate": 1.311e-06, "loss": 2.8685, "step": 439 }, { "epoch": 1.0680437424058322, "grad_norm": 4.378936290740967, "learning_rate": 1.314e-06, "loss": 2.8523, "step": 440 }, { "epoch": 1.0704738760631836, "grad_norm": 14.721598625183105, "learning_rate": 1.317e-06, "loss": 2.8763, "step": 441 }, { "epoch": 1.0729040097205347, "grad_norm": 4.731357097625732, "learning_rate": 1.3199999999999999e-06, "loss": 2.8353, "step": 442 }, { "epoch": 1.0753341433778858, "grad_norm": 4.34326171875, "learning_rate": 1.323e-06, "loss": 2.8502, "step": 443 }, { "epoch": 1.077764277035237, "grad_norm": 4.761924743652344, "learning_rate": 1.326e-06, "loss": 2.9118, "step": 444 }, { "epoch": 1.080194410692588, "grad_norm": 9.197234153747559, "learning_rate": 1.3290000000000001e-06, "loss": 2.8428, "step": 445 }, { "epoch": 1.0826245443499392, "grad_norm": 4.313076496124268, "learning_rate": 1.3320000000000001e-06, "loss": 2.8382, "step": 446 }, { "epoch": 1.0850546780072905, "grad_norm": 2.643549919128418, "learning_rate": 1.335e-06, "loss": 2.8601, "step": 447 }, { "epoch": 1.0874848116646416, "grad_norm": 7.945528984069824, "learning_rate": 1.3380000000000001e-06, "loss": 2.8013, "step": 448 }, { "epoch": 1.0899149453219927, "grad_norm": 2.4594085216522217, "learning_rate": 1.341e-06, "loss": 2.8162, "step": 449 }, { "epoch": 1.0923450789793439, "grad_norm": 3.360072612762451, "learning_rate": 1.344e-06, "loss": 2.8574, "step": 450 }, { "epoch": 1.094775212636695, "grad_norm": 3.9171745777130127, "learning_rate": 1.3470000000000002e-06, "loss": 2.8425, "step": 451 }, { "epoch": 1.097205346294046, "grad_norm": 4.890247344970703, "learning_rate": 1.35e-06, "loss": 2.8632, "step": 452 }, { "epoch": 1.0996354799513974, "grad_norm": 4.380356788635254, "learning_rate": 1.353e-06, "loss": 2.8316, "step": 453 }, { "epoch": 1.1020656136087486, "grad_norm": 3.3724515438079834, "learning_rate": 1.356e-06, "loss": 2.855, "step": 454 }, { "epoch": 1.1044957472660997, "grad_norm": 6.068344593048096, "learning_rate": 1.359e-06, "loss": 2.8559, "step": 455 }, { "epoch": 1.1069258809234508, "grad_norm": 6.111365795135498, "learning_rate": 1.362e-06, "loss": 2.8865, "step": 456 }, { "epoch": 1.109356014580802, "grad_norm": 9.085060119628906, "learning_rate": 1.365e-06, "loss": 2.859, "step": 457 }, { "epoch": 1.111786148238153, "grad_norm": 7.320844650268555, "learning_rate": 1.368e-06, "loss": 2.891, "step": 458 }, { "epoch": 1.1142162818955041, "grad_norm": 5.448301792144775, "learning_rate": 1.3709999999999999e-06, "loss": 2.8401, "step": 459 }, { "epoch": 1.1166464155528555, "grad_norm": NaN, "learning_rate": 1.3709999999999999e-06, "loss": 2.8809, "step": 460 }, { "epoch": 1.1190765492102066, "grad_norm": 12.237930297851562, "learning_rate": 1.374e-06, "loss": 2.8819, "step": 461 }, { "epoch": 1.1215066828675577, "grad_norm": 14.549925804138184, "learning_rate": 1.3770000000000001e-06, "loss": 3.0127, "step": 462 }, { "epoch": 1.1239368165249088, "grad_norm": 9.64478874206543, "learning_rate": 1.38e-06, "loss": 2.9716, "step": 463 }, { "epoch": 1.12636695018226, "grad_norm": 4.59730339050293, "learning_rate": 1.3830000000000001e-06, "loss": 2.9145, "step": 464 }, { "epoch": 1.128797083839611, "grad_norm": 2.5641891956329346, "learning_rate": 1.386e-06, "loss": 2.8542, "step": 465 }, { "epoch": 1.1312272174969624, "grad_norm": 3.4826552867889404, "learning_rate": 1.389e-06, "loss": 2.8682, "step": 466 }, { "epoch": 1.1336573511543135, "grad_norm": 5.72088098526001, "learning_rate": 1.392e-06, "loss": 2.869, "step": 467 }, { "epoch": 1.1360874848116647, "grad_norm": 7.856710910797119, "learning_rate": 1.395e-06, "loss": 2.8453, "step": 468 }, { "epoch": 1.1385176184690158, "grad_norm": 5.188879489898682, "learning_rate": 1.3980000000000002e-06, "loss": 2.8551, "step": 469 }, { "epoch": 1.140947752126367, "grad_norm": 3.778571605682373, "learning_rate": 1.401e-06, "loss": 2.8284, "step": 470 }, { "epoch": 1.143377885783718, "grad_norm": 6.491325378417969, "learning_rate": 1.404e-06, "loss": 2.8208, "step": 471 }, { "epoch": 1.1458080194410694, "grad_norm": 7.279517650604248, "learning_rate": 1.407e-06, "loss": 2.8382, "step": 472 }, { "epoch": 1.1482381530984205, "grad_norm": 4.834375858306885, "learning_rate": 1.41e-06, "loss": 2.8464, "step": 473 }, { "epoch": 1.1506682867557716, "grad_norm": 3.8952813148498535, "learning_rate": 1.413e-06, "loss": 2.822, "step": 474 }, { "epoch": 1.1530984204131227, "grad_norm": 3.8332488536834717, "learning_rate": 1.416e-06, "loss": 2.788, "step": 475 }, { "epoch": 1.1555285540704738, "grad_norm": 3.7866549491882324, "learning_rate": 1.419e-06, "loss": 2.799, "step": 476 }, { "epoch": 1.157958687727825, "grad_norm": 4.237229347229004, "learning_rate": 1.422e-06, "loss": 2.81, "step": 477 }, { "epoch": 1.160388821385176, "grad_norm": 4.966311931610107, "learning_rate": 1.4250000000000001e-06, "loss": 2.8041, "step": 478 }, { "epoch": 1.1628189550425274, "grad_norm": 4.5229010581970215, "learning_rate": 1.4280000000000001e-06, "loss": 2.8249, "step": 479 }, { "epoch": 1.1652490886998785, "grad_norm": 7.936843395233154, "learning_rate": 1.431e-06, "loss": 2.8282, "step": 480 }, { "epoch": 1.1676792223572297, "grad_norm": 6.619509220123291, "learning_rate": 1.4340000000000002e-06, "loss": 2.8204, "step": 481 }, { "epoch": 1.1701093560145808, "grad_norm": 6.51749324798584, "learning_rate": 1.437e-06, "loss": 2.8423, "step": 482 }, { "epoch": 1.172539489671932, "grad_norm": 7.348910808563232, "learning_rate": 1.44e-06, "loss": 2.7842, "step": 483 }, { "epoch": 1.1749696233292832, "grad_norm": 4.493994235992432, "learning_rate": 1.443e-06, "loss": 2.7687, "step": 484 }, { "epoch": 1.1773997569866343, "grad_norm": 5.384476184844971, "learning_rate": 1.446e-06, "loss": 2.8105, "step": 485 }, { "epoch": 1.1798298906439855, "grad_norm": 17.75827980041504, "learning_rate": 1.449e-06, "loss": 2.7834, "step": 486 }, { "epoch": 1.1822600243013366, "grad_norm": 3.5806820392608643, "learning_rate": 1.452e-06, "loss": 2.8247, "step": 487 }, { "epoch": 1.1846901579586877, "grad_norm": 6.88260555267334, "learning_rate": 1.455e-06, "loss": 2.8195, "step": 488 }, { "epoch": 1.1871202916160388, "grad_norm": 2.749821901321411, "learning_rate": 1.458e-06, "loss": 2.7746, "step": 489 }, { "epoch": 1.18955042527339, "grad_norm": 4.4666948318481445, "learning_rate": 1.461e-06, "loss": 2.7979, "step": 490 }, { "epoch": 1.1919805589307413, "grad_norm": 3.1026198863983154, "learning_rate": 1.464e-06, "loss": 2.7549, "step": 491 }, { "epoch": 1.1944106925880924, "grad_norm": 4.506851673126221, "learning_rate": 1.467e-06, "loss": 2.7779, "step": 492 }, { "epoch": 1.1968408262454435, "grad_norm": 4.351664066314697, "learning_rate": 1.4700000000000001e-06, "loss": 2.7865, "step": 493 }, { "epoch": 1.1992709599027946, "grad_norm": 7.507403373718262, "learning_rate": 1.473e-06, "loss": 2.7835, "step": 494 }, { "epoch": 1.2017010935601458, "grad_norm": 30.74132537841797, "learning_rate": 1.4760000000000001e-06, "loss": 2.7804, "step": 495 }, { "epoch": 1.2041312272174969, "grad_norm": 6.045531272888184, "learning_rate": 1.479e-06, "loss": 2.8238, "step": 496 }, { "epoch": 1.206561360874848, "grad_norm": 4.282325267791748, "learning_rate": 1.482e-06, "loss": 2.7851, "step": 497 }, { "epoch": 1.2089914945321993, "grad_norm": 3.025688409805298, "learning_rate": 1.4850000000000002e-06, "loss": 2.7853, "step": 498 }, { "epoch": 1.2114216281895505, "grad_norm": 7.6886467933654785, "learning_rate": 1.488e-06, "loss": 2.776, "step": 499 }, { "epoch": 1.2138517618469016, "grad_norm": 3.571652412414551, "learning_rate": 1.491e-06, "loss": 2.7921, "step": 500 }, { "epoch": 1.2162818955042527, "grad_norm": 7.486672878265381, "learning_rate": 1.494e-06, "loss": 2.7758, "step": 501 }, { "epoch": 1.2187120291616038, "grad_norm": 3.8430023193359375, "learning_rate": 1.497e-06, "loss": 2.7796, "step": 502 }, { "epoch": 1.2211421628189552, "grad_norm": 6.7286787033081055, "learning_rate": 1.5e-06, "loss": 2.7834, "step": 503 }, { "epoch": 1.2235722964763063, "grad_norm": 14.185094833374023, "learning_rate": 1.503e-06, "loss": 2.7693, "step": 504 }, { "epoch": 1.2260024301336574, "grad_norm": 4.372721195220947, "learning_rate": 1.506e-06, "loss": 2.778, "step": 505 }, { "epoch": 1.2284325637910085, "grad_norm": 3.7360386848449707, "learning_rate": 1.5089999999999999e-06, "loss": 2.7989, "step": 506 }, { "epoch": 1.2308626974483596, "grad_norm": 11.453518867492676, "learning_rate": 1.512e-06, "loss": 2.804, "step": 507 }, { "epoch": 1.2332928311057108, "grad_norm": 7.237366676330566, "learning_rate": 1.5150000000000001e-06, "loss": 2.8267, "step": 508 }, { "epoch": 1.2357229647630619, "grad_norm": 7.972693920135498, "learning_rate": 1.518e-06, "loss": 2.7724, "step": 509 }, { "epoch": 1.2381530984204132, "grad_norm": 7.772054195404053, "learning_rate": 1.5210000000000001e-06, "loss": 2.8091, "step": 510 }, { "epoch": 1.2405832320777643, "grad_norm": 8.711593627929688, "learning_rate": 1.524e-06, "loss": 2.793, "step": 511 }, { "epoch": 1.2430133657351154, "grad_norm": 7.541114807128906, "learning_rate": 1.5270000000000002e-06, "loss": 2.8182, "step": 512 }, { "epoch": 1.2454434993924666, "grad_norm": 11.77699089050293, "learning_rate": 1.53e-06, "loss": 2.8859, "step": 513 }, { "epoch": 1.2478736330498177, "grad_norm": 4.479485511779785, "learning_rate": 1.533e-06, "loss": 2.8065, "step": 514 }, { "epoch": 1.250303766707169, "grad_norm": 3.545567750930786, "learning_rate": 1.5360000000000002e-06, "loss": 2.804, "step": 515 }, { "epoch": 1.25273390036452, "grad_norm": 2.4853012561798096, "learning_rate": 1.539e-06, "loss": 2.7859, "step": 516 }, { "epoch": 1.2551640340218713, "grad_norm": 5.361724853515625, "learning_rate": 1.542e-06, "loss": 2.7826, "step": 517 }, { "epoch": 1.2575941676792224, "grad_norm": 7.928747653961182, "learning_rate": 1.545e-06, "loss": 2.7382, "step": 518 }, { "epoch": 1.2600243013365735, "grad_norm": 6.702937602996826, "learning_rate": 1.548e-06, "loss": 2.7379, "step": 519 }, { "epoch": 1.2624544349939246, "grad_norm": 4.902796745300293, "learning_rate": 1.551e-06, "loss": 2.7018, "step": 520 }, { "epoch": 1.2648845686512757, "grad_norm": 5.6517863273620605, "learning_rate": 1.554e-06, "loss": 2.7511, "step": 521 }, { "epoch": 1.267314702308627, "grad_norm": 4.860466003417969, "learning_rate": 1.557e-06, "loss": 2.7233, "step": 522 }, { "epoch": 1.2697448359659782, "grad_norm": 5.5633063316345215, "learning_rate": 1.5599999999999999e-06, "loss": 2.7443, "step": 523 }, { "epoch": 1.2721749696233293, "grad_norm": 4.407516956329346, "learning_rate": 1.5630000000000001e-06, "loss": 2.7367, "step": 524 }, { "epoch": 1.2746051032806804, "grad_norm": 4.788374423980713, "learning_rate": 1.5660000000000001e-06, "loss": 2.7288, "step": 525 }, { "epoch": 1.2770352369380316, "grad_norm": 2.253481388092041, "learning_rate": 1.569e-06, "loss": 2.6881, "step": 526 }, { "epoch": 1.2794653705953827, "grad_norm": 5.29470157623291, "learning_rate": 1.5720000000000002e-06, "loss": 2.6975, "step": 527 }, { "epoch": 1.2818955042527338, "grad_norm": 4.809932708740234, "learning_rate": 1.575e-06, "loss": 2.724, "step": 528 }, { "epoch": 1.2843256379100851, "grad_norm": 4.387966632843018, "learning_rate": 1.578e-06, "loss": 2.776, "step": 529 }, { "epoch": 1.2867557715674363, "grad_norm": 2.7207391262054443, "learning_rate": 1.581e-06, "loss": 2.7033, "step": 530 }, { "epoch": 1.2891859052247874, "grad_norm": 2.7622718811035156, "learning_rate": 1.584e-06, "loss": 2.7074, "step": 531 }, { "epoch": 1.2916160388821385, "grad_norm": 3.79521107673645, "learning_rate": 1.5870000000000002e-06, "loss": 2.6866, "step": 532 }, { "epoch": 1.2940461725394896, "grad_norm": 5.901496887207031, "learning_rate": 1.59e-06, "loss": 2.8357, "step": 533 }, { "epoch": 1.296476306196841, "grad_norm": 6.230663299560547, "learning_rate": 1.593e-06, "loss": 2.6847, "step": 534 }, { "epoch": 1.2989064398541919, "grad_norm": 8.117265701293945, "learning_rate": 1.596e-06, "loss": 2.6868, "step": 535 }, { "epoch": 1.3013365735115432, "grad_norm": 5.459497928619385, "learning_rate": 1.599e-06, "loss": 2.6606, "step": 536 }, { "epoch": 1.3037667071688943, "grad_norm": 3.462874412536621, "learning_rate": 1.602e-06, "loss": 2.6581, "step": 537 }, { "epoch": 1.3061968408262454, "grad_norm": 3.4003734588623047, "learning_rate": 1.605e-06, "loss": 2.6487, "step": 538 }, { "epoch": 1.3086269744835966, "grad_norm": 3.6501619815826416, "learning_rate": 1.608e-06, "loss": 2.6804, "step": 539 }, { "epoch": 1.3110571081409477, "grad_norm": 3.087853193283081, "learning_rate": 1.611e-06, "loss": 2.653, "step": 540 }, { "epoch": 1.313487241798299, "grad_norm": 4.6285505294799805, "learning_rate": 1.6140000000000001e-06, "loss": 2.6452, "step": 541 }, { "epoch": 1.3159173754556501, "grad_norm": 3.3217051029205322, "learning_rate": 1.6170000000000001e-06, "loss": 2.6415, "step": 542 }, { "epoch": 1.3183475091130012, "grad_norm": 5.716796398162842, "learning_rate": 1.62e-06, "loss": 2.6273, "step": 543 }, { "epoch": 1.3207776427703524, "grad_norm": 4.824670314788818, "learning_rate": 1.6230000000000002e-06, "loss": 2.5884, "step": 544 }, { "epoch": 1.3232077764277035, "grad_norm": 11.822954177856445, "learning_rate": 1.626e-06, "loss": 2.6452, "step": 545 }, { "epoch": 1.3256379100850546, "grad_norm": 4.308380603790283, "learning_rate": 1.629e-06, "loss": 2.6509, "step": 546 }, { "epoch": 1.3280680437424057, "grad_norm": 3.5699973106384277, "learning_rate": 1.632e-06, "loss": 2.661, "step": 547 }, { "epoch": 1.330498177399757, "grad_norm": 6.269909381866455, "learning_rate": 1.635e-06, "loss": 2.667, "step": 548 }, { "epoch": 1.3329283110571082, "grad_norm": 3.974316358566284, "learning_rate": 1.638e-06, "loss": 2.6358, "step": 549 }, { "epoch": 1.3353584447144593, "grad_norm": 9.780369758605957, "learning_rate": 1.641e-06, "loss": 2.6412, "step": 550 }, { "epoch": 1.3377885783718104, "grad_norm": 6.229178428649902, "learning_rate": 1.644e-06, "loss": 2.6415, "step": 551 }, { "epoch": 1.3402187120291615, "grad_norm": 4.771795749664307, "learning_rate": 1.6469999999999999e-06, "loss": 2.6429, "step": 552 }, { "epoch": 1.3426488456865129, "grad_norm": 4.707674503326416, "learning_rate": 1.65e-06, "loss": 2.5817, "step": 553 }, { "epoch": 1.3450789793438638, "grad_norm": 4.209589958190918, "learning_rate": 1.653e-06, "loss": 2.6071, "step": 554 }, { "epoch": 1.3475091130012151, "grad_norm": 4.542924880981445, "learning_rate": 1.6560000000000001e-06, "loss": 2.5905, "step": 555 }, { "epoch": 1.3499392466585662, "grad_norm": 3.0187361240386963, "learning_rate": 1.6590000000000001e-06, "loss": 2.61, "step": 556 }, { "epoch": 1.3523693803159174, "grad_norm": 10.546207427978516, "learning_rate": 1.662e-06, "loss": 2.6471, "step": 557 }, { "epoch": 1.3547995139732685, "grad_norm": 5.794422626495361, "learning_rate": 1.6650000000000002e-06, "loss": 2.6736, "step": 558 }, { "epoch": 1.3572296476306196, "grad_norm": 9.366691589355469, "learning_rate": 1.668e-06, "loss": 2.6024, "step": 559 }, { "epoch": 1.359659781287971, "grad_norm": 3.8943889141082764, "learning_rate": 1.671e-06, "loss": 2.6674, "step": 560 }, { "epoch": 1.362089914945322, "grad_norm": 11.1276273727417, "learning_rate": 1.6740000000000002e-06, "loss": 2.6699, "step": 561 }, { "epoch": 1.3645200486026732, "grad_norm": 5.943089485168457, "learning_rate": 1.677e-06, "loss": 2.705, "step": 562 }, { "epoch": 1.3669501822600243, "grad_norm": 6.737440586090088, "learning_rate": 1.68e-06, "loss": 2.7281, "step": 563 }, { "epoch": 1.3693803159173754, "grad_norm": 4.765307426452637, "learning_rate": 1.683e-06, "loss": 2.6584, "step": 564 }, { "epoch": 1.3718104495747265, "grad_norm": 4.548864364624023, "learning_rate": 1.686e-06, "loss": 2.6447, "step": 565 }, { "epoch": 1.3742405832320777, "grad_norm": 4.981193542480469, "learning_rate": 1.689e-06, "loss": 2.581, "step": 566 }, { "epoch": 1.376670716889429, "grad_norm": 5.476633548736572, "learning_rate": 1.692e-06, "loss": 2.5485, "step": 567 }, { "epoch": 1.37910085054678, "grad_norm": 4.760281562805176, "learning_rate": 1.695e-06, "loss": 2.5543, "step": 568 }, { "epoch": 1.3815309842041312, "grad_norm": 3.9952354431152344, "learning_rate": 1.6979999999999999e-06, "loss": 2.5383, "step": 569 }, { "epoch": 1.3839611178614823, "grad_norm": 4.966131210327148, "learning_rate": 1.701e-06, "loss": 2.5203, "step": 570 }, { "epoch": 1.3863912515188335, "grad_norm": 8.824623107910156, "learning_rate": 1.7040000000000001e-06, "loss": 2.5172, "step": 571 }, { "epoch": 1.3888213851761848, "grad_norm": 5.57030725479126, "learning_rate": 1.707e-06, "loss": 2.4853, "step": 572 }, { "epoch": 1.391251518833536, "grad_norm": 4.850300312042236, "learning_rate": 1.7100000000000001e-06, "loss": 2.4829, "step": 573 }, { "epoch": 1.393681652490887, "grad_norm": 5.225827217102051, "learning_rate": 1.713e-06, "loss": 2.5114, "step": 574 }, { "epoch": 1.3961117861482382, "grad_norm": 3.4709434509277344, "learning_rate": 1.7160000000000002e-06, "loss": 2.4823, "step": 575 }, { "epoch": 1.3985419198055893, "grad_norm": 4.28768253326416, "learning_rate": 1.719e-06, "loss": 2.4808, "step": 576 }, { "epoch": 1.4009720534629404, "grad_norm": 16.353775024414062, "learning_rate": 1.722e-06, "loss": 2.452, "step": 577 }, { "epoch": 1.4034021871202915, "grad_norm": 4.566389560699463, "learning_rate": 1.7250000000000002e-06, "loss": 2.4361, "step": 578 }, { "epoch": 1.4058323207776429, "grad_norm": 4.84018611907959, "learning_rate": 1.728e-06, "loss": 2.447, "step": 579 }, { "epoch": 1.408262454434994, "grad_norm": 5.83809232711792, "learning_rate": 1.731e-06, "loss": 2.386, "step": 580 }, { "epoch": 1.410692588092345, "grad_norm": 4.28434419631958, "learning_rate": 1.734e-06, "loss": 2.4236, "step": 581 }, { "epoch": 1.4131227217496962, "grad_norm": 3.404315948486328, "learning_rate": 1.737e-06, "loss": 2.4324, "step": 582 }, { "epoch": 1.4155528554070473, "grad_norm": 3.951995372772217, "learning_rate": 1.74e-06, "loss": 2.3966, "step": 583 }, { "epoch": 1.4179829890643987, "grad_norm": 4.9742536544799805, "learning_rate": 1.743e-06, "loss": 2.458, "step": 584 }, { "epoch": 1.4204131227217496, "grad_norm": 4.221017360687256, "learning_rate": 1.746e-06, "loss": 2.3679, "step": 585 }, { "epoch": 1.422843256379101, "grad_norm": 3.980243682861328, "learning_rate": 1.749e-06, "loss": 2.3605, "step": 586 }, { "epoch": 1.425273390036452, "grad_norm": 4.432951927185059, "learning_rate": 1.7520000000000001e-06, "loss": 2.3604, "step": 587 }, { "epoch": 1.4277035236938032, "grad_norm": 6.78403902053833, "learning_rate": 1.7550000000000001e-06, "loss": 2.3637, "step": 588 }, { "epoch": 1.4301336573511543, "grad_norm": 4.818192481994629, "learning_rate": 1.758e-06, "loss": 2.3776, "step": 589 }, { "epoch": 1.4325637910085054, "grad_norm": 5.941251277923584, "learning_rate": 1.7610000000000002e-06, "loss": 2.4006, "step": 590 }, { "epoch": 1.4349939246658567, "grad_norm": 4.761614799499512, "learning_rate": 1.764e-06, "loss": 2.3561, "step": 591 }, { "epoch": 1.4374240583232079, "grad_norm": 4.5895915031433105, "learning_rate": 1.767e-06, "loss": 2.3944, "step": 592 }, { "epoch": 1.439854191980559, "grad_norm": 7.0216383934021, "learning_rate": 1.77e-06, "loss": 2.3421, "step": 593 }, { "epoch": 1.44228432563791, "grad_norm": 7.950050354003906, "learning_rate": 1.773e-06, "loss": 2.3839, "step": 594 }, { "epoch": 1.4447144592952612, "grad_norm": 11.923834800720215, "learning_rate": 1.776e-06, "loss": 2.2719, "step": 595 }, { "epoch": 1.4471445929526123, "grad_norm": 6.813778400421143, "learning_rate": 1.779e-06, "loss": 2.3675, "step": 596 }, { "epoch": 1.4495747266099634, "grad_norm": 5.882168769836426, "learning_rate": 1.782e-06, "loss": 2.3424, "step": 597 }, { "epoch": 1.4520048602673148, "grad_norm": 5.646364688873291, "learning_rate": 1.785e-06, "loss": 2.3199, "step": 598 }, { "epoch": 1.454434993924666, "grad_norm": 4.352991104125977, "learning_rate": 1.788e-06, "loss": 2.3072, "step": 599 }, { "epoch": 1.456865127582017, "grad_norm": 4.417640209197998, "learning_rate": 1.791e-06, "loss": 2.308, "step": 600 }, { "epoch": 1.4592952612393681, "grad_norm": 5.026256561279297, "learning_rate": 1.794e-06, "loss": 2.2724, "step": 601 }, { "epoch": 1.4617253948967193, "grad_norm": 5.659675598144531, "learning_rate": 1.7970000000000001e-06, "loss": 2.3085, "step": 602 }, { "epoch": 1.4641555285540706, "grad_norm": 4.5922136306762695, "learning_rate": 1.8e-06, "loss": 2.2371, "step": 603 }, { "epoch": 1.4665856622114215, "grad_norm": 10.518667221069336, "learning_rate": 1.8030000000000001e-06, "loss": 2.2834, "step": 604 }, { "epoch": 1.4690157958687728, "grad_norm": 5.272966384887695, "learning_rate": 1.806e-06, "loss": 2.4017, "step": 605 }, { "epoch": 1.471445929526124, "grad_norm": 5.3595428466796875, "learning_rate": 1.809e-06, "loss": 2.3738, "step": 606 }, { "epoch": 1.473876063183475, "grad_norm": 90.29997253417969, "learning_rate": 1.8120000000000002e-06, "loss": 2.2709, "step": 607 }, { "epoch": 1.4763061968408262, "grad_norm": 6.0421247482299805, "learning_rate": 1.815e-06, "loss": 2.3385, "step": 608 }, { "epoch": 1.4787363304981773, "grad_norm": 21.04892921447754, "learning_rate": 1.818e-06, "loss": 2.3676, "step": 609 }, { "epoch": 1.4811664641555287, "grad_norm": 6.54754114151001, "learning_rate": 1.821e-06, "loss": 2.3583, "step": 610 }, { "epoch": 1.4835965978128798, "grad_norm": 13.983354568481445, "learning_rate": 1.824e-06, "loss": 2.5338, "step": 611 }, { "epoch": 1.486026731470231, "grad_norm": 20.545310974121094, "learning_rate": 1.827e-06, "loss": 2.5243, "step": 612 }, { "epoch": 1.488456865127582, "grad_norm": 7.613363265991211, "learning_rate": 1.83e-06, "loss": 2.5131, "step": 613 }, { "epoch": 1.4908869987849331, "grad_norm": 5.62204647064209, "learning_rate": 1.833e-06, "loss": 2.3875, "step": 614 }, { "epoch": 1.4933171324422843, "grad_norm": 4.182187557220459, "learning_rate": 1.8359999999999999e-06, "loss": 2.3011, "step": 615 }, { "epoch": 1.4957472660996354, "grad_norm": 4.2289628982543945, "learning_rate": 1.839e-06, "loss": 2.3527, "step": 616 }, { "epoch": 1.4981773997569867, "grad_norm": 6.146172046661377, "learning_rate": 1.8420000000000001e-06, "loss": 2.2854, "step": 617 }, { "epoch": 1.5006075334143378, "grad_norm": 7.0756659507751465, "learning_rate": 1.8450000000000001e-06, "loss": 2.2618, "step": 618 }, { "epoch": 1.503037667071689, "grad_norm": 6.491519927978516, "learning_rate": 1.8480000000000001e-06, "loss": 2.237, "step": 619 }, { "epoch": 1.50546780072904, "grad_norm": 8.260353088378906, "learning_rate": 1.851e-06, "loss": 2.1858, "step": 620 }, { "epoch": 1.5078979343863912, "grad_norm": 6.155652046203613, "learning_rate": 1.8540000000000002e-06, "loss": 2.1262, "step": 621 }, { "epoch": 1.5103280680437425, "grad_norm": 11.169551849365234, "learning_rate": 1.857e-06, "loss": 2.1337, "step": 622 }, { "epoch": 1.5127582017010934, "grad_norm": 4.799984455108643, "learning_rate": 1.86e-06, "loss": 2.1065, "step": 623 }, { "epoch": 1.5151883353584448, "grad_norm": 7.203545570373535, "learning_rate": 1.8630000000000002e-06, "loss": 2.1032, "step": 624 }, { "epoch": 1.517618469015796, "grad_norm": 5.207713603973389, "learning_rate": 1.866e-06, "loss": 2.1376, "step": 625 }, { "epoch": 1.520048602673147, "grad_norm": 5.502042293548584, "learning_rate": 1.869e-06, "loss": 2.0424, "step": 626 }, { "epoch": 1.5224787363304981, "grad_norm": 9.310318946838379, "learning_rate": 1.872e-06, "loss": 2.0961, "step": 627 }, { "epoch": 1.5249088699878492, "grad_norm": 7.11378288269043, "learning_rate": 1.875e-06, "loss": 2.0653, "step": 628 }, { "epoch": 1.5273390036452006, "grad_norm": 14.375036239624023, "learning_rate": 1.878e-06, "loss": 2.0392, "step": 629 }, { "epoch": 1.5297691373025515, "grad_norm": 5.345547199249268, "learning_rate": 1.8810000000000003e-06, "loss": 2.0261, "step": 630 }, { "epoch": 1.5321992709599028, "grad_norm": 5.5373215675354, "learning_rate": 1.8839999999999999e-06, "loss": 2.0449, "step": 631 }, { "epoch": 1.534629404617254, "grad_norm": 14.996706008911133, "learning_rate": 1.8869999999999999e-06, "loss": 2.0613, "step": 632 }, { "epoch": 1.537059538274605, "grad_norm": 4.867535591125488, "learning_rate": 1.8900000000000001e-06, "loss": 2.0637, "step": 633 }, { "epoch": 1.5394896719319564, "grad_norm": 5.51106595993042, "learning_rate": 1.8930000000000001e-06, "loss": 2.1048, "step": 634 }, { "epoch": 1.5419198055893073, "grad_norm": 5.426146030426025, "learning_rate": 1.8960000000000001e-06, "loss": 2.0406, "step": 635 }, { "epoch": 1.5443499392466586, "grad_norm": 3.7424564361572266, "learning_rate": 1.899e-06, "loss": 1.9765, "step": 636 }, { "epoch": 1.5467800729040098, "grad_norm": 5.565145015716553, "learning_rate": 1.902e-06, "loss": 2.0995, "step": 637 }, { "epoch": 1.5492102065613609, "grad_norm": 3.098925828933716, "learning_rate": 1.905e-06, "loss": 1.9824, "step": 638 }, { "epoch": 1.551640340218712, "grad_norm": 4.565895080566406, "learning_rate": 1.908e-06, "loss": 1.884, "step": 639 }, { "epoch": 1.5540704738760631, "grad_norm": 10.303536415100098, "learning_rate": 1.9110000000000004e-06, "loss": 1.9939, "step": 640 }, { "epoch": 1.5565006075334145, "grad_norm": 15.254600524902344, "learning_rate": 1.914e-06, "loss": 1.8944, "step": 641 }, { "epoch": 1.5589307411907654, "grad_norm": 3.5996413230895996, "learning_rate": 1.917e-06, "loss": 1.8939, "step": 642 }, { "epoch": 1.5613608748481167, "grad_norm": 4.349140644073486, "learning_rate": 1.9200000000000003e-06, "loss": 1.9354, "step": 643 }, { "epoch": 1.5637910085054678, "grad_norm": 7.423696994781494, "learning_rate": 1.923e-06, "loss": 1.9492, "step": 644 }, { "epoch": 1.566221142162819, "grad_norm": 5.396579265594482, "learning_rate": 1.926e-06, "loss": 1.9187, "step": 645 }, { "epoch": 1.56865127582017, "grad_norm": 5.147340297698975, "learning_rate": 1.929e-06, "loss": 1.8997, "step": 646 }, { "epoch": 1.5710814094775212, "grad_norm": 5.474572658538818, "learning_rate": 1.932e-06, "loss": 1.9455, "step": 647 }, { "epoch": 1.5735115431348725, "grad_norm": 10.256887435913086, "learning_rate": 1.935e-06, "loss": 1.8909, "step": 648 }, { "epoch": 1.5759416767922234, "grad_norm": 10.422914505004883, "learning_rate": 1.9380000000000003e-06, "loss": 1.9038, "step": 649 }, { "epoch": 1.5783718104495748, "grad_norm": 4.420162677764893, "learning_rate": 1.9409999999999997e-06, "loss": 1.8911, "step": 650 }, { "epoch": 1.5808019441069259, "grad_norm": 10.167625427246094, "learning_rate": 1.944e-06, "loss": 1.9162, "step": 651 }, { "epoch": 1.583232077764277, "grad_norm": 9.351463317871094, "learning_rate": 1.947e-06, "loss": 2.0555, "step": 652 }, { "epoch": 1.5856622114216283, "grad_norm": 8.84534740447998, "learning_rate": 1.95e-06, "loss": 1.9705, "step": 653 }, { "epoch": 1.5880923450789792, "grad_norm": 4.16977596282959, "learning_rate": 1.953e-06, "loss": 1.9457, "step": 654 }, { "epoch": 1.5905224787363306, "grad_norm": 6.517801761627197, "learning_rate": 1.956e-06, "loss": 1.8607, "step": 655 }, { "epoch": 1.5929526123936817, "grad_norm": 10.125398635864258, "learning_rate": 1.959e-06, "loss": 1.9551, "step": 656 }, { "epoch": 1.5953827460510328, "grad_norm": 12.314558029174805, "learning_rate": 1.962e-06, "loss": 1.9594, "step": 657 }, { "epoch": 1.597812879708384, "grad_norm": 16.677112579345703, "learning_rate": 1.9650000000000002e-06, "loss": 2.1407, "step": 658 }, { "epoch": 1.600243013365735, "grad_norm": 9.072488784790039, "learning_rate": 1.968e-06, "loss": 1.9949, "step": 659 }, { "epoch": 1.6026731470230864, "grad_norm": 15.166990280151367, "learning_rate": 1.971e-06, "loss": 1.9606, "step": 660 }, { "epoch": 1.6051032806804373, "grad_norm": 7.715949535369873, "learning_rate": 1.974e-06, "loss": 1.9987, "step": 661 }, { "epoch": 1.6075334143377886, "grad_norm": 10.718607902526855, "learning_rate": 1.977e-06, "loss": 2.1255, "step": 662 }, { "epoch": 1.6099635479951397, "grad_norm": 18.82831382751465, "learning_rate": 1.98e-06, "loss": 2.2518, "step": 663 }, { "epoch": 1.6123936816524909, "grad_norm": 7.167763710021973, "learning_rate": 1.9830000000000003e-06, "loss": 1.9825, "step": 664 }, { "epoch": 1.6148238153098422, "grad_norm": 6.464595794677734, "learning_rate": 1.9859999999999997e-06, "loss": 1.9212, "step": 665 }, { "epoch": 1.617253948967193, "grad_norm": 7.014903545379639, "learning_rate": 1.989e-06, "loss": 1.8748, "step": 666 }, { "epoch": 1.6196840826245444, "grad_norm": 4.129557132720947, "learning_rate": 1.992e-06, "loss": 1.9317, "step": 667 }, { "epoch": 1.6221142162818953, "grad_norm": 10.163619995117188, "learning_rate": 1.995e-06, "loss": 1.7415, "step": 668 }, { "epoch": 1.6245443499392467, "grad_norm": 6.647574424743652, "learning_rate": 1.998e-06, "loss": 1.684, "step": 669 }, { "epoch": 1.6269744835965978, "grad_norm": 6.073408126831055, "learning_rate": 2.001e-06, "loss": 1.7155, "step": 670 }, { "epoch": 1.629404617253949, "grad_norm": 5.115166187286377, "learning_rate": 2.004e-06, "loss": 1.6897, "step": 671 }, { "epoch": 1.6318347509113003, "grad_norm": 4.851500034332275, "learning_rate": 2.007e-06, "loss": 1.6662, "step": 672 }, { "epoch": 1.6342648845686512, "grad_norm": 13.657958030700684, "learning_rate": 2.0100000000000002e-06, "loss": 1.6838, "step": 673 }, { "epoch": 1.6366950182260025, "grad_norm": 4.661623001098633, "learning_rate": 2.0130000000000005e-06, "loss": 1.6031, "step": 674 }, { "epoch": 1.6391251518833536, "grad_norm": 15.389811515808105, "learning_rate": 2.016e-06, "loss": 1.5455, "step": 675 }, { "epoch": 1.6415552855407047, "grad_norm": 4.262889385223389, "learning_rate": 2.019e-06, "loss": 1.5709, "step": 676 }, { "epoch": 1.6439854191980559, "grad_norm": 42.747398376464844, "learning_rate": 2.0220000000000003e-06, "loss": 1.5708, "step": 677 }, { "epoch": 1.646415552855407, "grad_norm": 4.109449863433838, "learning_rate": 2.025e-06, "loss": 1.5322, "step": 678 }, { "epoch": 1.6488456865127583, "grad_norm": 4.88274621963501, "learning_rate": 2.028e-06, "loss": 1.5231, "step": 679 }, { "epoch": 1.6512758201701092, "grad_norm": 7.230599403381348, "learning_rate": 2.031e-06, "loss": 1.5614, "step": 680 }, { "epoch": 1.6537059538274606, "grad_norm": 4.180158615112305, "learning_rate": 2.034e-06, "loss": 1.5228, "step": 681 }, { "epoch": 1.6561360874848117, "grad_norm": 6.689671993255615, "learning_rate": 2.037e-06, "loss": 1.5533, "step": 682 }, { "epoch": 1.6585662211421628, "grad_norm": 4.846935272216797, "learning_rate": 2.0400000000000004e-06, "loss": 1.474, "step": 683 }, { "epoch": 1.6609963547995141, "grad_norm": 5.428877353668213, "learning_rate": 2.0429999999999998e-06, "loss": 1.4786, "step": 684 }, { "epoch": 1.663426488456865, "grad_norm": 5.865189075469971, "learning_rate": 2.046e-06, "loss": 1.4488, "step": 685 }, { "epoch": 1.6658566221142164, "grad_norm": 5.106090545654297, "learning_rate": 2.049e-06, "loss": 1.4352, "step": 686 }, { "epoch": 1.6682867557715675, "grad_norm": 7.091668605804443, "learning_rate": 2.052e-06, "loss": 1.4597, "step": 687 }, { "epoch": 1.6707168894289186, "grad_norm": 2.748375654220581, "learning_rate": 2.0550000000000002e-06, "loss": 1.4103, "step": 688 }, { "epoch": 1.6731470230862697, "grad_norm": 15.94963264465332, "learning_rate": 2.058e-06, "loss": 1.487, "step": 689 }, { "epoch": 1.6755771567436208, "grad_norm": 4.101191997528076, "learning_rate": 2.061e-06, "loss": 1.4109, "step": 690 }, { "epoch": 1.6780072904009722, "grad_norm": 5.5668158531188965, "learning_rate": 2.064e-06, "loss": 1.3659, "step": 691 }, { "epoch": 1.680437424058323, "grad_norm": 6.150637626647949, "learning_rate": 2.0670000000000003e-06, "loss": 1.4078, "step": 692 }, { "epoch": 1.6828675577156744, "grad_norm": 7.709277153015137, "learning_rate": 2.07e-06, "loss": 1.4776, "step": 693 }, { "epoch": 1.6852976913730255, "grad_norm": 8.927399635314941, "learning_rate": 2.073e-06, "loss": 1.3876, "step": 694 }, { "epoch": 1.6877278250303767, "grad_norm": 9.519058227539062, "learning_rate": 2.076e-06, "loss": 1.3991, "step": 695 }, { "epoch": 1.6901579586877278, "grad_norm": 8.277839660644531, "learning_rate": 2.079e-06, "loss": 1.3737, "step": 696 }, { "epoch": 1.692588092345079, "grad_norm": 3.7640533447265625, "learning_rate": 2.082e-06, "loss": 1.2833, "step": 697 }, { "epoch": 1.6950182260024302, "grad_norm": 5.460485458374023, "learning_rate": 2.0850000000000004e-06, "loss": 1.327, "step": 698 }, { "epoch": 1.6974483596597811, "grad_norm": 6.180414199829102, "learning_rate": 2.0879999999999997e-06, "loss": 1.3406, "step": 699 }, { "epoch": 1.6998784933171325, "grad_norm": 4.211923122406006, "learning_rate": 2.091e-06, "loss": 1.3025, "step": 700 }, { "epoch": 1.7023086269744836, "grad_norm": 14.485774040222168, "learning_rate": 2.094e-06, "loss": 1.2952, "step": 701 }, { "epoch": 1.7047387606318347, "grad_norm": 14.066312789916992, "learning_rate": 2.097e-06, "loss": 1.3961, "step": 702 }, { "epoch": 1.707168894289186, "grad_norm": 6.3851728439331055, "learning_rate": 2.1000000000000002e-06, "loss": 1.3208, "step": 703 }, { "epoch": 1.709599027946537, "grad_norm": 6.277082443237305, "learning_rate": 2.103e-06, "loss": 1.3708, "step": 704 }, { "epoch": 1.7120291616038883, "grad_norm": 5.149957180023193, "learning_rate": 2.106e-06, "loss": 1.2635, "step": 705 }, { "epoch": 1.7144592952612394, "grad_norm": 7.392581939697266, "learning_rate": 2.109e-06, "loss": 1.341, "step": 706 }, { "epoch": 1.7168894289185905, "grad_norm": 4.491787433624268, "learning_rate": 2.1120000000000003e-06, "loss": 1.3294, "step": 707 }, { "epoch": 1.7193195625759417, "grad_norm": 15.18033504486084, "learning_rate": 2.1149999999999997e-06, "loss": 1.3355, "step": 708 }, { "epoch": 1.7217496962332928, "grad_norm": 7.974854469299316, "learning_rate": 2.118e-06, "loss": 1.3289, "step": 709 }, { "epoch": 1.7241798298906441, "grad_norm": 4.6180315017700195, "learning_rate": 2.121e-06, "loss": 1.3782, "step": 710 }, { "epoch": 1.726609963547995, "grad_norm": 6.86355447769165, "learning_rate": 2.124e-06, "loss": 1.2859, "step": 711 }, { "epoch": 1.7290400972053463, "grad_norm": 6.873224258422852, "learning_rate": 2.127e-06, "loss": 1.5053, "step": 712 }, { "epoch": 1.7314702308626975, "grad_norm": 12.862412452697754, "learning_rate": 2.13e-06, "loss": 1.8251, "step": 713 }, { "epoch": 1.7339003645200486, "grad_norm": 10.434954643249512, "learning_rate": 2.133e-06, "loss": 1.7839, "step": 714 }, { "epoch": 1.7363304981773997, "grad_norm": 5.556496620178223, "learning_rate": 2.136e-06, "loss": 1.4638, "step": 715 }, { "epoch": 1.7387606318347508, "grad_norm": 5.085179328918457, "learning_rate": 2.139e-06, "loss": 1.3499, "step": 716 }, { "epoch": 1.7411907654921022, "grad_norm": 7.628553867340088, "learning_rate": 2.1420000000000004e-06, "loss": 1.3219, "step": 717 }, { "epoch": 1.743620899149453, "grad_norm": 5.06912088394165, "learning_rate": 2.145e-06, "loss": 1.2248, "step": 718 }, { "epoch": 1.7460510328068044, "grad_norm": 5.780307769775391, "learning_rate": 2.148e-06, "loss": 1.2236, "step": 719 }, { "epoch": 1.7484811664641555, "grad_norm": 5.051016330718994, "learning_rate": 2.1510000000000002e-06, "loss": 1.2472, "step": 720 }, { "epoch": 1.7509113001215066, "grad_norm": 4.991174221038818, "learning_rate": 2.154e-06, "loss": 1.0962, "step": 721 }, { "epoch": 1.753341433778858, "grad_norm": 4.604220867156982, "learning_rate": 2.1570000000000003e-06, "loss": 1.1731, "step": 722 }, { "epoch": 1.7557715674362089, "grad_norm": 11.908672332763672, "learning_rate": 2.16e-06, "loss": 1.1167, "step": 723 }, { "epoch": 1.7582017010935602, "grad_norm": 33.52266311645508, "learning_rate": 2.163e-06, "loss": 1.1395, "step": 724 }, { "epoch": 1.7606318347509113, "grad_norm": 6.060308456420898, "learning_rate": 2.166e-06, "loss": 1.0436, "step": 725 }, { "epoch": 1.7630619684082625, "grad_norm": 7.43856143951416, "learning_rate": 2.1690000000000003e-06, "loss": 1.0486, "step": 726 }, { "epoch": 1.7654921020656136, "grad_norm": 5.236306190490723, "learning_rate": 2.172e-06, "loss": 1.0647, "step": 727 }, { "epoch": 1.7679222357229647, "grad_norm": 10.533977508544922, "learning_rate": 2.175e-06, "loss": 1.0052, "step": 728 }, { "epoch": 1.770352369380316, "grad_norm": 5.658529281616211, "learning_rate": 2.178e-06, "loss": 1.0698, "step": 729 }, { "epoch": 1.772782503037667, "grad_norm": 5.382141590118408, "learning_rate": 2.181e-06, "loss": 1.0447, "step": 730 }, { "epoch": 1.7752126366950183, "grad_norm": 4.3465800285339355, "learning_rate": 2.184e-06, "loss": 1.0451, "step": 731 }, { "epoch": 1.7776427703523694, "grad_norm": 3.389500856399536, "learning_rate": 2.1870000000000004e-06, "loss": 1.0314, "step": 732 }, { "epoch": 1.7800729040097205, "grad_norm": 4.861445903778076, "learning_rate": 2.1899999999999998e-06, "loss": 0.9871, "step": 733 }, { "epoch": 1.7825030376670719, "grad_norm": 7.9872307777404785, "learning_rate": 2.193e-06, "loss": 1.0257, "step": 734 }, { "epoch": 1.7849331713244228, "grad_norm": 2.823728322982788, "learning_rate": 2.1960000000000002e-06, "loss": 1.0272, "step": 735 }, { "epoch": 1.787363304981774, "grad_norm": 4.565240859985352, "learning_rate": 2.199e-06, "loss": 0.9691, "step": 736 }, { "epoch": 1.789793438639125, "grad_norm": 3.7180333137512207, "learning_rate": 2.2020000000000003e-06, "loss": 1.0091, "step": 737 }, { "epoch": 1.7922235722964763, "grad_norm": 6.74357795715332, "learning_rate": 2.205e-06, "loss": 1.0425, "step": 738 }, { "epoch": 1.7946537059538274, "grad_norm": 3.1347434520721436, "learning_rate": 2.208e-06, "loss": 1.0133, "step": 739 }, { "epoch": 1.7970838396111786, "grad_norm": 4.844738960266113, "learning_rate": 2.211e-06, "loss": 0.9873, "step": 740 }, { "epoch": 1.79951397326853, "grad_norm": 3.9623124599456787, "learning_rate": 2.2140000000000003e-06, "loss": 1.0128, "step": 741 }, { "epoch": 1.8019441069258808, "grad_norm": 12.217218399047852, "learning_rate": 2.2169999999999997e-06, "loss": 0.9272, "step": 742 }, { "epoch": 1.8043742405832321, "grad_norm": 4.845147132873535, "learning_rate": 2.22e-06, "loss": 0.9164, "step": 743 }, { "epoch": 1.8068043742405833, "grad_norm": 3.788195848464966, "learning_rate": 2.223e-06, "loss": 0.981, "step": 744 }, { "epoch": 1.8092345078979344, "grad_norm": 13.96584415435791, "learning_rate": 2.226e-06, "loss": 0.9389, "step": 745 }, { "epoch": 1.8116646415552855, "grad_norm": 2.5958080291748047, "learning_rate": 2.229e-06, "loss": 0.9143, "step": 746 }, { "epoch": 1.8140947752126366, "grad_norm": 12.140559196472168, "learning_rate": 2.232e-06, "loss": 0.9809, "step": 747 }, { "epoch": 1.816524908869988, "grad_norm": 4.834804058074951, "learning_rate": 2.2349999999999998e-06, "loss": 0.9475, "step": 748 }, { "epoch": 1.8189550425273389, "grad_norm": 5.581211566925049, "learning_rate": 2.238e-06, "loss": 0.901, "step": 749 }, { "epoch": 1.8213851761846902, "grad_norm": 21.87607192993164, "learning_rate": 2.2410000000000002e-06, "loss": 1.07, "step": 750 }, { "epoch": 1.8238153098420413, "grad_norm": 3.083911180496216, "learning_rate": 2.244e-06, "loss": 0.8767, "step": 751 }, { "epoch": 1.8262454434993924, "grad_norm": 3.539313554763794, "learning_rate": 2.247e-06, "loss": 0.9345, "step": 752 }, { "epoch": 1.8286755771567438, "grad_norm": 3.707838535308838, "learning_rate": 2.25e-06, "loss": 0.8898, "step": 753 }, { "epoch": 1.8311057108140947, "grad_norm": 14.506610870361328, "learning_rate": 2.253e-06, "loss": 0.9404, "step": 754 }, { "epoch": 1.833535844471446, "grad_norm": 4.04967737197876, "learning_rate": 2.256e-06, "loss": 0.8446, "step": 755 }, { "epoch": 1.8359659781287971, "grad_norm": 4.632888317108154, "learning_rate": 2.2590000000000003e-06, "loss": 0.8938, "step": 756 }, { "epoch": 1.8383961117861483, "grad_norm": 4.298732757568359, "learning_rate": 2.262e-06, "loss": 0.9226, "step": 757 }, { "epoch": 1.8408262454434994, "grad_norm": 8.76998233795166, "learning_rate": 2.265e-06, "loss": 1.0089, "step": 758 }, { "epoch": 1.8432563791008505, "grad_norm": 6.10404109954834, "learning_rate": 2.268e-06, "loss": 0.9816, "step": 759 }, { "epoch": 1.8456865127582018, "grad_norm": 5.368403911590576, "learning_rate": 2.2710000000000004e-06, "loss": 0.9661, "step": 760 }, { "epoch": 1.8481166464155527, "grad_norm": 6.429357528686523, "learning_rate": 2.274e-06, "loss": 1.0038, "step": 761 }, { "epoch": 1.850546780072904, "grad_norm": 7.713373184204102, "learning_rate": 2.277e-06, "loss": 1.1659, "step": 762 }, { "epoch": 1.8529769137302552, "grad_norm": 10.561009407043457, "learning_rate": 2.28e-06, "loss": 1.4581, "step": 763 }, { "epoch": 1.8554070473876063, "grad_norm": 6.183310508728027, "learning_rate": 2.283e-06, "loss": 1.1336, "step": 764 }, { "epoch": 1.8578371810449574, "grad_norm": 3.868321180343628, "learning_rate": 2.2860000000000002e-06, "loss": 0.9471, "step": 765 }, { "epoch": 1.8602673147023085, "grad_norm": 4.371630668640137, "learning_rate": 2.2890000000000004e-06, "loss": 1.069, "step": 766 }, { "epoch": 1.86269744835966, "grad_norm": 6.562849521636963, "learning_rate": 2.292e-06, "loss": 0.8467, "step": 767 }, { "epoch": 1.8651275820170108, "grad_norm": 3.998115301132202, "learning_rate": 2.295e-06, "loss": 0.7784, "step": 768 }, { "epoch": 1.8675577156743621, "grad_norm": 3.4796667098999023, "learning_rate": 2.2980000000000003e-06, "loss": 0.8935, "step": 769 }, { "epoch": 1.8699878493317132, "grad_norm": 3.401324987411499, "learning_rate": 2.301e-06, "loss": 0.8609, "step": 770 }, { "epoch": 1.8724179829890644, "grad_norm": 3.2404568195343018, "learning_rate": 2.304e-06, "loss": 0.7698, "step": 771 }, { "epoch": 1.8748481166464157, "grad_norm": 2.9399781227111816, "learning_rate": 2.307e-06, "loss": 0.7731, "step": 772 }, { "epoch": 1.8772782503037666, "grad_norm": 2.8002188205718994, "learning_rate": 2.31e-06, "loss": 0.8053, "step": 773 }, { "epoch": 1.879708383961118, "grad_norm": 3.547079563140869, "learning_rate": 2.313e-06, "loss": 0.8117, "step": 774 }, { "epoch": 1.882138517618469, "grad_norm": 10.204996109008789, "learning_rate": 2.3160000000000004e-06, "loss": 0.7633, "step": 775 }, { "epoch": 1.8845686512758202, "grad_norm": 2.8857924938201904, "learning_rate": 2.3189999999999997e-06, "loss": 0.8013, "step": 776 }, { "epoch": 1.8869987849331713, "grad_norm": 15.732056617736816, "learning_rate": 2.322e-06, "loss": 0.757, "step": 777 }, { "epoch": 1.8894289185905224, "grad_norm": 4.79892110824585, "learning_rate": 2.325e-06, "loss": 0.7572, "step": 778 }, { "epoch": 1.8918590522478738, "grad_norm": 2.907451629638672, "learning_rate": 2.328e-06, "loss": 0.7385, "step": 779 }, { "epoch": 1.8942891859052247, "grad_norm": 4.335346698760986, "learning_rate": 2.3310000000000002e-06, "loss": 0.7252, "step": 780 }, { "epoch": 1.896719319562576, "grad_norm": 3.397822141647339, "learning_rate": 2.334e-06, "loss": 0.7149, "step": 781 }, { "epoch": 1.8991494532199271, "grad_norm": 3.6916708946228027, "learning_rate": 2.337e-06, "loss": 0.7193, "step": 782 }, { "epoch": 1.9015795868772782, "grad_norm": 2.296602964401245, "learning_rate": 2.34e-06, "loss": 0.7238, "step": 783 }, { "epoch": 1.9040097205346294, "grad_norm": 2.7498011589050293, "learning_rate": 2.3430000000000003e-06, "loss": 0.7279, "step": 784 }, { "epoch": 1.9064398541919805, "grad_norm": 14.292098999023438, "learning_rate": 2.346e-06, "loss": 0.7061, "step": 785 }, { "epoch": 1.9088699878493318, "grad_norm": 3.1998982429504395, "learning_rate": 2.349e-06, "loss": 0.7315, "step": 786 }, { "epoch": 1.9113001215066827, "grad_norm": 2.7436795234680176, "learning_rate": 2.352e-06, "loss": 0.7274, "step": 787 }, { "epoch": 1.913730255164034, "grad_norm": 3.9602432250976562, "learning_rate": 2.355e-06, "loss": 0.8358, "step": 788 }, { "epoch": 1.9161603888213852, "grad_norm": 3.794820547103882, "learning_rate": 2.358e-06, "loss": 0.7402, "step": 789 }, { "epoch": 1.9185905224787363, "grad_norm": 3.4426488876342773, "learning_rate": 2.3610000000000003e-06, "loss": 0.7906, "step": 790 }, { "epoch": 1.9210206561360876, "grad_norm": 3.5540738105773926, "learning_rate": 2.3639999999999997e-06, "loss": 0.6613, "step": 791 }, { "epoch": 1.9234507897934385, "grad_norm": 5.762589454650879, "learning_rate": 2.367e-06, "loss": 0.7522, "step": 792 }, { "epoch": 1.9258809234507899, "grad_norm": 3.4329633712768555, "learning_rate": 2.37e-06, "loss": 0.7533, "step": 793 }, { "epoch": 1.928311057108141, "grad_norm": 2.5704185962677, "learning_rate": 2.373e-06, "loss": 0.7031, "step": 794 }, { "epoch": 1.930741190765492, "grad_norm": 3.056565046310425, "learning_rate": 2.376e-06, "loss": 0.6743, "step": 795 }, { "epoch": 1.9331713244228432, "grad_norm": 3.8814802169799805, "learning_rate": 2.379e-06, "loss": 0.7355, "step": 796 }, { "epoch": 1.9356014580801943, "grad_norm": 6.883727073669434, "learning_rate": 2.382e-06, "loss": 0.8024, "step": 797 }, { "epoch": 1.9380315917375457, "grad_norm": 5.164430141448975, "learning_rate": 2.385e-06, "loss": 0.6759, "step": 798 }, { "epoch": 1.9404617253948966, "grad_norm": 3.5896027088165283, "learning_rate": 2.3880000000000003e-06, "loss": 0.6468, "step": 799 }, { "epoch": 1.942891859052248, "grad_norm": 2.940779685974121, "learning_rate": 2.391e-06, "loss": 0.687, "step": 800 }, { "epoch": 1.945321992709599, "grad_norm": 3.7467076778411865, "learning_rate": 2.394e-06, "loss": 0.6463, "step": 801 }, { "epoch": 1.9477521263669502, "grad_norm": 5.177464962005615, "learning_rate": 2.397e-06, "loss": 0.663, "step": 802 }, { "epoch": 1.9501822600243013, "grad_norm": 4.515866756439209, "learning_rate": 2.4000000000000003e-06, "loss": 0.7179, "step": 803 }, { "epoch": 1.9526123936816524, "grad_norm": 4.266791343688965, "learning_rate": 2.403e-06, "loss": 0.6684, "step": 804 }, { "epoch": 1.9550425273390037, "grad_norm": 4.405402660369873, "learning_rate": 2.406e-06, "loss": 0.7095, "step": 805 }, { "epoch": 1.9574726609963546, "grad_norm": 4.859936237335205, "learning_rate": 2.409e-06, "loss": 0.6737, "step": 806 }, { "epoch": 1.959902794653706, "grad_norm": 4.073750972747803, "learning_rate": 2.412e-06, "loss": 0.7564, "step": 807 }, { "epoch": 1.962332928311057, "grad_norm": 4.612713813781738, "learning_rate": 2.415e-06, "loss": 0.7162, "step": 808 }, { "epoch": 1.9647630619684082, "grad_norm": 3.712541103363037, "learning_rate": 2.4180000000000004e-06, "loss": 0.7148, "step": 809 }, { "epoch": 1.9671931956257596, "grad_norm": 5.989622116088867, "learning_rate": 2.4209999999999998e-06, "loss": 0.7867, "step": 810 }, { "epoch": 1.9696233292831105, "grad_norm": 9.14027214050293, "learning_rate": 2.424e-06, "loss": 0.7717, "step": 811 }, { "epoch": 1.9720534629404618, "grad_norm": 7.128634452819824, "learning_rate": 2.4270000000000002e-06, "loss": 0.9021, "step": 812 }, { "epoch": 1.974483596597813, "grad_norm": 7.919500827789307, "learning_rate": 2.43e-06, "loss": 1.0978, "step": 813 }, { "epoch": 1.976913730255164, "grad_norm": 3.6138720512390137, "learning_rate": 2.4330000000000003e-06, "loss": 0.8114, "step": 814 }, { "epoch": 1.9793438639125152, "grad_norm": 3.5672717094421387, "learning_rate": 2.436e-06, "loss": 0.6551, "step": 815 }, { "epoch": 1.9817739975698663, "grad_norm": 5.117175579071045, "learning_rate": 2.439e-06, "loss": 0.6168, "step": 816 }, { "epoch": 1.9842041312272176, "grad_norm": 3.5084221363067627, "learning_rate": 2.442e-06, "loss": 0.6025, "step": 817 }, { "epoch": 1.9866342648845685, "grad_norm": 4.990915298461914, "learning_rate": 2.4450000000000003e-06, "loss": 0.6138, "step": 818 }, { "epoch": 1.9890643985419199, "grad_norm": 3.4831228256225586, "learning_rate": 2.448e-06, "loss": 0.6875, "step": 819 }, { "epoch": 1.991494532199271, "grad_norm": 5.558478355407715, "learning_rate": 2.451e-06, "loss": 0.6514, "step": 820 }, { "epoch": 1.993924665856622, "grad_norm": 4.0786051750183105, "learning_rate": 2.454e-06, "loss": 0.6226, "step": 821 }, { "epoch": 1.9963547995139734, "grad_norm": 9.433616638183594, "learning_rate": 2.457e-06, "loss": 0.6742, "step": 822 }, { "epoch": 1.9987849331713243, "grad_norm": 7.177051544189453, "learning_rate": 2.46e-06, "loss": 0.692, "step": 823 }, { "epoch": 2.0, "grad_norm": 3.238696336746216, "learning_rate": 2.4630000000000004e-06, "loss": 0.4198, "step": 824 }, { "epoch": 2.0024301336573513, "grad_norm": 11.250473976135254, "learning_rate": 2.4659999999999998e-06, "loss": 1.4037, "step": 825 }, { "epoch": 2.0048602673147022, "grad_norm": 6.885899066925049, "learning_rate": 2.469e-06, "loss": 0.9875, "step": 826 }, { "epoch": 2.0072904009720536, "grad_norm": 5.102292060852051, "learning_rate": 2.4720000000000002e-06, "loss": 0.9746, "step": 827 }, { "epoch": 2.0097205346294045, "grad_norm": 6.817753314971924, "learning_rate": 2.475e-06, "loss": 0.8824, "step": 828 }, { "epoch": 2.012150668286756, "grad_norm": 4.960653305053711, "learning_rate": 2.4780000000000002e-06, "loss": 0.8081, "step": 829 }, { "epoch": 2.0145808019441067, "grad_norm": 6.951458930969238, "learning_rate": 2.481e-06, "loss": 0.6951, "step": 830 }, { "epoch": 2.017010935601458, "grad_norm": 9.949361801147461, "learning_rate": 2.484e-06, "loss": 0.7953, "step": 831 }, { "epoch": 2.0194410692588094, "grad_norm": 6.487469673156738, "learning_rate": 2.487e-06, "loss": 0.6111, "step": 832 }, { "epoch": 2.0218712029161603, "grad_norm": 5.125680923461914, "learning_rate": 2.4900000000000003e-06, "loss": 0.6482, "step": 833 }, { "epoch": 2.0243013365735116, "grad_norm": 3.4573090076446533, "learning_rate": 2.4929999999999997e-06, "loss": 0.6065, "step": 834 }, { "epoch": 2.0267314702308625, "grad_norm": 5.3875203132629395, "learning_rate": 2.496e-06, "loss": 0.5866, "step": 835 }, { "epoch": 2.029161603888214, "grad_norm": 7.124242305755615, "learning_rate": 2.499e-06, "loss": 0.5753, "step": 836 }, { "epoch": 2.031591737545565, "grad_norm": 7.795366287231445, "learning_rate": 2.502e-06, "loss": 0.6348, "step": 837 }, { "epoch": 2.034021871202916, "grad_norm": 2.5310893058776855, "learning_rate": 2.505e-06, "loss": 0.5747, "step": 838 }, { "epoch": 2.0364520048602675, "grad_norm": 2.4720869064331055, "learning_rate": 2.508e-06, "loss": 0.5585, "step": 839 }, { "epoch": 2.0388821385176183, "grad_norm": 3.081408977508545, "learning_rate": 2.5109999999999998e-06, "loss": 0.599, "step": 840 }, { "epoch": 2.0413122721749697, "grad_norm": 6.893228054046631, "learning_rate": 2.514e-06, "loss": 0.5665, "step": 841 }, { "epoch": 2.0437424058323206, "grad_norm": 3.589545726776123, "learning_rate": 2.517e-06, "loss": 0.5722, "step": 842 }, { "epoch": 2.046172539489672, "grad_norm": 2.9531593322753906, "learning_rate": 2.52e-06, "loss": 0.526, "step": 843 }, { "epoch": 2.0486026731470233, "grad_norm": 3.123610496520996, "learning_rate": 2.523e-06, "loss": 0.644, "step": 844 }, { "epoch": 2.051032806804374, "grad_norm": 3.4662842750549316, "learning_rate": 2.526e-06, "loss": 0.5235, "step": 845 }, { "epoch": 2.0534629404617255, "grad_norm": 7.513754844665527, "learning_rate": 2.5290000000000003e-06, "loss": 0.5989, "step": 846 }, { "epoch": 2.0558930741190764, "grad_norm": 4.457220554351807, "learning_rate": 2.532e-06, "loss": 0.6013, "step": 847 }, { "epoch": 2.0583232077764277, "grad_norm": 18.123184204101562, "learning_rate": 2.5350000000000003e-06, "loss": 0.5775, "step": 848 }, { "epoch": 2.0607533414337786, "grad_norm": 3.0328845977783203, "learning_rate": 2.538e-06, "loss": 0.6347, "step": 849 }, { "epoch": 2.06318347509113, "grad_norm": 3.341778039932251, "learning_rate": 2.541e-06, "loss": 0.5824, "step": 850 }, { "epoch": 2.0656136087484813, "grad_norm": 3.3516008853912354, "learning_rate": 2.544e-06, "loss": 0.5873, "step": 851 }, { "epoch": 2.068043742405832, "grad_norm": 3.7426531314849854, "learning_rate": 2.5470000000000003e-06, "loss": 0.5951, "step": 852 }, { "epoch": 2.0704738760631836, "grad_norm": 3.1536941528320312, "learning_rate": 2.55e-06, "loss": 0.5414, "step": 853 }, { "epoch": 2.0729040097205345, "grad_norm": 2.5540003776550293, "learning_rate": 2.553e-06, "loss": 0.5252, "step": 854 }, { "epoch": 2.075334143377886, "grad_norm": 3.9193694591522217, "learning_rate": 2.556e-06, "loss": 0.536, "step": 855 }, { "epoch": 2.077764277035237, "grad_norm": 2.868220329284668, "learning_rate": 2.559e-06, "loss": 0.5614, "step": 856 }, { "epoch": 2.080194410692588, "grad_norm": 3.4670217037200928, "learning_rate": 2.562e-06, "loss": 0.547, "step": 857 }, { "epoch": 2.0826245443499394, "grad_norm": 5.205864429473877, "learning_rate": 2.5650000000000004e-06, "loss": 0.5414, "step": 858 }, { "epoch": 2.0850546780072903, "grad_norm": 4.0570759773254395, "learning_rate": 2.568e-06, "loss": 0.5442, "step": 859 }, { "epoch": 2.0874848116646416, "grad_norm": 8.245138168334961, "learning_rate": 2.571e-06, "loss": 0.5419, "step": 860 }, { "epoch": 2.0899149453219925, "grad_norm": 5.74954080581665, "learning_rate": 2.5740000000000003e-06, "loss": 0.6016, "step": 861 }, { "epoch": 2.092345078979344, "grad_norm": 5.3258891105651855, "learning_rate": 2.577e-06, "loss": 0.6347, "step": 862 }, { "epoch": 2.094775212636695, "grad_norm": 4.383651256561279, "learning_rate": 2.58e-06, "loss": 0.6243, "step": 863 }, { "epoch": 2.097205346294046, "grad_norm": 3.89263916015625, "learning_rate": 2.583e-06, "loss": 0.5283, "step": 864 }, { "epoch": 2.0996354799513974, "grad_norm": 7.676142692565918, "learning_rate": 2.586e-06, "loss": 0.604, "step": 865 }, { "epoch": 2.1020656136087483, "grad_norm": 4.346503734588623, "learning_rate": 2.589e-06, "loss": 0.5999, "step": 866 }, { "epoch": 2.1044957472660997, "grad_norm": 5.87732458114624, "learning_rate": 2.5920000000000003e-06, "loss": 0.5462, "step": 867 }, { "epoch": 2.106925880923451, "grad_norm": 12.832000732421875, "learning_rate": 2.5949999999999997e-06, "loss": 0.597, "step": 868 }, { "epoch": 2.109356014580802, "grad_norm": 4.22803258895874, "learning_rate": 2.598e-06, "loss": 0.6131, "step": 869 }, { "epoch": 2.1117861482381532, "grad_norm": 4.015406131744385, "learning_rate": 2.601e-06, "loss": 0.6044, "step": 870 }, { "epoch": 2.114216281895504, "grad_norm": 4.157554626464844, "learning_rate": 2.604e-06, "loss": 0.6083, "step": 871 }, { "epoch": 2.1166464155528555, "grad_norm": 6.096071243286133, "learning_rate": 2.607e-06, "loss": 0.6066, "step": 872 }, { "epoch": 2.1190765492102064, "grad_norm": 8.83604621887207, "learning_rate": 2.61e-06, "loss": 0.7081, "step": 873 }, { "epoch": 2.1215066828675577, "grad_norm": 6.443227767944336, "learning_rate": 2.613e-06, "loss": 0.7104, "step": 874 }, { "epoch": 2.123936816524909, "grad_norm": 11.642943382263184, "learning_rate": 2.616e-06, "loss": 1.3211, "step": 875 }, { "epoch": 2.12636695018226, "grad_norm": 7.302554607391357, "learning_rate": 2.6190000000000003e-06, "loss": 0.9668, "step": 876 }, { "epoch": 2.1287970838396113, "grad_norm": 3.389618396759033, "learning_rate": 2.622e-06, "loss": 0.7635, "step": 877 }, { "epoch": 2.131227217496962, "grad_norm": 5.583101272583008, "learning_rate": 2.625e-06, "loss": 1.0029, "step": 878 }, { "epoch": 2.1336573511543135, "grad_norm": 2.8969790935516357, "learning_rate": 2.628e-06, "loss": 0.705, "step": 879 }, { "epoch": 2.1360874848116644, "grad_norm": 9.254556655883789, "learning_rate": 2.631e-06, "loss": 0.6187, "step": 880 }, { "epoch": 2.138517618469016, "grad_norm": 4.637352466583252, "learning_rate": 2.634e-06, "loss": 0.6505, "step": 881 }, { "epoch": 2.140947752126367, "grad_norm": 4.1549835205078125, "learning_rate": 2.6370000000000003e-06, "loss": 0.601, "step": 882 }, { "epoch": 2.143377885783718, "grad_norm": 10.071443557739258, "learning_rate": 2.6399999999999997e-06, "loss": 0.6989, "step": 883 }, { "epoch": 2.1458080194410694, "grad_norm": 3.790875196456909, "learning_rate": 2.643e-06, "loss": 0.53, "step": 884 }, { "epoch": 2.1482381530984203, "grad_norm": 9.807236671447754, "learning_rate": 2.646e-06, "loss": 0.5325, "step": 885 }, { "epoch": 2.1506682867557716, "grad_norm": 1.9488499164581299, "learning_rate": 2.649e-06, "loss": 0.5069, "step": 886 }, { "epoch": 2.153098420413123, "grad_norm": 4.001325607299805, "learning_rate": 2.652e-06, "loss": 0.5486, "step": 887 }, { "epoch": 2.155528554070474, "grad_norm": 3.0900845527648926, "learning_rate": 2.655e-06, "loss": 0.5144, "step": 888 }, { "epoch": 2.157958687727825, "grad_norm": 2.1842591762542725, "learning_rate": 2.6580000000000002e-06, "loss": 0.5105, "step": 889 }, { "epoch": 2.160388821385176, "grad_norm": 2.5840537548065186, "learning_rate": 2.661e-06, "loss": 0.4812, "step": 890 }, { "epoch": 2.1628189550425274, "grad_norm": 3.032200574874878, "learning_rate": 2.6640000000000002e-06, "loss": 0.4701, "step": 891 }, { "epoch": 2.1652490886998783, "grad_norm": 5.81608247756958, "learning_rate": 2.6670000000000005e-06, "loss": 0.5099, "step": 892 }, { "epoch": 2.1676792223572297, "grad_norm": 3.3536598682403564, "learning_rate": 2.67e-06, "loss": 0.5249, "step": 893 }, { "epoch": 2.170109356014581, "grad_norm": 2.6235575675964355, "learning_rate": 2.673e-06, "loss": 0.4736, "step": 894 }, { "epoch": 2.172539489671932, "grad_norm": 7.285075664520264, "learning_rate": 2.6760000000000003e-06, "loss": 0.5453, "step": 895 }, { "epoch": 2.1749696233292832, "grad_norm": 2.805095911026001, "learning_rate": 2.679e-06, "loss": 0.48, "step": 896 }, { "epoch": 2.177399756986634, "grad_norm": 3.503758668899536, "learning_rate": 2.682e-06, "loss": 0.5044, "step": 897 }, { "epoch": 2.1798298906439855, "grad_norm": 2.281181573867798, "learning_rate": 2.685e-06, "loss": 0.4625, "step": 898 }, { "epoch": 2.1822600243013364, "grad_norm": 5.516132831573486, "learning_rate": 2.688e-06, "loss": 0.5299, "step": 899 }, { "epoch": 2.1846901579586877, "grad_norm": 2.226823568344116, "learning_rate": 2.691e-06, "loss": 0.4844, "step": 900 }, { "epoch": 2.187120291616039, "grad_norm": 2.242321729660034, "learning_rate": 2.6940000000000004e-06, "loss": 0.5399, "step": 901 }, { "epoch": 2.18955042527339, "grad_norm": 2.163339138031006, "learning_rate": 2.6969999999999998e-06, "loss": 0.4939, "step": 902 }, { "epoch": 2.1919805589307413, "grad_norm": 3.076960325241089, "learning_rate": 2.7e-06, "loss": 0.474, "step": 903 }, { "epoch": 2.194410692588092, "grad_norm": 3.572941303253174, "learning_rate": 2.703e-06, "loss": 0.5233, "step": 904 }, { "epoch": 2.1968408262454435, "grad_norm": 10.969109535217285, "learning_rate": 2.706e-06, "loss": 0.5558, "step": 905 }, { "epoch": 2.199270959902795, "grad_norm": 4.146024703979492, "learning_rate": 2.7090000000000002e-06, "loss": 0.5213, "step": 906 }, { "epoch": 2.2017010935601458, "grad_norm": 2.7457730770111084, "learning_rate": 2.712e-06, "loss": 0.5147, "step": 907 }, { "epoch": 2.204131227217497, "grad_norm": 2.7947912216186523, "learning_rate": 2.715e-06, "loss": 0.5307, "step": 908 }, { "epoch": 2.206561360874848, "grad_norm": 5.983397006988525, "learning_rate": 2.718e-06, "loss": 0.6368, "step": 909 }, { "epoch": 2.2089914945321993, "grad_norm": 6.3333845138549805, "learning_rate": 2.7210000000000003e-06, "loss": 0.523, "step": 910 }, { "epoch": 2.2114216281895502, "grad_norm": 2.856820583343506, "learning_rate": 2.724e-06, "loss": 0.4311, "step": 911 }, { "epoch": 2.2138517618469016, "grad_norm": 4.76262092590332, "learning_rate": 2.727e-06, "loss": 0.6109, "step": 912 }, { "epoch": 2.216281895504253, "grad_norm": 3.9213063716888428, "learning_rate": 2.73e-06, "loss": 0.4831, "step": 913 }, { "epoch": 2.218712029161604, "grad_norm": 6.559173583984375, "learning_rate": 2.733e-06, "loss": 0.5179, "step": 914 }, { "epoch": 2.221142162818955, "grad_norm": 5.526670932769775, "learning_rate": 2.736e-06, "loss": 0.5468, "step": 915 }, { "epoch": 2.223572296476306, "grad_norm": 4.020683288574219, "learning_rate": 2.7390000000000004e-06, "loss": 0.5057, "step": 916 }, { "epoch": 2.2260024301336574, "grad_norm": 5.4078545570373535, "learning_rate": 2.7419999999999998e-06, "loss": 0.5229, "step": 917 }, { "epoch": 2.2284325637910083, "grad_norm": 4.840234279632568, "learning_rate": 2.745e-06, "loss": 0.5077, "step": 918 }, { "epoch": 2.2308626974483596, "grad_norm": 5.006260395050049, "learning_rate": 2.748e-06, "loss": 0.5304, "step": 919 }, { "epoch": 2.233292831105711, "grad_norm": 4.206860065460205, "learning_rate": 2.751e-06, "loss": 0.5504, "step": 920 }, { "epoch": 2.235722964763062, "grad_norm": 4.576345920562744, "learning_rate": 2.7540000000000002e-06, "loss": 0.5324, "step": 921 }, { "epoch": 2.238153098420413, "grad_norm": 4.452962875366211, "learning_rate": 2.757e-06, "loss": 0.5901, "step": 922 }, { "epoch": 2.240583232077764, "grad_norm": 3.953855037689209, "learning_rate": 2.76e-06, "loss": 0.5736, "step": 923 }, { "epoch": 2.2430133657351154, "grad_norm": 5.158233642578125, "learning_rate": 2.763e-06, "loss": 0.6062, "step": 924 }, { "epoch": 2.245443499392467, "grad_norm": 9.964811325073242, "learning_rate": 2.7660000000000003e-06, "loss": 1.3221, "step": 925 }, { "epoch": 2.2478736330498177, "grad_norm": 6.90237283706665, "learning_rate": 2.7689999999999997e-06, "loss": 1.0228, "step": 926 }, { "epoch": 2.250303766707169, "grad_norm": 9.699655532836914, "learning_rate": 2.772e-06, "loss": 0.9066, "step": 927 }, { "epoch": 2.25273390036452, "grad_norm": 3.2245190143585205, "learning_rate": 2.775e-06, "loss": 0.7098, "step": 928 }, { "epoch": 2.2551640340218713, "grad_norm": 4.761231899261475, "learning_rate": 2.778e-06, "loss": 0.6936, "step": 929 }, { "epoch": 2.257594167679222, "grad_norm": 3.542588233947754, "learning_rate": 2.781e-06, "loss": 0.7102, "step": 930 }, { "epoch": 2.2600243013365735, "grad_norm": 3.3549246788024902, "learning_rate": 2.784e-06, "loss": 0.5893, "step": 931 }, { "epoch": 2.262454434993925, "grad_norm": 2.550562858581543, "learning_rate": 2.787e-06, "loss": 0.5549, "step": 932 }, { "epoch": 2.2648845686512757, "grad_norm": 2.5339972972869873, "learning_rate": 2.79e-06, "loss": 0.5127, "step": 933 }, { "epoch": 2.267314702308627, "grad_norm": 3.044750213623047, "learning_rate": 2.793e-06, "loss": 0.5097, "step": 934 }, { "epoch": 2.269744835965978, "grad_norm": 3.116286039352417, "learning_rate": 2.7960000000000004e-06, "loss": 0.4598, "step": 935 }, { "epoch": 2.2721749696233293, "grad_norm": 4.100244522094727, "learning_rate": 2.799e-06, "loss": 0.5243, "step": 936 }, { "epoch": 2.27460510328068, "grad_norm": 4.244337558746338, "learning_rate": 2.802e-06, "loss": 0.4508, "step": 937 }, { "epoch": 2.2770352369380316, "grad_norm": 3.1715073585510254, "learning_rate": 2.8050000000000002e-06, "loss": 0.4497, "step": 938 }, { "epoch": 2.279465370595383, "grad_norm": 4.405244827270508, "learning_rate": 2.808e-06, "loss": 0.4714, "step": 939 }, { "epoch": 2.281895504252734, "grad_norm": 3.573509693145752, "learning_rate": 2.8110000000000003e-06, "loss": 0.4685, "step": 940 }, { "epoch": 2.284325637910085, "grad_norm": 2.6045894622802734, "learning_rate": 2.814e-06, "loss": 0.5146, "step": 941 }, { "epoch": 2.286755771567436, "grad_norm": 2.6302714347839355, "learning_rate": 2.817e-06, "loss": 0.4719, "step": 942 }, { "epoch": 2.2891859052247874, "grad_norm": 2.3109419345855713, "learning_rate": 2.82e-06, "loss": 0.4716, "step": 943 }, { "epoch": 2.2916160388821387, "grad_norm": 4.517823219299316, "learning_rate": 2.8230000000000003e-06, "loss": 0.4504, "step": 944 }, { "epoch": 2.2940461725394896, "grad_norm": 2.80881929397583, "learning_rate": 2.826e-06, "loss": 0.4881, "step": 945 }, { "epoch": 2.296476306196841, "grad_norm": 3.566917657852173, "learning_rate": 2.829e-06, "loss": 0.4509, "step": 946 }, { "epoch": 2.298906439854192, "grad_norm": 2.5469820499420166, "learning_rate": 2.832e-06, "loss": 0.422, "step": 947 }, { "epoch": 2.301336573511543, "grad_norm": 3.1923375129699707, "learning_rate": 2.835e-06, "loss": 0.4466, "step": 948 }, { "epoch": 2.3037667071688945, "grad_norm": 2.4189631938934326, "learning_rate": 2.838e-06, "loss": 0.4508, "step": 949 }, { "epoch": 2.3061968408262454, "grad_norm": 3.35990309715271, "learning_rate": 2.8410000000000004e-06, "loss": 0.4103, "step": 950 }, { "epoch": 2.3086269744835968, "grad_norm": 2.5451204776763916, "learning_rate": 2.844e-06, "loss": 0.4412, "step": 951 }, { "epoch": 2.3110571081409477, "grad_norm": 4.278253078460693, "learning_rate": 2.847e-06, "loss": 0.5045, "step": 952 }, { "epoch": 2.313487241798299, "grad_norm": 2.3095016479492188, "learning_rate": 2.8500000000000002e-06, "loss": 0.4487, "step": 953 }, { "epoch": 2.31591737545565, "grad_norm": 2.7947046756744385, "learning_rate": 2.853e-06, "loss": 0.4252, "step": 954 }, { "epoch": 2.3183475091130012, "grad_norm": 10.389890670776367, "learning_rate": 2.8560000000000003e-06, "loss": 0.4594, "step": 955 }, { "epoch": 2.320777642770352, "grad_norm": 3.5948009490966797, "learning_rate": 2.859e-06, "loss": 0.4907, "step": 956 }, { "epoch": 2.3232077764277035, "grad_norm": 3.6899967193603516, "learning_rate": 2.862e-06, "loss": 0.4107, "step": 957 }, { "epoch": 2.325637910085055, "grad_norm": 3.2929131984710693, "learning_rate": 2.865e-06, "loss": 0.5717, "step": 958 }, { "epoch": 2.3280680437424057, "grad_norm": 2.482360601425171, "learning_rate": 2.8680000000000003e-06, "loss": 0.4925, "step": 959 }, { "epoch": 2.330498177399757, "grad_norm": 3.624401330947876, "learning_rate": 2.8709999999999997e-06, "loss": 0.3971, "step": 960 }, { "epoch": 2.332928311057108, "grad_norm": 7.128837585449219, "learning_rate": 2.874e-06, "loss": 0.4353, "step": 961 }, { "epoch": 2.3353584447144593, "grad_norm": 5.022778511047363, "learning_rate": 2.877e-06, "loss": 0.4228, "step": 962 }, { "epoch": 2.3377885783718106, "grad_norm": 2.9282495975494385, "learning_rate": 2.88e-06, "loss": 0.4419, "step": 963 }, { "epoch": 2.3402187120291615, "grad_norm": 7.120631694793701, "learning_rate": 2.883e-06, "loss": 0.4864, "step": 964 }, { "epoch": 2.342648845686513, "grad_norm": 2.82422137260437, "learning_rate": 2.886e-06, "loss": 0.5007, "step": 965 }, { "epoch": 2.345078979343864, "grad_norm": 3.0093977451324463, "learning_rate": 2.8889999999999998e-06, "loss": 0.4292, "step": 966 }, { "epoch": 2.347509113001215, "grad_norm": 4.215259075164795, "learning_rate": 2.892e-06, "loss": 0.4493, "step": 967 }, { "epoch": 2.3499392466585665, "grad_norm": 8.690277099609375, "learning_rate": 2.8950000000000002e-06, "loss": 0.5145, "step": 968 }, { "epoch": 2.3523693803159174, "grad_norm": 5.085582733154297, "learning_rate": 2.898e-06, "loss": 0.551, "step": 969 }, { "epoch": 2.3547995139732687, "grad_norm": 15.8950777053833, "learning_rate": 2.901e-06, "loss": 0.6018, "step": 970 }, { "epoch": 2.3572296476306196, "grad_norm": 3.654742479324341, "learning_rate": 2.904e-06, "loss": 0.5286, "step": 971 }, { "epoch": 2.359659781287971, "grad_norm": 7.53311014175415, "learning_rate": 2.907e-06, "loss": 0.5676, "step": 972 }, { "epoch": 2.362089914945322, "grad_norm": 5.470914840698242, "learning_rate": 2.91e-06, "loss": 0.5234, "step": 973 }, { "epoch": 2.364520048602673, "grad_norm": 6.295107841491699, "learning_rate": 2.9130000000000003e-06, "loss": 0.7664, "step": 974 }, { "epoch": 2.366950182260024, "grad_norm": 10.699466705322266, "learning_rate": 2.916e-06, "loss": 1.2089, "step": 975 }, { "epoch": 2.3693803159173754, "grad_norm": 6.470330238342285, "learning_rate": 2.919e-06, "loss": 1.0036, "step": 976 }, { "epoch": 2.3718104495747268, "grad_norm": 4.232030868530273, "learning_rate": 2.922e-06, "loss": 0.7885, "step": 977 }, { "epoch": 2.3742405832320777, "grad_norm": 3.0989222526550293, "learning_rate": 2.9250000000000004e-06, "loss": 0.7166, "step": 978 }, { "epoch": 2.376670716889429, "grad_norm": 5.3148980140686035, "learning_rate": 2.928e-06, "loss": 0.7048, "step": 979 }, { "epoch": 2.37910085054678, "grad_norm": 6.840015888214111, "learning_rate": 2.931e-06, "loss": 0.6055, "step": 980 }, { "epoch": 2.3815309842041312, "grad_norm": 4.354755878448486, "learning_rate": 2.934e-06, "loss": 0.5131, "step": 981 }, { "epoch": 2.3839611178614826, "grad_norm": 8.380233764648438, "learning_rate": 2.937e-06, "loss": 0.4791, "step": 982 }, { "epoch": 2.3863912515188335, "grad_norm": 2.658780336380005, "learning_rate": 2.9400000000000002e-06, "loss": 0.4705, "step": 983 }, { "epoch": 2.388821385176185, "grad_norm": 6.274712562561035, "learning_rate": 2.9430000000000005e-06, "loss": 0.406, "step": 984 }, { "epoch": 2.3912515188335357, "grad_norm": 8.415331840515137, "learning_rate": 2.946e-06, "loss": 0.4435, "step": 985 }, { "epoch": 2.393681652490887, "grad_norm": 4.11092472076416, "learning_rate": 2.949e-06, "loss": 0.4381, "step": 986 }, { "epoch": 2.3961117861482384, "grad_norm": 2.158395528793335, "learning_rate": 2.9520000000000003e-06, "loss": 0.4085, "step": 987 }, { "epoch": 2.3985419198055893, "grad_norm": 2.0115654468536377, "learning_rate": 2.955e-06, "loss": 0.4212, "step": 988 }, { "epoch": 2.4009720534629406, "grad_norm": 3.024912118911743, "learning_rate": 2.958e-06, "loss": 0.4509, "step": 989 }, { "epoch": 2.4034021871202915, "grad_norm": 2.589916706085205, "learning_rate": 2.961e-06, "loss": 0.424, "step": 990 }, { "epoch": 2.405832320777643, "grad_norm": 5.639590740203857, "learning_rate": 2.964e-06, "loss": 0.4604, "step": 991 }, { "epoch": 2.4082624544349938, "grad_norm": 14.912527084350586, "learning_rate": 2.967e-06, "loss": 0.4413, "step": 992 }, { "epoch": 2.410692588092345, "grad_norm": 4.136866569519043, "learning_rate": 2.9700000000000004e-06, "loss": 0.3943, "step": 993 }, { "epoch": 2.413122721749696, "grad_norm": 2.4516818523406982, "learning_rate": 2.9729999999999997e-06, "loss": 0.3877, "step": 994 }, { "epoch": 2.4155528554070473, "grad_norm": 2.407127618789673, "learning_rate": 2.976e-06, "loss": 0.3954, "step": 995 }, { "epoch": 2.4179829890643987, "grad_norm": 2.28560209274292, "learning_rate": 2.979e-06, "loss": 0.415, "step": 996 }, { "epoch": 2.4204131227217496, "grad_norm": 2.4941906929016113, "learning_rate": 2.982e-06, "loss": 0.3824, "step": 997 }, { "epoch": 2.422843256379101, "grad_norm": 2.9037418365478516, "learning_rate": 2.9850000000000002e-06, "loss": 0.4026, "step": 998 }, { "epoch": 2.425273390036452, "grad_norm": 2.0882151126861572, "learning_rate": 2.988e-06, "loss": 0.409, "step": 999 }, { "epoch": 2.427703523693803, "grad_norm": 6.898071765899658, "learning_rate": 2.991e-06, "loss": 0.4378, "step": 1000 }, { "epoch": 2.427703523693803, "eval_cer": 0.1906674162717854, "eval_loss": 0.7759953737258911, "eval_runtime": 7.9533, "eval_samples_per_second": 12.699, "eval_steps_per_second": 0.503, "eval_wer": 0.5952380952380952, "step": 1000 }, { "epoch": 2.4301336573511545, "grad_norm": 2.0185115337371826, "learning_rate": 2.994e-06, "loss": 0.4082, "step": 1001 }, { "epoch": 2.4325637910085054, "grad_norm": 3.7299978733062744, "learning_rate": 2.9970000000000003e-06, "loss": 0.4192, "step": 1002 }, { "epoch": 2.4349939246658567, "grad_norm": 3.8798251152038574, "learning_rate": 3e-06, "loss": 0.3664, "step": 1003 }, { "epoch": 2.4374240583232076, "grad_norm": 2.136995792388916, "learning_rate": 3.003e-06, "loss": 0.4008, "step": 1004 }, { "epoch": 2.439854191980559, "grad_norm": 3.3633031845092773, "learning_rate": 3.006e-06, "loss": 0.4212, "step": 1005 }, { "epoch": 2.4422843256379103, "grad_norm": 2.408897638320923, "learning_rate": 3.009e-06, "loss": 0.4019, "step": 1006 }, { "epoch": 2.444714459295261, "grad_norm": 3.1437642574310303, "learning_rate": 3.012e-06, "loss": 0.3915, "step": 1007 }, { "epoch": 2.4471445929526126, "grad_norm": 2.9299182891845703, "learning_rate": 3.0150000000000004e-06, "loss": 0.4194, "step": 1008 }, { "epoch": 2.4495747266099634, "grad_norm": 3.451258659362793, "learning_rate": 3.0179999999999997e-06, "loss": 0.4585, "step": 1009 }, { "epoch": 2.452004860267315, "grad_norm": 3.2542076110839844, "learning_rate": 3.021e-06, "loss": 0.4541, "step": 1010 }, { "epoch": 2.4544349939246657, "grad_norm": 3.2307345867156982, "learning_rate": 3.024e-06, "loss": 0.443, "step": 1011 }, { "epoch": 2.456865127582017, "grad_norm": 3.068436861038208, "learning_rate": 3.027e-06, "loss": 0.4271, "step": 1012 }, { "epoch": 2.459295261239368, "grad_norm": 3.355825901031494, "learning_rate": 3.0300000000000002e-06, "loss": 0.4421, "step": 1013 }, { "epoch": 2.4617253948967193, "grad_norm": 3.126253843307495, "learning_rate": 3.033e-06, "loss": 0.4678, "step": 1014 }, { "epoch": 2.4641555285540706, "grad_norm": 3.1999881267547607, "learning_rate": 3.036e-06, "loss": 0.4145, "step": 1015 }, { "epoch": 2.4665856622114215, "grad_norm": 4.64090633392334, "learning_rate": 3.039e-06, "loss": 0.4093, "step": 1016 }, { "epoch": 2.469015795868773, "grad_norm": 3.9662094116210938, "learning_rate": 3.0420000000000003e-06, "loss": 0.4235, "step": 1017 }, { "epoch": 2.4714459295261237, "grad_norm": 3.07952880859375, "learning_rate": 3.0450000000000005e-06, "loss": 0.4827, "step": 1018 }, { "epoch": 2.473876063183475, "grad_norm": 4.1216535568237305, "learning_rate": 3.048e-06, "loss": 0.4236, "step": 1019 }, { "epoch": 2.4763061968408264, "grad_norm": 3.968137502670288, "learning_rate": 3.051e-06, "loss": 0.5121, "step": 1020 }, { "epoch": 2.4787363304981773, "grad_norm": 6.021745681762695, "learning_rate": 3.0540000000000003e-06, "loss": 0.4982, "step": 1021 }, { "epoch": 2.4811664641555287, "grad_norm": 4.9343953132629395, "learning_rate": 3.057e-06, "loss": 0.5809, "step": 1022 }, { "epoch": 2.4835965978128796, "grad_norm": 4.141907691955566, "learning_rate": 3.06e-06, "loss": 0.5144, "step": 1023 }, { "epoch": 2.486026731470231, "grad_norm": 6.980617046356201, "learning_rate": 3.063e-06, "loss": 0.6966, "step": 1024 }, { "epoch": 2.4884568651275822, "grad_norm": 11.589492797851562, "learning_rate": 3.066e-06, "loss": 1.2888, "step": 1025 }, { "epoch": 2.490886998784933, "grad_norm": 9.175788879394531, "learning_rate": 3.069e-06, "loss": 1.0648, "step": 1026 }, { "epoch": 2.4933171324422845, "grad_norm": 5.728570461273193, "learning_rate": 3.0720000000000004e-06, "loss": 0.8014, "step": 1027 }, { "epoch": 2.4957472660996354, "grad_norm": 2.7607851028442383, "learning_rate": 3.0749999999999998e-06, "loss": 0.748, "step": 1028 }, { "epoch": 2.4981773997569867, "grad_norm": 4.498540878295898, "learning_rate": 3.078e-06, "loss": 0.815, "step": 1029 }, { "epoch": 2.500607533414338, "grad_norm": 5.386823654174805, "learning_rate": 3.0810000000000002e-06, "loss": 0.6278, "step": 1030 }, { "epoch": 2.503037667071689, "grad_norm": 4.582977294921875, "learning_rate": 3.084e-06, "loss": 0.5436, "step": 1031 }, { "epoch": 2.50546780072904, "grad_norm": 3.588542938232422, "learning_rate": 3.0870000000000003e-06, "loss": 0.4372, "step": 1032 }, { "epoch": 2.507897934386391, "grad_norm": 5.559036731719971, "learning_rate": 3.09e-06, "loss": 0.4324, "step": 1033 }, { "epoch": 2.5103280680437425, "grad_norm": 2.390441417694092, "learning_rate": 3.093e-06, "loss": 0.4178, "step": 1034 }, { "epoch": 2.5127582017010934, "grad_norm": 2.4455037117004395, "learning_rate": 3.096e-06, "loss": 0.4525, "step": 1035 }, { "epoch": 2.5151883353584448, "grad_norm": 3.5434954166412354, "learning_rate": 3.0990000000000003e-06, "loss": 0.5428, "step": 1036 }, { "epoch": 2.5176184690157957, "grad_norm": 3.4629862308502197, "learning_rate": 3.102e-06, "loss": 0.4194, "step": 1037 }, { "epoch": 2.520048602673147, "grad_norm": 3.606041431427002, "learning_rate": 3.105e-06, "loss": 0.439, "step": 1038 }, { "epoch": 2.5224787363304984, "grad_norm": 2.707509756088257, "learning_rate": 3.108e-06, "loss": 0.4106, "step": 1039 }, { "epoch": 2.5249088699878492, "grad_norm": 2.647584915161133, "learning_rate": 3.111e-06, "loss": 0.4026, "step": 1040 }, { "epoch": 2.5273390036452006, "grad_norm": 3.5246989727020264, "learning_rate": 3.114e-06, "loss": 0.389, "step": 1041 }, { "epoch": 2.5297691373025515, "grad_norm": 2.426914691925049, "learning_rate": 3.1170000000000004e-06, "loss": 0.4421, "step": 1042 }, { "epoch": 2.532199270959903, "grad_norm": 2.581773519515991, "learning_rate": 3.1199999999999998e-06, "loss": 0.3532, "step": 1043 }, { "epoch": 2.534629404617254, "grad_norm": 3.3146607875823975, "learning_rate": 3.123e-06, "loss": 0.3858, "step": 1044 }, { "epoch": 2.537059538274605, "grad_norm": 3.0764684677124023, "learning_rate": 3.1260000000000002e-06, "loss": 0.418, "step": 1045 }, { "epoch": 2.5394896719319564, "grad_norm": 6.30286169052124, "learning_rate": 3.129e-06, "loss": 0.4183, "step": 1046 }, { "epoch": 2.5419198055893073, "grad_norm": 4.751224994659424, "learning_rate": 3.1320000000000003e-06, "loss": 0.4311, "step": 1047 }, { "epoch": 2.5443499392466586, "grad_norm": 2.3398067951202393, "learning_rate": 3.135e-06, "loss": 0.5239, "step": 1048 }, { "epoch": 2.54678007290401, "grad_norm": 3.499000072479248, "learning_rate": 3.138e-06, "loss": 0.3552, "step": 1049 }, { "epoch": 2.549210206561361, "grad_norm": 10.223480224609375, "learning_rate": 3.141e-06, "loss": 0.3851, "step": 1050 }, { "epoch": 2.5516403402187118, "grad_norm": 3.101475238800049, "learning_rate": 3.1440000000000003e-06, "loss": 0.3956, "step": 1051 }, { "epoch": 2.554070473876063, "grad_norm": 2.3953280448913574, "learning_rate": 3.1469999999999997e-06, "loss": 0.4281, "step": 1052 }, { "epoch": 2.5565006075334145, "grad_norm": 4.566012859344482, "learning_rate": 3.15e-06, "loss": 0.4044, "step": 1053 }, { "epoch": 2.5589307411907654, "grad_norm": 4.6544413566589355, "learning_rate": 3.153e-06, "loss": 0.3448, "step": 1054 }, { "epoch": 2.5613608748481167, "grad_norm": 3.8602521419525146, "learning_rate": 3.156e-06, "loss": 0.3347, "step": 1055 }, { "epoch": 2.5637910085054676, "grad_norm": 4.047382354736328, "learning_rate": 3.159e-06, "loss": 0.4471, "step": 1056 }, { "epoch": 2.566221142162819, "grad_norm": 3.8531301021575928, "learning_rate": 3.162e-06, "loss": 0.442, "step": 1057 }, { "epoch": 2.5686512758201703, "grad_norm": 2.818635940551758, "learning_rate": 3.1649999999999998e-06, "loss": 0.4107, "step": 1058 }, { "epoch": 2.571081409477521, "grad_norm": 4.594295501708984, "learning_rate": 3.168e-06, "loss": 0.4548, "step": 1059 }, { "epoch": 2.5735115431348725, "grad_norm": 2.547137498855591, "learning_rate": 3.1710000000000002e-06, "loss": 0.4475, "step": 1060 }, { "epoch": 2.5759416767922234, "grad_norm": 3.669891834259033, "learning_rate": 3.1740000000000004e-06, "loss": 0.3691, "step": 1061 }, { "epoch": 2.5783718104495748, "grad_norm": 3.478529214859009, "learning_rate": 3.177e-06, "loss": 0.4609, "step": 1062 }, { "epoch": 2.580801944106926, "grad_norm": 4.130182266235352, "learning_rate": 3.18e-06, "loss": 0.4614, "step": 1063 }, { "epoch": 2.583232077764277, "grad_norm": 2.6962294578552246, "learning_rate": 3.1830000000000003e-06, "loss": 0.3915, "step": 1064 }, { "epoch": 2.5856622114216283, "grad_norm": 6.369646072387695, "learning_rate": 3.186e-06, "loss": 0.4535, "step": 1065 }, { "epoch": 2.5880923450789792, "grad_norm": 3.1115524768829346, "learning_rate": 3.1890000000000003e-06, "loss": 0.4279, "step": 1066 }, { "epoch": 2.5905224787363306, "grad_norm": 4.981843948364258, "learning_rate": 3.192e-06, "loss": 0.3485, "step": 1067 }, { "epoch": 2.592952612393682, "grad_norm": 3.160334348678589, "learning_rate": 3.195e-06, "loss": 0.4029, "step": 1068 }, { "epoch": 2.595382746051033, "grad_norm": 3.650153160095215, "learning_rate": 3.198e-06, "loss": 0.4395, "step": 1069 }, { "epoch": 2.5978128797083837, "grad_norm": 2.749948501586914, "learning_rate": 3.2010000000000004e-06, "loss": 0.4363, "step": 1070 }, { "epoch": 2.600243013365735, "grad_norm": 4.1056108474731445, "learning_rate": 3.204e-06, "loss": 0.4529, "step": 1071 }, { "epoch": 2.6026731470230864, "grad_norm": 5.464536190032959, "learning_rate": 3.207e-06, "loss": 0.5565, "step": 1072 }, { "epoch": 2.6051032806804373, "grad_norm": 6.545903205871582, "learning_rate": 3.21e-06, "loss": 0.4861, "step": 1073 }, { "epoch": 2.6075334143377886, "grad_norm": 5.222568035125732, "learning_rate": 3.213e-06, "loss": 0.6592, "step": 1074 }, { "epoch": 2.6099635479951395, "grad_norm": 10.82277774810791, "learning_rate": 3.216e-06, "loss": 1.2248, "step": 1075 }, { "epoch": 2.612393681652491, "grad_norm": 7.238896369934082, "learning_rate": 3.2190000000000004e-06, "loss": 1.0563, "step": 1076 }, { "epoch": 2.614823815309842, "grad_norm": 5.734050750732422, "learning_rate": 3.222e-06, "loss": 0.7968, "step": 1077 }, { "epoch": 2.617253948967193, "grad_norm": 3.7977490425109863, "learning_rate": 3.225e-06, "loss": 0.7156, "step": 1078 }, { "epoch": 2.6196840826245444, "grad_norm": 5.79496431350708, "learning_rate": 3.2280000000000003e-06, "loss": 0.5803, "step": 1079 }, { "epoch": 2.6221142162818953, "grad_norm": 5.440210819244385, "learning_rate": 3.231e-06, "loss": 0.5704, "step": 1080 }, { "epoch": 2.6245443499392467, "grad_norm": 5.547235488891602, "learning_rate": 3.2340000000000003e-06, "loss": 0.4775, "step": 1081 }, { "epoch": 2.626974483596598, "grad_norm": 4.627890110015869, "learning_rate": 3.237e-06, "loss": 0.4896, "step": 1082 }, { "epoch": 2.629404617253949, "grad_norm": 3.4648237228393555, "learning_rate": 3.24e-06, "loss": 0.4693, "step": 1083 }, { "epoch": 2.6318347509113003, "grad_norm": 2.456491708755493, "learning_rate": 3.243e-06, "loss": 0.4107, "step": 1084 }, { "epoch": 2.634264884568651, "grad_norm": 2.3736371994018555, "learning_rate": 3.2460000000000003e-06, "loss": 0.4088, "step": 1085 }, { "epoch": 2.6366950182260025, "grad_norm": 2.981717109680176, "learning_rate": 3.2489999999999997e-06, "loss": 0.3659, "step": 1086 }, { "epoch": 2.639125151883354, "grad_norm": 3.031247138977051, "learning_rate": 3.252e-06, "loss": 0.4598, "step": 1087 }, { "epoch": 2.6415552855407047, "grad_norm": 2.1551923751831055, "learning_rate": 3.255e-06, "loss": 0.3822, "step": 1088 }, { "epoch": 2.6439854191980556, "grad_norm": 2.5660393238067627, "learning_rate": 3.258e-06, "loss": 0.3846, "step": 1089 }, { "epoch": 2.646415552855407, "grad_norm": 3.169907331466675, "learning_rate": 3.261e-06, "loss": 0.4158, "step": 1090 }, { "epoch": 2.6488456865127583, "grad_norm": 2.427792549133301, "learning_rate": 3.264e-06, "loss": 0.3695, "step": 1091 }, { "epoch": 2.651275820170109, "grad_norm": 2.1565871238708496, "learning_rate": 3.267e-06, "loss": 0.365, "step": 1092 }, { "epoch": 2.6537059538274606, "grad_norm": 2.1120247840881348, "learning_rate": 3.27e-06, "loss": 0.339, "step": 1093 }, { "epoch": 2.6561360874848114, "grad_norm": 2.124668598175049, "learning_rate": 3.2730000000000003e-06, "loss": 0.3454, "step": 1094 }, { "epoch": 2.658566221142163, "grad_norm": 1.642741322517395, "learning_rate": 3.276e-06, "loss": 0.3068, "step": 1095 }, { "epoch": 2.660996354799514, "grad_norm": 4.007686614990234, "learning_rate": 3.279e-06, "loss": 0.5404, "step": 1096 }, { "epoch": 2.663426488456865, "grad_norm": 3.1289219856262207, "learning_rate": 3.282e-06, "loss": 0.4198, "step": 1097 }, { "epoch": 2.6658566221142164, "grad_norm": 2.9560868740081787, "learning_rate": 3.285e-06, "loss": 0.4052, "step": 1098 }, { "epoch": 2.6682867557715673, "grad_norm": 1.9033448696136475, "learning_rate": 3.288e-06, "loss": 0.36, "step": 1099 }, { "epoch": 2.6707168894289186, "grad_norm": 2.5072078704833984, "learning_rate": 3.2910000000000003e-06, "loss": 0.4998, "step": 1100 }, { "epoch": 2.67314702308627, "grad_norm": 3.181910753250122, "learning_rate": 3.2939999999999997e-06, "loss": 0.3824, "step": 1101 }, { "epoch": 2.675577156743621, "grad_norm": 2.38913893699646, "learning_rate": 3.297e-06, "loss": 0.4285, "step": 1102 }, { "epoch": 2.678007290400972, "grad_norm": 2.3337161540985107, "learning_rate": 3.3e-06, "loss": 0.3704, "step": 1103 }, { "epoch": 2.680437424058323, "grad_norm": 3.1509275436401367, "learning_rate": 3.3030000000000004e-06, "loss": 0.383, "step": 1104 }, { "epoch": 2.6828675577156744, "grad_norm": 3.3982889652252197, "learning_rate": 3.306e-06, "loss": 0.365, "step": 1105 }, { "epoch": 2.6852976913730258, "grad_norm": 3.5445616245269775, "learning_rate": 3.309e-06, "loss": 0.4384, "step": 1106 }, { "epoch": 2.6877278250303767, "grad_norm": 2.515423536300659, "learning_rate": 3.3120000000000002e-06, "loss": 0.3484, "step": 1107 }, { "epoch": 2.6901579586877276, "grad_norm": 2.556387424468994, "learning_rate": 3.315e-06, "loss": 0.3945, "step": 1108 }, { "epoch": 2.692588092345079, "grad_norm": 2.6959121227264404, "learning_rate": 3.3180000000000003e-06, "loss": 0.4098, "step": 1109 }, { "epoch": 2.6950182260024302, "grad_norm": 3.496415376663208, "learning_rate": 3.3210000000000005e-06, "loss": 0.4619, "step": 1110 }, { "epoch": 2.697448359659781, "grad_norm": 2.7164735794067383, "learning_rate": 3.324e-06, "loss": 0.3859, "step": 1111 }, { "epoch": 2.6998784933171325, "grad_norm": 3.6564157009124756, "learning_rate": 3.327e-06, "loss": 0.522, "step": 1112 }, { "epoch": 2.7023086269744834, "grad_norm": 4.886316776275635, "learning_rate": 3.3300000000000003e-06, "loss": 0.3585, "step": 1113 }, { "epoch": 2.7047387606318347, "grad_norm": 26.84955406188965, "learning_rate": 3.333e-06, "loss": 0.3662, "step": 1114 }, { "epoch": 2.707168894289186, "grad_norm": 4.509939193725586, "learning_rate": 3.336e-06, "loss": 0.4098, "step": 1115 }, { "epoch": 2.709599027946537, "grad_norm": 2.365229845046997, "learning_rate": 3.339e-06, "loss": 0.3866, "step": 1116 }, { "epoch": 2.7120291616038883, "grad_norm": 2.9485018253326416, "learning_rate": 3.342e-06, "loss": 0.3611, "step": 1117 }, { "epoch": 2.714459295261239, "grad_norm": 4.847822666168213, "learning_rate": 3.345e-06, "loss": 0.5119, "step": 1118 }, { "epoch": 2.7168894289185905, "grad_norm": 2.798297643661499, "learning_rate": 3.3480000000000004e-06, "loss": 0.4022, "step": 1119 }, { "epoch": 2.719319562575942, "grad_norm": 5.17159366607666, "learning_rate": 3.3509999999999998e-06, "loss": 0.4517, "step": 1120 }, { "epoch": 2.7217496962332928, "grad_norm": 6.144942760467529, "learning_rate": 3.354e-06, "loss": 0.4354, "step": 1121 }, { "epoch": 2.724179829890644, "grad_norm": 3.2696051597595215, "learning_rate": 3.3570000000000002e-06, "loss": 0.4279, "step": 1122 }, { "epoch": 2.726609963547995, "grad_norm": 4.6051154136657715, "learning_rate": 3.36e-06, "loss": 0.4722, "step": 1123 }, { "epoch": 2.7290400972053463, "grad_norm": 5.383318901062012, "learning_rate": 3.3630000000000002e-06, "loss": 0.6469, "step": 1124 }, { "epoch": 2.7314702308626977, "grad_norm": 8.554316520690918, "learning_rate": 3.366e-06, "loss": 1.1423, "step": 1125 }, { "epoch": 2.7339003645200486, "grad_norm": 6.315384387969971, "learning_rate": 3.369e-06, "loss": 0.9587, "step": 1126 }, { "epoch": 2.7363304981773995, "grad_norm": 3.4451589584350586, "learning_rate": 3.372e-06, "loss": 0.8012, "step": 1127 }, { "epoch": 2.738760631834751, "grad_norm": 3.289102554321289, "learning_rate": 3.3750000000000003e-06, "loss": 0.6659, "step": 1128 }, { "epoch": 2.741190765492102, "grad_norm": 6.752678871154785, "learning_rate": 3.378e-06, "loss": 0.5598, "step": 1129 }, { "epoch": 2.743620899149453, "grad_norm": 6.926419734954834, "learning_rate": 3.381e-06, "loss": 0.4934, "step": 1130 }, { "epoch": 2.7460510328068044, "grad_norm": 4.364245891571045, "learning_rate": 3.384e-06, "loss": 0.4767, "step": 1131 }, { "epoch": 2.7484811664641553, "grad_norm": 2.6576411724090576, "learning_rate": 3.387e-06, "loss": 0.4151, "step": 1132 }, { "epoch": 2.7509113001215066, "grad_norm": 2.804719924926758, "learning_rate": 3.39e-06, "loss": 0.3818, "step": 1133 }, { "epoch": 2.753341433778858, "grad_norm": 2.759122848510742, "learning_rate": 3.3930000000000004e-06, "loss": 0.4074, "step": 1134 }, { "epoch": 2.755771567436209, "grad_norm": 1.8816183805465698, "learning_rate": 3.3959999999999998e-06, "loss": 0.3288, "step": 1135 }, { "epoch": 2.75820170109356, "grad_norm": 2.5185859203338623, "learning_rate": 3.399e-06, "loss": 0.3636, "step": 1136 }, { "epoch": 2.760631834750911, "grad_norm": 2.1318092346191406, "learning_rate": 3.402e-06, "loss": 0.3279, "step": 1137 }, { "epoch": 2.7630619684082625, "grad_norm": 2.8860788345336914, "learning_rate": 3.405e-06, "loss": 0.4214, "step": 1138 }, { "epoch": 2.765492102065614, "grad_norm": 2.9840400218963623, "learning_rate": 3.4080000000000002e-06, "loss": 0.361, "step": 1139 }, { "epoch": 2.7679222357229647, "grad_norm": 2.6765387058258057, "learning_rate": 3.411e-06, "loss": 0.3932, "step": 1140 }, { "epoch": 2.770352369380316, "grad_norm": 4.249729633331299, "learning_rate": 3.414e-06, "loss": 0.3633, "step": 1141 }, { "epoch": 2.772782503037667, "grad_norm": 2.1742773056030273, "learning_rate": 3.417e-06, "loss": 0.3735, "step": 1142 }, { "epoch": 2.7752126366950183, "grad_norm": 5.497856140136719, "learning_rate": 3.4200000000000003e-06, "loss": 0.3534, "step": 1143 }, { "epoch": 2.7776427703523696, "grad_norm": 2.3837735652923584, "learning_rate": 3.4229999999999997e-06, "loss": 0.3741, "step": 1144 }, { "epoch": 2.7800729040097205, "grad_norm": 1.87030029296875, "learning_rate": 3.426e-06, "loss": 0.3368, "step": 1145 }, { "epoch": 2.782503037667072, "grad_norm": 2.683340549468994, "learning_rate": 3.429e-06, "loss": 0.312, "step": 1146 }, { "epoch": 2.7849331713244228, "grad_norm": 3.3165860176086426, "learning_rate": 3.4320000000000003e-06, "loss": 0.3165, "step": 1147 }, { "epoch": 2.787363304981774, "grad_norm": 32.77874755859375, "learning_rate": 3.435e-06, "loss": 0.3126, "step": 1148 }, { "epoch": 2.789793438639125, "grad_norm": 2.303290605545044, "learning_rate": 3.438e-06, "loss": 0.3476, "step": 1149 }, { "epoch": 2.7922235722964763, "grad_norm": 13.477643013000488, "learning_rate": 3.441e-06, "loss": 0.3376, "step": 1150 }, { "epoch": 2.7946537059538272, "grad_norm": 3.6705455780029297, "learning_rate": 3.444e-06, "loss": 0.3738, "step": 1151 }, { "epoch": 2.7970838396111786, "grad_norm": 2.360508918762207, "learning_rate": 3.447e-06, "loss": 0.3093, "step": 1152 }, { "epoch": 2.79951397326853, "grad_norm": 4.048976898193359, "learning_rate": 3.4500000000000004e-06, "loss": 0.3188, "step": 1153 }, { "epoch": 2.801944106925881, "grad_norm": 2.0969531536102295, "learning_rate": 3.453e-06, "loss": 0.3574, "step": 1154 }, { "epoch": 2.804374240583232, "grad_norm": 3.4919495582580566, "learning_rate": 3.456e-06, "loss": 0.3513, "step": 1155 }, { "epoch": 2.806804374240583, "grad_norm": 2.375927209854126, "learning_rate": 3.4590000000000003e-06, "loss": 0.347, "step": 1156 }, { "epoch": 2.8092345078979344, "grad_norm": 2.912905216217041, "learning_rate": 3.462e-06, "loss": 0.3786, "step": 1157 }, { "epoch": 2.8116646415552857, "grad_norm": 3.116215229034424, "learning_rate": 3.4650000000000003e-06, "loss": 0.3673, "step": 1158 }, { "epoch": 2.8140947752126366, "grad_norm": 3.9118258953094482, "learning_rate": 3.468e-06, "loss": 0.3194, "step": 1159 }, { "epoch": 2.816524908869988, "grad_norm": 3.369077444076538, "learning_rate": 3.471e-06, "loss": 0.3762, "step": 1160 }, { "epoch": 2.818955042527339, "grad_norm": 2.696681261062622, "learning_rate": 3.474e-06, "loss": 0.3706, "step": 1161 }, { "epoch": 2.82138517618469, "grad_norm": 3.2620632648468018, "learning_rate": 3.4770000000000003e-06, "loss": 0.3395, "step": 1162 }, { "epoch": 2.8238153098420415, "grad_norm": 2.555659532546997, "learning_rate": 3.48e-06, "loss": 0.3415, "step": 1163 }, { "epoch": 2.8262454434993924, "grad_norm": 3.6026084423065186, "learning_rate": 3.483e-06, "loss": 0.4112, "step": 1164 }, { "epoch": 2.828675577156744, "grad_norm": 2.841728448867798, "learning_rate": 3.486e-06, "loss": 0.3284, "step": 1165 }, { "epoch": 2.8311057108140947, "grad_norm": 3.4841978549957275, "learning_rate": 3.489e-06, "loss": 0.388, "step": 1166 }, { "epoch": 2.833535844471446, "grad_norm": 2.3236422538757324, "learning_rate": 3.492e-06, "loss": 0.3139, "step": 1167 }, { "epoch": 2.8359659781287974, "grad_norm": 3.1816818714141846, "learning_rate": 3.4950000000000004e-06, "loss": 0.391, "step": 1168 }, { "epoch": 2.8383961117861483, "grad_norm": 4.783125877380371, "learning_rate": 3.498e-06, "loss": 0.3993, "step": 1169 }, { "epoch": 2.840826245443499, "grad_norm": 3.561491012573242, "learning_rate": 3.501e-06, "loss": 0.3746, "step": 1170 }, { "epoch": 2.8432563791008505, "grad_norm": 3.888559103012085, "learning_rate": 3.5040000000000002e-06, "loss": 0.4934, "step": 1171 }, { "epoch": 2.845686512758202, "grad_norm": 4.531956195831299, "learning_rate": 3.507e-06, "loss": 0.4387, "step": 1172 }, { "epoch": 2.8481166464155527, "grad_norm": 6.418872833251953, "learning_rate": 3.5100000000000003e-06, "loss": 0.3914, "step": 1173 }, { "epoch": 2.850546780072904, "grad_norm": 18.707765579223633, "learning_rate": 3.513e-06, "loss": 0.5258, "step": 1174 }, { "epoch": 2.852976913730255, "grad_norm": 6.018445014953613, "learning_rate": 3.516e-06, "loss": 1.0958, "step": 1175 }, { "epoch": 2.8554070473876063, "grad_norm": 2.4664156436920166, "learning_rate": 3.519e-06, "loss": 0.7826, "step": 1176 }, { "epoch": 2.8578371810449577, "grad_norm": 3.80051589012146, "learning_rate": 3.5220000000000003e-06, "loss": 0.5892, "step": 1177 }, { "epoch": 2.8602673147023085, "grad_norm": 2.4284555912017822, "learning_rate": 3.5249999999999997e-06, "loss": 0.6656, "step": 1178 }, { "epoch": 2.86269744835966, "grad_norm": 5.0006585121154785, "learning_rate": 3.528e-06, "loss": 0.5564, "step": 1179 }, { "epoch": 2.865127582017011, "grad_norm": 4.382360935211182, "learning_rate": 3.531e-06, "loss": 0.5293, "step": 1180 }, { "epoch": 2.867557715674362, "grad_norm": 3.9869589805603027, "learning_rate": 3.534e-06, "loss": 0.5343, "step": 1181 }, { "epoch": 2.8699878493317135, "grad_norm": 3.209017038345337, "learning_rate": 3.537e-06, "loss": 0.3368, "step": 1182 }, { "epoch": 2.8724179829890644, "grad_norm": 2.786210775375366, "learning_rate": 3.54e-06, "loss": 0.3335, "step": 1183 }, { "epoch": 2.8748481166464157, "grad_norm": 3.643336057662964, "learning_rate": 3.543e-06, "loss": 0.3683, "step": 1184 }, { "epoch": 2.8772782503037666, "grad_norm": 4.608928203582764, "learning_rate": 3.546e-06, "loss": 0.3722, "step": 1185 }, { "epoch": 2.879708383961118, "grad_norm": 2.1251137256622314, "learning_rate": 3.5490000000000002e-06, "loss": 0.3832, "step": 1186 }, { "epoch": 2.8821385176184693, "grad_norm": 1.9939638376235962, "learning_rate": 3.552e-06, "loss": 0.3123, "step": 1187 }, { "epoch": 2.88456865127582, "grad_norm": 1.982487440109253, "learning_rate": 3.555e-06, "loss": 0.2764, "step": 1188 }, { "epoch": 2.886998784933171, "grad_norm": 4.456912040710449, "learning_rate": 3.558e-06, "loss": 0.3067, "step": 1189 }, { "epoch": 2.8894289185905224, "grad_norm": 4.312714099884033, "learning_rate": 3.5610000000000003e-06, "loss": 0.3194, "step": 1190 }, { "epoch": 2.8918590522478738, "grad_norm": 2.560370922088623, "learning_rate": 3.564e-06, "loss": 0.3149, "step": 1191 }, { "epoch": 2.8942891859052247, "grad_norm": 2.745138645172119, "learning_rate": 3.5670000000000003e-06, "loss": 0.373, "step": 1192 }, { "epoch": 2.896719319562576, "grad_norm": 2.011029005050659, "learning_rate": 3.57e-06, "loss": 0.3619, "step": 1193 }, { "epoch": 2.899149453219927, "grad_norm": 2.572124719619751, "learning_rate": 3.573e-06, "loss": 0.3178, "step": 1194 }, { "epoch": 2.9015795868772782, "grad_norm": 3.2681262493133545, "learning_rate": 3.576e-06, "loss": 0.3059, "step": 1195 }, { "epoch": 2.9040097205346296, "grad_norm": 2.9930315017700195, "learning_rate": 3.5790000000000004e-06, "loss": 0.3981, "step": 1196 }, { "epoch": 2.9064398541919805, "grad_norm": 3.226097822189331, "learning_rate": 3.582e-06, "loss": 0.3449, "step": 1197 }, { "epoch": 2.908869987849332, "grad_norm": 1.810941457748413, "learning_rate": 3.585e-06, "loss": 0.2539, "step": 1198 }, { "epoch": 2.9113001215066827, "grad_norm": 2.766091823577881, "learning_rate": 3.588e-06, "loss": 0.4153, "step": 1199 }, { "epoch": 2.913730255164034, "grad_norm": 14.901863098144531, "learning_rate": 3.591e-06, "loss": 0.3442, "step": 1200 }, { "epoch": 2.9161603888213854, "grad_norm": 2.5669050216674805, "learning_rate": 3.5940000000000002e-06, "loss": 0.3905, "step": 1201 }, { "epoch": 2.9185905224787363, "grad_norm": 2.151798725128174, "learning_rate": 3.5970000000000005e-06, "loss": 0.3255, "step": 1202 }, { "epoch": 2.9210206561360876, "grad_norm": 2.7560853958129883, "learning_rate": 3.6e-06, "loss": 0.3201, "step": 1203 }, { "epoch": 2.9234507897934385, "grad_norm": 2.948676586151123, "learning_rate": 3.603e-06, "loss": 0.3072, "step": 1204 }, { "epoch": 2.92588092345079, "grad_norm": 2.413700580596924, "learning_rate": 3.6060000000000003e-06, "loss": 0.3269, "step": 1205 }, { "epoch": 2.928311057108141, "grad_norm": 2.38494610786438, "learning_rate": 3.609e-06, "loss": 0.3233, "step": 1206 }, { "epoch": 2.930741190765492, "grad_norm": 2.3619563579559326, "learning_rate": 3.612e-06, "loss": 0.3155, "step": 1207 }, { "epoch": 2.933171324422843, "grad_norm": 3.6819708347320557, "learning_rate": 3.615e-06, "loss": 0.3909, "step": 1208 }, { "epoch": 2.9356014580801943, "grad_norm": 2.5638132095336914, "learning_rate": 3.618e-06, "loss": 0.3378, "step": 1209 }, { "epoch": 2.9380315917375457, "grad_norm": 2.7515835762023926, "learning_rate": 3.621e-06, "loss": 0.4365, "step": 1210 }, { "epoch": 2.9404617253948966, "grad_norm": 3.4276645183563232, "learning_rate": 3.6240000000000004e-06, "loss": 0.3539, "step": 1211 }, { "epoch": 2.942891859052248, "grad_norm": 2.8227462768554688, "learning_rate": 3.6269999999999997e-06, "loss": 0.2866, "step": 1212 }, { "epoch": 2.945321992709599, "grad_norm": 3.4011542797088623, "learning_rate": 3.63e-06, "loss": 0.3393, "step": 1213 }, { "epoch": 2.94775212636695, "grad_norm": 2.939096212387085, "learning_rate": 3.633e-06, "loss": 0.4145, "step": 1214 }, { "epoch": 2.9501822600243015, "grad_norm": 2.2784228324890137, "learning_rate": 3.636e-06, "loss": 0.3464, "step": 1215 }, { "epoch": 2.9526123936816524, "grad_norm": 4.14130163192749, "learning_rate": 3.6390000000000002e-06, "loss": 0.351, "step": 1216 }, { "epoch": 2.9550425273390037, "grad_norm": 2.9902687072753906, "learning_rate": 3.642e-06, "loss": 0.3007, "step": 1217 }, { "epoch": 2.9574726609963546, "grad_norm": 3.1736338138580322, "learning_rate": 3.645e-06, "loss": 0.3722, "step": 1218 }, { "epoch": 2.959902794653706, "grad_norm": 3.2590534687042236, "learning_rate": 3.648e-06, "loss": 0.3851, "step": 1219 }, { "epoch": 2.9623329283110573, "grad_norm": 3.271069288253784, "learning_rate": 3.6510000000000003e-06, "loss": 0.4014, "step": 1220 }, { "epoch": 2.964763061968408, "grad_norm": 4.503889083862305, "learning_rate": 3.654e-06, "loss": 0.447, "step": 1221 }, { "epoch": 2.9671931956257596, "grad_norm": 3.9880833625793457, "learning_rate": 3.657e-06, "loss": 0.418, "step": 1222 }, { "epoch": 2.9696233292831105, "grad_norm": 3.721221923828125, "learning_rate": 3.66e-06, "loss": 0.381, "step": 1223 }, { "epoch": 2.972053462940462, "grad_norm": 5.290297985076904, "learning_rate": 3.663e-06, "loss": 0.5544, "step": 1224 }, { "epoch": 2.974483596597813, "grad_norm": 5.897195816040039, "learning_rate": 3.666e-06, "loss": 0.851, "step": 1225 }, { "epoch": 2.976913730255164, "grad_norm": 2.385490655899048, "learning_rate": 3.6690000000000004e-06, "loss": 0.4869, "step": 1226 }, { "epoch": 2.979343863912515, "grad_norm": 1.95222806930542, "learning_rate": 3.6719999999999997e-06, "loss": 0.391, "step": 1227 }, { "epoch": 2.9817739975698663, "grad_norm": 2.4904258251190186, "learning_rate": 3.675e-06, "loss": 0.329, "step": 1228 }, { "epoch": 2.9842041312272176, "grad_norm": 2.4554452896118164, "learning_rate": 3.678e-06, "loss": 0.3222, "step": 1229 }, { "epoch": 2.9866342648845685, "grad_norm": 2.9280009269714355, "learning_rate": 3.681e-06, "loss": 0.3302, "step": 1230 }, { "epoch": 2.98906439854192, "grad_norm": 2.4630134105682373, "learning_rate": 3.6840000000000002e-06, "loss": 0.3052, "step": 1231 }, { "epoch": 2.9914945321992708, "grad_norm": 2.822155714035034, "learning_rate": 3.687e-06, "loss": 0.3252, "step": 1232 }, { "epoch": 2.993924665856622, "grad_norm": 3.7329795360565186, "learning_rate": 3.6900000000000002e-06, "loss": 0.3285, "step": 1233 }, { "epoch": 2.9963547995139734, "grad_norm": 3.821397542953491, "learning_rate": 3.693e-06, "loss": 0.3325, "step": 1234 }, { "epoch": 2.9987849331713243, "grad_norm": 3.563197135925293, "learning_rate": 3.6960000000000003e-06, "loss": 0.3882, "step": 1235 }, { "epoch": 3.0, "grad_norm": 3.3412139415740967, "learning_rate": 3.6990000000000005e-06, "loss": 0.343, "step": 1236 }, { "epoch": 3.0024301336573513, "grad_norm": 7.030354022979736, "learning_rate": 3.702e-06, "loss": 1.0589, "step": 1237 }, { "epoch": 3.0048602673147022, "grad_norm": 4.254022598266602, "learning_rate": 3.705e-06, "loss": 0.8282, "step": 1238 }, { "epoch": 3.0072904009720536, "grad_norm": 3.132927656173706, "learning_rate": 3.7080000000000003e-06, "loss": 0.7017, "step": 1239 }, { "epoch": 3.0097205346294045, "grad_norm": 4.944055557250977, "learning_rate": 3.711e-06, "loss": 0.581, "step": 1240 }, { "epoch": 3.012150668286756, "grad_norm": 3.5163652896881104, "learning_rate": 3.714e-06, "loss": 0.5588, "step": 1241 }, { "epoch": 3.0145808019441067, "grad_norm": 3.0944507122039795, "learning_rate": 3.717e-06, "loss": 0.5468, "step": 1242 }, { "epoch": 3.017010935601458, "grad_norm": 2.9066600799560547, "learning_rate": 3.72e-06, "loss": 0.4258, "step": 1243 }, { "epoch": 3.0194410692588094, "grad_norm": 2.3904125690460205, "learning_rate": 3.723e-06, "loss": 0.3737, "step": 1244 }, { "epoch": 3.0218712029161603, "grad_norm": 2.691965341567993, "learning_rate": 3.7260000000000004e-06, "loss": 0.3259, "step": 1245 }, { "epoch": 3.0243013365735116, "grad_norm": 3.146987199783325, "learning_rate": 3.7289999999999998e-06, "loss": 0.3652, "step": 1246 }, { "epoch": 3.0267314702308625, "grad_norm": 6.751753807067871, "learning_rate": 3.732e-06, "loss": 0.373, "step": 1247 }, { "epoch": 3.029161603888214, "grad_norm": 1.9312688112258911, "learning_rate": 3.7350000000000002e-06, "loss": 0.3453, "step": 1248 }, { "epoch": 3.031591737545565, "grad_norm": 2.4646663665771484, "learning_rate": 3.738e-06, "loss": 0.3755, "step": 1249 }, { "epoch": 3.034021871202916, "grad_norm": 2.1455464363098145, "learning_rate": 3.7410000000000003e-06, "loss": 0.2549, "step": 1250 }, { "epoch": 3.0364520048602675, "grad_norm": 1.9814451932907104, "learning_rate": 3.744e-06, "loss": 0.2786, "step": 1251 }, { "epoch": 3.0388821385176183, "grad_norm": 2.0790629386901855, "learning_rate": 3.747e-06, "loss": 0.297, "step": 1252 }, { "epoch": 3.0413122721749697, "grad_norm": 2.0316433906555176, "learning_rate": 3.75e-06, "loss": 0.3102, "step": 1253 }, { "epoch": 3.0437424058323206, "grad_norm": 2.272108554840088, "learning_rate": 3.753e-06, "loss": 0.2958, "step": 1254 }, { "epoch": 3.046172539489672, "grad_norm": 2.5826776027679443, "learning_rate": 3.756e-06, "loss": 0.2826, "step": 1255 }, { "epoch": 3.0486026731470233, "grad_norm": 2.3663249015808105, "learning_rate": 3.759e-06, "loss": 0.2854, "step": 1256 }, { "epoch": 3.051032806804374, "grad_norm": 2.804816484451294, "learning_rate": 3.7620000000000006e-06, "loss": 0.2797, "step": 1257 }, { "epoch": 3.0534629404617255, "grad_norm": 3.1649296283721924, "learning_rate": 3.765e-06, "loss": 0.3675, "step": 1258 }, { "epoch": 3.0558930741190764, "grad_norm": 2.0418474674224854, "learning_rate": 3.7679999999999998e-06, "loss": 0.3197, "step": 1259 }, { "epoch": 3.0583232077764277, "grad_norm": 2.0911166667938232, "learning_rate": 3.7710000000000004e-06, "loss": 0.2813, "step": 1260 }, { "epoch": 3.0607533414337786, "grad_norm": 2.7404067516326904, "learning_rate": 3.7739999999999998e-06, "loss": 0.4115, "step": 1261 }, { "epoch": 3.06318347509113, "grad_norm": 2.093015193939209, "learning_rate": 3.7770000000000004e-06, "loss": 0.3465, "step": 1262 }, { "epoch": 3.0656136087484813, "grad_norm": 2.0828957557678223, "learning_rate": 3.7800000000000002e-06, "loss": 0.2997, "step": 1263 }, { "epoch": 3.068043742405832, "grad_norm": 2.8220624923706055, "learning_rate": 3.7829999999999996e-06, "loss": 0.3582, "step": 1264 }, { "epoch": 3.0704738760631836, "grad_norm": 2.0660436153411865, "learning_rate": 3.7860000000000003e-06, "loss": 0.3055, "step": 1265 }, { "epoch": 3.0729040097205345, "grad_norm": 2.667062282562256, "learning_rate": 3.789e-06, "loss": 0.3001, "step": 1266 }, { "epoch": 3.075334143377886, "grad_norm": 2.187509298324585, "learning_rate": 3.7920000000000003e-06, "loss": 0.2959, "step": 1267 }, { "epoch": 3.077764277035237, "grad_norm": 2.176262140274048, "learning_rate": 3.795e-06, "loss": 0.3145, "step": 1268 }, { "epoch": 3.080194410692588, "grad_norm": 4.212644577026367, "learning_rate": 3.798e-06, "loss": 0.2767, "step": 1269 }, { "epoch": 3.0826245443499394, "grad_norm": 2.0058035850524902, "learning_rate": 3.801e-06, "loss": 0.2863, "step": 1270 }, { "epoch": 3.0850546780072903, "grad_norm": 2.7391886711120605, "learning_rate": 3.804e-06, "loss": 0.2559, "step": 1271 }, { "epoch": 3.0874848116646416, "grad_norm": 2.868361711502075, "learning_rate": 3.8070000000000006e-06, "loss": 0.2904, "step": 1272 }, { "epoch": 3.0899149453219925, "grad_norm": 3.2735230922698975, "learning_rate": 3.81e-06, "loss": 0.4653, "step": 1273 }, { "epoch": 3.092345078979344, "grad_norm": 50.2281494140625, "learning_rate": 3.8129999999999997e-06, "loss": 0.3799, "step": 1274 }, { "epoch": 3.094775212636695, "grad_norm": 9.384848594665527, "learning_rate": 3.816e-06, "loss": 0.3072, "step": 1275 }, { "epoch": 3.097205346294046, "grad_norm": 7.828560829162598, "learning_rate": 3.819e-06, "loss": 0.3491, "step": 1276 }, { "epoch": 3.0996354799513974, "grad_norm": 2.602738857269287, "learning_rate": 3.822000000000001e-06, "loss": 0.2897, "step": 1277 }, { "epoch": 3.1020656136087483, "grad_norm": 2.637624740600586, "learning_rate": 3.825e-06, "loss": 0.321, "step": 1278 }, { "epoch": 3.1044957472660997, "grad_norm": 2.253551721572876, "learning_rate": 3.828e-06, "loss": 0.2794, "step": 1279 }, { "epoch": 3.106925880923451, "grad_norm": 3.5666985511779785, "learning_rate": 3.831e-06, "loss": 0.4047, "step": 1280 }, { "epoch": 3.109356014580802, "grad_norm": 3.0755741596221924, "learning_rate": 3.834e-06, "loss": 0.3663, "step": 1281 }, { "epoch": 3.1117861482381532, "grad_norm": 5.461564540863037, "learning_rate": 3.837000000000001e-06, "loss": 0.4213, "step": 1282 }, { "epoch": 3.114216281895504, "grad_norm": 3.2448959350585938, "learning_rate": 3.8400000000000005e-06, "loss": 0.3866, "step": 1283 }, { "epoch": 3.1166464155528555, "grad_norm": 3.806419610977173, "learning_rate": 3.8429999999999995e-06, "loss": 0.5227, "step": 1284 }, { "epoch": 3.1190765492102064, "grad_norm": 3.4329280853271484, "learning_rate": 3.846e-06, "loss": 0.3715, "step": 1285 }, { "epoch": 3.1215066828675577, "grad_norm": 3.756901264190674, "learning_rate": 3.849e-06, "loss": 0.5299, "step": 1286 }, { "epoch": 3.123936816524909, "grad_norm": 8.83553409576416, "learning_rate": 3.852e-06, "loss": 1.0126, "step": 1287 }, { "epoch": 3.12636695018226, "grad_norm": 6.2253618240356445, "learning_rate": 3.855e-06, "loss": 0.8528, "step": 1288 }, { "epoch": 3.1287970838396113, "grad_norm": 2.2086713314056396, "learning_rate": 3.858e-06, "loss": 0.6679, "step": 1289 }, { "epoch": 3.131227217496962, "grad_norm": 5.306094646453857, "learning_rate": 3.861e-06, "loss": 0.6542, "step": 1290 }, { "epoch": 3.1336573511543135, "grad_norm": 6.518947124481201, "learning_rate": 3.864e-06, "loss": 0.6453, "step": 1291 }, { "epoch": 3.1360874848116644, "grad_norm": 7.149270057678223, "learning_rate": 3.8669999999999996e-06, "loss": 0.4676, "step": 1292 }, { "epoch": 3.138517618469016, "grad_norm": 7.44870138168335, "learning_rate": 3.87e-06, "loss": 0.4679, "step": 1293 }, { "epoch": 3.140947752126367, "grad_norm": 5.233057975769043, "learning_rate": 3.873e-06, "loss": 0.3468, "step": 1294 }, { "epoch": 3.143377885783718, "grad_norm": 2.9006285667419434, "learning_rate": 3.876000000000001e-06, "loss": 0.325, "step": 1295 }, { "epoch": 3.1458080194410694, "grad_norm": 2.34796404838562, "learning_rate": 3.8790000000000005e-06, "loss": 0.3631, "step": 1296 }, { "epoch": 3.1482381530984203, "grad_norm": 2.1857075691223145, "learning_rate": 3.8819999999999994e-06, "loss": 0.3715, "step": 1297 }, { "epoch": 3.1506682867557716, "grad_norm": 1.686023473739624, "learning_rate": 3.885e-06, "loss": 0.3061, "step": 1298 }, { "epoch": 3.153098420413123, "grad_norm": 2.537281036376953, "learning_rate": 3.888e-06, "loss": 0.3047, "step": 1299 }, { "epoch": 3.155528554070474, "grad_norm": 2.6057419776916504, "learning_rate": 3.8910000000000005e-06, "loss": 0.2687, "step": 1300 }, { "epoch": 3.157958687727825, "grad_norm": 2.3771612644195557, "learning_rate": 3.894e-06, "loss": 0.2951, "step": 1301 }, { "epoch": 3.160388821385176, "grad_norm": 2.0313608646392822, "learning_rate": 3.897e-06, "loss": 0.2273, "step": 1302 }, { "epoch": 3.1628189550425274, "grad_norm": 1.7228941917419434, "learning_rate": 3.9e-06, "loss": 0.2887, "step": 1303 }, { "epoch": 3.1652490886998783, "grad_norm": 5.290918350219727, "learning_rate": 3.903e-06, "loss": 0.2836, "step": 1304 }, { "epoch": 3.1676792223572297, "grad_norm": 2.019310235977173, "learning_rate": 3.906e-06, "loss": 0.2604, "step": 1305 }, { "epoch": 3.170109356014581, "grad_norm": 2.017364025115967, "learning_rate": 3.909e-06, "loss": 0.3024, "step": 1306 }, { "epoch": 3.172539489671932, "grad_norm": 2.660456895828247, "learning_rate": 3.912e-06, "loss": 0.3204, "step": 1307 }, { "epoch": 3.1749696233292832, "grad_norm": 1.8808032274246216, "learning_rate": 3.915000000000001e-06, "loss": 0.2679, "step": 1308 }, { "epoch": 3.177399756986634, "grad_norm": 2.467817783355713, "learning_rate": 3.918e-06, "loss": 0.3114, "step": 1309 }, { "epoch": 3.1798298906439855, "grad_norm": 4.6025166511535645, "learning_rate": 3.921e-06, "loss": 0.2902, "step": 1310 }, { "epoch": 3.1822600243013364, "grad_norm": 2.7662110328674316, "learning_rate": 3.924e-06, "loss": 0.2693, "step": 1311 }, { "epoch": 3.1846901579586877, "grad_norm": 1.8626163005828857, "learning_rate": 3.927e-06, "loss": 0.2729, "step": 1312 }, { "epoch": 3.187120291616039, "grad_norm": 2.3901219367980957, "learning_rate": 3.9300000000000005e-06, "loss": 0.2942, "step": 1313 }, { "epoch": 3.18955042527339, "grad_norm": 1.6928192377090454, "learning_rate": 3.933e-06, "loss": 0.2801, "step": 1314 }, { "epoch": 3.1919805589307413, "grad_norm": 1.7538899183273315, "learning_rate": 3.936e-06, "loss": 0.292, "step": 1315 }, { "epoch": 3.194410692588092, "grad_norm": 4.1896209716796875, "learning_rate": 3.939e-06, "loss": 0.2855, "step": 1316 }, { "epoch": 3.1968408262454435, "grad_norm": 2.1621286869049072, "learning_rate": 3.942e-06, "loss": 0.2843, "step": 1317 }, { "epoch": 3.199270959902795, "grad_norm": 1.7516355514526367, "learning_rate": 3.945e-06, "loss": 0.2779, "step": 1318 }, { "epoch": 3.2017010935601458, "grad_norm": 1.909501552581787, "learning_rate": 3.948e-06, "loss": 0.2667, "step": 1319 }, { "epoch": 3.204131227217497, "grad_norm": 2.365070343017578, "learning_rate": 3.951000000000001e-06, "loss": 0.2967, "step": 1320 }, { "epoch": 3.206561360874848, "grad_norm": 3.210770845413208, "learning_rate": 3.954e-06, "loss": 0.3792, "step": 1321 }, { "epoch": 3.2089914945321993, "grad_norm": 2.607928514480591, "learning_rate": 3.9569999999999996e-06, "loss": 0.2863, "step": 1322 }, { "epoch": 3.2114216281895502, "grad_norm": 2.489051342010498, "learning_rate": 3.96e-06, "loss": 0.3595, "step": 1323 }, { "epoch": 3.2138517618469016, "grad_norm": 2.3942110538482666, "learning_rate": 3.963e-06, "loss": 0.2754, "step": 1324 }, { "epoch": 3.216281895504253, "grad_norm": 1.8327991962432861, "learning_rate": 3.966000000000001e-06, "loss": 0.2878, "step": 1325 }, { "epoch": 3.218712029161604, "grad_norm": 3.5669913291931152, "learning_rate": 3.9690000000000005e-06, "loss": 0.3663, "step": 1326 }, { "epoch": 3.221142162818955, "grad_norm": 6.946078300476074, "learning_rate": 3.971999999999999e-06, "loss": 0.284, "step": 1327 }, { "epoch": 3.223572296476306, "grad_norm": 3.278881311416626, "learning_rate": 3.975e-06, "loss": 0.306, "step": 1328 }, { "epoch": 3.2260024301336574, "grad_norm": 2.2652158737182617, "learning_rate": 3.978e-06, "loss": 0.3164, "step": 1329 }, { "epoch": 3.2284325637910083, "grad_norm": 5.341062068939209, "learning_rate": 3.9810000000000005e-06, "loss": 0.329, "step": 1330 }, { "epoch": 3.2308626974483596, "grad_norm": 2.695479154586792, "learning_rate": 3.984e-06, "loss": 0.3489, "step": 1331 }, { "epoch": 3.233292831105711, "grad_norm": 3.101210594177246, "learning_rate": 3.987e-06, "loss": 0.3396, "step": 1332 }, { "epoch": 3.235722964763062, "grad_norm": 2.1024978160858154, "learning_rate": 3.99e-06, "loss": 0.332, "step": 1333 }, { "epoch": 3.238153098420413, "grad_norm": 3.2985634803771973, "learning_rate": 3.993e-06, "loss": 0.3349, "step": 1334 }, { "epoch": 3.240583232077764, "grad_norm": 8.215672492980957, "learning_rate": 3.996e-06, "loss": 0.5132, "step": 1335 }, { "epoch": 3.2430133657351154, "grad_norm": 5.442267417907715, "learning_rate": 3.999e-06, "loss": 0.5118, "step": 1336 }, { "epoch": 3.245443499392467, "grad_norm": 9.06329345703125, "learning_rate": 4.002e-06, "loss": 1.0557, "step": 1337 }, { "epoch": 3.2478736330498177, "grad_norm": 4.478384494781494, "learning_rate": 4.005000000000001e-06, "loss": 0.7499, "step": 1338 }, { "epoch": 3.250303766707169, "grad_norm": 4.851478099822998, "learning_rate": 4.008e-06, "loss": 0.7258, "step": 1339 }, { "epoch": 3.25273390036452, "grad_norm": 2.8782010078430176, "learning_rate": 4.011e-06, "loss": 0.5045, "step": 1340 }, { "epoch": 3.2551640340218713, "grad_norm": 4.831270694732666, "learning_rate": 4.014e-06, "loss": 0.5126, "step": 1341 }, { "epoch": 3.257594167679222, "grad_norm": 5.2604146003723145, "learning_rate": 4.017e-06, "loss": 0.4195, "step": 1342 }, { "epoch": 3.2600243013365735, "grad_norm": 2.970151901245117, "learning_rate": 4.0200000000000005e-06, "loss": 0.4191, "step": 1343 }, { "epoch": 3.262454434993925, "grad_norm": 2.5959219932556152, "learning_rate": 4.023e-06, "loss": 0.3789, "step": 1344 }, { "epoch": 3.2648845686512757, "grad_norm": 2.7785582542419434, "learning_rate": 4.026000000000001e-06, "loss": 0.4027, "step": 1345 }, { "epoch": 3.267314702308627, "grad_norm": 2.2580509185791016, "learning_rate": 4.029e-06, "loss": 0.3103, "step": 1346 }, { "epoch": 3.269744835965978, "grad_norm": 2.4186344146728516, "learning_rate": 4.032e-06, "loss": 0.3108, "step": 1347 }, { "epoch": 3.2721749696233293, "grad_norm": 1.7792359590530396, "learning_rate": 4.035e-06, "loss": 0.3069, "step": 1348 }, { "epoch": 3.27460510328068, "grad_norm": 1.9310210943222046, "learning_rate": 4.038e-06, "loss": 0.2732, "step": 1349 }, { "epoch": 3.2770352369380316, "grad_norm": 2.277067184448242, "learning_rate": 4.041e-06, "loss": 0.2615, "step": 1350 }, { "epoch": 3.279465370595383, "grad_norm": 2.6634223461151123, "learning_rate": 4.044000000000001e-06, "loss": 0.3154, "step": 1351 }, { "epoch": 3.281895504252734, "grad_norm": 2.073141098022461, "learning_rate": 4.0469999999999995e-06, "loss": 0.2728, "step": 1352 }, { "epoch": 3.284325637910085, "grad_norm": 1.8761717081069946, "learning_rate": 4.05e-06, "loss": 0.2437, "step": 1353 }, { "epoch": 3.286755771567436, "grad_norm": 2.456753730773926, "learning_rate": 4.053e-06, "loss": 0.3354, "step": 1354 }, { "epoch": 3.2891859052247874, "grad_norm": 1.5650432109832764, "learning_rate": 4.056e-06, "loss": 0.2421, "step": 1355 }, { "epoch": 3.2916160388821387, "grad_norm": 1.9150296449661255, "learning_rate": 4.0590000000000004e-06, "loss": 0.2821, "step": 1356 }, { "epoch": 3.2940461725394896, "grad_norm": 1.748663306236267, "learning_rate": 4.062e-06, "loss": 0.2656, "step": 1357 }, { "epoch": 3.296476306196841, "grad_norm": 2.2144296169281006, "learning_rate": 4.065e-06, "loss": 0.2902, "step": 1358 }, { "epoch": 3.298906439854192, "grad_norm": 2.445939064025879, "learning_rate": 4.068e-06, "loss": 0.3114, "step": 1359 }, { "epoch": 3.301336573511543, "grad_norm": 2.2897701263427734, "learning_rate": 4.071e-06, "loss": 0.3481, "step": 1360 }, { "epoch": 3.3037667071688945, "grad_norm": 2.0824713706970215, "learning_rate": 4.074e-06, "loss": 0.263, "step": 1361 }, { "epoch": 3.3061968408262454, "grad_norm": 2.365267753601074, "learning_rate": 4.077e-06, "loss": 0.3075, "step": 1362 }, { "epoch": 3.3086269744835968, "grad_norm": 2.1901395320892334, "learning_rate": 4.080000000000001e-06, "loss": 0.2592, "step": 1363 }, { "epoch": 3.3110571081409477, "grad_norm": 1.7461378574371338, "learning_rate": 4.083e-06, "loss": 0.2789, "step": 1364 }, { "epoch": 3.313487241798299, "grad_norm": 1.6886016130447388, "learning_rate": 4.0859999999999995e-06, "loss": 0.2522, "step": 1365 }, { "epoch": 3.31591737545565, "grad_norm": 1.6384655237197876, "learning_rate": 4.089e-06, "loss": 0.2495, "step": 1366 }, { "epoch": 3.3183475091130012, "grad_norm": 1.7090295553207397, "learning_rate": 4.092e-06, "loss": 0.2561, "step": 1367 }, { "epoch": 3.320777642770352, "grad_norm": 2.003981590270996, "learning_rate": 4.095000000000001e-06, "loss": 0.2838, "step": 1368 }, { "epoch": 3.3232077764277035, "grad_norm": 2.3643994331359863, "learning_rate": 4.098e-06, "loss": 0.2689, "step": 1369 }, { "epoch": 3.325637910085055, "grad_norm": 2.2854361534118652, "learning_rate": 4.100999999999999e-06, "loss": 0.2591, "step": 1370 }, { "epoch": 3.3280680437424057, "grad_norm": 2.604581356048584, "learning_rate": 4.104e-06, "loss": 0.2712, "step": 1371 }, { "epoch": 3.330498177399757, "grad_norm": 2.948453664779663, "learning_rate": 4.107e-06, "loss": 0.3707, "step": 1372 }, { "epoch": 3.332928311057108, "grad_norm": 2.3802266120910645, "learning_rate": 4.1100000000000005e-06, "loss": 0.3231, "step": 1373 }, { "epoch": 3.3353584447144593, "grad_norm": 2.2947628498077393, "learning_rate": 4.113e-06, "loss": 0.2955, "step": 1374 }, { "epoch": 3.3377885783718106, "grad_norm": 2.5421690940856934, "learning_rate": 4.116e-06, "loss": 0.2906, "step": 1375 }, { "epoch": 3.3402187120291615, "grad_norm": 3.866685628890991, "learning_rate": 4.119e-06, "loss": 0.2994, "step": 1376 }, { "epoch": 3.342648845686513, "grad_norm": 2.5745153427124023, "learning_rate": 4.122e-06, "loss": 0.315, "step": 1377 }, { "epoch": 3.345078979343864, "grad_norm": 1.861351728439331, "learning_rate": 4.125e-06, "loss": 0.3109, "step": 1378 }, { "epoch": 3.347509113001215, "grad_norm": 4.166517734527588, "learning_rate": 4.128e-06, "loss": 0.3722, "step": 1379 }, { "epoch": 3.3499392466585665, "grad_norm": 2.8315396308898926, "learning_rate": 4.131e-06, "loss": 0.2927, "step": 1380 }, { "epoch": 3.3523693803159174, "grad_norm": 2.8393428325653076, "learning_rate": 4.1340000000000006e-06, "loss": 0.3311, "step": 1381 }, { "epoch": 3.3547995139732687, "grad_norm": 2.8073978424072266, "learning_rate": 4.137e-06, "loss": 0.3507, "step": 1382 }, { "epoch": 3.3572296476306196, "grad_norm": 3.218196392059326, "learning_rate": 4.14e-06, "loss": 0.3232, "step": 1383 }, { "epoch": 3.359659781287971, "grad_norm": 2.838557243347168, "learning_rate": 4.143e-06, "loss": 0.3509, "step": 1384 }, { "epoch": 3.362089914945322, "grad_norm": 4.745254993438721, "learning_rate": 4.146e-06, "loss": 0.3775, "step": 1385 }, { "epoch": 3.364520048602673, "grad_norm": 5.6992082595825195, "learning_rate": 4.1490000000000004e-06, "loss": 0.4862, "step": 1386 }, { "epoch": 3.366950182260024, "grad_norm": 6.210108757019043, "learning_rate": 4.152e-06, "loss": 0.9905, "step": 1387 }, { "epoch": 3.3693803159173754, "grad_norm": 4.675873279571533, "learning_rate": 4.155000000000001e-06, "loss": 0.8247, "step": 1388 }, { "epoch": 3.3718104495747268, "grad_norm": 2.173145055770874, "learning_rate": 4.158e-06, "loss": 0.6606, "step": 1389 }, { "epoch": 3.3742405832320777, "grad_norm": 2.3313047885894775, "learning_rate": 4.161e-06, "loss": 0.6245, "step": 1390 }, { "epoch": 3.376670716889429, "grad_norm": 4.7707319259643555, "learning_rate": 4.164e-06, "loss": 0.5682, "step": 1391 }, { "epoch": 3.37910085054678, "grad_norm": 5.267004489898682, "learning_rate": 4.167e-06, "loss": 0.4785, "step": 1392 }, { "epoch": 3.3815309842041312, "grad_norm": 3.5855114459991455, "learning_rate": 4.170000000000001e-06, "loss": 0.4374, "step": 1393 }, { "epoch": 3.3839611178614826, "grad_norm": 4.5357985496521, "learning_rate": 4.1730000000000005e-06, "loss": 0.3895, "step": 1394 }, { "epoch": 3.3863912515188335, "grad_norm": 2.381756067276001, "learning_rate": 4.1759999999999995e-06, "loss": 0.3156, "step": 1395 }, { "epoch": 3.388821385176185, "grad_norm": 2.1949045658111572, "learning_rate": 4.179e-06, "loss": 0.3172, "step": 1396 }, { "epoch": 3.3912515188335357, "grad_norm": 1.6498266458511353, "learning_rate": 4.182e-06, "loss": 0.2988, "step": 1397 }, { "epoch": 3.393681652490887, "grad_norm": 2.0045998096466064, "learning_rate": 4.185000000000001e-06, "loss": 0.2804, "step": 1398 }, { "epoch": 3.3961117861482384, "grad_norm": 1.7448824644088745, "learning_rate": 4.188e-06, "loss": 0.3065, "step": 1399 }, { "epoch": 3.3985419198055893, "grad_norm": 2.06725811958313, "learning_rate": 4.191e-06, "loss": 0.3335, "step": 1400 }, { "epoch": 3.4009720534629406, "grad_norm": 4.435703754425049, "learning_rate": 4.194e-06, "loss": 0.2621, "step": 1401 }, { "epoch": 3.4034021871202915, "grad_norm": 3.270415782928467, "learning_rate": 4.197e-06, "loss": 0.2812, "step": 1402 }, { "epoch": 3.405832320777643, "grad_norm": 2.036071300506592, "learning_rate": 4.2000000000000004e-06, "loss": 0.2464, "step": 1403 }, { "epoch": 3.4082624544349938, "grad_norm": 1.69017493724823, "learning_rate": 4.203e-06, "loss": 0.2554, "step": 1404 }, { "epoch": 3.410692588092345, "grad_norm": 1.9608148336410522, "learning_rate": 4.206e-06, "loss": 0.3166, "step": 1405 }, { "epoch": 3.413122721749696, "grad_norm": 2.6501028537750244, "learning_rate": 4.209000000000001e-06, "loss": 0.2541, "step": 1406 }, { "epoch": 3.4155528554070473, "grad_norm": 1.971596121788025, "learning_rate": 4.212e-06, "loss": 0.2468, "step": 1407 }, { "epoch": 3.4179829890643987, "grad_norm": 2.0661723613739014, "learning_rate": 4.215e-06, "loss": 0.2517, "step": 1408 }, { "epoch": 3.4204131227217496, "grad_norm": 2.5195562839508057, "learning_rate": 4.218e-06, "loss": 0.2705, "step": 1409 }, { "epoch": 3.422843256379101, "grad_norm": 2.057877540588379, "learning_rate": 4.221e-06, "loss": 0.2318, "step": 1410 }, { "epoch": 3.425273390036452, "grad_norm": 1.8271549940109253, "learning_rate": 4.2240000000000006e-06, "loss": 0.2375, "step": 1411 }, { "epoch": 3.427703523693803, "grad_norm": 1.8545873165130615, "learning_rate": 4.227e-06, "loss": 0.2607, "step": 1412 }, { "epoch": 3.4301336573511545, "grad_norm": 2.5197396278381348, "learning_rate": 4.229999999999999e-06, "loss": 0.2556, "step": 1413 }, { "epoch": 3.4325637910085054, "grad_norm": 1.8821656703948975, "learning_rate": 4.233e-06, "loss": 0.2486, "step": 1414 }, { "epoch": 3.4349939246658567, "grad_norm": 2.540170669555664, "learning_rate": 4.236e-06, "loss": 0.2578, "step": 1415 }, { "epoch": 3.4374240583232076, "grad_norm": 1.9620877504348755, "learning_rate": 4.239e-06, "loss": 0.2769, "step": 1416 }, { "epoch": 3.439854191980559, "grad_norm": 4.247562408447266, "learning_rate": 4.242e-06, "loss": 0.2785, "step": 1417 }, { "epoch": 3.4422843256379103, "grad_norm": 2.115781307220459, "learning_rate": 4.245e-06, "loss": 0.2901, "step": 1418 }, { "epoch": 3.444714459295261, "grad_norm": 2.5213568210601807, "learning_rate": 4.248e-06, "loss": 0.3104, "step": 1419 }, { "epoch": 3.4471445929526126, "grad_norm": 1.9135065078735352, "learning_rate": 4.251e-06, "loss": 0.244, "step": 1420 }, { "epoch": 3.4495747266099634, "grad_norm": 2.6069374084472656, "learning_rate": 4.254e-06, "loss": 0.2854, "step": 1421 }, { "epoch": 3.452004860267315, "grad_norm": 3.954155206680298, "learning_rate": 4.257e-06, "loss": 0.2878, "step": 1422 }, { "epoch": 3.4544349939246657, "grad_norm": 2.222317695617676, "learning_rate": 4.26e-06, "loss": 0.2502, "step": 1423 }, { "epoch": 3.456865127582017, "grad_norm": 2.520677328109741, "learning_rate": 4.2630000000000005e-06, "loss": 0.2899, "step": 1424 }, { "epoch": 3.459295261239368, "grad_norm": 2.1228301525115967, "learning_rate": 4.266e-06, "loss": 0.2697, "step": 1425 }, { "epoch": 3.4617253948967193, "grad_norm": 2.357494354248047, "learning_rate": 4.269e-06, "loss": 0.2459, "step": 1426 }, { "epoch": 3.4641555285540706, "grad_norm": 2.321075201034546, "learning_rate": 4.272e-06, "loss": 0.2768, "step": 1427 }, { "epoch": 3.4665856622114215, "grad_norm": 3.0324642658233643, "learning_rate": 4.275e-06, "loss": 0.3449, "step": 1428 }, { "epoch": 3.469015795868773, "grad_norm": 4.031352519989014, "learning_rate": 4.278e-06, "loss": 0.3028, "step": 1429 }, { "epoch": 3.4714459295261237, "grad_norm": 2.297471284866333, "learning_rate": 4.281e-06, "loss": 0.2747, "step": 1430 }, { "epoch": 3.473876063183475, "grad_norm": 3.6083197593688965, "learning_rate": 4.284000000000001e-06, "loss": 0.2888, "step": 1431 }, { "epoch": 3.4763061968408264, "grad_norm": 2.520165205001831, "learning_rate": 4.287e-06, "loss": 0.2676, "step": 1432 }, { "epoch": 3.4787363304981773, "grad_norm": 2.3082621097564697, "learning_rate": 4.29e-06, "loss": 0.2713, "step": 1433 }, { "epoch": 3.4811664641555287, "grad_norm": 3.4308948516845703, "learning_rate": 4.293e-06, "loss": 0.3815, "step": 1434 }, { "epoch": 3.4835965978128796, "grad_norm": 2.7119197845458984, "learning_rate": 4.296e-06, "loss": 0.2707, "step": 1435 }, { "epoch": 3.486026731470231, "grad_norm": 4.229110240936279, "learning_rate": 4.299000000000001e-06, "loss": 0.4281, "step": 1436 }, { "epoch": 3.4884568651275822, "grad_norm": 3.001671075820923, "learning_rate": 4.3020000000000005e-06, "loss": 1.0258, "step": 1437 }, { "epoch": 3.490886998784933, "grad_norm": 2.3685147762298584, "learning_rate": 4.3049999999999994e-06, "loss": 0.7195, "step": 1438 }, { "epoch": 3.4933171324422845, "grad_norm": 5.387795925140381, "learning_rate": 4.308e-06, "loss": 0.6236, "step": 1439 }, { "epoch": 3.4957472660996354, "grad_norm": 2.4780988693237305, "learning_rate": 4.311e-06, "loss": 0.6777, "step": 1440 }, { "epoch": 3.4981773997569867, "grad_norm": 2.0697131156921387, "learning_rate": 4.3140000000000005e-06, "loss": 0.5452, "step": 1441 }, { "epoch": 3.500607533414338, "grad_norm": 2.5698294639587402, "learning_rate": 4.317e-06, "loss": 0.4364, "step": 1442 }, { "epoch": 3.503037667071689, "grad_norm": 2.9762790203094482, "learning_rate": 4.32e-06, "loss": 0.3538, "step": 1443 }, { "epoch": 3.50546780072904, "grad_norm": 2.076737642288208, "learning_rate": 4.323e-06, "loss": 0.2836, "step": 1444 }, { "epoch": 3.507897934386391, "grad_norm": 1.8715184926986694, "learning_rate": 4.326e-06, "loss": 0.3212, "step": 1445 }, { "epoch": 3.5103280680437425, "grad_norm": 4.614205837249756, "learning_rate": 4.329e-06, "loss": 0.2716, "step": 1446 }, { "epoch": 3.5127582017010934, "grad_norm": 3.4961299896240234, "learning_rate": 4.332e-06, "loss": 0.3087, "step": 1447 }, { "epoch": 3.5151883353584448, "grad_norm": 2.024791955947876, "learning_rate": 4.335e-06, "loss": 0.262, "step": 1448 }, { "epoch": 3.5176184690157957, "grad_norm": 3.5721957683563232, "learning_rate": 4.338000000000001e-06, "loss": 0.2944, "step": 1449 }, { "epoch": 3.520048602673147, "grad_norm": 5.1159281730651855, "learning_rate": 4.341e-06, "loss": 0.2896, "step": 1450 }, { "epoch": 3.5224787363304984, "grad_norm": 2.2629528045654297, "learning_rate": 4.344e-06, "loss": 0.2308, "step": 1451 }, { "epoch": 3.5249088699878492, "grad_norm": 2.1645171642303467, "learning_rate": 4.347e-06, "loss": 0.2923, "step": 1452 }, { "epoch": 3.5273390036452006, "grad_norm": 2.734567880630493, "learning_rate": 4.35e-06, "loss": 0.3553, "step": 1453 }, { "epoch": 3.5297691373025515, "grad_norm": 1.6842414140701294, "learning_rate": 4.3530000000000005e-06, "loss": 0.3348, "step": 1454 }, { "epoch": 3.532199270959903, "grad_norm": 1.6494600772857666, "learning_rate": 4.356e-06, "loss": 0.2441, "step": 1455 }, { "epoch": 3.534629404617254, "grad_norm": 2.094094753265381, "learning_rate": 4.359e-06, "loss": 0.252, "step": 1456 }, { "epoch": 3.537059538274605, "grad_norm": 1.4743338823318481, "learning_rate": 4.362e-06, "loss": 0.224, "step": 1457 }, { "epoch": 3.5394896719319564, "grad_norm": 3.0043134689331055, "learning_rate": 4.365e-06, "loss": 0.3451, "step": 1458 }, { "epoch": 3.5419198055893073, "grad_norm": 2.3353962898254395, "learning_rate": 4.368e-06, "loss": 0.2952, "step": 1459 }, { "epoch": 3.5443499392466586, "grad_norm": 1.560339093208313, "learning_rate": 4.371e-06, "loss": 0.2166, "step": 1460 }, { "epoch": 3.54678007290401, "grad_norm": 1.944671392440796, "learning_rate": 4.374000000000001e-06, "loss": 0.2401, "step": 1461 }, { "epoch": 3.549210206561361, "grad_norm": 2.1660192012786865, "learning_rate": 4.377e-06, "loss": 0.2491, "step": 1462 }, { "epoch": 3.5516403402187118, "grad_norm": 3.754913330078125, "learning_rate": 4.3799999999999996e-06, "loss": 0.2467, "step": 1463 }, { "epoch": 3.554070473876063, "grad_norm": 1.9183053970336914, "learning_rate": 4.383e-06, "loss": 0.2851, "step": 1464 }, { "epoch": 3.5565006075334145, "grad_norm": 7.557110786437988, "learning_rate": 4.386e-06, "loss": 0.212, "step": 1465 }, { "epoch": 3.5589307411907654, "grad_norm": 2.411710023880005, "learning_rate": 4.389000000000001e-06, "loss": 0.2705, "step": 1466 }, { "epoch": 3.5613608748481167, "grad_norm": 1.8416589498519897, "learning_rate": 4.3920000000000005e-06, "loss": 0.2342, "step": 1467 }, { "epoch": 3.5637910085054676, "grad_norm": 2.3694982528686523, "learning_rate": 4.395e-06, "loss": 0.3265, "step": 1468 }, { "epoch": 3.566221142162819, "grad_norm": 2.048614978790283, "learning_rate": 4.398e-06, "loss": 0.3153, "step": 1469 }, { "epoch": 3.5686512758201703, "grad_norm": 2.028179407119751, "learning_rate": 4.401e-06, "loss": 0.2583, "step": 1470 }, { "epoch": 3.571081409477521, "grad_norm": 2.593938112258911, "learning_rate": 4.4040000000000005e-06, "loss": 0.2522, "step": 1471 }, { "epoch": 3.5735115431348725, "grad_norm": 1.9909794330596924, "learning_rate": 4.407e-06, "loss": 0.3219, "step": 1472 }, { "epoch": 3.5759416767922234, "grad_norm": 2.155956268310547, "learning_rate": 4.41e-06, "loss": 0.2664, "step": 1473 }, { "epoch": 3.5783718104495748, "grad_norm": 2.050661087036133, "learning_rate": 4.413000000000001e-06, "loss": 0.2492, "step": 1474 }, { "epoch": 3.580801944106926, "grad_norm": 2.5636134147644043, "learning_rate": 4.416e-06, "loss": 0.3311, "step": 1475 }, { "epoch": 3.583232077764277, "grad_norm": 1.8698439598083496, "learning_rate": 4.4189999999999995e-06, "loss": 0.2621, "step": 1476 }, { "epoch": 3.5856622114216283, "grad_norm": 2.5002057552337646, "learning_rate": 4.422e-06, "loss": 0.2937, "step": 1477 }, { "epoch": 3.5880923450789792, "grad_norm": 3.590641975402832, "learning_rate": 4.425e-06, "loss": 0.2963, "step": 1478 }, { "epoch": 3.5905224787363306, "grad_norm": 1.8234145641326904, "learning_rate": 4.428000000000001e-06, "loss": 0.2229, "step": 1479 }, { "epoch": 3.592952612393682, "grad_norm": 1.912773847579956, "learning_rate": 4.4310000000000004e-06, "loss": 0.25, "step": 1480 }, { "epoch": 3.595382746051033, "grad_norm": 2.5723259449005127, "learning_rate": 4.433999999999999e-06, "loss": 0.2679, "step": 1481 }, { "epoch": 3.5978128797083837, "grad_norm": 2.579493999481201, "learning_rate": 4.437e-06, "loss": 0.3737, "step": 1482 }, { "epoch": 3.600243013365735, "grad_norm": 3.5581765174865723, "learning_rate": 4.44e-06, "loss": 0.3649, "step": 1483 }, { "epoch": 3.6026731470230864, "grad_norm": 3.1696033477783203, "learning_rate": 4.4430000000000005e-06, "loss": 0.3015, "step": 1484 }, { "epoch": 3.6051032806804373, "grad_norm": 3.8741455078125, "learning_rate": 4.446e-06, "loss": 0.3541, "step": 1485 }, { "epoch": 3.6075334143377886, "grad_norm": 3.4001102447509766, "learning_rate": 4.449e-06, "loss": 0.3842, "step": 1486 }, { "epoch": 3.6099635479951395, "grad_norm": 4.633135795593262, "learning_rate": 4.452e-06, "loss": 0.8902, "step": 1487 }, { "epoch": 3.612393681652491, "grad_norm": 5.694637775421143, "learning_rate": 4.455e-06, "loss": 0.7441, "step": 1488 }, { "epoch": 3.614823815309842, "grad_norm": 1.9066975116729736, "learning_rate": 4.458e-06, "loss": 0.5007, "step": 1489 }, { "epoch": 3.617253948967193, "grad_norm": 3.494570255279541, "learning_rate": 4.461e-06, "loss": 0.6151, "step": 1490 }, { "epoch": 3.6196840826245444, "grad_norm": 3.45949387550354, "learning_rate": 4.464e-06, "loss": 0.4066, "step": 1491 }, { "epoch": 3.6221142162818953, "grad_norm": 3.2609477043151855, "learning_rate": 4.467000000000001e-06, "loss": 0.413, "step": 1492 }, { "epoch": 3.6245443499392467, "grad_norm": 1.6855485439300537, "learning_rate": 4.4699999999999996e-06, "loss": 0.2998, "step": 1493 }, { "epoch": 3.626974483596598, "grad_norm": 1.4479116201400757, "learning_rate": 4.473e-06, "loss": 0.3129, "step": 1494 }, { "epoch": 3.629404617253949, "grad_norm": 12.56561279296875, "learning_rate": 4.476e-06, "loss": 0.2562, "step": 1495 }, { "epoch": 3.6318347509113003, "grad_norm": 1.7998236417770386, "learning_rate": 4.479e-06, "loss": 0.2317, "step": 1496 }, { "epoch": 3.634264884568651, "grad_norm": 2.2252895832061768, "learning_rate": 4.4820000000000005e-06, "loss": 0.2914, "step": 1497 }, { "epoch": 3.6366950182260025, "grad_norm": 2.232365369796753, "learning_rate": 4.485e-06, "loss": 0.3051, "step": 1498 }, { "epoch": 3.639125151883354, "grad_norm": 1.291721224784851, "learning_rate": 4.488e-06, "loss": 0.2361, "step": 1499 }, { "epoch": 3.6415552855407047, "grad_norm": 1.5375502109527588, "learning_rate": 4.491e-06, "loss": 0.2275, "step": 1500 }, { "epoch": 3.6439854191980556, "grad_norm": 1.8030821084976196, "learning_rate": 4.494e-06, "loss": 0.2756, "step": 1501 }, { "epoch": 3.646415552855407, "grad_norm": 2.1909966468811035, "learning_rate": 4.497e-06, "loss": 0.2597, "step": 1502 }, { "epoch": 3.6488456865127583, "grad_norm": 1.65114426612854, "learning_rate": 4.5e-06, "loss": 0.255, "step": 1503 }, { "epoch": 3.651275820170109, "grad_norm": 1.6610959768295288, "learning_rate": 4.503000000000001e-06, "loss": 0.2257, "step": 1504 }, { "epoch": 3.6537059538274606, "grad_norm": 1.3305914402008057, "learning_rate": 4.506e-06, "loss": 0.2825, "step": 1505 }, { "epoch": 3.6561360874848114, "grad_norm": 1.773088812828064, "learning_rate": 4.5089999999999995e-06, "loss": 0.3118, "step": 1506 }, { "epoch": 3.658566221142163, "grad_norm": 1.551355242729187, "learning_rate": 4.512e-06, "loss": 0.2386, "step": 1507 }, { "epoch": 3.660996354799514, "grad_norm": 1.8488825559616089, "learning_rate": 4.515e-06, "loss": 0.2403, "step": 1508 }, { "epoch": 3.663426488456865, "grad_norm": 2.7082407474517822, "learning_rate": 4.518000000000001e-06, "loss": 0.2383, "step": 1509 }, { "epoch": 3.6658566221142164, "grad_norm": 1.4943426847457886, "learning_rate": 4.521e-06, "loss": 0.2293, "step": 1510 }, { "epoch": 3.6682867557715673, "grad_norm": 2.384434461593628, "learning_rate": 4.524e-06, "loss": 0.253, "step": 1511 }, { "epoch": 3.6707168894289186, "grad_norm": 1.786576747894287, "learning_rate": 4.527e-06, "loss": 0.3412, "step": 1512 }, { "epoch": 3.67314702308627, "grad_norm": 1.9114006757736206, "learning_rate": 4.53e-06, "loss": 0.247, "step": 1513 }, { "epoch": 3.675577156743621, "grad_norm": 1.7760275602340698, "learning_rate": 4.5330000000000005e-06, "loss": 0.2453, "step": 1514 }, { "epoch": 3.678007290400972, "grad_norm": 2.0147202014923096, "learning_rate": 4.536e-06, "loss": 0.2172, "step": 1515 }, { "epoch": 3.680437424058323, "grad_norm": 1.5592120885849, "learning_rate": 4.539e-06, "loss": 0.2148, "step": 1516 }, { "epoch": 3.6828675577156744, "grad_norm": 1.8135122060775757, "learning_rate": 4.542000000000001e-06, "loss": 0.2291, "step": 1517 }, { "epoch": 3.6852976913730258, "grad_norm": 1.727003812789917, "learning_rate": 4.545e-06, "loss": 0.2022, "step": 1518 }, { "epoch": 3.6877278250303767, "grad_norm": 1.504385232925415, "learning_rate": 4.548e-06, "loss": 0.2464, "step": 1519 }, { "epoch": 3.6901579586877276, "grad_norm": 1.8358674049377441, "learning_rate": 4.551e-06, "loss": 0.2947, "step": 1520 }, { "epoch": 3.692588092345079, "grad_norm": 4.284717559814453, "learning_rate": 4.554e-06, "loss": 0.2468, "step": 1521 }, { "epoch": 3.6950182260024302, "grad_norm": 1.8679287433624268, "learning_rate": 4.557000000000001e-06, "loss": 0.2799, "step": 1522 }, { "epoch": 3.697448359659781, "grad_norm": 1.6816794872283936, "learning_rate": 4.56e-06, "loss": 0.1973, "step": 1523 }, { "epoch": 3.6998784933171325, "grad_norm": 2.5337889194488525, "learning_rate": 4.563e-06, "loss": 0.3153, "step": 1524 }, { "epoch": 3.7023086269744834, "grad_norm": 2.644315719604492, "learning_rate": 4.566e-06, "loss": 0.2735, "step": 1525 }, { "epoch": 3.7047387606318347, "grad_norm": 2.5981924533843994, "learning_rate": 4.569e-06, "loss": 0.3141, "step": 1526 }, { "epoch": 3.707168894289186, "grad_norm": 2.15722918510437, "learning_rate": 4.5720000000000004e-06, "loss": 0.2822, "step": 1527 }, { "epoch": 3.709599027946537, "grad_norm": 2.158390998840332, "learning_rate": 4.575e-06, "loss": 0.3008, "step": 1528 }, { "epoch": 3.7120291616038883, "grad_norm": 2.1464197635650635, "learning_rate": 4.578000000000001e-06, "loss": 0.2956, "step": 1529 }, { "epoch": 3.714459295261239, "grad_norm": 1.7620192766189575, "learning_rate": 4.581e-06, "loss": 0.2778, "step": 1530 }, { "epoch": 3.7168894289185905, "grad_norm": 3.2140893936157227, "learning_rate": 4.584e-06, "loss": 0.2956, "step": 1531 }, { "epoch": 3.719319562575942, "grad_norm": 2.1044209003448486, "learning_rate": 4.587e-06, "loss": 0.2781, "step": 1532 }, { "epoch": 3.7217496962332928, "grad_norm": 2.1493213176727295, "learning_rate": 4.59e-06, "loss": 0.22, "step": 1533 }, { "epoch": 3.724179829890644, "grad_norm": 2.7268457412719727, "learning_rate": 4.593000000000001e-06, "loss": 0.339, "step": 1534 }, { "epoch": 3.726609963547995, "grad_norm": 2.6760380268096924, "learning_rate": 4.5960000000000006e-06, "loss": 0.2748, "step": 1535 }, { "epoch": 3.7290400972053463, "grad_norm": 5.229881763458252, "learning_rate": 4.5989999999999995e-06, "loss": 0.5197, "step": 1536 }, { "epoch": 3.7314702308626977, "grad_norm": 3.3976452350616455, "learning_rate": 4.602e-06, "loss": 0.784, "step": 1537 }, { "epoch": 3.7339003645200486, "grad_norm": 3.3200042247772217, "learning_rate": 4.605e-06, "loss": 0.6353, "step": 1538 }, { "epoch": 3.7363304981773995, "grad_norm": 5.655397415161133, "learning_rate": 4.608e-06, "loss": 0.6301, "step": 1539 }, { "epoch": 3.738760631834751, "grad_norm": 1.7299044132232666, "learning_rate": 4.611e-06, "loss": 0.4903, "step": 1540 }, { "epoch": 3.741190765492102, "grad_norm": 2.8534762859344482, "learning_rate": 4.614e-06, "loss": 0.4595, "step": 1541 }, { "epoch": 3.743620899149453, "grad_norm": 2.860318422317505, "learning_rate": 4.617e-06, "loss": 0.3739, "step": 1542 }, { "epoch": 3.7460510328068044, "grad_norm": 2.6479616165161133, "learning_rate": 4.62e-06, "loss": 0.3474, "step": 1543 }, { "epoch": 3.7484811664641553, "grad_norm": 1.4509981870651245, "learning_rate": 4.623e-06, "loss": 0.2875, "step": 1544 }, { "epoch": 3.7509113001215066, "grad_norm": 3.2458784580230713, "learning_rate": 4.626e-06, "loss": 0.3339, "step": 1545 }, { "epoch": 3.753341433778858, "grad_norm": 2.5539071559906006, "learning_rate": 4.629e-06, "loss": 0.3089, "step": 1546 }, { "epoch": 3.755771567436209, "grad_norm": 2.510977268218994, "learning_rate": 4.632000000000001e-06, "loss": 0.2829, "step": 1547 }, { "epoch": 3.75820170109356, "grad_norm": 3.127358913421631, "learning_rate": 4.635e-06, "loss": 0.2485, "step": 1548 }, { "epoch": 3.760631834750911, "grad_norm": 1.7254531383514404, "learning_rate": 4.6379999999999995e-06, "loss": 0.2413, "step": 1549 }, { "epoch": 3.7630619684082625, "grad_norm": 1.5774308443069458, "learning_rate": 4.641e-06, "loss": 0.2703, "step": 1550 }, { "epoch": 3.765492102065614, "grad_norm": 1.7420504093170166, "learning_rate": 4.644e-06, "loss": 0.2615, "step": 1551 }, { "epoch": 3.7679222357229647, "grad_norm": 1.6576149463653564, "learning_rate": 4.6470000000000006e-06, "loss": 0.2617, "step": 1552 }, { "epoch": 3.770352369380316, "grad_norm": 1.6151537895202637, "learning_rate": 4.65e-06, "loss": 0.2791, "step": 1553 }, { "epoch": 3.772782503037667, "grad_norm": 2.02516508102417, "learning_rate": 4.653e-06, "loss": 0.2088, "step": 1554 }, { "epoch": 3.7752126366950183, "grad_norm": 1.5890021324157715, "learning_rate": 4.656e-06, "loss": 0.2301, "step": 1555 }, { "epoch": 3.7776427703523696, "grad_norm": 2.7520248889923096, "learning_rate": 4.659e-06, "loss": 0.2146, "step": 1556 }, { "epoch": 3.7800729040097205, "grad_norm": 1.342589020729065, "learning_rate": 4.6620000000000004e-06, "loss": 0.1971, "step": 1557 }, { "epoch": 3.782503037667072, "grad_norm": 2.234562635421753, "learning_rate": 4.665e-06, "loss": 0.2115, "step": 1558 }, { "epoch": 3.7849331713244228, "grad_norm": 1.5016874074935913, "learning_rate": 4.668e-06, "loss": 0.2194, "step": 1559 }, { "epoch": 3.787363304981774, "grad_norm": 3.2617037296295166, "learning_rate": 4.671000000000001e-06, "loss": 0.1896, "step": 1560 }, { "epoch": 3.789793438639125, "grad_norm": 1.4841028451919556, "learning_rate": 4.674e-06, "loss": 0.2059, "step": 1561 }, { "epoch": 3.7922235722964763, "grad_norm": 1.7997411489486694, "learning_rate": 4.677e-06, "loss": 0.2393, "step": 1562 }, { "epoch": 3.7946537059538272, "grad_norm": 1.8655977249145508, "learning_rate": 4.68e-06, "loss": 0.2096, "step": 1563 }, { "epoch": 3.7970838396111786, "grad_norm": 5.121084690093994, "learning_rate": 4.683e-06, "loss": 0.2521, "step": 1564 }, { "epoch": 3.79951397326853, "grad_norm": 1.6578811407089233, "learning_rate": 4.6860000000000005e-06, "loss": 0.1988, "step": 1565 }, { "epoch": 3.801944106925881, "grad_norm": 2.0233490467071533, "learning_rate": 4.689e-06, "loss": 0.2451, "step": 1566 }, { "epoch": 3.804374240583232, "grad_norm": 4.112019062042236, "learning_rate": 4.692e-06, "loss": 0.2399, "step": 1567 }, { "epoch": 3.806804374240583, "grad_norm": 2.1396050453186035, "learning_rate": 4.695e-06, "loss": 0.2083, "step": 1568 }, { "epoch": 3.8092345078979344, "grad_norm": 1.4892563819885254, "learning_rate": 4.698e-06, "loss": 0.2162, "step": 1569 }, { "epoch": 3.8116646415552857, "grad_norm": 2.917013645172119, "learning_rate": 4.701e-06, "loss": 0.2932, "step": 1570 }, { "epoch": 3.8140947752126366, "grad_norm": 3.1253905296325684, "learning_rate": 4.704e-06, "loss": 0.3124, "step": 1571 }, { "epoch": 3.816524908869988, "grad_norm": 2.8988139629364014, "learning_rate": 4.707000000000001e-06, "loss": 0.2563, "step": 1572 }, { "epoch": 3.818955042527339, "grad_norm": 1.6377811431884766, "learning_rate": 4.71e-06, "loss": 0.212, "step": 1573 }, { "epoch": 3.82138517618469, "grad_norm": 1.7875412702560425, "learning_rate": 4.713e-06, "loss": 0.2606, "step": 1574 }, { "epoch": 3.8238153098420415, "grad_norm": 2.129967212677002, "learning_rate": 4.716e-06, "loss": 0.2434, "step": 1575 }, { "epoch": 3.8262454434993924, "grad_norm": 2.6740431785583496, "learning_rate": 4.719e-06, "loss": 0.2489, "step": 1576 }, { "epoch": 3.828675577156744, "grad_norm": 1.9921178817749023, "learning_rate": 4.722000000000001e-06, "loss": 0.2846, "step": 1577 }, { "epoch": 3.8311057108140947, "grad_norm": 2.1628503799438477, "learning_rate": 4.7250000000000005e-06, "loss": 0.2572, "step": 1578 }, { "epoch": 3.833535844471446, "grad_norm": 2.3979837894439697, "learning_rate": 4.7279999999999995e-06, "loss": 0.2187, "step": 1579 }, { "epoch": 3.8359659781287974, "grad_norm": 2.088470935821533, "learning_rate": 4.731e-06, "loss": 0.2379, "step": 1580 }, { "epoch": 3.8383961117861483, "grad_norm": 2.4790802001953125, "learning_rate": 4.734e-06, "loss": 0.3087, "step": 1581 }, { "epoch": 3.840826245443499, "grad_norm": 1.9745177030563354, "learning_rate": 4.7370000000000006e-06, "loss": 0.2293, "step": 1582 }, { "epoch": 3.8432563791008505, "grad_norm": 3.942844867706299, "learning_rate": 4.74e-06, "loss": 0.2715, "step": 1583 }, { "epoch": 3.845686512758202, "grad_norm": 2.721454620361328, "learning_rate": 4.743e-06, "loss": 0.3019, "step": 1584 }, { "epoch": 3.8481166464155527, "grad_norm": 3.1269404888153076, "learning_rate": 4.746e-06, "loss": 0.3358, "step": 1585 }, { "epoch": 3.850546780072904, "grad_norm": 7.284228324890137, "learning_rate": 4.749e-06, "loss": 0.4898, "step": 1586 }, { "epoch": 3.852976913730255, "grad_norm": 5.6373443603515625, "learning_rate": 4.752e-06, "loss": 0.8787, "step": 1587 }, { "epoch": 3.8554070473876063, "grad_norm": 3.8329145908355713, "learning_rate": 4.755e-06, "loss": 0.7243, "step": 1588 }, { "epoch": 3.8578371810449577, "grad_norm": 3.2410359382629395, "learning_rate": 4.758e-06, "loss": 0.5915, "step": 1589 }, { "epoch": 3.8602673147023085, "grad_norm": 4.630851745605469, "learning_rate": 4.761000000000001e-06, "loss": 0.5498, "step": 1590 }, { "epoch": 3.86269744835966, "grad_norm": 4.0952982902526855, "learning_rate": 4.764e-06, "loss": 0.4213, "step": 1591 }, { "epoch": 3.865127582017011, "grad_norm": 1.8788973093032837, "learning_rate": 4.767e-06, "loss": 0.4051, "step": 1592 }, { "epoch": 3.867557715674362, "grad_norm": 1.5706899166107178, "learning_rate": 4.77e-06, "loss": 0.3944, "step": 1593 }, { "epoch": 3.8699878493317135, "grad_norm": 1.4180454015731812, "learning_rate": 4.773e-06, "loss": 0.3309, "step": 1594 }, { "epoch": 3.8724179829890644, "grad_norm": 1.8142223358154297, "learning_rate": 4.7760000000000005e-06, "loss": 0.297, "step": 1595 }, { "epoch": 3.8748481166464157, "grad_norm": 1.5140457153320312, "learning_rate": 4.779e-06, "loss": 0.2233, "step": 1596 }, { "epoch": 3.8772782503037666, "grad_norm": 1.7824970483779907, "learning_rate": 4.782e-06, "loss": 0.2813, "step": 1597 }, { "epoch": 3.879708383961118, "grad_norm": 1.6344701051712036, "learning_rate": 4.785e-06, "loss": 0.218, "step": 1598 }, { "epoch": 3.8821385176184693, "grad_norm": 1.4496451616287231, "learning_rate": 4.788e-06, "loss": 0.1923, "step": 1599 }, { "epoch": 3.88456865127582, "grad_norm": 1.6140903234481812, "learning_rate": 4.791e-06, "loss": 0.2786, "step": 1600 }, { "epoch": 3.886998784933171, "grad_norm": 1.8923370838165283, "learning_rate": 4.794e-06, "loss": 0.2702, "step": 1601 }, { "epoch": 3.8894289185905224, "grad_norm": 1.8905396461486816, "learning_rate": 4.797e-06, "loss": 0.2307, "step": 1602 }, { "epoch": 3.8918590522478738, "grad_norm": 1.867226243019104, "learning_rate": 4.800000000000001e-06, "loss": 0.1783, "step": 1603 }, { "epoch": 3.8942891859052247, "grad_norm": 2.5952038764953613, "learning_rate": 4.803e-06, "loss": 0.26, "step": 1604 }, { "epoch": 3.896719319562576, "grad_norm": 1.5629022121429443, "learning_rate": 4.806e-06, "loss": 0.212, "step": 1605 }, { "epoch": 3.899149453219927, "grad_norm": 1.5017707347869873, "learning_rate": 4.809e-06, "loss": 0.2171, "step": 1606 }, { "epoch": 3.9015795868772782, "grad_norm": 1.3841748237609863, "learning_rate": 4.812e-06, "loss": 0.2141, "step": 1607 }, { "epoch": 3.9040097205346296, "grad_norm": 1.288071870803833, "learning_rate": 4.8150000000000005e-06, "loss": 0.1628, "step": 1608 }, { "epoch": 3.9064398541919805, "grad_norm": 2.9336445331573486, "learning_rate": 4.818e-06, "loss": 0.2572, "step": 1609 }, { "epoch": 3.908869987849332, "grad_norm": 1.419750452041626, "learning_rate": 4.821e-06, "loss": 0.2049, "step": 1610 }, { "epoch": 3.9113001215066827, "grad_norm": 1.251624584197998, "learning_rate": 4.824e-06, "loss": 0.1736, "step": 1611 }, { "epoch": 3.913730255164034, "grad_norm": 2.190908193588257, "learning_rate": 4.827e-06, "loss": 0.2471, "step": 1612 }, { "epoch": 3.9161603888213854, "grad_norm": 1.6615674495697021, "learning_rate": 4.83e-06, "loss": 0.2334, "step": 1613 }, { "epoch": 3.9185905224787363, "grad_norm": 1.5896936655044556, "learning_rate": 4.833e-06, "loss": 0.2064, "step": 1614 }, { "epoch": 3.9210206561360876, "grad_norm": 2.080543041229248, "learning_rate": 4.836000000000001e-06, "loss": 0.2801, "step": 1615 }, { "epoch": 3.9234507897934385, "grad_norm": 1.785494089126587, "learning_rate": 4.839e-06, "loss": 0.2198, "step": 1616 }, { "epoch": 3.92588092345079, "grad_norm": 1.715135931968689, "learning_rate": 4.8419999999999996e-06, "loss": 0.2334, "step": 1617 }, { "epoch": 3.928311057108141, "grad_norm": 2.234251022338867, "learning_rate": 4.845e-06, "loss": 0.2242, "step": 1618 }, { "epoch": 3.930741190765492, "grad_norm": 2.3580336570739746, "learning_rate": 4.848e-06, "loss": 0.242, "step": 1619 }, { "epoch": 3.933171324422843, "grad_norm": 1.955499529838562, "learning_rate": 4.851000000000001e-06, "loss": 0.2297, "step": 1620 }, { "epoch": 3.9356014580801943, "grad_norm": 1.6946269273757935, "learning_rate": 4.8540000000000005e-06, "loss": 0.2716, "step": 1621 }, { "epoch": 3.9380315917375457, "grad_norm": 1.7606561183929443, "learning_rate": 4.856999999999999e-06, "loss": 0.2435, "step": 1622 }, { "epoch": 3.9404617253948966, "grad_norm": 2.0043540000915527, "learning_rate": 4.86e-06, "loss": 0.2913, "step": 1623 }, { "epoch": 3.942891859052248, "grad_norm": 5.668542385101318, "learning_rate": 4.863e-06, "loss": 0.2157, "step": 1624 }, { "epoch": 3.945321992709599, "grad_norm": 2.1096341609954834, "learning_rate": 4.8660000000000005e-06, "loss": 0.2317, "step": 1625 }, { "epoch": 3.94775212636695, "grad_norm": 1.945953607559204, "learning_rate": 4.869e-06, "loss": 0.2209, "step": 1626 }, { "epoch": 3.9501822600243015, "grad_norm": 2.4760451316833496, "learning_rate": 4.872e-06, "loss": 0.1965, "step": 1627 }, { "epoch": 3.9526123936816524, "grad_norm": 1.937138319015503, "learning_rate": 4.875e-06, "loss": 0.2471, "step": 1628 }, { "epoch": 3.9550425273390037, "grad_norm": 1.932438611984253, "learning_rate": 4.878e-06, "loss": 0.1846, "step": 1629 }, { "epoch": 3.9574726609963546, "grad_norm": 2.0603599548339844, "learning_rate": 4.881e-06, "loss": 0.2394, "step": 1630 }, { "epoch": 3.959902794653706, "grad_norm": 4.214655876159668, "learning_rate": 4.884e-06, "loss": 0.3068, "step": 1631 }, { "epoch": 3.9623329283110573, "grad_norm": 2.162440061569214, "learning_rate": 4.887e-06, "loss": 0.2438, "step": 1632 }, { "epoch": 3.964763061968408, "grad_norm": 2.6595020294189453, "learning_rate": 4.890000000000001e-06, "loss": 0.318, "step": 1633 }, { "epoch": 3.9671931956257596, "grad_norm": 3.8510773181915283, "learning_rate": 4.8929999999999996e-06, "loss": 0.2947, "step": 1634 }, { "epoch": 3.9696233292831105, "grad_norm": 5.403232574462891, "learning_rate": 4.896e-06, "loss": 0.3065, "step": 1635 }, { "epoch": 3.972053462940462, "grad_norm": 4.334510326385498, "learning_rate": 4.899e-06, "loss": 0.446, "step": 1636 }, { "epoch": 3.974483596597813, "grad_norm": 4.946133613586426, "learning_rate": 4.902e-06, "loss": 0.6664, "step": 1637 }, { "epoch": 3.976913730255164, "grad_norm": 2.528921127319336, "learning_rate": 4.9050000000000005e-06, "loss": 0.3685, "step": 1638 }, { "epoch": 3.979343863912515, "grad_norm": 2.124689817428589, "learning_rate": 4.908e-06, "loss": 0.2533, "step": 1639 }, { "epoch": 3.9817739975698663, "grad_norm": 2.1054625511169434, "learning_rate": 4.911e-06, "loss": 0.228, "step": 1640 }, { "epoch": 3.9842041312272176, "grad_norm": 2.0322821140289307, "learning_rate": 4.914e-06, "loss": 0.2298, "step": 1641 }, { "epoch": 3.9866342648845685, "grad_norm": 1.9720362424850464, "learning_rate": 4.917e-06, "loss": 0.2971, "step": 1642 }, { "epoch": 3.98906439854192, "grad_norm": 2.058056116104126, "learning_rate": 4.92e-06, "loss": 0.2144, "step": 1643 }, { "epoch": 3.9914945321992708, "grad_norm": 2.2673676013946533, "learning_rate": 4.923e-06, "loss": 0.2601, "step": 1644 }, { "epoch": 3.993924665856622, "grad_norm": 2.089221239089966, "learning_rate": 4.926000000000001e-06, "loss": 0.231, "step": 1645 }, { "epoch": 3.9963547995139734, "grad_norm": 9.859814643859863, "learning_rate": 4.929000000000001e-06, "loss": 0.2199, "step": 1646 }, { "epoch": 3.9987849331713243, "grad_norm": 5.170797824859619, "learning_rate": 4.9319999999999995e-06, "loss": 0.2866, "step": 1647 }, { "epoch": 4.0, "grad_norm": 2.47273588180542, "learning_rate": 4.935e-06, "loss": 0.2219, "step": 1648 }, { "epoch": 4.002430133657351, "grad_norm": 7.494588375091553, "learning_rate": 4.938e-06, "loss": 0.802, "step": 1649 }, { "epoch": 4.004860267314703, "grad_norm": 4.516654014587402, "learning_rate": 4.941000000000001e-06, "loss": 0.6913, "step": 1650 }, { "epoch": 4.007290400972053, "grad_norm": 2.3543734550476074, "learning_rate": 4.9440000000000004e-06, "loss": 0.5613, "step": 1651 }, { "epoch": 4.0097205346294045, "grad_norm": 2.522566795349121, "learning_rate": 4.947e-06, "loss": 0.543, "step": 1652 }, { "epoch": 4.012150668286756, "grad_norm": 5.597517967224121, "learning_rate": 4.95e-06, "loss": 0.3957, "step": 1653 }, { "epoch": 4.014580801944107, "grad_norm": 4.3787407875061035, "learning_rate": 4.953e-06, "loss": 0.3742, "step": 1654 }, { "epoch": 4.0170109356014585, "grad_norm": 2.5416100025177, "learning_rate": 4.9560000000000005e-06, "loss": 0.3108, "step": 1655 }, { "epoch": 4.019441069258809, "grad_norm": 1.5872336626052856, "learning_rate": 4.959e-06, "loss": 0.2491, "step": 1656 }, { "epoch": 4.02187120291616, "grad_norm": 1.818698763847351, "learning_rate": 4.962e-06, "loss": 0.2845, "step": 1657 }, { "epoch": 4.024301336573512, "grad_norm": 1.704495906829834, "learning_rate": 4.965000000000001e-06, "loss": 0.2878, "step": 1658 }, { "epoch": 4.026731470230863, "grad_norm": 2.7189252376556396, "learning_rate": 4.968e-06, "loss": 0.2501, "step": 1659 }, { "epoch": 4.029161603888213, "grad_norm": 1.3408466577529907, "learning_rate": 4.9709999999999995e-06, "loss": 0.2358, "step": 1660 }, { "epoch": 4.031591737545565, "grad_norm": 1.681044578552246, "learning_rate": 4.974e-06, "loss": 0.2309, "step": 1661 }, { "epoch": 4.034021871202916, "grad_norm": 1.5330641269683838, "learning_rate": 4.977e-06, "loss": 0.1972, "step": 1662 }, { "epoch": 4.0364520048602675, "grad_norm": 1.3965768814086914, "learning_rate": 4.980000000000001e-06, "loss": 0.1898, "step": 1663 }, { "epoch": 4.038882138517619, "grad_norm": 1.3735432624816895, "learning_rate": 4.983e-06, "loss": 0.2144, "step": 1664 }, { "epoch": 4.041312272174969, "grad_norm": 1.7565068006515503, "learning_rate": 4.985999999999999e-06, "loss": 0.1792, "step": 1665 }, { "epoch": 4.043742405832321, "grad_norm": 1.3186124563217163, "learning_rate": 4.989e-06, "loss": 0.1814, "step": 1666 }, { "epoch": 4.046172539489672, "grad_norm": 2.069122791290283, "learning_rate": 4.992e-06, "loss": 0.2131, "step": 1667 }, { "epoch": 4.048602673147023, "grad_norm": 1.4072562456130981, "learning_rate": 4.9950000000000005e-06, "loss": 0.1821, "step": 1668 }, { "epoch": 4.051032806804375, "grad_norm": 1.8426507711410522, "learning_rate": 4.998e-06, "loss": 0.2326, "step": 1669 }, { "epoch": 4.053462940461725, "grad_norm": 2.004849910736084, "learning_rate": 5.001e-06, "loss": 0.2309, "step": 1670 }, { "epoch": 4.055893074119076, "grad_norm": 1.1780518293380737, "learning_rate": 5.004e-06, "loss": 0.1809, "step": 1671 }, { "epoch": 4.058323207776428, "grad_norm": 2.513725757598877, "learning_rate": 5.007e-06, "loss": 0.227, "step": 1672 }, { "epoch": 4.060753341433779, "grad_norm": 1.3677433729171753, "learning_rate": 5.01e-06, "loss": 0.1641, "step": 1673 }, { "epoch": 4.06318347509113, "grad_norm": 1.8106915950775146, "learning_rate": 5.013e-06, "loss": 0.1811, "step": 1674 }, { "epoch": 4.065613608748481, "grad_norm": 1.3781343698501587, "learning_rate": 5.016e-06, "loss": 0.2163, "step": 1675 }, { "epoch": 4.068043742405832, "grad_norm": 1.4118961095809937, "learning_rate": 5.0190000000000006e-06, "loss": 0.193, "step": 1676 }, { "epoch": 4.070473876063184, "grad_norm": 1.9015134572982788, "learning_rate": 5.0219999999999995e-06, "loss": 0.2083, "step": 1677 }, { "epoch": 4.072904009720535, "grad_norm": 1.6386663913726807, "learning_rate": 5.025e-06, "loss": 0.1856, "step": 1678 }, { "epoch": 4.075334143377885, "grad_norm": 1.3059157133102417, "learning_rate": 5.028e-06, "loss": 0.1621, "step": 1679 }, { "epoch": 4.077764277035237, "grad_norm": 1.90481436252594, "learning_rate": 5.031e-06, "loss": 0.24, "step": 1680 }, { "epoch": 4.080194410692588, "grad_norm": 1.357619285583496, "learning_rate": 5.034e-06, "loss": 0.1381, "step": 1681 }, { "epoch": 4.082624544349939, "grad_norm": 1.570247769355774, "learning_rate": 5.037e-06, "loss": 0.1896, "step": 1682 }, { "epoch": 4.085054678007291, "grad_norm": 1.823864221572876, "learning_rate": 5.04e-06, "loss": 0.1965, "step": 1683 }, { "epoch": 4.087484811664641, "grad_norm": 1.7796754837036133, "learning_rate": 5.043e-06, "loss": 0.1894, "step": 1684 }, { "epoch": 4.0899149453219925, "grad_norm": 2.760305166244507, "learning_rate": 5.046e-06, "loss": 0.3361, "step": 1685 }, { "epoch": 4.092345078979344, "grad_norm": 2.4547641277313232, "learning_rate": 5.049e-06, "loss": 0.2374, "step": 1686 }, { "epoch": 4.094775212636695, "grad_norm": 1.6009910106658936, "learning_rate": 5.052e-06, "loss": 0.196, "step": 1687 }, { "epoch": 4.0972053462940465, "grad_norm": 1.4457600116729736, "learning_rate": 5.055000000000001e-06, "loss": 0.1916, "step": 1688 }, { "epoch": 4.099635479951397, "grad_norm": 2.0688538551330566, "learning_rate": 5.0580000000000005e-06, "loss": 0.2487, "step": 1689 }, { "epoch": 4.102065613608748, "grad_norm": 2.2532973289489746, "learning_rate": 5.0609999999999995e-06, "loss": 0.2061, "step": 1690 }, { "epoch": 4.1044957472661, "grad_norm": 2.963718891143799, "learning_rate": 5.064e-06, "loss": 0.1919, "step": 1691 }, { "epoch": 4.106925880923451, "grad_norm": 2.005188465118408, "learning_rate": 5.067e-06, "loss": 0.2197, "step": 1692 }, { "epoch": 4.109356014580802, "grad_norm": 2.4467718601226807, "learning_rate": 5.070000000000001e-06, "loss": 0.2575, "step": 1693 }, { "epoch": 4.111786148238153, "grad_norm": 1.7049946784973145, "learning_rate": 5.073e-06, "loss": 0.2365, "step": 1694 }, { "epoch": 4.114216281895504, "grad_norm": 2.4907069206237793, "learning_rate": 5.076e-06, "loss": 0.2664, "step": 1695 }, { "epoch": 4.1166464155528555, "grad_norm": 2.718737840652466, "learning_rate": 5.079e-06, "loss": 0.2621, "step": 1696 }, { "epoch": 4.119076549210207, "grad_norm": 3.1300859451293945, "learning_rate": 5.082e-06, "loss": 0.2862, "step": 1697 }, { "epoch": 4.121506682867557, "grad_norm": 4.405331134796143, "learning_rate": 5.0850000000000004e-06, "loss": 0.4507, "step": 1698 }, { "epoch": 4.123936816524909, "grad_norm": 2.3172099590301514, "learning_rate": 5.088e-06, "loss": 0.7608, "step": 1699 }, { "epoch": 4.12636695018226, "grad_norm": 3.369312047958374, "learning_rate": 5.091e-06, "loss": 0.6258, "step": 1700 }, { "epoch": 4.128797083839611, "grad_norm": 2.3759613037109375, "learning_rate": 5.094000000000001e-06, "loss": 0.5741, "step": 1701 }, { "epoch": 4.131227217496963, "grad_norm": 2.0789361000061035, "learning_rate": 5.097e-06, "loss": 0.5005, "step": 1702 }, { "epoch": 4.133657351154313, "grad_norm": 1.741218090057373, "learning_rate": 5.1e-06, "loss": 0.5004, "step": 1703 }, { "epoch": 4.136087484811664, "grad_norm": 4.753630638122559, "learning_rate": 5.103e-06, "loss": 0.4025, "step": 1704 }, { "epoch": 4.138517618469016, "grad_norm": 5.642798900604248, "learning_rate": 5.106e-06, "loss": 0.3695, "step": 1705 }, { "epoch": 4.140947752126367, "grad_norm": 3.8923327922821045, "learning_rate": 5.1090000000000006e-06, "loss": 0.3337, "step": 1706 }, { "epoch": 4.1433778857837185, "grad_norm": 3.328474283218384, "learning_rate": 5.112e-06, "loss": 0.3726, "step": 1707 }, { "epoch": 4.145808019441069, "grad_norm": 1.4016109704971313, "learning_rate": 5.115e-06, "loss": 0.2249, "step": 1708 }, { "epoch": 4.14823815309842, "grad_norm": 1.3586350679397583, "learning_rate": 5.118e-06, "loss": 0.2069, "step": 1709 }, { "epoch": 4.150668286755772, "grad_norm": 1.6208165884017944, "learning_rate": 5.121e-06, "loss": 0.2133, "step": 1710 }, { "epoch": 4.153098420413123, "grad_norm": 1.4725512266159058, "learning_rate": 5.124e-06, "loss": 0.216, "step": 1711 }, { "epoch": 4.155528554070474, "grad_norm": 1.2525608539581299, "learning_rate": 5.127e-06, "loss": 0.1894, "step": 1712 }, { "epoch": 4.157958687727825, "grad_norm": 1.6206961870193481, "learning_rate": 5.130000000000001e-06, "loss": 0.2172, "step": 1713 }, { "epoch": 4.160388821385176, "grad_norm": 1.6797181367874146, "learning_rate": 5.133e-06, "loss": 0.2136, "step": 1714 }, { "epoch": 4.162818955042527, "grad_norm": 1.800667643547058, "learning_rate": 5.136e-06, "loss": 0.2176, "step": 1715 }, { "epoch": 4.165249088699879, "grad_norm": 1.637252688407898, "learning_rate": 5.139e-06, "loss": 0.2627, "step": 1716 }, { "epoch": 4.167679222357229, "grad_norm": 1.4002078771591187, "learning_rate": 5.142e-06, "loss": 0.1871, "step": 1717 }, { "epoch": 4.1701093560145805, "grad_norm": 1.4224469661712646, "learning_rate": 5.145000000000001e-06, "loss": 0.1598, "step": 1718 }, { "epoch": 4.172539489671932, "grad_norm": 1.4677361249923706, "learning_rate": 5.1480000000000005e-06, "loss": 0.2059, "step": 1719 }, { "epoch": 4.174969623329283, "grad_norm": 1.4446908235549927, "learning_rate": 5.1509999999999995e-06, "loss": 0.1644, "step": 1720 }, { "epoch": 4.177399756986635, "grad_norm": 2.271099805831909, "learning_rate": 5.154e-06, "loss": 0.2778, "step": 1721 }, { "epoch": 4.179829890643985, "grad_norm": 1.7450958490371704, "learning_rate": 5.157e-06, "loss": 0.1867, "step": 1722 }, { "epoch": 4.182260024301336, "grad_norm": 1.6806482076644897, "learning_rate": 5.16e-06, "loss": 0.1872, "step": 1723 }, { "epoch": 4.184690157958688, "grad_norm": 2.07155442237854, "learning_rate": 5.163e-06, "loss": 0.1939, "step": 1724 }, { "epoch": 4.187120291616039, "grad_norm": 2.422714948654175, "learning_rate": 5.166e-06, "loss": 0.2001, "step": 1725 }, { "epoch": 4.18955042527339, "grad_norm": 1.6861555576324463, "learning_rate": 5.169e-06, "loss": 0.1611, "step": 1726 }, { "epoch": 4.191980558930741, "grad_norm": 2.4591662883758545, "learning_rate": 5.172e-06, "loss": 0.2377, "step": 1727 }, { "epoch": 4.194410692588092, "grad_norm": 1.7046971321105957, "learning_rate": 5.175e-06, "loss": 0.1763, "step": 1728 }, { "epoch": 4.1968408262454435, "grad_norm": 2.1571366786956787, "learning_rate": 5.178e-06, "loss": 0.2165, "step": 1729 }, { "epoch": 4.199270959902795, "grad_norm": 1.7640998363494873, "learning_rate": 5.181e-06, "loss": 0.1858, "step": 1730 }, { "epoch": 4.201701093560146, "grad_norm": 1.9501477479934692, "learning_rate": 5.184000000000001e-06, "loss": 0.2223, "step": 1731 }, { "epoch": 4.204131227217497, "grad_norm": 1.6958156824111938, "learning_rate": 5.1870000000000005e-06, "loss": 0.2072, "step": 1732 }, { "epoch": 4.206561360874848, "grad_norm": 1.5066970586776733, "learning_rate": 5.1899999999999994e-06, "loss": 0.182, "step": 1733 }, { "epoch": 4.208991494532199, "grad_norm": 3.0253708362579346, "learning_rate": 5.193e-06, "loss": 0.2356, "step": 1734 }, { "epoch": 4.211421628189551, "grad_norm": 1.920958161354065, "learning_rate": 5.196e-06, "loss": 0.2196, "step": 1735 }, { "epoch": 4.213851761846902, "grad_norm": 2.4140853881835938, "learning_rate": 5.1990000000000005e-06, "loss": 0.1696, "step": 1736 }, { "epoch": 4.2162818955042525, "grad_norm": 1.9189488887786865, "learning_rate": 5.202e-06, "loss": 0.2068, "step": 1737 }, { "epoch": 4.218712029161604, "grad_norm": 4.0640082359313965, "learning_rate": 5.205e-06, "loss": 0.2731, "step": 1738 }, { "epoch": 4.221142162818955, "grad_norm": 1.7091023921966553, "learning_rate": 5.208e-06, "loss": 0.1841, "step": 1739 }, { "epoch": 4.2235722964763065, "grad_norm": 2.078399658203125, "learning_rate": 5.211e-06, "loss": 0.2534, "step": 1740 }, { "epoch": 4.226002430133657, "grad_norm": 2.4132285118103027, "learning_rate": 5.214e-06, "loss": 0.1853, "step": 1741 }, { "epoch": 4.228432563791008, "grad_norm": 1.9828070402145386, "learning_rate": 5.217e-06, "loss": 0.2163, "step": 1742 }, { "epoch": 4.23086269744836, "grad_norm": 1.9420913457870483, "learning_rate": 5.22e-06, "loss": 0.1838, "step": 1743 }, { "epoch": 4.233292831105711, "grad_norm": 2.8796393871307373, "learning_rate": 5.223000000000001e-06, "loss": 0.2944, "step": 1744 }, { "epoch": 4.235722964763062, "grad_norm": 4.2451395988464355, "learning_rate": 5.226e-06, "loss": 0.242, "step": 1745 }, { "epoch": 4.238153098420413, "grad_norm": 3.2513480186462402, "learning_rate": 5.229e-06, "loss": 0.2846, "step": 1746 }, { "epoch": 4.240583232077764, "grad_norm": 3.2159340381622314, "learning_rate": 5.232e-06, "loss": 0.2584, "step": 1747 }, { "epoch": 4.2430133657351154, "grad_norm": 5.155735492706299, "learning_rate": 5.235e-06, "loss": 0.376, "step": 1748 }, { "epoch": 4.245443499392467, "grad_norm": 11.141236305236816, "learning_rate": 5.2380000000000005e-06, "loss": 0.9368, "step": 1749 }, { "epoch": 4.247873633049818, "grad_norm": 3.8541102409362793, "learning_rate": 5.241e-06, "loss": 0.6723, "step": 1750 }, { "epoch": 4.250303766707169, "grad_norm": 7.099707126617432, "learning_rate": 5.244e-06, "loss": 0.564, "step": 1751 }, { "epoch": 4.25273390036452, "grad_norm": 1.748732089996338, "learning_rate": 5.247e-06, "loss": 0.4733, "step": 1752 }, { "epoch": 4.255164034021871, "grad_norm": 1.7004426717758179, "learning_rate": 5.25e-06, "loss": 0.3595, "step": 1753 }, { "epoch": 4.257594167679223, "grad_norm": 1.4353837966918945, "learning_rate": 5.253e-06, "loss": 0.251, "step": 1754 }, { "epoch": 4.260024301336573, "grad_norm": 1.5201852321624756, "learning_rate": 5.256e-06, "loss": 0.2647, "step": 1755 }, { "epoch": 4.262454434993924, "grad_norm": 1.9534832239151, "learning_rate": 5.259000000000001e-06, "loss": 0.2353, "step": 1756 }, { "epoch": 4.264884568651276, "grad_norm": 1.203300952911377, "learning_rate": 5.262e-06, "loss": 0.2269, "step": 1757 }, { "epoch": 4.267314702308627, "grad_norm": 1.5077012777328491, "learning_rate": 5.2649999999999996e-06, "loss": 0.1933, "step": 1758 }, { "epoch": 4.269744835965978, "grad_norm": 1.4007681608200073, "learning_rate": 5.268e-06, "loss": 0.2374, "step": 1759 }, { "epoch": 4.272174969623329, "grad_norm": 1.5341392755508423, "learning_rate": 5.271e-06, "loss": 0.2113, "step": 1760 }, { "epoch": 4.27460510328068, "grad_norm": 6.01198673248291, "learning_rate": 5.274000000000001e-06, "loss": 0.2842, "step": 1761 }, { "epoch": 4.277035236938032, "grad_norm": 2.246304750442505, "learning_rate": 5.2770000000000005e-06, "loss": 0.2789, "step": 1762 }, { "epoch": 4.279465370595383, "grad_norm": 1.1875550746917725, "learning_rate": 5.279999999999999e-06, "loss": 0.1811, "step": 1763 }, { "epoch": 4.281895504252734, "grad_norm": 1.2873291969299316, "learning_rate": 5.283e-06, "loss": 0.1743, "step": 1764 }, { "epoch": 4.284325637910085, "grad_norm": 1.476625680923462, "learning_rate": 5.286e-06, "loss": 0.1676, "step": 1765 }, { "epoch": 4.286755771567436, "grad_norm": 1.4470710754394531, "learning_rate": 5.2890000000000005e-06, "loss": 0.2104, "step": 1766 }, { "epoch": 4.289185905224787, "grad_norm": 1.198807716369629, "learning_rate": 5.292e-06, "loss": 0.1823, "step": 1767 }, { "epoch": 4.291616038882139, "grad_norm": 1.5879220962524414, "learning_rate": 5.295e-06, "loss": 0.188, "step": 1768 }, { "epoch": 4.29404617253949, "grad_norm": 1.3011834621429443, "learning_rate": 5.298e-06, "loss": 0.1758, "step": 1769 }, { "epoch": 4.2964763061968405, "grad_norm": 1.7260212898254395, "learning_rate": 5.301e-06, "loss": 0.1876, "step": 1770 }, { "epoch": 4.298906439854192, "grad_norm": 1.7747083902359009, "learning_rate": 5.304e-06, "loss": 0.2335, "step": 1771 }, { "epoch": 4.301336573511543, "grad_norm": 1.5664809942245483, "learning_rate": 5.307e-06, "loss": 0.1853, "step": 1772 }, { "epoch": 4.3037667071688945, "grad_norm": 2.2108025550842285, "learning_rate": 5.31e-06, "loss": 0.1696, "step": 1773 }, { "epoch": 4.306196840826246, "grad_norm": 1.866366982460022, "learning_rate": 5.313000000000001e-06, "loss": 0.2055, "step": 1774 }, { "epoch": 4.308626974483596, "grad_norm": 2.294645309448242, "learning_rate": 5.3160000000000004e-06, "loss": 0.1704, "step": 1775 }, { "epoch": 4.311057108140948, "grad_norm": 2.153461217880249, "learning_rate": 5.319e-06, "loss": 0.2653, "step": 1776 }, { "epoch": 4.313487241798299, "grad_norm": 1.5174921751022339, "learning_rate": 5.322e-06, "loss": 0.1818, "step": 1777 }, { "epoch": 4.31591737545565, "grad_norm": 1.5844870805740356, "learning_rate": 5.325e-06, "loss": 0.1567, "step": 1778 }, { "epoch": 4.318347509113001, "grad_norm": 1.8066294193267822, "learning_rate": 5.3280000000000005e-06, "loss": 0.1559, "step": 1779 }, { "epoch": 4.320777642770352, "grad_norm": 2.989042282104492, "learning_rate": 5.331e-06, "loss": 0.2121, "step": 1780 }, { "epoch": 4.3232077764277035, "grad_norm": 2.0323822498321533, "learning_rate": 5.334000000000001e-06, "loss": 0.2166, "step": 1781 }, { "epoch": 4.325637910085055, "grad_norm": 4.590256214141846, "learning_rate": 5.337e-06, "loss": 0.3363, "step": 1782 }, { "epoch": 4.328068043742406, "grad_norm": 2.04875111579895, "learning_rate": 5.34e-06, "loss": 0.1758, "step": 1783 }, { "epoch": 4.330498177399757, "grad_norm": 1.5852711200714111, "learning_rate": 5.343e-06, "loss": 0.186, "step": 1784 }, { "epoch": 4.332928311057108, "grad_norm": 1.6695536375045776, "learning_rate": 5.346e-06, "loss": 0.1742, "step": 1785 }, { "epoch": 4.335358444714459, "grad_norm": 2.1438498497009277, "learning_rate": 5.349e-06, "loss": 0.1937, "step": 1786 }, { "epoch": 4.337788578371811, "grad_norm": 1.702541470527649, "learning_rate": 5.352000000000001e-06, "loss": 0.1836, "step": 1787 }, { "epoch": 4.340218712029162, "grad_norm": 1.8796508312225342, "learning_rate": 5.3549999999999996e-06, "loss": 0.1975, "step": 1788 }, { "epoch": 4.342648845686512, "grad_norm": 2.040435552597046, "learning_rate": 5.358e-06, "loss": 0.2073, "step": 1789 }, { "epoch": 4.345078979343864, "grad_norm": 2.942150831222534, "learning_rate": 5.361e-06, "loss": 0.2283, "step": 1790 }, { "epoch": 4.347509113001215, "grad_norm": 1.6010209321975708, "learning_rate": 5.364e-06, "loss": 0.1712, "step": 1791 }, { "epoch": 4.3499392466585665, "grad_norm": 3.1842167377471924, "learning_rate": 5.3670000000000005e-06, "loss": 0.207, "step": 1792 }, { "epoch": 4.352369380315917, "grad_norm": 2.525632858276367, "learning_rate": 5.37e-06, "loss": 0.1956, "step": 1793 }, { "epoch": 4.354799513973268, "grad_norm": 1.9679182767868042, "learning_rate": 5.373e-06, "loss": 0.2082, "step": 1794 }, { "epoch": 4.35722964763062, "grad_norm": 2.744028091430664, "learning_rate": 5.376e-06, "loss": 0.2743, "step": 1795 }, { "epoch": 4.359659781287971, "grad_norm": 3.2603776454925537, "learning_rate": 5.379e-06, "loss": 0.2982, "step": 1796 }, { "epoch": 4.362089914945322, "grad_norm": 2.5690033435821533, "learning_rate": 5.382e-06, "loss": 0.2479, "step": 1797 }, { "epoch": 4.364520048602673, "grad_norm": 4.17588472366333, "learning_rate": 5.385e-06, "loss": 0.4122, "step": 1798 }, { "epoch": 4.366950182260024, "grad_norm": 7.068970203399658, "learning_rate": 5.388000000000001e-06, "loss": 0.8077, "step": 1799 }, { "epoch": 4.369380315917375, "grad_norm": 3.1573870182037354, "learning_rate": 5.391e-06, "loss": 0.5777, "step": 1800 }, { "epoch": 4.371810449574727, "grad_norm": 1.9126935005187988, "learning_rate": 5.3939999999999995e-06, "loss": 0.421, "step": 1801 }, { "epoch": 4.374240583232078, "grad_norm": 2.239323616027832, "learning_rate": 5.397e-06, "loss": 0.4024, "step": 1802 }, { "epoch": 4.3766707168894285, "grad_norm": 3.789628744125366, "learning_rate": 5.4e-06, "loss": 0.4913, "step": 1803 }, { "epoch": 4.37910085054678, "grad_norm": 3.2793161869049072, "learning_rate": 5.403000000000001e-06, "loss": 0.311, "step": 1804 }, { "epoch": 4.381530984204131, "grad_norm": 1.85017728805542, "learning_rate": 5.406e-06, "loss": 0.2348, "step": 1805 }, { "epoch": 4.383961117861483, "grad_norm": 2.6642096042633057, "learning_rate": 5.408999999999999e-06, "loss": 0.3134, "step": 1806 }, { "epoch": 4.386391251518834, "grad_norm": 1.7164502143859863, "learning_rate": 5.412e-06, "loss": 0.2158, "step": 1807 }, { "epoch": 4.388821385176184, "grad_norm": 1.44251549243927, "learning_rate": 5.415e-06, "loss": 0.1974, "step": 1808 }, { "epoch": 4.391251518833536, "grad_norm": 1.6780251264572144, "learning_rate": 5.4180000000000005e-06, "loss": 0.2277, "step": 1809 }, { "epoch": 4.393681652490887, "grad_norm": 1.5353238582611084, "learning_rate": 5.421e-06, "loss": 0.198, "step": 1810 }, { "epoch": 4.396111786148238, "grad_norm": 2.2592904567718506, "learning_rate": 5.424e-06, "loss": 0.1923, "step": 1811 }, { "epoch": 4.39854191980559, "grad_norm": 1.4176841974258423, "learning_rate": 5.427e-06, "loss": 0.1601, "step": 1812 }, { "epoch": 4.40097205346294, "grad_norm": 1.4912850856781006, "learning_rate": 5.43e-06, "loss": 0.2073, "step": 1813 }, { "epoch": 4.4034021871202915, "grad_norm": 1.5519843101501465, "learning_rate": 5.433e-06, "loss": 0.1868, "step": 1814 }, { "epoch": 4.405832320777643, "grad_norm": 1.277956247329712, "learning_rate": 5.436e-06, "loss": 0.1744, "step": 1815 }, { "epoch": 4.408262454434994, "grad_norm": 1.6108448505401611, "learning_rate": 5.439e-06, "loss": 0.2245, "step": 1816 }, { "epoch": 4.4106925880923455, "grad_norm": 1.4259332418441772, "learning_rate": 5.442000000000001e-06, "loss": 0.1809, "step": 1817 }, { "epoch": 4.413122721749696, "grad_norm": 2.1358728408813477, "learning_rate": 5.445e-06, "loss": 0.2126, "step": 1818 }, { "epoch": 4.415552855407047, "grad_norm": 1.3537133932113647, "learning_rate": 5.448e-06, "loss": 0.1939, "step": 1819 }, { "epoch": 4.417982989064399, "grad_norm": 1.743535041809082, "learning_rate": 5.451e-06, "loss": 0.1839, "step": 1820 }, { "epoch": 4.42041312272175, "grad_norm": 2.206594944000244, "learning_rate": 5.454e-06, "loss": 0.1706, "step": 1821 }, { "epoch": 4.4228432563791005, "grad_norm": 1.3354880809783936, "learning_rate": 5.4570000000000004e-06, "loss": 0.1736, "step": 1822 }, { "epoch": 4.425273390036452, "grad_norm": 1.4133659601211548, "learning_rate": 5.46e-06, "loss": 0.1923, "step": 1823 }, { "epoch": 4.427703523693803, "grad_norm": 1.1884126663208008, "learning_rate": 5.463000000000001e-06, "loss": 0.1503, "step": 1824 }, { "epoch": 4.4301336573511545, "grad_norm": 1.8823184967041016, "learning_rate": 5.466e-06, "loss": 0.1899, "step": 1825 }, { "epoch": 4.432563791008506, "grad_norm": 1.5439237356185913, "learning_rate": 5.469e-06, "loss": 0.2729, "step": 1826 }, { "epoch": 4.434993924665856, "grad_norm": 1.812919020652771, "learning_rate": 5.472e-06, "loss": 0.1657, "step": 1827 }, { "epoch": 4.437424058323208, "grad_norm": 1.4969556331634521, "learning_rate": 5.475e-06, "loss": 0.1683, "step": 1828 }, { "epoch": 4.439854191980559, "grad_norm": 2.285355806350708, "learning_rate": 5.478000000000001e-06, "loss": 0.232, "step": 1829 }, { "epoch": 4.44228432563791, "grad_norm": 1.2815271615982056, "learning_rate": 5.4810000000000005e-06, "loss": 0.1549, "step": 1830 }, { "epoch": 4.444714459295262, "grad_norm": 1.4957445859909058, "learning_rate": 5.4839999999999995e-06, "loss": 0.2174, "step": 1831 }, { "epoch": 4.447144592952612, "grad_norm": 1.8503540754318237, "learning_rate": 5.487e-06, "loss": 0.2497, "step": 1832 }, { "epoch": 4.4495747266099634, "grad_norm": 1.580716609954834, "learning_rate": 5.49e-06, "loss": 0.1914, "step": 1833 }, { "epoch": 4.452004860267315, "grad_norm": 1.912794589996338, "learning_rate": 5.493000000000001e-06, "loss": 0.2242, "step": 1834 }, { "epoch": 4.454434993924666, "grad_norm": 12.854998588562012, "learning_rate": 5.496e-06, "loss": 0.349, "step": 1835 }, { "epoch": 4.456865127582017, "grad_norm": 1.878293514251709, "learning_rate": 5.499e-06, "loss": 0.1518, "step": 1836 }, { "epoch": 4.459295261239368, "grad_norm": 2.135802745819092, "learning_rate": 5.502e-06, "loss": 0.2113, "step": 1837 }, { "epoch": 4.461725394896719, "grad_norm": 1.9525222778320312, "learning_rate": 5.505e-06, "loss": 0.2192, "step": 1838 }, { "epoch": 4.464155528554071, "grad_norm": 1.6301546096801758, "learning_rate": 5.5080000000000005e-06, "loss": 0.209, "step": 1839 }, { "epoch": 4.466585662211422, "grad_norm": 1.9419358968734741, "learning_rate": 5.511e-06, "loss": 0.1918, "step": 1840 }, { "epoch": 4.469015795868772, "grad_norm": 1.473796010017395, "learning_rate": 5.514e-06, "loss": 0.1829, "step": 1841 }, { "epoch": 4.471445929526124, "grad_norm": 2.389822244644165, "learning_rate": 5.517000000000001e-06, "loss": 0.2538, "step": 1842 }, { "epoch": 4.473876063183475, "grad_norm": 2.123887300491333, "learning_rate": 5.52e-06, "loss": 0.2055, "step": 1843 }, { "epoch": 4.476306196840826, "grad_norm": 2.2782912254333496, "learning_rate": 5.523e-06, "loss": 0.2365, "step": 1844 }, { "epoch": 4.478736330498178, "grad_norm": 3.4686989784240723, "learning_rate": 5.526e-06, "loss": 0.2061, "step": 1845 }, { "epoch": 4.481166464155528, "grad_norm": 2.875196933746338, "learning_rate": 5.529e-06, "loss": 0.2555, "step": 1846 }, { "epoch": 4.48359659781288, "grad_norm": 3.103397846221924, "learning_rate": 5.5320000000000006e-06, "loss": 0.2754, "step": 1847 }, { "epoch": 4.486026731470231, "grad_norm": 4.411263942718506, "learning_rate": 5.535e-06, "loss": 0.3616, "step": 1848 }, { "epoch": 4.488456865127582, "grad_norm": 7.369452953338623, "learning_rate": 5.537999999999999e-06, "loss": 0.7852, "step": 1849 }, { "epoch": 4.490886998784934, "grad_norm": 1.950388789176941, "learning_rate": 5.541e-06, "loss": 0.5722, "step": 1850 }, { "epoch": 4.493317132442284, "grad_norm": 1.6450307369232178, "learning_rate": 5.544e-06, "loss": 0.5934, "step": 1851 }, { "epoch": 4.495747266099635, "grad_norm": 3.3621950149536133, "learning_rate": 5.547e-06, "loss": 0.4237, "step": 1852 }, { "epoch": 4.498177399756987, "grad_norm": 4.405115127563477, "learning_rate": 5.55e-06, "loss": 0.4212, "step": 1853 }, { "epoch": 4.500607533414338, "grad_norm": 3.7408251762390137, "learning_rate": 5.553e-06, "loss": 0.3168, "step": 1854 }, { "epoch": 4.503037667071689, "grad_norm": 1.9411730766296387, "learning_rate": 5.556e-06, "loss": 0.3207, "step": 1855 }, { "epoch": 4.50546780072904, "grad_norm": 1.386409044265747, "learning_rate": 5.559e-06, "loss": 0.2966, "step": 1856 }, { "epoch": 4.507897934386391, "grad_norm": 1.3260694742202759, "learning_rate": 5.562e-06, "loss": 0.1893, "step": 1857 }, { "epoch": 4.5103280680437425, "grad_norm": 1.861108422279358, "learning_rate": 5.565e-06, "loss": 0.2837, "step": 1858 }, { "epoch": 4.512758201701094, "grad_norm": 2.8412926197052, "learning_rate": 5.568e-06, "loss": 0.2039, "step": 1859 }, { "epoch": 4.515188335358444, "grad_norm": 1.6438075304031372, "learning_rate": 5.5710000000000005e-06, "loss": 0.191, "step": 1860 }, { "epoch": 4.517618469015796, "grad_norm": 1.3131718635559082, "learning_rate": 5.574e-06, "loss": 0.1901, "step": 1861 }, { "epoch": 4.520048602673147, "grad_norm": 1.6882343292236328, "learning_rate": 5.577e-06, "loss": 0.1713, "step": 1862 }, { "epoch": 4.522478736330498, "grad_norm": 1.7356617450714111, "learning_rate": 5.58e-06, "loss": 0.2079, "step": 1863 }, { "epoch": 4.52490886998785, "grad_norm": 1.5087804794311523, "learning_rate": 5.583e-06, "loss": 0.2472, "step": 1864 }, { "epoch": 4.5273390036452, "grad_norm": 1.3663551807403564, "learning_rate": 5.586e-06, "loss": 0.1797, "step": 1865 }, { "epoch": 4.5297691373025515, "grad_norm": 1.4138344526290894, "learning_rate": 5.589e-06, "loss": 0.1788, "step": 1866 }, { "epoch": 4.532199270959903, "grad_norm": 1.8907309770584106, "learning_rate": 5.592000000000001e-06, "loss": 0.1745, "step": 1867 }, { "epoch": 4.534629404617254, "grad_norm": 2.156958818435669, "learning_rate": 5.595e-06, "loss": 0.1905, "step": 1868 }, { "epoch": 4.537059538274605, "grad_norm": 1.4983723163604736, "learning_rate": 5.598e-06, "loss": 0.1513, "step": 1869 }, { "epoch": 4.539489671931956, "grad_norm": 3.322011709213257, "learning_rate": 5.601e-06, "loss": 0.2022, "step": 1870 }, { "epoch": 4.541919805589307, "grad_norm": 1.4150173664093018, "learning_rate": 5.604e-06, "loss": 0.1602, "step": 1871 }, { "epoch": 4.544349939246659, "grad_norm": 1.2507104873657227, "learning_rate": 5.607000000000001e-06, "loss": 0.1664, "step": 1872 }, { "epoch": 4.54678007290401, "grad_norm": 1.3469231128692627, "learning_rate": 5.6100000000000005e-06, "loss": 0.2787, "step": 1873 }, { "epoch": 4.54921020656136, "grad_norm": 1.562269926071167, "learning_rate": 5.6129999999999995e-06, "loss": 0.1763, "step": 1874 }, { "epoch": 4.551640340218712, "grad_norm": 1.175022006034851, "learning_rate": 5.616e-06, "loss": 0.1799, "step": 1875 }, { "epoch": 4.554070473876063, "grad_norm": 2.321749210357666, "learning_rate": 5.619e-06, "loss": 0.1952, "step": 1876 }, { "epoch": 4.5565006075334145, "grad_norm": 1.911288857460022, "learning_rate": 5.6220000000000006e-06, "loss": 0.1825, "step": 1877 }, { "epoch": 4.558930741190766, "grad_norm": 1.73954439163208, "learning_rate": 5.625e-06, "loss": 0.1779, "step": 1878 }, { "epoch": 4.561360874848116, "grad_norm": 1.7176120281219482, "learning_rate": 5.628e-06, "loss": 0.1779, "step": 1879 }, { "epoch": 4.563791008505468, "grad_norm": 1.3302104473114014, "learning_rate": 5.631e-06, "loss": 0.1713, "step": 1880 }, { "epoch": 4.566221142162819, "grad_norm": 1.596643328666687, "learning_rate": 5.634e-06, "loss": 0.1884, "step": 1881 }, { "epoch": 4.56865127582017, "grad_norm": 1.8169488906860352, "learning_rate": 5.637e-06, "loss": 0.2323, "step": 1882 }, { "epoch": 4.571081409477522, "grad_norm": 1.651482343673706, "learning_rate": 5.64e-06, "loss": 0.19, "step": 1883 }, { "epoch": 4.573511543134872, "grad_norm": 1.5596630573272705, "learning_rate": 5.643e-06, "loss": 0.2334, "step": 1884 }, { "epoch": 4.575941676792223, "grad_norm": 8.296601295471191, "learning_rate": 5.646000000000001e-06, "loss": 0.201, "step": 1885 }, { "epoch": 4.578371810449575, "grad_norm": 1.6493960618972778, "learning_rate": 5.649e-06, "loss": 0.1661, "step": 1886 }, { "epoch": 4.580801944106926, "grad_norm": 1.5663546323776245, "learning_rate": 5.652e-06, "loss": 0.2369, "step": 1887 }, { "epoch": 4.583232077764277, "grad_norm": 2.458096742630005, "learning_rate": 5.655e-06, "loss": 0.1829, "step": 1888 }, { "epoch": 4.585662211421628, "grad_norm": 1.6443874835968018, "learning_rate": 5.658e-06, "loss": 0.2089, "step": 1889 }, { "epoch": 4.588092345078979, "grad_norm": 2.065129280090332, "learning_rate": 5.6610000000000005e-06, "loss": 0.2312, "step": 1890 }, { "epoch": 4.590522478736331, "grad_norm": 1.4812114238739014, "learning_rate": 5.664e-06, "loss": 0.1742, "step": 1891 }, { "epoch": 4.592952612393682, "grad_norm": 1.815509557723999, "learning_rate": 5.667e-06, "loss": 0.2678, "step": 1892 }, { "epoch": 4.595382746051033, "grad_norm": 8.039273262023926, "learning_rate": 5.67e-06, "loss": 0.2274, "step": 1893 }, { "epoch": 4.597812879708384, "grad_norm": 2.1029670238494873, "learning_rate": 5.673e-06, "loss": 0.2362, "step": 1894 }, { "epoch": 4.600243013365735, "grad_norm": 2.047708749771118, "learning_rate": 5.676e-06, "loss": 0.2518, "step": 1895 }, { "epoch": 4.602673147023086, "grad_norm": 3.6896555423736572, "learning_rate": 5.679e-06, "loss": 0.2864, "step": 1896 }, { "epoch": 4.605103280680438, "grad_norm": 2.890073537826538, "learning_rate": 5.682000000000001e-06, "loss": 0.3215, "step": 1897 }, { "epoch": 4.607533414337789, "grad_norm": 3.7471675872802734, "learning_rate": 5.685e-06, "loss": 0.3392, "step": 1898 }, { "epoch": 4.6099635479951395, "grad_norm": 3.0351836681365967, "learning_rate": 5.688e-06, "loss": 0.7371, "step": 1899 }, { "epoch": 4.612393681652491, "grad_norm": 1.9809123277664185, "learning_rate": 5.691e-06, "loss": 0.6489, "step": 1900 }, { "epoch": 4.614823815309842, "grad_norm": 1.4867310523986816, "learning_rate": 5.694e-06, "loss": 0.5368, "step": 1901 }, { "epoch": 4.6172539489671935, "grad_norm": 2.734384775161743, "learning_rate": 5.697000000000001e-06, "loss": 0.4144, "step": 1902 }, { "epoch": 4.619684082624544, "grad_norm": 35.945804595947266, "learning_rate": 5.7000000000000005e-06, "loss": 0.461, "step": 1903 }, { "epoch": 4.622114216281895, "grad_norm": 2.9065394401550293, "learning_rate": 5.703e-06, "loss": 0.3578, "step": 1904 }, { "epoch": 4.624544349939247, "grad_norm": 1.583162784576416, "learning_rate": 5.706e-06, "loss": 0.2169, "step": 1905 }, { "epoch": 4.626974483596598, "grad_norm": 1.5328911542892456, "learning_rate": 5.709e-06, "loss": 0.244, "step": 1906 }, { "epoch": 4.6294046172539485, "grad_norm": 1.3771393299102783, "learning_rate": 5.7120000000000005e-06, "loss": 0.2243, "step": 1907 }, { "epoch": 4.6318347509113, "grad_norm": 1.207829236984253, "learning_rate": 5.715e-06, "loss": 0.1796, "step": 1908 }, { "epoch": 4.634264884568651, "grad_norm": 1.3164597749710083, "learning_rate": 5.718e-06, "loss": 0.1712, "step": 1909 }, { "epoch": 4.6366950182260025, "grad_norm": 1.7641732692718506, "learning_rate": 5.721000000000001e-06, "loss": 0.1931, "step": 1910 }, { "epoch": 4.639125151883354, "grad_norm": 1.9462960958480835, "learning_rate": 5.724e-06, "loss": 0.1884, "step": 1911 }, { "epoch": 4.641555285540704, "grad_norm": 1.328927993774414, "learning_rate": 5.7269999999999995e-06, "loss": 0.1373, "step": 1912 }, { "epoch": 4.643985419198056, "grad_norm": 1.3034671545028687, "learning_rate": 5.73e-06, "loss": 0.1947, "step": 1913 }, { "epoch": 4.646415552855407, "grad_norm": 1.887528896331787, "learning_rate": 5.733e-06, "loss": 0.2058, "step": 1914 }, { "epoch": 4.648845686512758, "grad_norm": 1.5494303703308105, "learning_rate": 5.736000000000001e-06, "loss": 0.1647, "step": 1915 }, { "epoch": 4.65127582017011, "grad_norm": 1.8080852031707764, "learning_rate": 5.7390000000000004e-06, "loss": 0.2212, "step": 1916 }, { "epoch": 4.65370595382746, "grad_norm": 1.698225498199463, "learning_rate": 5.741999999999999e-06, "loss": 0.1512, "step": 1917 }, { "epoch": 4.6561360874848114, "grad_norm": 1.272809386253357, "learning_rate": 5.745e-06, "loss": 0.148, "step": 1918 }, { "epoch": 4.658566221142163, "grad_norm": 1.7800363302230835, "learning_rate": 5.748e-06, "loss": 0.1779, "step": 1919 }, { "epoch": 4.660996354799514, "grad_norm": 1.337418556213379, "learning_rate": 5.7510000000000005e-06, "loss": 0.1819, "step": 1920 }, { "epoch": 4.6634264884568655, "grad_norm": 1.2919321060180664, "learning_rate": 5.754e-06, "loss": 0.1683, "step": 1921 }, { "epoch": 4.665856622114216, "grad_norm": 1.2676877975463867, "learning_rate": 5.757e-06, "loss": 0.1659, "step": 1922 }, { "epoch": 4.668286755771567, "grad_norm": 1.6004278659820557, "learning_rate": 5.76e-06, "loss": 0.1812, "step": 1923 }, { "epoch": 4.670716889428919, "grad_norm": 1.8933849334716797, "learning_rate": 5.763e-06, "loss": 0.1819, "step": 1924 }, { "epoch": 4.67314702308627, "grad_norm": 1.8495893478393555, "learning_rate": 5.766e-06, "loss": 0.1823, "step": 1925 }, { "epoch": 4.675577156743621, "grad_norm": 1.2050886154174805, "learning_rate": 5.769e-06, "loss": 0.1451, "step": 1926 }, { "epoch": 4.678007290400972, "grad_norm": 1.9505311250686646, "learning_rate": 5.772e-06, "loss": 0.1902, "step": 1927 }, { "epoch": 4.680437424058323, "grad_norm": 1.3044872283935547, "learning_rate": 5.775000000000001e-06, "loss": 0.1638, "step": 1928 }, { "epoch": 4.682867557715674, "grad_norm": 1.4061836004257202, "learning_rate": 5.7779999999999996e-06, "loss": 0.201, "step": 1929 }, { "epoch": 4.685297691373026, "grad_norm": 1.5126975774765015, "learning_rate": 5.781e-06, "loss": 0.1851, "step": 1930 }, { "epoch": 4.687727825030377, "grad_norm": 1.5117807388305664, "learning_rate": 5.784e-06, "loss": 0.2114, "step": 1931 }, { "epoch": 4.690157958687728, "grad_norm": 1.2435661554336548, "learning_rate": 5.787e-06, "loss": 0.1452, "step": 1932 }, { "epoch": 4.692588092345079, "grad_norm": 1.6299012899398804, "learning_rate": 5.7900000000000005e-06, "loss": 0.19, "step": 1933 }, { "epoch": 4.69501822600243, "grad_norm": 0.986369252204895, "learning_rate": 5.793e-06, "loss": 0.1318, "step": 1934 }, { "epoch": 4.697448359659782, "grad_norm": 1.5092589855194092, "learning_rate": 5.796e-06, "loss": 0.202, "step": 1935 }, { "epoch": 4.699878493317133, "grad_norm": 1.5897372961044312, "learning_rate": 5.799e-06, "loss": 0.2114, "step": 1936 }, { "epoch": 4.702308626974483, "grad_norm": 2.171642780303955, "learning_rate": 5.802e-06, "loss": 0.1601, "step": 1937 }, { "epoch": 4.704738760631835, "grad_norm": 2.85451078414917, "learning_rate": 5.805e-06, "loss": 0.1751, "step": 1938 }, { "epoch": 4.707168894289186, "grad_norm": 1.5633336305618286, "learning_rate": 5.808e-06, "loss": 0.1792, "step": 1939 }, { "epoch": 4.709599027946537, "grad_norm": 2.490140914916992, "learning_rate": 5.811000000000001e-06, "loss": 0.1878, "step": 1940 }, { "epoch": 4.712029161603888, "grad_norm": 1.563924789428711, "learning_rate": 5.814e-06, "loss": 0.2197, "step": 1941 }, { "epoch": 4.714459295261239, "grad_norm": 1.8351848125457764, "learning_rate": 5.8169999999999995e-06, "loss": 0.1882, "step": 1942 }, { "epoch": 4.7168894289185905, "grad_norm": 3.2050962448120117, "learning_rate": 5.82e-06, "loss": 0.2599, "step": 1943 }, { "epoch": 4.719319562575942, "grad_norm": 2.1698648929595947, "learning_rate": 5.823e-06, "loss": 0.2101, "step": 1944 }, { "epoch": 4.721749696233293, "grad_norm": 2.8197295665740967, "learning_rate": 5.826000000000001e-06, "loss": 0.2816, "step": 1945 }, { "epoch": 4.724179829890644, "grad_norm": 2.38775634765625, "learning_rate": 5.8290000000000004e-06, "loss": 0.2605, "step": 1946 }, { "epoch": 4.726609963547995, "grad_norm": 3.707838535308838, "learning_rate": 5.832e-06, "loss": 0.3446, "step": 1947 }, { "epoch": 4.729040097205346, "grad_norm": 3.2842016220092773, "learning_rate": 5.835e-06, "loss": 0.3272, "step": 1948 }, { "epoch": 4.731470230862698, "grad_norm": 3.789127826690674, "learning_rate": 5.838e-06, "loss": 0.6765, "step": 1949 }, { "epoch": 4.733900364520048, "grad_norm": 2.13421893119812, "learning_rate": 5.8410000000000005e-06, "loss": 0.5651, "step": 1950 }, { "epoch": 4.7363304981773995, "grad_norm": 1.666982650756836, "learning_rate": 5.844e-06, "loss": 0.4269, "step": 1951 }, { "epoch": 4.738760631834751, "grad_norm": 2.845935106277466, "learning_rate": 5.847e-06, "loss": 0.4329, "step": 1952 }, { "epoch": 4.741190765492102, "grad_norm": 1.509891390800476, "learning_rate": 5.850000000000001e-06, "loss": 0.397, "step": 1953 }, { "epoch": 4.7436208991494535, "grad_norm": 9.200102806091309, "learning_rate": 5.853e-06, "loss": 0.3679, "step": 1954 }, { "epoch": 4.746051032806804, "grad_norm": 2.2708067893981934, "learning_rate": 5.856e-06, "loss": 0.3409, "step": 1955 }, { "epoch": 4.748481166464155, "grad_norm": 1.9165021181106567, "learning_rate": 5.859e-06, "loss": 0.2144, "step": 1956 }, { "epoch": 4.750911300121507, "grad_norm": 4.297070503234863, "learning_rate": 5.862e-06, "loss": 0.1971, "step": 1957 }, { "epoch": 4.753341433778858, "grad_norm": 10.535781860351562, "learning_rate": 5.865000000000001e-06, "loss": 0.2305, "step": 1958 }, { "epoch": 4.755771567436209, "grad_norm": 1.4356271028518677, "learning_rate": 5.868e-06, "loss": 0.2188, "step": 1959 }, { "epoch": 4.75820170109356, "grad_norm": 1.2159662246704102, "learning_rate": 5.871e-06, "loss": 0.1697, "step": 1960 }, { "epoch": 4.760631834750911, "grad_norm": 1.5806480646133423, "learning_rate": 5.874e-06, "loss": 0.192, "step": 1961 }, { "epoch": 4.7630619684082625, "grad_norm": 1.4228006601333618, "learning_rate": 5.877e-06, "loss": 0.1869, "step": 1962 }, { "epoch": 4.765492102065614, "grad_norm": 1.224684715270996, "learning_rate": 5.8800000000000005e-06, "loss": 0.1838, "step": 1963 }, { "epoch": 4.767922235722965, "grad_norm": 1.2506756782531738, "learning_rate": 5.883e-06, "loss": 0.1774, "step": 1964 }, { "epoch": 4.770352369380316, "grad_norm": 1.5204352140426636, "learning_rate": 5.886000000000001e-06, "loss": 0.1967, "step": 1965 }, { "epoch": 4.772782503037667, "grad_norm": 1.3023831844329834, "learning_rate": 5.889e-06, "loss": 0.1494, "step": 1966 }, { "epoch": 4.775212636695018, "grad_norm": 1.1708446741104126, "learning_rate": 5.892e-06, "loss": 0.1735, "step": 1967 }, { "epoch": 4.77764277035237, "grad_norm": 1.8564238548278809, "learning_rate": 5.895e-06, "loss": 0.1481, "step": 1968 }, { "epoch": 4.780072904009721, "grad_norm": 1.0768914222717285, "learning_rate": 5.898e-06, "loss": 0.1342, "step": 1969 }, { "epoch": 4.782503037667071, "grad_norm": 1.615625023841858, "learning_rate": 5.901000000000001e-06, "loss": 0.1884, "step": 1970 }, { "epoch": 4.784933171324423, "grad_norm": 1.2395107746124268, "learning_rate": 5.9040000000000006e-06, "loss": 0.168, "step": 1971 }, { "epoch": 4.787363304981774, "grad_norm": 1.1892458200454712, "learning_rate": 5.9069999999999995e-06, "loss": 0.1612, "step": 1972 }, { "epoch": 4.789793438639125, "grad_norm": 1.3564504384994507, "learning_rate": 5.91e-06, "loss": 0.1792, "step": 1973 }, { "epoch": 4.792223572296477, "grad_norm": 1.239201307296753, "learning_rate": 5.913e-06, "loss": 0.136, "step": 1974 }, { "epoch": 4.794653705953827, "grad_norm": 1.637007236480713, "learning_rate": 5.916e-06, "loss": 0.1745, "step": 1975 }, { "epoch": 4.797083839611179, "grad_norm": 2.333134412765503, "learning_rate": 5.919e-06, "loss": 0.1835, "step": 1976 }, { "epoch": 4.79951397326853, "grad_norm": 1.58845853805542, "learning_rate": 5.922e-06, "loss": 0.2021, "step": 1977 }, { "epoch": 4.801944106925881, "grad_norm": 1.6695548295974731, "learning_rate": 5.925e-06, "loss": 0.1917, "step": 1978 }, { "epoch": 4.804374240583232, "grad_norm": 1.7628523111343384, "learning_rate": 5.928e-06, "loss": 0.1979, "step": 1979 }, { "epoch": 4.806804374240583, "grad_norm": 1.5853688716888428, "learning_rate": 5.931e-06, "loss": 0.1556, "step": 1980 }, { "epoch": 4.809234507897934, "grad_norm": 1.4383217096328735, "learning_rate": 5.934e-06, "loss": 0.1594, "step": 1981 }, { "epoch": 4.811664641555286, "grad_norm": 1.9810978174209595, "learning_rate": 5.937e-06, "loss": 0.1995, "step": 1982 }, { "epoch": 4.814094775212637, "grad_norm": 1.509507656097412, "learning_rate": 5.940000000000001e-06, "loss": 0.1924, "step": 1983 }, { "epoch": 4.8165249088699875, "grad_norm": 1.8531895875930786, "learning_rate": 5.943e-06, "loss": 0.1809, "step": 1984 }, { "epoch": 4.818955042527339, "grad_norm": 1.6575932502746582, "learning_rate": 5.9459999999999995e-06, "loss": 0.1736, "step": 1985 }, { "epoch": 4.82138517618469, "grad_norm": 4.678852558135986, "learning_rate": 5.949e-06, "loss": 0.2025, "step": 1986 }, { "epoch": 4.8238153098420415, "grad_norm": 1.6451393365859985, "learning_rate": 5.952e-06, "loss": 0.2009, "step": 1987 }, { "epoch": 4.826245443499392, "grad_norm": 2.141374349594116, "learning_rate": 5.955000000000001e-06, "loss": 0.2543, "step": 1988 }, { "epoch": 4.828675577156743, "grad_norm": 1.543433666229248, "learning_rate": 5.958e-06, "loss": 0.1771, "step": 1989 }, { "epoch": 4.831105710814095, "grad_norm": 1.7099214792251587, "learning_rate": 5.961e-06, "loss": 0.1926, "step": 1990 }, { "epoch": 4.833535844471446, "grad_norm": 1.6301573514938354, "learning_rate": 5.964e-06, "loss": 0.1784, "step": 1991 }, { "epoch": 4.835965978128797, "grad_norm": 5.124726295471191, "learning_rate": 5.967e-06, "loss": 0.3167, "step": 1992 }, { "epoch": 4.838396111786148, "grad_norm": 2.080595016479492, "learning_rate": 5.9700000000000004e-06, "loss": 0.2391, "step": 1993 }, { "epoch": 4.840826245443499, "grad_norm": 2.293670892715454, "learning_rate": 5.973e-06, "loss": 0.2002, "step": 1994 }, { "epoch": 4.8432563791008505, "grad_norm": 3.0245444774627686, "learning_rate": 5.976e-06, "loss": 0.2364, "step": 1995 }, { "epoch": 4.845686512758202, "grad_norm": 2.141958713531494, "learning_rate": 5.979000000000001e-06, "loss": 0.2358, "step": 1996 }, { "epoch": 4.848116646415553, "grad_norm": 4.026017189025879, "learning_rate": 5.982e-06, "loss": 0.2823, "step": 1997 }, { "epoch": 4.850546780072904, "grad_norm": 3.3342015743255615, "learning_rate": 5.985e-06, "loss": 0.4367, "step": 1998 }, { "epoch": 4.852976913730255, "grad_norm": 4.056601047515869, "learning_rate": 5.988e-06, "loss": 0.8518, "step": 1999 }, { "epoch": 4.855407047387606, "grad_norm": 1.8717001676559448, "learning_rate": 5.991e-06, "loss": 0.5611, "step": 2000 }, { "epoch": 4.855407047387606, "eval_cer": 0.14215725644526545, "eval_loss": 0.4460384249687195, "eval_runtime": 8.0217, "eval_samples_per_second": 12.591, "eval_steps_per_second": 0.499, "eval_wer": 0.42548500881834217, "step": 2000 }, { "epoch": 4.857837181044958, "grad_norm": 1.6536273956298828, "learning_rate": 5.9940000000000005e-06, "loss": 0.5707, "step": 2001 }, { "epoch": 4.860267314702309, "grad_norm": 3.0374886989593506, "learning_rate": 5.997e-06, "loss": 0.4189, "step": 2002 }, { "epoch": 4.8626974483596594, "grad_norm": 2.2227776050567627, "learning_rate": 6e-06, "loss": 0.3554, "step": 2003 }, { "epoch": 4.865127582017011, "grad_norm": 1.4814327955245972, "learning_rate": 6.003e-06, "loss": 0.2834, "step": 2004 }, { "epoch": 4.867557715674362, "grad_norm": 1.7463345527648926, "learning_rate": 6.006e-06, "loss": 0.2977, "step": 2005 }, { "epoch": 4.8699878493317135, "grad_norm": 1.5954495668411255, "learning_rate": 6.009e-06, "loss": 0.2125, "step": 2006 }, { "epoch": 4.872417982989065, "grad_norm": 1.8921070098876953, "learning_rate": 6.012e-06, "loss": 0.1785, "step": 2007 }, { "epoch": 4.874848116646415, "grad_norm": 1.05350923538208, "learning_rate": 6.015000000000001e-06, "loss": 0.172, "step": 2008 }, { "epoch": 4.877278250303767, "grad_norm": 1.625196099281311, "learning_rate": 6.018e-06, "loss": 0.2008, "step": 2009 }, { "epoch": 4.879708383961118, "grad_norm": 1.3968639373779297, "learning_rate": 6.021e-06, "loss": 0.1987, "step": 2010 }, { "epoch": 4.882138517618469, "grad_norm": 1.656152606010437, "learning_rate": 6.024e-06, "loss": 0.1828, "step": 2011 }, { "epoch": 4.884568651275821, "grad_norm": 1.3448129892349243, "learning_rate": 6.027e-06, "loss": 0.1812, "step": 2012 }, { "epoch": 4.886998784933171, "grad_norm": 1.569520115852356, "learning_rate": 6.030000000000001e-06, "loss": 0.2025, "step": 2013 }, { "epoch": 4.889428918590522, "grad_norm": 1.131117582321167, "learning_rate": 6.0330000000000005e-06, "loss": 0.1767, "step": 2014 }, { "epoch": 4.891859052247874, "grad_norm": 1.0621347427368164, "learning_rate": 6.0359999999999995e-06, "loss": 0.1285, "step": 2015 }, { "epoch": 4.894289185905225, "grad_norm": 1.4255393743515015, "learning_rate": 6.039e-06, "loss": 0.1904, "step": 2016 }, { "epoch": 4.896719319562576, "grad_norm": 1.274394154548645, "learning_rate": 6.042e-06, "loss": 0.1853, "step": 2017 }, { "epoch": 4.899149453219927, "grad_norm": 2.9067978858947754, "learning_rate": 6.0450000000000006e-06, "loss": 0.1309, "step": 2018 }, { "epoch": 4.901579586877278, "grad_norm": 1.3560593128204346, "learning_rate": 6.048e-06, "loss": 0.2033, "step": 2019 }, { "epoch": 4.90400972053463, "grad_norm": 1.1525437831878662, "learning_rate": 6.051e-06, "loss": 0.1397, "step": 2020 }, { "epoch": 4.906439854191981, "grad_norm": 0.9955020546913147, "learning_rate": 6.054e-06, "loss": 0.1488, "step": 2021 }, { "epoch": 4.908869987849331, "grad_norm": 1.3095829486846924, "learning_rate": 6.057e-06, "loss": 0.2438, "step": 2022 }, { "epoch": 4.911300121506683, "grad_norm": 1.4901729822158813, "learning_rate": 6.0600000000000004e-06, "loss": 0.2049, "step": 2023 }, { "epoch": 4.913730255164034, "grad_norm": 1.269317626953125, "learning_rate": 6.063e-06, "loss": 0.1723, "step": 2024 }, { "epoch": 4.916160388821385, "grad_norm": 1.59920072555542, "learning_rate": 6.066e-06, "loss": 0.1823, "step": 2025 }, { "epoch": 4.918590522478736, "grad_norm": 1.3379626274108887, "learning_rate": 6.069000000000001e-06, "loss": 0.1707, "step": 2026 }, { "epoch": 4.921020656136087, "grad_norm": 1.4529684782028198, "learning_rate": 6.072e-06, "loss": 0.1611, "step": 2027 }, { "epoch": 4.9234507897934385, "grad_norm": 1.1586722135543823, "learning_rate": 6.075e-06, "loss": 0.1422, "step": 2028 }, { "epoch": 4.92588092345079, "grad_norm": 1.2863043546676636, "learning_rate": 6.078e-06, "loss": 0.1371, "step": 2029 }, { "epoch": 4.928311057108141, "grad_norm": 1.6615211963653564, "learning_rate": 6.081e-06, "loss": 0.1668, "step": 2030 }, { "epoch": 4.930741190765492, "grad_norm": 1.3711858987808228, "learning_rate": 6.0840000000000005e-06, "loss": 0.1577, "step": 2031 }, { "epoch": 4.933171324422843, "grad_norm": 1.1942662000656128, "learning_rate": 6.087e-06, "loss": 0.15, "step": 2032 }, { "epoch": 4.935601458080194, "grad_norm": 1.5801631212234497, "learning_rate": 6.090000000000001e-06, "loss": 0.2115, "step": 2033 }, { "epoch": 4.938031591737546, "grad_norm": 1.4464904069900513, "learning_rate": 6.093e-06, "loss": 0.1615, "step": 2034 }, { "epoch": 4.940461725394897, "grad_norm": 1.120173454284668, "learning_rate": 6.096e-06, "loss": 0.1433, "step": 2035 }, { "epoch": 4.9428918590522475, "grad_norm": 1.3976536989212036, "learning_rate": 6.099e-06, "loss": 0.1598, "step": 2036 }, { "epoch": 4.945321992709599, "grad_norm": 1.75798499584198, "learning_rate": 6.102e-06, "loss": 0.1728, "step": 2037 }, { "epoch": 4.94775212636695, "grad_norm": 1.7059309482574463, "learning_rate": 6.105e-06, "loss": 0.1765, "step": 2038 }, { "epoch": 4.9501822600243015, "grad_norm": 1.7417452335357666, "learning_rate": 6.108000000000001e-06, "loss": 0.1965, "step": 2039 }, { "epoch": 4.952612393681653, "grad_norm": 2.2495720386505127, "learning_rate": 6.111e-06, "loss": 0.1931, "step": 2040 }, { "epoch": 4.955042527339003, "grad_norm": 1.4375771284103394, "learning_rate": 6.114e-06, "loss": 0.1629, "step": 2041 }, { "epoch": 4.957472660996355, "grad_norm": 1.7308903932571411, "learning_rate": 6.117e-06, "loss": 0.1938, "step": 2042 }, { "epoch": 4.959902794653706, "grad_norm": 1.9524012804031372, "learning_rate": 6.12e-06, "loss": 0.2258, "step": 2043 }, { "epoch": 4.962332928311057, "grad_norm": 1.9366120100021362, "learning_rate": 6.1230000000000005e-06, "loss": 0.2389, "step": 2044 }, { "epoch": 4.964763061968409, "grad_norm": 2.477975368499756, "learning_rate": 6.126e-06, "loss": 0.2253, "step": 2045 }, { "epoch": 4.967193195625759, "grad_norm": 2.60300874710083, "learning_rate": 6.129e-06, "loss": 0.2807, "step": 2046 }, { "epoch": 4.9696233292831105, "grad_norm": 2.2960731983184814, "learning_rate": 6.132e-06, "loss": 0.2664, "step": 2047 }, { "epoch": 4.972053462940462, "grad_norm": 3.917039155960083, "learning_rate": 6.135e-06, "loss": 0.3142, "step": 2048 }, { "epoch": 4.974483596597813, "grad_norm": 2.231684923171997, "learning_rate": 6.138e-06, "loss": 0.5622, "step": 2049 }, { "epoch": 4.9769137302551645, "grad_norm": 1.3376240730285645, "learning_rate": 6.141e-06, "loss": 0.3256, "step": 2050 }, { "epoch": 4.979343863912515, "grad_norm": 1.5137596130371094, "learning_rate": 6.144000000000001e-06, "loss": 0.1518, "step": 2051 }, { "epoch": 4.981773997569866, "grad_norm": 1.5095763206481934, "learning_rate": 6.147e-06, "loss": 0.223, "step": 2052 }, { "epoch": 4.984204131227218, "grad_norm": 1.9984349012374878, "learning_rate": 6.1499999999999996e-06, "loss": 0.1583, "step": 2053 }, { "epoch": 4.986634264884569, "grad_norm": 1.771902322769165, "learning_rate": 6.153e-06, "loss": 0.1382, "step": 2054 }, { "epoch": 4.98906439854192, "grad_norm": 1.1490647792816162, "learning_rate": 6.156e-06, "loss": 0.1937, "step": 2055 }, { "epoch": 4.991494532199271, "grad_norm": 1.5848791599273682, "learning_rate": 6.159000000000001e-06, "loss": 0.1672, "step": 2056 }, { "epoch": 4.993924665856622, "grad_norm": 1.998736023902893, "learning_rate": 6.1620000000000005e-06, "loss": 0.1979, "step": 2057 }, { "epoch": 4.996354799513973, "grad_norm": 1.434244155883789, "learning_rate": 6.164999999999999e-06, "loss": 0.1684, "step": 2058 }, { "epoch": 4.998784933171325, "grad_norm": 3.153454303741455, "learning_rate": 6.168e-06, "loss": 0.3255, "step": 2059 }, { "epoch": 5.0, "grad_norm": 2.5789785385131836, "learning_rate": 6.171e-06, "loss": 0.2775, "step": 2060 }, { "epoch": 5.002430133657351, "grad_norm": 1.7373229265213013, "learning_rate": 6.1740000000000005e-06, "loss": 0.5595, "step": 2061 }, { "epoch": 5.004860267314703, "grad_norm": 3.9317221641540527, "learning_rate": 6.177e-06, "loss": 0.5035, "step": 2062 }, { "epoch": 5.007290400972053, "grad_norm": 1.5208433866500854, "learning_rate": 6.18e-06, "loss": 0.4948, "step": 2063 }, { "epoch": 5.0097205346294045, "grad_norm": 1.7340798377990723, "learning_rate": 6.183e-06, "loss": 0.3939, "step": 2064 }, { "epoch": 5.012150668286756, "grad_norm": 2.328009605407715, "learning_rate": 6.186e-06, "loss": 0.4338, "step": 2065 }, { "epoch": 5.014580801944107, "grad_norm": 3.420335054397583, "learning_rate": 6.189e-06, "loss": 0.3625, "step": 2066 }, { "epoch": 5.0170109356014585, "grad_norm": 1.5266108512878418, "learning_rate": 6.192e-06, "loss": 0.2424, "step": 2067 }, { "epoch": 5.019441069258809, "grad_norm": 1.4175134897232056, "learning_rate": 6.195e-06, "loss": 0.2481, "step": 2068 }, { "epoch": 5.02187120291616, "grad_norm": 1.1909509897232056, "learning_rate": 6.198000000000001e-06, "loss": 0.228, "step": 2069 }, { "epoch": 5.024301336573512, "grad_norm": 1.2500370740890503, "learning_rate": 6.201e-06, "loss": 0.1642, "step": 2070 }, { "epoch": 5.026731470230863, "grad_norm": 1.0301834344863892, "learning_rate": 6.204e-06, "loss": 0.172, "step": 2071 }, { "epoch": 5.029161603888213, "grad_norm": 1.0930485725402832, "learning_rate": 6.207e-06, "loss": 0.183, "step": 2072 }, { "epoch": 5.031591737545565, "grad_norm": 1.62590754032135, "learning_rate": 6.21e-06, "loss": 0.1925, "step": 2073 }, { "epoch": 5.034021871202916, "grad_norm": 1.195167064666748, "learning_rate": 6.2130000000000005e-06, "loss": 0.155, "step": 2074 }, { "epoch": 5.0364520048602675, "grad_norm": 1.3314337730407715, "learning_rate": 6.216e-06, "loss": 0.1702, "step": 2075 }, { "epoch": 5.038882138517619, "grad_norm": 1.201750636100769, "learning_rate": 6.219000000000001e-06, "loss": 0.1705, "step": 2076 }, { "epoch": 5.041312272174969, "grad_norm": 1.5041757822036743, "learning_rate": 6.222e-06, "loss": 0.153, "step": 2077 }, { "epoch": 5.043742405832321, "grad_norm": 0.9938594698905945, "learning_rate": 6.225e-06, "loss": 0.1191, "step": 2078 }, { "epoch": 5.046172539489672, "grad_norm": 1.0962486267089844, "learning_rate": 6.228e-06, "loss": 0.1416, "step": 2079 }, { "epoch": 5.048602673147023, "grad_norm": 1.3706415891647339, "learning_rate": 6.231e-06, "loss": 0.1419, "step": 2080 }, { "epoch": 5.051032806804375, "grad_norm": 0.9020640254020691, "learning_rate": 6.234000000000001e-06, "loss": 0.1402, "step": 2081 }, { "epoch": 5.053462940461725, "grad_norm": 1.2581090927124023, "learning_rate": 6.237000000000001e-06, "loss": 0.1495, "step": 2082 }, { "epoch": 5.055893074119076, "grad_norm": 1.3030370473861694, "learning_rate": 6.2399999999999995e-06, "loss": 0.1187, "step": 2083 }, { "epoch": 5.058323207776428, "grad_norm": 1.0414791107177734, "learning_rate": 6.243e-06, "loss": 0.1177, "step": 2084 }, { "epoch": 5.060753341433779, "grad_norm": 1.052876591682434, "learning_rate": 6.246e-06, "loss": 0.1264, "step": 2085 }, { "epoch": 5.06318347509113, "grad_norm": 1.6032778024673462, "learning_rate": 6.249000000000001e-06, "loss": 0.1512, "step": 2086 }, { "epoch": 5.065613608748481, "grad_norm": 1.3668875694274902, "learning_rate": 6.2520000000000004e-06, "loss": 0.1362, "step": 2087 }, { "epoch": 5.068043742405832, "grad_norm": 1.2594555616378784, "learning_rate": 6.255e-06, "loss": 0.1471, "step": 2088 }, { "epoch": 5.070473876063184, "grad_norm": 1.8467954397201538, "learning_rate": 6.258e-06, "loss": 0.138, "step": 2089 }, { "epoch": 5.072904009720535, "grad_norm": 1.3512202501296997, "learning_rate": 6.261e-06, "loss": 0.1584, "step": 2090 }, { "epoch": 5.075334143377885, "grad_norm": 1.414244294166565, "learning_rate": 6.2640000000000005e-06, "loss": 0.1451, "step": 2091 }, { "epoch": 5.077764277035237, "grad_norm": 1.5731158256530762, "learning_rate": 6.267e-06, "loss": 0.1488, "step": 2092 }, { "epoch": 5.080194410692588, "grad_norm": 1.6513988971710205, "learning_rate": 6.27e-06, "loss": 0.1801, "step": 2093 }, { "epoch": 5.082624544349939, "grad_norm": 1.7843595743179321, "learning_rate": 6.273000000000001e-06, "loss": 0.1373, "step": 2094 }, { "epoch": 5.085054678007291, "grad_norm": 1.6505777835845947, "learning_rate": 6.276e-06, "loss": 0.2897, "step": 2095 }, { "epoch": 5.087484811664641, "grad_norm": 1.8535442352294922, "learning_rate": 6.279e-06, "loss": 0.2094, "step": 2096 }, { "epoch": 5.0899149453219925, "grad_norm": 1.4256974458694458, "learning_rate": 6.282e-06, "loss": 0.143, "step": 2097 }, { "epoch": 5.092345078979344, "grad_norm": 1.5630801916122437, "learning_rate": 6.285e-06, "loss": 0.1868, "step": 2098 }, { "epoch": 5.094775212636695, "grad_norm": 1.997485637664795, "learning_rate": 6.288000000000001e-06, "loss": 0.1815, "step": 2099 }, { "epoch": 5.0972053462940465, "grad_norm": 1.2098675966262817, "learning_rate": 6.291e-06, "loss": 0.1732, "step": 2100 }, { "epoch": 5.099635479951397, "grad_norm": 2.047820806503296, "learning_rate": 6.293999999999999e-06, "loss": 0.219, "step": 2101 }, { "epoch": 5.102065613608748, "grad_norm": 1.6169476509094238, "learning_rate": 6.297e-06, "loss": 0.1744, "step": 2102 }, { "epoch": 5.1044957472661, "grad_norm": 1.406002163887024, "learning_rate": 6.3e-06, "loss": 0.1851, "step": 2103 }, { "epoch": 5.106925880923451, "grad_norm": 1.3636443614959717, "learning_rate": 6.3030000000000005e-06, "loss": 0.1398, "step": 2104 }, { "epoch": 5.109356014580802, "grad_norm": 1.991511344909668, "learning_rate": 6.306e-06, "loss": 0.1774, "step": 2105 }, { "epoch": 5.111786148238153, "grad_norm": 2.2009057998657227, "learning_rate": 6.309e-06, "loss": 0.2355, "step": 2106 }, { "epoch": 5.114216281895504, "grad_norm": 1.6643071174621582, "learning_rate": 6.312e-06, "loss": 0.2028, "step": 2107 }, { "epoch": 5.1166464155528555, "grad_norm": 2.082120656967163, "learning_rate": 6.315e-06, "loss": 0.185, "step": 2108 }, { "epoch": 5.119076549210207, "grad_norm": 2.4936845302581787, "learning_rate": 6.318e-06, "loss": 0.2365, "step": 2109 }, { "epoch": 5.121506682867557, "grad_norm": 3.363543748855591, "learning_rate": 6.321e-06, "loss": 0.3351, "step": 2110 }, { "epoch": 5.123936816524909, "grad_norm": 3.9749438762664795, "learning_rate": 6.324e-06, "loss": 0.6598, "step": 2111 }, { "epoch": 5.12636695018226, "grad_norm": 2.084512233734131, "learning_rate": 6.327000000000001e-06, "loss": 0.5595, "step": 2112 }, { "epoch": 5.128797083839611, "grad_norm": 1.588382601737976, "learning_rate": 6.3299999999999995e-06, "loss": 0.3998, "step": 2113 }, { "epoch": 5.131227217496963, "grad_norm": 2.5016326904296875, "learning_rate": 6.333e-06, "loss": 0.4328, "step": 2114 }, { "epoch": 5.133657351154313, "grad_norm": 1.7975300550460815, "learning_rate": 6.336e-06, "loss": 0.3188, "step": 2115 }, { "epoch": 5.136087484811664, "grad_norm": 2.118603229522705, "learning_rate": 6.339e-06, "loss": 0.2692, "step": 2116 }, { "epoch": 5.138517618469016, "grad_norm": 1.5072038173675537, "learning_rate": 6.3420000000000004e-06, "loss": 0.2712, "step": 2117 }, { "epoch": 5.140947752126367, "grad_norm": 1.3462082147598267, "learning_rate": 6.345e-06, "loss": 0.2056, "step": 2118 }, { "epoch": 5.1433778857837185, "grad_norm": 1.6033987998962402, "learning_rate": 6.348000000000001e-06, "loss": 0.2517, "step": 2119 }, { "epoch": 5.145808019441069, "grad_norm": 1.3548815250396729, "learning_rate": 6.351e-06, "loss": 0.2016, "step": 2120 }, { "epoch": 5.14823815309842, "grad_norm": 1.1769148111343384, "learning_rate": 6.354e-06, "loss": 0.1597, "step": 2121 }, { "epoch": 5.150668286755772, "grad_norm": 1.0125608444213867, "learning_rate": 6.357e-06, "loss": 0.1301, "step": 2122 }, { "epoch": 5.153098420413123, "grad_norm": 1.0532119274139404, "learning_rate": 6.36e-06, "loss": 0.1451, "step": 2123 }, { "epoch": 5.155528554070474, "grad_norm": 1.052171230316162, "learning_rate": 6.363000000000001e-06, "loss": 0.1683, "step": 2124 }, { "epoch": 5.157958687727825, "grad_norm": 1.0506101846694946, "learning_rate": 6.3660000000000005e-06, "loss": 0.1471, "step": 2125 }, { "epoch": 5.160388821385176, "grad_norm": 1.2568771839141846, "learning_rate": 6.3689999999999995e-06, "loss": 0.135, "step": 2126 }, { "epoch": 5.162818955042527, "grad_norm": 1.2575539350509644, "learning_rate": 6.372e-06, "loss": 0.1401, "step": 2127 }, { "epoch": 5.165249088699879, "grad_norm": 1.2944303750991821, "learning_rate": 6.375e-06, "loss": 0.1631, "step": 2128 }, { "epoch": 5.167679222357229, "grad_norm": 1.1255377531051636, "learning_rate": 6.378000000000001e-06, "loss": 0.1244, "step": 2129 }, { "epoch": 5.1701093560145805, "grad_norm": 1.7201008796691895, "learning_rate": 6.381e-06, "loss": 0.1564, "step": 2130 }, { "epoch": 5.172539489671932, "grad_norm": 0.8651052117347717, "learning_rate": 6.384e-06, "loss": 0.1243, "step": 2131 }, { "epoch": 5.174969623329283, "grad_norm": 1.4456082582473755, "learning_rate": 6.387e-06, "loss": 0.1552, "step": 2132 }, { "epoch": 5.177399756986635, "grad_norm": 1.2946549654006958, "learning_rate": 6.39e-06, "loss": 0.1801, "step": 2133 }, { "epoch": 5.179829890643985, "grad_norm": 1.5035477876663208, "learning_rate": 6.3930000000000005e-06, "loss": 0.1419, "step": 2134 }, { "epoch": 5.182260024301336, "grad_norm": 1.033281683921814, "learning_rate": 6.396e-06, "loss": 0.1219, "step": 2135 }, { "epoch": 5.184690157958688, "grad_norm": 1.1366639137268066, "learning_rate": 6.399e-06, "loss": 0.143, "step": 2136 }, { "epoch": 5.187120291616039, "grad_norm": 1.253104329109192, "learning_rate": 6.402000000000001e-06, "loss": 0.1463, "step": 2137 }, { "epoch": 5.18955042527339, "grad_norm": 1.5319545269012451, "learning_rate": 6.405e-06, "loss": 0.1636, "step": 2138 }, { "epoch": 5.191980558930741, "grad_norm": 1.1692386865615845, "learning_rate": 6.408e-06, "loss": 0.1322, "step": 2139 }, { "epoch": 5.194410692588092, "grad_norm": 1.2128897905349731, "learning_rate": 6.411e-06, "loss": 0.157, "step": 2140 }, { "epoch": 5.1968408262454435, "grad_norm": 1.8712838888168335, "learning_rate": 6.414e-06, "loss": 0.1912, "step": 2141 }, { "epoch": 5.199270959902795, "grad_norm": 1.2639161348342896, "learning_rate": 6.4170000000000006e-06, "loss": 0.1128, "step": 2142 }, { "epoch": 5.201701093560146, "grad_norm": 1.3463314771652222, "learning_rate": 6.42e-06, "loss": 0.1717, "step": 2143 }, { "epoch": 5.204131227217497, "grad_norm": 1.9516428709030151, "learning_rate": 6.423e-06, "loss": 0.1559, "step": 2144 }, { "epoch": 5.206561360874848, "grad_norm": 1.4295710325241089, "learning_rate": 6.426e-06, "loss": 0.1643, "step": 2145 }, { "epoch": 5.208991494532199, "grad_norm": 1.4648627042770386, "learning_rate": 6.429e-06, "loss": 0.1473, "step": 2146 }, { "epoch": 5.211421628189551, "grad_norm": 1.5241515636444092, "learning_rate": 6.432e-06, "loss": 0.1545, "step": 2147 }, { "epoch": 5.213851761846902, "grad_norm": 1.5305542945861816, "learning_rate": 6.435e-06, "loss": 0.1345, "step": 2148 }, { "epoch": 5.2162818955042525, "grad_norm": 2.1125762462615967, "learning_rate": 6.438000000000001e-06, "loss": 0.2103, "step": 2149 }, { "epoch": 5.218712029161604, "grad_norm": 1.8480637073516846, "learning_rate": 6.441e-06, "loss": 0.2356, "step": 2150 }, { "epoch": 5.221142162818955, "grad_norm": 1.391878366470337, "learning_rate": 6.444e-06, "loss": 0.1703, "step": 2151 }, { "epoch": 5.2235722964763065, "grad_norm": 1.5357526540756226, "learning_rate": 6.447e-06, "loss": 0.1799, "step": 2152 }, { "epoch": 5.226002430133657, "grad_norm": 1.6115163564682007, "learning_rate": 6.45e-06, "loss": 0.1584, "step": 2153 }, { "epoch": 5.228432563791008, "grad_norm": 2.6158416271209717, "learning_rate": 6.453000000000001e-06, "loss": 0.232, "step": 2154 }, { "epoch": 5.23086269744836, "grad_norm": 2.094284772872925, "learning_rate": 6.4560000000000005e-06, "loss": 0.1657, "step": 2155 }, { "epoch": 5.233292831105711, "grad_norm": 1.950670599937439, "learning_rate": 6.4589999999999995e-06, "loss": 0.1628, "step": 2156 }, { "epoch": 5.235722964763062, "grad_norm": 2.012681722640991, "learning_rate": 6.462e-06, "loss": 0.2112, "step": 2157 }, { "epoch": 5.238153098420413, "grad_norm": 1.951025366783142, "learning_rate": 6.465e-06, "loss": 0.2182, "step": 2158 }, { "epoch": 5.240583232077764, "grad_norm": 1.894200086593628, "learning_rate": 6.468000000000001e-06, "loss": 0.1278, "step": 2159 }, { "epoch": 5.2430133657351154, "grad_norm": 3.4696991443634033, "learning_rate": 6.471e-06, "loss": 0.3165, "step": 2160 }, { "epoch": 5.245443499392467, "grad_norm": 4.180868625640869, "learning_rate": 6.474e-06, "loss": 0.7444, "step": 2161 }, { "epoch": 5.247873633049818, "grad_norm": 1.7009941339492798, "learning_rate": 6.477000000000001e-06, "loss": 0.5395, "step": 2162 }, { "epoch": 5.250303766707169, "grad_norm": 1.5282583236694336, "learning_rate": 6.48e-06, "loss": 0.4644, "step": 2163 }, { "epoch": 5.25273390036452, "grad_norm": 4.842777252197266, "learning_rate": 6.483e-06, "loss": 0.428, "step": 2164 }, { "epoch": 5.255164034021871, "grad_norm": 3.70592999458313, "learning_rate": 6.486e-06, "loss": 0.4071, "step": 2165 }, { "epoch": 5.257594167679223, "grad_norm": 3.0719046592712402, "learning_rate": 6.489e-06, "loss": 0.3437, "step": 2166 }, { "epoch": 5.260024301336573, "grad_norm": 1.8158752918243408, "learning_rate": 6.492000000000001e-06, "loss": 0.2276, "step": 2167 }, { "epoch": 5.262454434993924, "grad_norm": 1.1685572862625122, "learning_rate": 6.4950000000000005e-06, "loss": 0.1939, "step": 2168 }, { "epoch": 5.264884568651276, "grad_norm": 1.2693920135498047, "learning_rate": 6.4979999999999994e-06, "loss": 0.1716, "step": 2169 }, { "epoch": 5.267314702308627, "grad_norm": 1.300469994544983, "learning_rate": 6.501e-06, "loss": 0.1547, "step": 2170 }, { "epoch": 5.269744835965978, "grad_norm": 1.940430998802185, "learning_rate": 6.504e-06, "loss": 0.1781, "step": 2171 }, { "epoch": 5.272174969623329, "grad_norm": 1.7020665407180786, "learning_rate": 6.5070000000000005e-06, "loss": 0.1581, "step": 2172 }, { "epoch": 5.27460510328068, "grad_norm": 1.9469162225723267, "learning_rate": 6.51e-06, "loss": 0.1513, "step": 2173 }, { "epoch": 5.277035236938032, "grad_norm": 1.394566535949707, "learning_rate": 6.513e-06, "loss": 0.114, "step": 2174 }, { "epoch": 5.279465370595383, "grad_norm": 1.2992274761199951, "learning_rate": 6.516e-06, "loss": 0.1525, "step": 2175 }, { "epoch": 5.281895504252734, "grad_norm": 1.50666344165802, "learning_rate": 6.519e-06, "loss": 0.1528, "step": 2176 }, { "epoch": 5.284325637910085, "grad_norm": 1.1626391410827637, "learning_rate": 6.522e-06, "loss": 0.1537, "step": 2177 }, { "epoch": 5.286755771567436, "grad_norm": 1.4266700744628906, "learning_rate": 6.525e-06, "loss": 0.2037, "step": 2178 }, { "epoch": 5.289185905224787, "grad_norm": 1.6053205728530884, "learning_rate": 6.528e-06, "loss": 0.1179, "step": 2179 }, { "epoch": 5.291616038882139, "grad_norm": 1.2346301078796387, "learning_rate": 6.531000000000001e-06, "loss": 0.1301, "step": 2180 }, { "epoch": 5.29404617253949, "grad_norm": 1.5613534450531006, "learning_rate": 6.534e-06, "loss": 0.1647, "step": 2181 }, { "epoch": 5.2964763061968405, "grad_norm": 1.2803704738616943, "learning_rate": 6.537e-06, "loss": 0.1638, "step": 2182 }, { "epoch": 5.298906439854192, "grad_norm": 1.2329548597335815, "learning_rate": 6.54e-06, "loss": 0.1992, "step": 2183 }, { "epoch": 5.301336573511543, "grad_norm": 1.65999436378479, "learning_rate": 6.543e-06, "loss": 0.1493, "step": 2184 }, { "epoch": 5.3037667071688945, "grad_norm": 1.005844235420227, "learning_rate": 6.5460000000000005e-06, "loss": 0.128, "step": 2185 }, { "epoch": 5.306196840826246, "grad_norm": 1.6683216094970703, "learning_rate": 6.549e-06, "loss": 0.1595, "step": 2186 }, { "epoch": 5.308626974483596, "grad_norm": 1.0778127908706665, "learning_rate": 6.552e-06, "loss": 0.1412, "step": 2187 }, { "epoch": 5.311057108140948, "grad_norm": 0.9392564296722412, "learning_rate": 6.555e-06, "loss": 0.131, "step": 2188 }, { "epoch": 5.313487241798299, "grad_norm": 1.2626423835754395, "learning_rate": 6.558e-06, "loss": 0.1295, "step": 2189 }, { "epoch": 5.31591737545565, "grad_norm": 1.3273791074752808, "learning_rate": 6.561e-06, "loss": 0.1664, "step": 2190 }, { "epoch": 5.318347509113001, "grad_norm": 1.043821096420288, "learning_rate": 6.564e-06, "loss": 0.1146, "step": 2191 }, { "epoch": 5.320777642770352, "grad_norm": 2.043778896331787, "learning_rate": 6.567000000000001e-06, "loss": 0.157, "step": 2192 }, { "epoch": 5.3232077764277035, "grad_norm": 1.4177178144454956, "learning_rate": 6.57e-06, "loss": 0.16, "step": 2193 }, { "epoch": 5.325637910085055, "grad_norm": 1.3311121463775635, "learning_rate": 6.573e-06, "loss": 0.1476, "step": 2194 }, { "epoch": 5.328068043742406, "grad_norm": 2.585063934326172, "learning_rate": 6.576e-06, "loss": 0.2182, "step": 2195 }, { "epoch": 5.330498177399757, "grad_norm": 2.906404495239258, "learning_rate": 6.579e-06, "loss": 0.1657, "step": 2196 }, { "epoch": 5.332928311057108, "grad_norm": 1.4411718845367432, "learning_rate": 6.582000000000001e-06, "loss": 0.2162, "step": 2197 }, { "epoch": 5.335358444714459, "grad_norm": 1.3868292570114136, "learning_rate": 6.5850000000000005e-06, "loss": 0.1581, "step": 2198 }, { "epoch": 5.337788578371811, "grad_norm": 1.8510197401046753, "learning_rate": 6.5879999999999994e-06, "loss": 0.1468, "step": 2199 }, { "epoch": 5.340218712029162, "grad_norm": 1.6119858026504517, "learning_rate": 6.591e-06, "loss": 0.1453, "step": 2200 }, { "epoch": 5.342648845686512, "grad_norm": 1.4827007055282593, "learning_rate": 6.594e-06, "loss": 0.154, "step": 2201 }, { "epoch": 5.345078979343864, "grad_norm": 1.6928266286849976, "learning_rate": 6.5970000000000005e-06, "loss": 0.1774, "step": 2202 }, { "epoch": 5.347509113001215, "grad_norm": 1.706632137298584, "learning_rate": 6.6e-06, "loss": 0.1836, "step": 2203 }, { "epoch": 5.3499392466585665, "grad_norm": 2.134814739227295, "learning_rate": 6.603e-06, "loss": 0.1913, "step": 2204 }, { "epoch": 5.352369380315917, "grad_norm": 1.579714059829712, "learning_rate": 6.606000000000001e-06, "loss": 0.1902, "step": 2205 }, { "epoch": 5.354799513973268, "grad_norm": 1.6738039255142212, "learning_rate": 6.609e-06, "loss": 0.1746, "step": 2206 }, { "epoch": 5.35722964763062, "grad_norm": 2.1965510845184326, "learning_rate": 6.612e-06, "loss": 0.2267, "step": 2207 }, { "epoch": 5.359659781287971, "grad_norm": 1.7433375120162964, "learning_rate": 6.615e-06, "loss": 0.188, "step": 2208 }, { "epoch": 5.362089914945322, "grad_norm": 2.427095651626587, "learning_rate": 6.618e-06, "loss": 0.2565, "step": 2209 }, { "epoch": 5.364520048602673, "grad_norm": 3.8726754188537598, "learning_rate": 6.621000000000001e-06, "loss": 0.2515, "step": 2210 }, { "epoch": 5.366950182260024, "grad_norm": 2.454218626022339, "learning_rate": 6.6240000000000004e-06, "loss": 0.6395, "step": 2211 }, { "epoch": 5.369380315917375, "grad_norm": 4.411599159240723, "learning_rate": 6.627e-06, "loss": 0.5956, "step": 2212 }, { "epoch": 5.371810449574727, "grad_norm": 1.3064870834350586, "learning_rate": 6.63e-06, "loss": 0.46, "step": 2213 }, { "epoch": 5.374240583232078, "grad_norm": 1.7214102745056152, "learning_rate": 6.633e-06, "loss": 0.3663, "step": 2214 }, { "epoch": 5.3766707168894285, "grad_norm": 4.733809947967529, "learning_rate": 6.6360000000000005e-06, "loss": 0.3151, "step": 2215 }, { "epoch": 5.37910085054678, "grad_norm": 1.9748938083648682, "learning_rate": 6.639e-06, "loss": 0.229, "step": 2216 }, { "epoch": 5.381530984204131, "grad_norm": 1.457720160484314, "learning_rate": 6.642000000000001e-06, "loss": 0.1965, "step": 2217 }, { "epoch": 5.383961117861483, "grad_norm": 1.1364902257919312, "learning_rate": 6.645e-06, "loss": 0.1967, "step": 2218 }, { "epoch": 5.386391251518834, "grad_norm": 1.1856987476348877, "learning_rate": 6.648e-06, "loss": 0.1655, "step": 2219 }, { "epoch": 5.388821385176184, "grad_norm": 1.6158689260482788, "learning_rate": 6.651e-06, "loss": 0.2051, "step": 2220 }, { "epoch": 5.391251518833536, "grad_norm": 1.1561895608901978, "learning_rate": 6.654e-06, "loss": 0.1429, "step": 2221 }, { "epoch": 5.393681652490887, "grad_norm": 1.2033296823501587, "learning_rate": 6.657e-06, "loss": 0.1716, "step": 2222 }, { "epoch": 5.396111786148238, "grad_norm": 1.2696352005004883, "learning_rate": 6.660000000000001e-06, "loss": 0.1981, "step": 2223 }, { "epoch": 5.39854191980559, "grad_norm": 1.185328483581543, "learning_rate": 6.6629999999999996e-06, "loss": 0.1557, "step": 2224 }, { "epoch": 5.40097205346294, "grad_norm": 1.251286506652832, "learning_rate": 6.666e-06, "loss": 0.1302, "step": 2225 }, { "epoch": 5.4034021871202915, "grad_norm": 1.2039122581481934, "learning_rate": 6.669e-06, "loss": 0.1904, "step": 2226 }, { "epoch": 5.405832320777643, "grad_norm": 0.9781752228736877, "learning_rate": 6.672e-06, "loss": 0.1621, "step": 2227 }, { "epoch": 5.408262454434994, "grad_norm": 1.666504979133606, "learning_rate": 6.6750000000000005e-06, "loss": 0.1232, "step": 2228 }, { "epoch": 5.4106925880923455, "grad_norm": 1.1960588693618774, "learning_rate": 6.678e-06, "loss": 0.1448, "step": 2229 }, { "epoch": 5.413122721749696, "grad_norm": 1.1513513326644897, "learning_rate": 6.681e-06, "loss": 0.1408, "step": 2230 }, { "epoch": 5.415552855407047, "grad_norm": 1.2458841800689697, "learning_rate": 6.684e-06, "loss": 0.1281, "step": 2231 }, { "epoch": 5.417982989064399, "grad_norm": 1.2741998434066772, "learning_rate": 6.687e-06, "loss": 0.1811, "step": 2232 }, { "epoch": 5.42041312272175, "grad_norm": 1.266586422920227, "learning_rate": 6.69e-06, "loss": 0.1316, "step": 2233 }, { "epoch": 5.4228432563791005, "grad_norm": 2.0625946521759033, "learning_rate": 6.693e-06, "loss": 0.1504, "step": 2234 }, { "epoch": 5.425273390036452, "grad_norm": 1.7009296417236328, "learning_rate": 6.696000000000001e-06, "loss": 0.276, "step": 2235 }, { "epoch": 5.427703523693803, "grad_norm": 2.342557907104492, "learning_rate": 6.699e-06, "loss": 0.2172, "step": 2236 }, { "epoch": 5.4301336573511545, "grad_norm": 1.6109771728515625, "learning_rate": 6.7019999999999995e-06, "loss": 0.1544, "step": 2237 }, { "epoch": 5.432563791008506, "grad_norm": 1.2057085037231445, "learning_rate": 6.705e-06, "loss": 0.1733, "step": 2238 }, { "epoch": 5.434993924665856, "grad_norm": 1.1559010744094849, "learning_rate": 6.708e-06, "loss": 0.1442, "step": 2239 }, { "epoch": 5.437424058323208, "grad_norm": 1.005570650100708, "learning_rate": 6.711000000000001e-06, "loss": 0.1174, "step": 2240 }, { "epoch": 5.439854191980559, "grad_norm": 1.1844117641448975, "learning_rate": 6.7140000000000004e-06, "loss": 0.1279, "step": 2241 }, { "epoch": 5.44228432563791, "grad_norm": 1.0868116617202759, "learning_rate": 6.716999999999999e-06, "loss": 0.0964, "step": 2242 }, { "epoch": 5.444714459295262, "grad_norm": 1.6001198291778564, "learning_rate": 6.72e-06, "loss": 0.1458, "step": 2243 }, { "epoch": 5.447144592952612, "grad_norm": 1.9431627988815308, "learning_rate": 6.723e-06, "loss": 0.207, "step": 2244 }, { "epoch": 5.4495747266099634, "grad_norm": 1.1170119047164917, "learning_rate": 6.7260000000000005e-06, "loss": 0.1116, "step": 2245 }, { "epoch": 5.452004860267315, "grad_norm": 1.2443063259124756, "learning_rate": 6.729e-06, "loss": 0.1479, "step": 2246 }, { "epoch": 5.454434993924666, "grad_norm": 1.9282934665679932, "learning_rate": 6.732e-06, "loss": 0.1725, "step": 2247 }, { "epoch": 5.456865127582017, "grad_norm": 1.328830599784851, "learning_rate": 6.735000000000001e-06, "loss": 0.1372, "step": 2248 }, { "epoch": 5.459295261239368, "grad_norm": 1.2540639638900757, "learning_rate": 6.738e-06, "loss": 0.1496, "step": 2249 }, { "epoch": 5.461725394896719, "grad_norm": 2.2879226207733154, "learning_rate": 6.741e-06, "loss": 0.1827, "step": 2250 }, { "epoch": 5.464155528554071, "grad_norm": 1.4571112394332886, "learning_rate": 6.744e-06, "loss": 0.1577, "step": 2251 }, { "epoch": 5.466585662211422, "grad_norm": 1.2270182371139526, "learning_rate": 6.747e-06, "loss": 0.1216, "step": 2252 }, { "epoch": 5.469015795868772, "grad_norm": 1.5295249223709106, "learning_rate": 6.750000000000001e-06, "loss": 0.1609, "step": 2253 }, { "epoch": 5.471445929526124, "grad_norm": 1.6558626890182495, "learning_rate": 6.753e-06, "loss": 0.1083, "step": 2254 }, { "epoch": 5.473876063183475, "grad_norm": 1.685438632965088, "learning_rate": 6.756e-06, "loss": 0.1712, "step": 2255 }, { "epoch": 5.476306196840826, "grad_norm": 1.7380964756011963, "learning_rate": 6.759e-06, "loss": 0.1439, "step": 2256 }, { "epoch": 5.478736330498178, "grad_norm": 2.2298784255981445, "learning_rate": 6.762e-06, "loss": 0.2151, "step": 2257 }, { "epoch": 5.481166464155528, "grad_norm": 11.24909782409668, "learning_rate": 6.7650000000000005e-06, "loss": 0.2334, "step": 2258 }, { "epoch": 5.48359659781288, "grad_norm": 2.2774674892425537, "learning_rate": 6.768e-06, "loss": 0.2364, "step": 2259 }, { "epoch": 5.486026731470231, "grad_norm": 3.199474573135376, "learning_rate": 6.771000000000001e-06, "loss": 0.2754, "step": 2260 }, { "epoch": 5.488456865127582, "grad_norm": 1.8072056770324707, "learning_rate": 6.774e-06, "loss": 0.5688, "step": 2261 }, { "epoch": 5.490886998784934, "grad_norm": 1.2870581150054932, "learning_rate": 6.777e-06, "loss": 0.4451, "step": 2262 }, { "epoch": 5.493317132442284, "grad_norm": 1.4018611907958984, "learning_rate": 6.78e-06, "loss": 0.3848, "step": 2263 }, { "epoch": 5.495747266099635, "grad_norm": 1.255428433418274, "learning_rate": 6.783e-06, "loss": 0.3975, "step": 2264 }, { "epoch": 5.498177399756987, "grad_norm": 2.2204723358154297, "learning_rate": 6.786000000000001e-06, "loss": 0.3258, "step": 2265 }, { "epoch": 5.500607533414338, "grad_norm": 1.9539499282836914, "learning_rate": 6.7890000000000006e-06, "loss": 0.2423, "step": 2266 }, { "epoch": 5.503037667071689, "grad_norm": 1.2773810625076294, "learning_rate": 6.7919999999999995e-06, "loss": 0.2709, "step": 2267 }, { "epoch": 5.50546780072904, "grad_norm": 1.5884236097335815, "learning_rate": 6.795e-06, "loss": 0.2054, "step": 2268 }, { "epoch": 5.507897934386391, "grad_norm": 1.3314454555511475, "learning_rate": 6.798e-06, "loss": 0.1808, "step": 2269 }, { "epoch": 5.5103280680437425, "grad_norm": 1.2662187814712524, "learning_rate": 6.801000000000001e-06, "loss": 0.1927, "step": 2270 }, { "epoch": 5.512758201701094, "grad_norm": 0.9742422103881836, "learning_rate": 6.804e-06, "loss": 0.1204, "step": 2271 }, { "epoch": 5.515188335358444, "grad_norm": 1.4330406188964844, "learning_rate": 6.807e-06, "loss": 0.2045, "step": 2272 }, { "epoch": 5.517618469015796, "grad_norm": 1.37572181224823, "learning_rate": 6.81e-06, "loss": 0.1373, "step": 2273 }, { "epoch": 5.520048602673147, "grad_norm": 0.8268419504165649, "learning_rate": 6.813e-06, "loss": 0.111, "step": 2274 }, { "epoch": 5.522478736330498, "grad_norm": 1.1327216625213623, "learning_rate": 6.8160000000000005e-06, "loss": 0.1143, "step": 2275 }, { "epoch": 5.52490886998785, "grad_norm": 2.8308064937591553, "learning_rate": 6.819e-06, "loss": 0.1391, "step": 2276 }, { "epoch": 5.5273390036452, "grad_norm": 1.1655954122543335, "learning_rate": 6.822e-06, "loss": 0.1365, "step": 2277 }, { "epoch": 5.5297691373025515, "grad_norm": 1.2414251565933228, "learning_rate": 6.825000000000001e-06, "loss": 0.1376, "step": 2278 }, { "epoch": 5.532199270959903, "grad_norm": 1.1686598062515259, "learning_rate": 6.828e-06, "loss": 0.1246, "step": 2279 }, { "epoch": 5.534629404617254, "grad_norm": 1.4900763034820557, "learning_rate": 6.831e-06, "loss": 0.1224, "step": 2280 }, { "epoch": 5.537059538274605, "grad_norm": 1.0022717714309692, "learning_rate": 6.834e-06, "loss": 0.1355, "step": 2281 }, { "epoch": 5.539489671931956, "grad_norm": 1.6515676975250244, "learning_rate": 6.837e-06, "loss": 0.1426, "step": 2282 }, { "epoch": 5.541919805589307, "grad_norm": 1.01982843875885, "learning_rate": 6.840000000000001e-06, "loss": 0.1228, "step": 2283 }, { "epoch": 5.544349939246659, "grad_norm": 1.2870709896087646, "learning_rate": 6.843e-06, "loss": 0.1095, "step": 2284 }, { "epoch": 5.54678007290401, "grad_norm": 1.2664512395858765, "learning_rate": 6.845999999999999e-06, "loss": 0.1828, "step": 2285 }, { "epoch": 5.54921020656136, "grad_norm": 1.377131462097168, "learning_rate": 6.849e-06, "loss": 0.1781, "step": 2286 }, { "epoch": 5.551640340218712, "grad_norm": 1.1159073114395142, "learning_rate": 6.852e-06, "loss": 0.1268, "step": 2287 }, { "epoch": 5.554070473876063, "grad_norm": 1.266015887260437, "learning_rate": 6.8550000000000004e-06, "loss": 0.1176, "step": 2288 }, { "epoch": 5.5565006075334145, "grad_norm": 1.1414026021957397, "learning_rate": 6.858e-06, "loss": 0.1494, "step": 2289 }, { "epoch": 5.558930741190766, "grad_norm": 1.466949462890625, "learning_rate": 6.861e-06, "loss": 0.1274, "step": 2290 }, { "epoch": 5.561360874848116, "grad_norm": 1.2330424785614014, "learning_rate": 6.864000000000001e-06, "loss": 0.1534, "step": 2291 }, { "epoch": 5.563791008505468, "grad_norm": 1.1983106136322021, "learning_rate": 6.867e-06, "loss": 0.1558, "step": 2292 }, { "epoch": 5.566221142162819, "grad_norm": 1.346745252609253, "learning_rate": 6.87e-06, "loss": 0.1417, "step": 2293 }, { "epoch": 5.56865127582017, "grad_norm": 1.1387920379638672, "learning_rate": 6.873e-06, "loss": 0.1459, "step": 2294 }, { "epoch": 5.571081409477522, "grad_norm": 1.158453106880188, "learning_rate": 6.876e-06, "loss": 0.1469, "step": 2295 }, { "epoch": 5.573511543134872, "grad_norm": 1.3376647233963013, "learning_rate": 6.8790000000000005e-06, "loss": 0.1765, "step": 2296 }, { "epoch": 5.575941676792223, "grad_norm": 2.1858575344085693, "learning_rate": 6.882e-06, "loss": 0.2119, "step": 2297 }, { "epoch": 5.578371810449575, "grad_norm": 1.1470568180084229, "learning_rate": 6.885e-06, "loss": 0.1283, "step": 2298 }, { "epoch": 5.580801944106926, "grad_norm": 1.010069489479065, "learning_rate": 6.888e-06, "loss": 0.1164, "step": 2299 }, { "epoch": 5.583232077764277, "grad_norm": 1.8236171007156372, "learning_rate": 6.891e-06, "loss": 0.187, "step": 2300 }, { "epoch": 5.585662211421628, "grad_norm": 1.263883113861084, "learning_rate": 6.894e-06, "loss": 0.1433, "step": 2301 }, { "epoch": 5.588092345078979, "grad_norm": 1.4677163362503052, "learning_rate": 6.897e-06, "loss": 0.1321, "step": 2302 }, { "epoch": 5.590522478736331, "grad_norm": 1.7184771299362183, "learning_rate": 6.900000000000001e-06, "loss": 0.1749, "step": 2303 }, { "epoch": 5.592952612393682, "grad_norm": 1.6595700979232788, "learning_rate": 6.903e-06, "loss": 0.1502, "step": 2304 }, { "epoch": 5.595382746051033, "grad_norm": 1.3898805379867554, "learning_rate": 6.906e-06, "loss": 0.1778, "step": 2305 }, { "epoch": 5.597812879708384, "grad_norm": 1.9728281497955322, "learning_rate": 6.909e-06, "loss": 0.1939, "step": 2306 }, { "epoch": 5.600243013365735, "grad_norm": 2.62557315826416, "learning_rate": 6.912e-06, "loss": 0.2155, "step": 2307 }, { "epoch": 5.602673147023086, "grad_norm": 2.724231481552124, "learning_rate": 6.915000000000001e-06, "loss": 0.277, "step": 2308 }, { "epoch": 5.605103280680438, "grad_norm": 2.456185817718506, "learning_rate": 6.9180000000000005e-06, "loss": 0.2334, "step": 2309 }, { "epoch": 5.607533414337789, "grad_norm": 3.6471445560455322, "learning_rate": 6.9209999999999995e-06, "loss": 0.3502, "step": 2310 }, { "epoch": 5.6099635479951395, "grad_norm": 1.7533049583435059, "learning_rate": 6.924e-06, "loss": 0.5773, "step": 2311 }, { "epoch": 5.612393681652491, "grad_norm": 1.7951691150665283, "learning_rate": 6.927e-06, "loss": 0.4456, "step": 2312 }, { "epoch": 5.614823815309842, "grad_norm": 1.8063887357711792, "learning_rate": 6.9300000000000006e-06, "loss": 0.4331, "step": 2313 }, { "epoch": 5.6172539489671935, "grad_norm": 1.2387535572052002, "learning_rate": 6.933e-06, "loss": 0.3546, "step": 2314 }, { "epoch": 5.619684082624544, "grad_norm": 1.3002561330795288, "learning_rate": 6.936e-06, "loss": 0.2667, "step": 2315 }, { "epoch": 5.622114216281895, "grad_norm": 1.6273181438446045, "learning_rate": 6.939e-06, "loss": 0.2175, "step": 2316 }, { "epoch": 5.624544349939247, "grad_norm": 1.8053040504455566, "learning_rate": 6.942e-06, "loss": 0.2273, "step": 2317 }, { "epoch": 5.626974483596598, "grad_norm": 1.4499192237854004, "learning_rate": 6.945e-06, "loss": 0.185, "step": 2318 }, { "epoch": 5.6294046172539485, "grad_norm": 1.2909514904022217, "learning_rate": 6.948e-06, "loss": 0.219, "step": 2319 }, { "epoch": 5.6318347509113, "grad_norm": 1.1313669681549072, "learning_rate": 6.951e-06, "loss": 0.1674, "step": 2320 }, { "epoch": 5.634264884568651, "grad_norm": 1.1492717266082764, "learning_rate": 6.954000000000001e-06, "loss": 0.1685, "step": 2321 }, { "epoch": 5.6366950182260025, "grad_norm": 1.281572699546814, "learning_rate": 6.957e-06, "loss": 0.1351, "step": 2322 }, { "epoch": 5.639125151883354, "grad_norm": 0.9117563962936401, "learning_rate": 6.96e-06, "loss": 0.1101, "step": 2323 }, { "epoch": 5.641555285540704, "grad_norm": 1.3708006143569946, "learning_rate": 6.963e-06, "loss": 0.1392, "step": 2324 }, { "epoch": 5.643985419198056, "grad_norm": 1.4078247547149658, "learning_rate": 6.966e-06, "loss": 0.1793, "step": 2325 }, { "epoch": 5.646415552855407, "grad_norm": 0.9874652624130249, "learning_rate": 6.9690000000000005e-06, "loss": 0.1331, "step": 2326 }, { "epoch": 5.648845686512758, "grad_norm": 1.2181804180145264, "learning_rate": 6.972e-06, "loss": 0.1354, "step": 2327 }, { "epoch": 5.65127582017011, "grad_norm": 1.2360925674438477, "learning_rate": 6.975e-06, "loss": 0.132, "step": 2328 }, { "epoch": 5.65370595382746, "grad_norm": 0.9239486455917358, "learning_rate": 6.978e-06, "loss": 0.1111, "step": 2329 }, { "epoch": 5.6561360874848114, "grad_norm": 1.0718815326690674, "learning_rate": 6.981e-06, "loss": 0.1632, "step": 2330 }, { "epoch": 5.658566221142163, "grad_norm": 1.2067372798919678, "learning_rate": 6.984e-06, "loss": 0.1371, "step": 2331 }, { "epoch": 5.660996354799514, "grad_norm": 1.0261670351028442, "learning_rate": 6.987e-06, "loss": 0.1262, "step": 2332 }, { "epoch": 5.6634264884568655, "grad_norm": 1.640673041343689, "learning_rate": 6.990000000000001e-06, "loss": 0.2214, "step": 2333 }, { "epoch": 5.665856622114216, "grad_norm": 1.4887412786483765, "learning_rate": 6.993000000000001e-06, "loss": 0.1219, "step": 2334 }, { "epoch": 5.668286755771567, "grad_norm": 0.6919817924499512, "learning_rate": 6.996e-06, "loss": 0.0893, "step": 2335 }, { "epoch": 5.670716889428919, "grad_norm": 1.1092469692230225, "learning_rate": 6.999e-06, "loss": 0.1456, "step": 2336 }, { "epoch": 5.67314702308627, "grad_norm": 1.146614909172058, "learning_rate": 7.002e-06, "loss": 0.1104, "step": 2337 }, { "epoch": 5.675577156743621, "grad_norm": 1.747395634651184, "learning_rate": 7.005000000000001e-06, "loss": 0.1549, "step": 2338 }, { "epoch": 5.678007290400972, "grad_norm": 1.3644167184829712, "learning_rate": 7.0080000000000005e-06, "loss": 0.1112, "step": 2339 }, { "epoch": 5.680437424058323, "grad_norm": 1.062320590019226, "learning_rate": 7.011e-06, "loss": 0.1139, "step": 2340 }, { "epoch": 5.682867557715674, "grad_norm": 1.3752378225326538, "learning_rate": 7.014e-06, "loss": 0.1273, "step": 2341 }, { "epoch": 5.685297691373026, "grad_norm": 1.368755578994751, "learning_rate": 7.017e-06, "loss": 0.1488, "step": 2342 }, { "epoch": 5.687727825030377, "grad_norm": 1.0498442649841309, "learning_rate": 7.0200000000000006e-06, "loss": 0.1086, "step": 2343 }, { "epoch": 5.690157958687728, "grad_norm": 1.2654396295547485, "learning_rate": 7.023e-06, "loss": 0.1277, "step": 2344 }, { "epoch": 5.692588092345079, "grad_norm": 1.7638516426086426, "learning_rate": 7.026e-06, "loss": 0.1684, "step": 2345 }, { "epoch": 5.69501822600243, "grad_norm": 1.4059438705444336, "learning_rate": 7.029000000000001e-06, "loss": 0.1514, "step": 2346 }, { "epoch": 5.697448359659782, "grad_norm": 1.2790364027023315, "learning_rate": 7.032e-06, "loss": 0.1288, "step": 2347 }, { "epoch": 5.699878493317133, "grad_norm": 1.2232571840286255, "learning_rate": 7.0349999999999996e-06, "loss": 0.1271, "step": 2348 }, { "epoch": 5.702308626974483, "grad_norm": 1.4332044124603271, "learning_rate": 7.038e-06, "loss": 0.1347, "step": 2349 }, { "epoch": 5.704738760631835, "grad_norm": 1.1368428468704224, "learning_rate": 7.041e-06, "loss": 0.1219, "step": 2350 }, { "epoch": 5.707168894289186, "grad_norm": 1.6452940702438354, "learning_rate": 7.044000000000001e-06, "loss": 0.1541, "step": 2351 }, { "epoch": 5.709599027946537, "grad_norm": 1.3409892320632935, "learning_rate": 7.0470000000000005e-06, "loss": 0.1712, "step": 2352 }, { "epoch": 5.712029161603888, "grad_norm": 1.5128130912780762, "learning_rate": 7.049999999999999e-06, "loss": 0.1486, "step": 2353 }, { "epoch": 5.714459295261239, "grad_norm": 1.7442835569381714, "learning_rate": 7.053e-06, "loss": 0.1848, "step": 2354 }, { "epoch": 5.7168894289185905, "grad_norm": 2.504667282104492, "learning_rate": 7.056e-06, "loss": 0.1722, "step": 2355 }, { "epoch": 5.719319562575942, "grad_norm": 1.6023551225662231, "learning_rate": 7.0590000000000005e-06, "loss": 0.1845, "step": 2356 }, { "epoch": 5.721749696233293, "grad_norm": 1.4894503355026245, "learning_rate": 7.062e-06, "loss": 0.175, "step": 2357 }, { "epoch": 5.724179829890644, "grad_norm": 2.4524168968200684, "learning_rate": 7.065e-06, "loss": 0.2252, "step": 2358 }, { "epoch": 5.726609963547995, "grad_norm": 2.015742778778076, "learning_rate": 7.068e-06, "loss": 0.1958, "step": 2359 }, { "epoch": 5.729040097205346, "grad_norm": 2.5480217933654785, "learning_rate": 7.071e-06, "loss": 0.4072, "step": 2360 }, { "epoch": 5.731470230862698, "grad_norm": 2.0951008796691895, "learning_rate": 7.074e-06, "loss": 0.6487, "step": 2361 }, { "epoch": 5.733900364520048, "grad_norm": 1.3698821067810059, "learning_rate": 7.077e-06, "loss": 0.4863, "step": 2362 }, { "epoch": 5.7363304981773995, "grad_norm": 1.256723403930664, "learning_rate": 7.08e-06, "loss": 0.3892, "step": 2363 }, { "epoch": 5.738760631834751, "grad_norm": 1.9482879638671875, "learning_rate": 7.083000000000001e-06, "loss": 0.4115, "step": 2364 }, { "epoch": 5.741190765492102, "grad_norm": 1.3823736906051636, "learning_rate": 7.086e-06, "loss": 0.2801, "step": 2365 }, { "epoch": 5.7436208991494535, "grad_norm": 1.228134274482727, "learning_rate": 7.089e-06, "loss": 0.2814, "step": 2366 }, { "epoch": 5.746051032806804, "grad_norm": 1.4007580280303955, "learning_rate": 7.092e-06, "loss": 0.2431, "step": 2367 }, { "epoch": 5.748481166464155, "grad_norm": 1.3974902629852295, "learning_rate": 7.095e-06, "loss": 0.222, "step": 2368 }, { "epoch": 5.750911300121507, "grad_norm": 1.158044457435608, "learning_rate": 7.0980000000000005e-06, "loss": 0.1798, "step": 2369 }, { "epoch": 5.753341433778858, "grad_norm": 1.0593492984771729, "learning_rate": 7.101e-06, "loss": 0.1799, "step": 2370 }, { "epoch": 5.755771567436209, "grad_norm": 1.4624567031860352, "learning_rate": 7.104e-06, "loss": 0.1874, "step": 2371 }, { "epoch": 5.75820170109356, "grad_norm": 1.2497024536132812, "learning_rate": 7.107e-06, "loss": 0.1501, "step": 2372 }, { "epoch": 5.760631834750911, "grad_norm": 1.2028577327728271, "learning_rate": 7.11e-06, "loss": 0.1633, "step": 2373 }, { "epoch": 5.7630619684082625, "grad_norm": 0.9734994173049927, "learning_rate": 7.113e-06, "loss": 0.1234, "step": 2374 }, { "epoch": 5.765492102065614, "grad_norm": 1.113978624343872, "learning_rate": 7.116e-06, "loss": 0.1509, "step": 2375 }, { "epoch": 5.767922235722965, "grad_norm": 1.1057958602905273, "learning_rate": 7.119000000000001e-06, "loss": 0.1428, "step": 2376 }, { "epoch": 5.770352369380316, "grad_norm": 0.8489695191383362, "learning_rate": 7.122000000000001e-06, "loss": 0.1194, "step": 2377 }, { "epoch": 5.772782503037667, "grad_norm": 1.0249556303024292, "learning_rate": 7.1249999999999995e-06, "loss": 0.2072, "step": 2378 }, { "epoch": 5.775212636695018, "grad_norm": 0.9625260829925537, "learning_rate": 7.128e-06, "loss": 0.1252, "step": 2379 }, { "epoch": 5.77764277035237, "grad_norm": 1.137239933013916, "learning_rate": 7.131e-06, "loss": 0.1217, "step": 2380 }, { "epoch": 5.780072904009721, "grad_norm": 1.0621031522750854, "learning_rate": 7.134000000000001e-06, "loss": 0.1312, "step": 2381 }, { "epoch": 5.782503037667071, "grad_norm": 1.1129580736160278, "learning_rate": 7.1370000000000004e-06, "loss": 0.1331, "step": 2382 }, { "epoch": 5.784933171324423, "grad_norm": 1.2083688974380493, "learning_rate": 7.14e-06, "loss": 0.137, "step": 2383 }, { "epoch": 5.787363304981774, "grad_norm": 1.0224331617355347, "learning_rate": 7.143e-06, "loss": 0.1404, "step": 2384 }, { "epoch": 5.789793438639125, "grad_norm": 1.1384907960891724, "learning_rate": 7.146e-06, "loss": 0.1318, "step": 2385 }, { "epoch": 5.792223572296477, "grad_norm": 0.8838988542556763, "learning_rate": 7.1490000000000005e-06, "loss": 0.1092, "step": 2386 }, { "epoch": 5.794653705953827, "grad_norm": 1.1468186378479004, "learning_rate": 7.152e-06, "loss": 0.1473, "step": 2387 }, { "epoch": 5.797083839611179, "grad_norm": 1.484224796295166, "learning_rate": 7.155e-06, "loss": 0.1694, "step": 2388 }, { "epoch": 5.79951397326853, "grad_norm": 1.1568318605422974, "learning_rate": 7.158000000000001e-06, "loss": 0.1455, "step": 2389 }, { "epoch": 5.801944106925881, "grad_norm": 0.9612528085708618, "learning_rate": 7.161e-06, "loss": 0.1206, "step": 2390 }, { "epoch": 5.804374240583232, "grad_norm": 0.9703068733215332, "learning_rate": 7.164e-06, "loss": 0.138, "step": 2391 }, { "epoch": 5.806804374240583, "grad_norm": 0.9849659204483032, "learning_rate": 7.167e-06, "loss": 0.114, "step": 2392 }, { "epoch": 5.809234507897934, "grad_norm": 1.96042799949646, "learning_rate": 7.17e-06, "loss": 0.152, "step": 2393 }, { "epoch": 5.811664641555286, "grad_norm": 1.7020269632339478, "learning_rate": 7.173000000000001e-06, "loss": 0.1216, "step": 2394 }, { "epoch": 5.814094775212637, "grad_norm": 1.5531237125396729, "learning_rate": 7.176e-06, "loss": 0.1275, "step": 2395 }, { "epoch": 5.8165249088699875, "grad_norm": 2.160130262374878, "learning_rate": 7.179e-06, "loss": 0.1781, "step": 2396 }, { "epoch": 5.818955042527339, "grad_norm": 2.0318052768707275, "learning_rate": 7.182e-06, "loss": 0.2153, "step": 2397 }, { "epoch": 5.82138517618469, "grad_norm": 4.17992639541626, "learning_rate": 7.185e-06, "loss": 0.2002, "step": 2398 }, { "epoch": 5.8238153098420415, "grad_norm": 1.413482666015625, "learning_rate": 7.1880000000000005e-06, "loss": 0.1339, "step": 2399 }, { "epoch": 5.826245443499392, "grad_norm": 1.3734371662139893, "learning_rate": 7.191e-06, "loss": 0.1648, "step": 2400 }, { "epoch": 5.828675577156743, "grad_norm": 1.178396224975586, "learning_rate": 7.194000000000001e-06, "loss": 0.1326, "step": 2401 }, { "epoch": 5.831105710814095, "grad_norm": 1.1532950401306152, "learning_rate": 7.197e-06, "loss": 0.1537, "step": 2402 }, { "epoch": 5.833535844471446, "grad_norm": 1.1470763683319092, "learning_rate": 7.2e-06, "loss": 0.1117, "step": 2403 }, { "epoch": 5.835965978128797, "grad_norm": 2.1854493618011475, "learning_rate": 7.203e-06, "loss": 0.2379, "step": 2404 }, { "epoch": 5.838396111786148, "grad_norm": 2.0658953189849854, "learning_rate": 7.206e-06, "loss": 0.1822, "step": 2405 }, { "epoch": 5.840826245443499, "grad_norm": 2.9296839237213135, "learning_rate": 7.209000000000001e-06, "loss": 0.1919, "step": 2406 }, { "epoch": 5.8432563791008505, "grad_norm": 1.4886764287948608, "learning_rate": 7.2120000000000006e-06, "loss": 0.155, "step": 2407 }, { "epoch": 5.845686512758202, "grad_norm": 2.1209969520568848, "learning_rate": 7.2149999999999995e-06, "loss": 0.2265, "step": 2408 }, { "epoch": 5.848116646415553, "grad_norm": 3.983009099960327, "learning_rate": 7.218e-06, "loss": 0.368, "step": 2409 }, { "epoch": 5.850546780072904, "grad_norm": 3.330080986022949, "learning_rate": 7.221e-06, "loss": 0.2691, "step": 2410 }, { "epoch": 5.852976913730255, "grad_norm": 1.7435846328735352, "learning_rate": 7.224e-06, "loss": 0.5859, "step": 2411 }, { "epoch": 5.855407047387606, "grad_norm": 1.56184720993042, "learning_rate": 7.2270000000000004e-06, "loss": 0.5341, "step": 2412 }, { "epoch": 5.857837181044958, "grad_norm": 1.0259696245193481, "learning_rate": 7.23e-06, "loss": 0.396, "step": 2413 }, { "epoch": 5.860267314702309, "grad_norm": 1.1627187728881836, "learning_rate": 7.233e-06, "loss": 0.3533, "step": 2414 }, { "epoch": 5.8626974483596594, "grad_norm": 0.9543158411979675, "learning_rate": 7.236e-06, "loss": 0.2916, "step": 2415 }, { "epoch": 5.865127582017011, "grad_norm": 1.046298861503601, "learning_rate": 7.239e-06, "loss": 0.2706, "step": 2416 }, { "epoch": 5.867557715674362, "grad_norm": 1.105576992034912, "learning_rate": 7.242e-06, "loss": 0.263, "step": 2417 }, { "epoch": 5.8699878493317135, "grad_norm": 1.015330195426941, "learning_rate": 7.245e-06, "loss": 0.2233, "step": 2418 }, { "epoch": 5.872417982989065, "grad_norm": 0.9940541386604309, "learning_rate": 7.248000000000001e-06, "loss": 0.1745, "step": 2419 }, { "epoch": 5.874848116646415, "grad_norm": 1.068058729171753, "learning_rate": 7.2510000000000005e-06, "loss": 0.1765, "step": 2420 }, { "epoch": 5.877278250303767, "grad_norm": 0.9610331654548645, "learning_rate": 7.2539999999999995e-06, "loss": 0.1941, "step": 2421 }, { "epoch": 5.879708383961118, "grad_norm": 1.9289917945861816, "learning_rate": 7.257e-06, "loss": 0.1593, "step": 2422 }, { "epoch": 5.882138517618469, "grad_norm": 1.1112984418869019, "learning_rate": 7.26e-06, "loss": 0.1213, "step": 2423 }, { "epoch": 5.884568651275821, "grad_norm": 1.395577311515808, "learning_rate": 7.263000000000001e-06, "loss": 0.1699, "step": 2424 }, { "epoch": 5.886998784933171, "grad_norm": 0.7715302109718323, "learning_rate": 7.266e-06, "loss": 0.1074, "step": 2425 }, { "epoch": 5.889428918590522, "grad_norm": 1.0500315427780151, "learning_rate": 7.269e-06, "loss": 0.1365, "step": 2426 }, { "epoch": 5.891859052247874, "grad_norm": 1.3564391136169434, "learning_rate": 7.272e-06, "loss": 0.1603, "step": 2427 }, { "epoch": 5.894289185905225, "grad_norm": 1.1118571758270264, "learning_rate": 7.275e-06, "loss": 0.1286, "step": 2428 }, { "epoch": 5.896719319562576, "grad_norm": 1.701269507408142, "learning_rate": 7.2780000000000005e-06, "loss": 0.1932, "step": 2429 }, { "epoch": 5.899149453219927, "grad_norm": 1.1899523735046387, "learning_rate": 7.281e-06, "loss": 0.1192, "step": 2430 }, { "epoch": 5.901579586877278, "grad_norm": 0.9410715103149414, "learning_rate": 7.284e-06, "loss": 0.1214, "step": 2431 }, { "epoch": 5.90400972053463, "grad_norm": 1.4337750673294067, "learning_rate": 7.287000000000001e-06, "loss": 0.1307, "step": 2432 }, { "epoch": 5.906439854191981, "grad_norm": 1.0581316947937012, "learning_rate": 7.29e-06, "loss": 0.1085, "step": 2433 }, { "epoch": 5.908869987849331, "grad_norm": 0.8084385395050049, "learning_rate": 7.293e-06, "loss": 0.0995, "step": 2434 }, { "epoch": 5.911300121506683, "grad_norm": 1.1998541355133057, "learning_rate": 7.296e-06, "loss": 0.1016, "step": 2435 }, { "epoch": 5.913730255164034, "grad_norm": 1.7751359939575195, "learning_rate": 7.299e-06, "loss": 0.1568, "step": 2436 }, { "epoch": 5.916160388821385, "grad_norm": 1.2158536911010742, "learning_rate": 7.3020000000000006e-06, "loss": 0.1368, "step": 2437 }, { "epoch": 5.918590522478736, "grad_norm": 1.2925602197647095, "learning_rate": 7.305e-06, "loss": 0.1382, "step": 2438 }, { "epoch": 5.921020656136087, "grad_norm": 1.3944109678268433, "learning_rate": 7.308e-06, "loss": 0.1331, "step": 2439 }, { "epoch": 5.9234507897934385, "grad_norm": 1.311216950416565, "learning_rate": 7.311e-06, "loss": 0.1299, "step": 2440 }, { "epoch": 5.92588092345079, "grad_norm": 1.4238319396972656, "learning_rate": 7.314e-06, "loss": 0.159, "step": 2441 }, { "epoch": 5.928311057108141, "grad_norm": 1.291276454925537, "learning_rate": 7.317e-06, "loss": 0.1276, "step": 2442 }, { "epoch": 5.930741190765492, "grad_norm": 1.040182113647461, "learning_rate": 7.32e-06, "loss": 0.1178, "step": 2443 }, { "epoch": 5.933171324422843, "grad_norm": 1.5100957155227661, "learning_rate": 7.323000000000001e-06, "loss": 0.1456, "step": 2444 }, { "epoch": 5.935601458080194, "grad_norm": 1.0542659759521484, "learning_rate": 7.326e-06, "loss": 0.1261, "step": 2445 }, { "epoch": 5.938031591737546, "grad_norm": 1.1874691247940063, "learning_rate": 7.329e-06, "loss": 0.1298, "step": 2446 }, { "epoch": 5.940461725394897, "grad_norm": 1.786170482635498, "learning_rate": 7.332e-06, "loss": 0.1658, "step": 2447 }, { "epoch": 5.9428918590522475, "grad_norm": 1.6263628005981445, "learning_rate": 7.335e-06, "loss": 0.1513, "step": 2448 }, { "epoch": 5.945321992709599, "grad_norm": 1.189917802810669, "learning_rate": 7.338000000000001e-06, "loss": 0.1141, "step": 2449 }, { "epoch": 5.94775212636695, "grad_norm": 1.0844968557357788, "learning_rate": 7.3410000000000005e-06, "loss": 0.1218, "step": 2450 }, { "epoch": 5.9501822600243015, "grad_norm": 1.7961629629135132, "learning_rate": 7.3439999999999995e-06, "loss": 0.1641, "step": 2451 }, { "epoch": 5.952612393681653, "grad_norm": 1.2388882637023926, "learning_rate": 7.347e-06, "loss": 0.1401, "step": 2452 }, { "epoch": 5.955042527339003, "grad_norm": 1.3384313583374023, "learning_rate": 7.35e-06, "loss": 0.1304, "step": 2453 }, { "epoch": 5.957472660996355, "grad_norm": 1.3187646865844727, "learning_rate": 7.353000000000001e-06, "loss": 0.1467, "step": 2454 }, { "epoch": 5.959902794653706, "grad_norm": 1.431372046470642, "learning_rate": 7.356e-06, "loss": 0.1404, "step": 2455 }, { "epoch": 5.962332928311057, "grad_norm": 1.9506630897521973, "learning_rate": 7.359e-06, "loss": 0.1866, "step": 2456 }, { "epoch": 5.964763061968409, "grad_norm": 4.0494303703308105, "learning_rate": 7.362e-06, "loss": 0.2132, "step": 2457 }, { "epoch": 5.967193195625759, "grad_norm": 1.9172271490097046, "learning_rate": 7.365e-06, "loss": 0.207, "step": 2458 }, { "epoch": 5.9696233292831105, "grad_norm": 2.0944974422454834, "learning_rate": 7.3680000000000004e-06, "loss": 0.2248, "step": 2459 }, { "epoch": 5.972053462940462, "grad_norm": 3.802898645401001, "learning_rate": 7.371e-06, "loss": 0.4021, "step": 2460 }, { "epoch": 5.974483596597813, "grad_norm": 2.0874524116516113, "learning_rate": 7.374e-06, "loss": 0.4451, "step": 2461 }, { "epoch": 5.9769137302551645, "grad_norm": 1.109311580657959, "learning_rate": 7.377000000000001e-06, "loss": 0.1882, "step": 2462 }, { "epoch": 5.979343863912515, "grad_norm": 0.9869315028190613, "learning_rate": 7.3800000000000005e-06, "loss": 0.1311, "step": 2463 }, { "epoch": 5.981773997569866, "grad_norm": 1.0797368288040161, "learning_rate": 7.383e-06, "loss": 0.1228, "step": 2464 }, { "epoch": 5.984204131227218, "grad_norm": 1.0126235485076904, "learning_rate": 7.386e-06, "loss": 0.1556, "step": 2465 }, { "epoch": 5.986634264884569, "grad_norm": 1.054526686668396, "learning_rate": 7.389e-06, "loss": 0.0934, "step": 2466 }, { "epoch": 5.98906439854192, "grad_norm": 1.1808466911315918, "learning_rate": 7.3920000000000005e-06, "loss": 0.131, "step": 2467 }, { "epoch": 5.991494532199271, "grad_norm": 1.3440055847167969, "learning_rate": 7.395e-06, "loss": 0.1263, "step": 2468 }, { "epoch": 5.993924665856622, "grad_norm": 0.9889402389526367, "learning_rate": 7.398000000000001e-06, "loss": 0.1144, "step": 2469 }, { "epoch": 5.996354799513973, "grad_norm": 1.3594046831130981, "learning_rate": 7.401e-06, "loss": 0.1248, "step": 2470 }, { "epoch": 5.998784933171325, "grad_norm": 1.748939871788025, "learning_rate": 7.404e-06, "loss": 0.1933, "step": 2471 }, { "epoch": 6.0, "grad_norm": 2.445232629776001, "learning_rate": 7.407e-06, "loss": 0.1061, "step": 2472 }, { "epoch": 6.002430133657351, "grad_norm": 3.9957594871520996, "learning_rate": 7.41e-06, "loss": 0.6846, "step": 2473 }, { "epoch": 6.004860267314703, "grad_norm": 1.58255136013031, "learning_rate": 7.413e-06, "loss": 0.4426, "step": 2474 }, { "epoch": 6.007290400972053, "grad_norm": 1.9587838649749756, "learning_rate": 7.416000000000001e-06, "loss": 0.4069, "step": 2475 }, { "epoch": 6.0097205346294045, "grad_norm": 1.4534977674484253, "learning_rate": 7.419e-06, "loss": 0.2974, "step": 2476 }, { "epoch": 6.012150668286756, "grad_norm": 1.1732666492462158, "learning_rate": 7.422e-06, "loss": 0.2561, "step": 2477 }, { "epoch": 6.014580801944107, "grad_norm": 1.3534761667251587, "learning_rate": 7.425e-06, "loss": 0.2754, "step": 2478 }, { "epoch": 6.0170109356014585, "grad_norm": 1.51971435546875, "learning_rate": 7.428e-06, "loss": 0.1899, "step": 2479 }, { "epoch": 6.019441069258809, "grad_norm": 0.8675341010093689, "learning_rate": 7.4310000000000005e-06, "loss": 0.1866, "step": 2480 }, { "epoch": 6.02187120291616, "grad_norm": 1.1442005634307861, "learning_rate": 7.434e-06, "loss": 0.2126, "step": 2481 }, { "epoch": 6.024301336573512, "grad_norm": 0.8891043066978455, "learning_rate": 7.437e-06, "loss": 0.1465, "step": 2482 }, { "epoch": 6.026731470230863, "grad_norm": 1.034774661064148, "learning_rate": 7.44e-06, "loss": 0.1583, "step": 2483 }, { "epoch": 6.029161603888213, "grad_norm": 1.112218976020813, "learning_rate": 7.443e-06, "loss": 0.1422, "step": 2484 }, { "epoch": 6.031591737545565, "grad_norm": 0.9072595834732056, "learning_rate": 7.446e-06, "loss": 0.161, "step": 2485 }, { "epoch": 6.034021871202916, "grad_norm": 0.9732086658477783, "learning_rate": 7.449e-06, "loss": 0.1221, "step": 2486 }, { "epoch": 6.0364520048602675, "grad_norm": 0.881294310092926, "learning_rate": 7.452000000000001e-06, "loss": 0.1269, "step": 2487 }, { "epoch": 6.038882138517619, "grad_norm": 0.8903632164001465, "learning_rate": 7.455e-06, "loss": 0.0919, "step": 2488 }, { "epoch": 6.041312272174969, "grad_norm": 0.7098023295402527, "learning_rate": 7.4579999999999996e-06, "loss": 0.1008, "step": 2489 }, { "epoch": 6.043742405832321, "grad_norm": 1.2895106077194214, "learning_rate": 7.461e-06, "loss": 0.123, "step": 2490 }, { "epoch": 6.046172539489672, "grad_norm": 0.9989758133888245, "learning_rate": 7.464e-06, "loss": 0.1152, "step": 2491 }, { "epoch": 6.048602673147023, "grad_norm": 1.5077311992645264, "learning_rate": 7.467000000000001e-06, "loss": 0.133, "step": 2492 }, { "epoch": 6.051032806804375, "grad_norm": 1.1044929027557373, "learning_rate": 7.4700000000000005e-06, "loss": 0.115, "step": 2493 }, { "epoch": 6.053462940461725, "grad_norm": 1.1434521675109863, "learning_rate": 7.4729999999999994e-06, "loss": 0.1098, "step": 2494 }, { "epoch": 6.055893074119076, "grad_norm": 1.1407063007354736, "learning_rate": 7.476e-06, "loss": 0.1328, "step": 2495 }, { "epoch": 6.058323207776428, "grad_norm": 1.0142170190811157, "learning_rate": 7.479e-06, "loss": 0.103, "step": 2496 }, { "epoch": 6.060753341433779, "grad_norm": 1.3643072843551636, "learning_rate": 7.4820000000000005e-06, "loss": 0.1071, "step": 2497 }, { "epoch": 6.06318347509113, "grad_norm": 0.9564996957778931, "learning_rate": 7.485e-06, "loss": 0.0853, "step": 2498 }, { "epoch": 6.065613608748481, "grad_norm": 1.1559988260269165, "learning_rate": 7.488e-06, "loss": 0.1269, "step": 2499 }, { "epoch": 6.068043742405832, "grad_norm": 1.4079197645187378, "learning_rate": 7.491e-06, "loss": 0.1725, "step": 2500 }, { "epoch": 6.070473876063184, "grad_norm": 0.8657966256141663, "learning_rate": 7.494e-06, "loss": 0.0837, "step": 2501 }, { "epoch": 6.072904009720535, "grad_norm": 1.2927523851394653, "learning_rate": 7.497e-06, "loss": 0.0999, "step": 2502 }, { "epoch": 6.075334143377885, "grad_norm": 1.1084388494491577, "learning_rate": 7.5e-06, "loss": 0.1172, "step": 2503 }, { "epoch": 6.077764277035237, "grad_norm": 0.9553362131118774, "learning_rate": 7.503e-06, "loss": 0.1046, "step": 2504 }, { "epoch": 6.080194410692588, "grad_norm": 1.049061894416809, "learning_rate": 7.506e-06, "loss": 0.1074, "step": 2505 }, { "epoch": 6.082624544349939, "grad_norm": 1.2488548755645752, "learning_rate": 7.5090000000000004e-06, "loss": 0.1554, "step": 2506 }, { "epoch": 6.085054678007291, "grad_norm": 1.0002087354660034, "learning_rate": 7.512e-06, "loss": 0.1108, "step": 2507 }, { "epoch": 6.087484811664641, "grad_norm": 1.6147668361663818, "learning_rate": 7.515e-06, "loss": 0.1482, "step": 2508 }, { "epoch": 6.0899149453219925, "grad_norm": 1.184435248374939, "learning_rate": 7.518e-06, "loss": 0.1262, "step": 2509 }, { "epoch": 6.092345078979344, "grad_norm": 1.2692763805389404, "learning_rate": 7.521e-06, "loss": 0.1104, "step": 2510 }, { "epoch": 6.094775212636695, "grad_norm": 2.277301788330078, "learning_rate": 7.524000000000001e-06, "loss": 0.1323, "step": 2511 }, { "epoch": 6.0972053462940465, "grad_norm": 1.1324001550674438, "learning_rate": 7.527000000000001e-06, "loss": 0.1696, "step": 2512 }, { "epoch": 6.099635479951397, "grad_norm": 1.3413573503494263, "learning_rate": 7.53e-06, "loss": 0.1494, "step": 2513 }, { "epoch": 6.102065613608748, "grad_norm": 1.302776575088501, "learning_rate": 7.533e-06, "loss": 0.1419, "step": 2514 }, { "epoch": 6.1044957472661, "grad_norm": 1.5580835342407227, "learning_rate": 7.5359999999999995e-06, "loss": 0.1146, "step": 2515 }, { "epoch": 6.106925880923451, "grad_norm": 1.761512041091919, "learning_rate": 7.539000000000001e-06, "loss": 0.1726, "step": 2516 }, { "epoch": 6.109356014580802, "grad_norm": 1.2054762840270996, "learning_rate": 7.542000000000001e-06, "loss": 0.1243, "step": 2517 }, { "epoch": 6.111786148238153, "grad_norm": 1.646715760231018, "learning_rate": 7.545000000000001e-06, "loss": 0.1406, "step": 2518 }, { "epoch": 6.114216281895504, "grad_norm": 1.9457513093948364, "learning_rate": 7.5479999999999996e-06, "loss": 0.1811, "step": 2519 }, { "epoch": 6.1166464155528555, "grad_norm": 3.018071413040161, "learning_rate": 7.550999999999999e-06, "loss": 0.2218, "step": 2520 }, { "epoch": 6.119076549210207, "grad_norm": 2.4346566200256348, "learning_rate": 7.554000000000001e-06, "loss": 0.2569, "step": 2521 }, { "epoch": 6.121506682867557, "grad_norm": 3.150421142578125, "learning_rate": 7.557000000000001e-06, "loss": 0.3443, "step": 2522 }, { "epoch": 6.123936816524909, "grad_norm": 2.813784122467041, "learning_rate": 7.5600000000000005e-06, "loss": 0.5048, "step": 2523 }, { "epoch": 6.12636695018226, "grad_norm": 1.720543622970581, "learning_rate": 7.563e-06, "loss": 0.4891, "step": 2524 }, { "epoch": 6.128797083839611, "grad_norm": 1.5230693817138672, "learning_rate": 7.565999999999999e-06, "loss": 0.3409, "step": 2525 }, { "epoch": 6.131227217496963, "grad_norm": 1.672211766242981, "learning_rate": 7.569000000000001e-06, "loss": 0.3044, "step": 2526 }, { "epoch": 6.133657351154313, "grad_norm": 1.2879167795181274, "learning_rate": 7.5720000000000005e-06, "loss": 0.3179, "step": 2527 }, { "epoch": 6.136087484811664, "grad_norm": 1.7009614706039429, "learning_rate": 7.575e-06, "loss": 0.2138, "step": 2528 }, { "epoch": 6.138517618469016, "grad_norm": 1.5900970697402954, "learning_rate": 7.578e-06, "loss": 0.2298, "step": 2529 }, { "epoch": 6.140947752126367, "grad_norm": 1.0948479175567627, "learning_rate": 7.581e-06, "loss": 0.1817, "step": 2530 }, { "epoch": 6.1433778857837185, "grad_norm": 0.8108450174331665, "learning_rate": 7.5840000000000006e-06, "loss": 0.1341, "step": 2531 }, { "epoch": 6.145808019441069, "grad_norm": 0.9842482805252075, "learning_rate": 7.587e-06, "loss": 0.1831, "step": 2532 }, { "epoch": 6.14823815309842, "grad_norm": 1.2662967443466187, "learning_rate": 7.59e-06, "loss": 0.1556, "step": 2533 }, { "epoch": 6.150668286755772, "grad_norm": 0.9864842295646667, "learning_rate": 7.593e-06, "loss": 0.1089, "step": 2534 }, { "epoch": 6.153098420413123, "grad_norm": 0.9353387355804443, "learning_rate": 7.596e-06, "loss": 0.1006, "step": 2535 }, { "epoch": 6.155528554070474, "grad_norm": 0.9024695158004761, "learning_rate": 7.599000000000001e-06, "loss": 0.0989, "step": 2536 }, { "epoch": 6.157958687727825, "grad_norm": 0.8266281485557556, "learning_rate": 7.602e-06, "loss": 0.1146, "step": 2537 }, { "epoch": 6.160388821385176, "grad_norm": 0.8349623680114746, "learning_rate": 7.605e-06, "loss": 0.114, "step": 2538 }, { "epoch": 6.162818955042527, "grad_norm": 1.0222914218902588, "learning_rate": 7.608e-06, "loss": 0.1441, "step": 2539 }, { "epoch": 6.165249088699879, "grad_norm": 0.9636932611465454, "learning_rate": 7.611e-06, "loss": 0.1703, "step": 2540 }, { "epoch": 6.167679222357229, "grad_norm": 1.3780335187911987, "learning_rate": 7.614000000000001e-06, "loss": 0.1358, "step": 2541 }, { "epoch": 6.1701093560145805, "grad_norm": 1.0270382165908813, "learning_rate": 7.617000000000001e-06, "loss": 0.1045, "step": 2542 }, { "epoch": 6.172539489671932, "grad_norm": 0.8751576542854309, "learning_rate": 7.62e-06, "loss": 0.0981, "step": 2543 }, { "epoch": 6.174969623329283, "grad_norm": 0.9579005241394043, "learning_rate": 7.623e-06, "loss": 0.1061, "step": 2544 }, { "epoch": 6.177399756986635, "grad_norm": 0.9927552938461304, "learning_rate": 7.6259999999999995e-06, "loss": 0.1078, "step": 2545 }, { "epoch": 6.179829890643985, "grad_norm": 1.028558373451233, "learning_rate": 7.629000000000001e-06, "loss": 0.125, "step": 2546 }, { "epoch": 6.182260024301336, "grad_norm": 1.1229567527770996, "learning_rate": 7.632e-06, "loss": 0.1116, "step": 2547 }, { "epoch": 6.184690157958688, "grad_norm": 1.2163889408111572, "learning_rate": 7.635e-06, "loss": 0.1703, "step": 2548 }, { "epoch": 6.187120291616039, "grad_norm": 1.0004339218139648, "learning_rate": 7.638e-06, "loss": 0.112, "step": 2549 }, { "epoch": 6.18955042527339, "grad_norm": 1.0087922811508179, "learning_rate": 7.641e-06, "loss": 0.1331, "step": 2550 }, { "epoch": 6.191980558930741, "grad_norm": 1.0877361297607422, "learning_rate": 7.644000000000002e-06, "loss": 0.1003, "step": 2551 }, { "epoch": 6.194410692588092, "grad_norm": 1.2701025009155273, "learning_rate": 7.647000000000001e-06, "loss": 0.0699, "step": 2552 }, { "epoch": 6.1968408262454435, "grad_norm": 1.3240573406219482, "learning_rate": 7.65e-06, "loss": 0.1733, "step": 2553 }, { "epoch": 6.199270959902795, "grad_norm": 0.9536173939704895, "learning_rate": 7.653e-06, "loss": 0.1186, "step": 2554 }, { "epoch": 6.201701093560146, "grad_norm": 1.3275469541549683, "learning_rate": 7.656e-06, "loss": 0.127, "step": 2555 }, { "epoch": 6.204131227217497, "grad_norm": 0.9676644802093506, "learning_rate": 7.659e-06, "loss": 0.1108, "step": 2556 }, { "epoch": 6.206561360874848, "grad_norm": 1.9753259420394897, "learning_rate": 7.662e-06, "loss": 0.1339, "step": 2557 }, { "epoch": 6.208991494532199, "grad_norm": 1.3191872835159302, "learning_rate": 7.665e-06, "loss": 0.1306, "step": 2558 }, { "epoch": 6.211421628189551, "grad_norm": 1.2839590311050415, "learning_rate": 7.668e-06, "loss": 0.1006, "step": 2559 }, { "epoch": 6.213851761846902, "grad_norm": 1.7084236145019531, "learning_rate": 7.671e-06, "loss": 0.2017, "step": 2560 }, { "epoch": 6.2162818955042525, "grad_norm": 1.226855993270874, "learning_rate": 7.674000000000001e-06, "loss": 0.1175, "step": 2561 }, { "epoch": 6.218712029161604, "grad_norm": 1.30331289768219, "learning_rate": 7.677000000000001e-06, "loss": 0.1106, "step": 2562 }, { "epoch": 6.221142162818955, "grad_norm": 1.528239130973816, "learning_rate": 7.680000000000001e-06, "loss": 0.1807, "step": 2563 }, { "epoch": 6.2235722964763065, "grad_norm": 1.0510531663894653, "learning_rate": 7.683e-06, "loss": 0.1033, "step": 2564 }, { "epoch": 6.226002430133657, "grad_norm": 1.82988703250885, "learning_rate": 7.685999999999999e-06, "loss": 0.1715, "step": 2565 }, { "epoch": 6.228432563791008, "grad_norm": 1.347122311592102, "learning_rate": 7.688999999999999e-06, "loss": 0.1248, "step": 2566 }, { "epoch": 6.23086269744836, "grad_norm": 4.028000354766846, "learning_rate": 7.692e-06, "loss": 0.1197, "step": 2567 }, { "epoch": 6.233292831105711, "grad_norm": 1.2651748657226562, "learning_rate": 7.695e-06, "loss": 0.1404, "step": 2568 }, { "epoch": 6.235722964763062, "grad_norm": 1.6186350584030151, "learning_rate": 7.698e-06, "loss": 0.2248, "step": 2569 }, { "epoch": 6.238153098420413, "grad_norm": 2.95637845993042, "learning_rate": 7.701e-06, "loss": 0.1565, "step": 2570 }, { "epoch": 6.240583232077764, "grad_norm": 1.81504487991333, "learning_rate": 7.704e-06, "loss": 0.2293, "step": 2571 }, { "epoch": 6.2430133657351154, "grad_norm": 2.78786301612854, "learning_rate": 7.707000000000001e-06, "loss": 0.2432, "step": 2572 }, { "epoch": 6.245443499392467, "grad_norm": 1.7050663232803345, "learning_rate": 7.71e-06, "loss": 0.6047, "step": 2573 }, { "epoch": 6.247873633049818, "grad_norm": 1.8092092275619507, "learning_rate": 7.713e-06, "loss": 0.4569, "step": 2574 }, { "epoch": 6.250303766707169, "grad_norm": 1.075701355934143, "learning_rate": 7.716e-06, "loss": 0.3553, "step": 2575 }, { "epoch": 6.25273390036452, "grad_norm": 1.190012812614441, "learning_rate": 7.719e-06, "loss": 0.2868, "step": 2576 }, { "epoch": 6.255164034021871, "grad_norm": 0.9939693212509155, "learning_rate": 7.722e-06, "loss": 0.2704, "step": 2577 }, { "epoch": 6.257594167679223, "grad_norm": 1.482115626335144, "learning_rate": 7.725e-06, "loss": 0.2086, "step": 2578 }, { "epoch": 6.260024301336573, "grad_norm": 1.1257166862487793, "learning_rate": 7.728e-06, "loss": 0.1638, "step": 2579 }, { "epoch": 6.262454434993924, "grad_norm": 1.0325751304626465, "learning_rate": 7.731e-06, "loss": 0.1287, "step": 2580 }, { "epoch": 6.264884568651276, "grad_norm": 1.0881551504135132, "learning_rate": 7.733999999999999e-06, "loss": 0.1534, "step": 2581 }, { "epoch": 6.267314702308627, "grad_norm": 1.4476888179779053, "learning_rate": 7.737e-06, "loss": 0.1592, "step": 2582 }, { "epoch": 6.269744835965978, "grad_norm": 1.1179803609848022, "learning_rate": 7.74e-06, "loss": 0.1209, "step": 2583 }, { "epoch": 6.272174969623329, "grad_norm": 1.0481549501419067, "learning_rate": 7.743e-06, "loss": 0.1147, "step": 2584 }, { "epoch": 6.27460510328068, "grad_norm": 0.9255885481834412, "learning_rate": 7.746e-06, "loss": 0.0936, "step": 2585 }, { "epoch": 6.277035236938032, "grad_norm": 0.9017358422279358, "learning_rate": 7.749e-06, "loss": 0.1089, "step": 2586 }, { "epoch": 6.279465370595383, "grad_norm": 1.0670298337936401, "learning_rate": 7.752000000000001e-06, "loss": 0.1136, "step": 2587 }, { "epoch": 6.281895504252734, "grad_norm": 1.2073012590408325, "learning_rate": 7.755000000000001e-06, "loss": 0.1611, "step": 2588 }, { "epoch": 6.284325637910085, "grad_norm": 0.7330478429794312, "learning_rate": 7.758000000000001e-06, "loss": 0.0978, "step": 2589 }, { "epoch": 6.286755771567436, "grad_norm": 1.2872647047042847, "learning_rate": 7.760999999999999e-06, "loss": 0.1593, "step": 2590 }, { "epoch": 6.289185905224787, "grad_norm": 1.1019185781478882, "learning_rate": 7.763999999999999e-06, "loss": 0.1203, "step": 2591 }, { "epoch": 6.291616038882139, "grad_norm": 1.6933976411819458, "learning_rate": 7.767e-06, "loss": 0.1168, "step": 2592 }, { "epoch": 6.29404617253949, "grad_norm": 0.7906861901283264, "learning_rate": 7.77e-06, "loss": 0.0944, "step": 2593 }, { "epoch": 6.2964763061968405, "grad_norm": 1.0430070161819458, "learning_rate": 7.773e-06, "loss": 0.1076, "step": 2594 }, { "epoch": 6.298906439854192, "grad_norm": 1.236579179763794, "learning_rate": 7.776e-06, "loss": 0.2178, "step": 2595 }, { "epoch": 6.301336573511543, "grad_norm": 0.8757935166358948, "learning_rate": 7.779e-06, "loss": 0.1132, "step": 2596 }, { "epoch": 6.3037667071688945, "grad_norm": 1.418758749961853, "learning_rate": 7.782000000000001e-06, "loss": 0.1368, "step": 2597 }, { "epoch": 6.306196840826246, "grad_norm": 1.95808744430542, "learning_rate": 7.785000000000001e-06, "loss": 0.117, "step": 2598 }, { "epoch": 6.308626974483596, "grad_norm": 0.954366147518158, "learning_rate": 7.788e-06, "loss": 0.1043, "step": 2599 }, { "epoch": 6.311057108140948, "grad_norm": 1.3449078798294067, "learning_rate": 7.791e-06, "loss": 0.1489, "step": 2600 }, { "epoch": 6.313487241798299, "grad_norm": 1.227600336074829, "learning_rate": 7.794e-06, "loss": 0.1063, "step": 2601 }, { "epoch": 6.31591737545565, "grad_norm": 1.0075424909591675, "learning_rate": 7.797e-06, "loss": 0.1163, "step": 2602 }, { "epoch": 6.318347509113001, "grad_norm": 1.0084861516952515, "learning_rate": 7.8e-06, "loss": 0.0946, "step": 2603 }, { "epoch": 6.320777642770352, "grad_norm": 1.0286283493041992, "learning_rate": 7.803e-06, "loss": 0.1085, "step": 2604 }, { "epoch": 6.3232077764277035, "grad_norm": 1.0763399600982666, "learning_rate": 7.806e-06, "loss": 0.09, "step": 2605 }, { "epoch": 6.325637910085055, "grad_norm": 1.175658106803894, "learning_rate": 7.809e-06, "loss": 0.0989, "step": 2606 }, { "epoch": 6.328068043742406, "grad_norm": 1.37642240524292, "learning_rate": 7.812e-06, "loss": 0.1418, "step": 2607 }, { "epoch": 6.330498177399757, "grad_norm": 1.6965669393539429, "learning_rate": 7.815e-06, "loss": 0.1642, "step": 2608 }, { "epoch": 6.332928311057108, "grad_norm": 1.0205782651901245, "learning_rate": 7.818e-06, "loss": 0.116, "step": 2609 }, { "epoch": 6.335358444714459, "grad_norm": 1.2258301973342896, "learning_rate": 7.821e-06, "loss": 0.1317, "step": 2610 }, { "epoch": 6.337788578371811, "grad_norm": 1.3246515989303589, "learning_rate": 7.824e-06, "loss": 0.1124, "step": 2611 }, { "epoch": 6.340218712029162, "grad_norm": 1.218748688697815, "learning_rate": 7.827000000000001e-06, "loss": 0.1247, "step": 2612 }, { "epoch": 6.342648845686512, "grad_norm": 1.2858887910842896, "learning_rate": 7.830000000000001e-06, "loss": 0.1462, "step": 2613 }, { "epoch": 6.345078979343864, "grad_norm": 2.407895088195801, "learning_rate": 7.833e-06, "loss": 0.1354, "step": 2614 }, { "epoch": 6.347509113001215, "grad_norm": 1.573325753211975, "learning_rate": 7.836e-06, "loss": 0.1247, "step": 2615 }, { "epoch": 6.3499392466585665, "grad_norm": 1.3426111936569214, "learning_rate": 7.838999999999999e-06, "loss": 0.1563, "step": 2616 }, { "epoch": 6.352369380315917, "grad_norm": 2.069913625717163, "learning_rate": 7.842e-06, "loss": 0.1354, "step": 2617 }, { "epoch": 6.354799513973268, "grad_norm": 2.097393751144409, "learning_rate": 7.845e-06, "loss": 0.2107, "step": 2618 }, { "epoch": 6.35722964763062, "grad_norm": 1.6745574474334717, "learning_rate": 7.848e-06, "loss": 0.1366, "step": 2619 }, { "epoch": 6.359659781287971, "grad_norm": 2.3558013439178467, "learning_rate": 7.851e-06, "loss": 0.1649, "step": 2620 }, { "epoch": 6.362089914945322, "grad_norm": 1.8427201509475708, "learning_rate": 7.854e-06, "loss": 0.1685, "step": 2621 }, { "epoch": 6.364520048602673, "grad_norm": 2.690326452255249, "learning_rate": 7.857000000000001e-06, "loss": 0.2914, "step": 2622 }, { "epoch": 6.366950182260024, "grad_norm": 1.2389570474624634, "learning_rate": 7.860000000000001e-06, "loss": 0.5024, "step": 2623 }, { "epoch": 6.369380315917375, "grad_norm": 1.2510128021240234, "learning_rate": 7.863e-06, "loss": 0.3931, "step": 2624 }, { "epoch": 6.371810449574727, "grad_norm": 1.1972085237503052, "learning_rate": 7.866e-06, "loss": 0.3154, "step": 2625 }, { "epoch": 6.374240583232078, "grad_norm": 0.9239754676818848, "learning_rate": 7.868999999999999e-06, "loss": 0.2891, "step": 2626 }, { "epoch": 6.3766707168894285, "grad_norm": 1.7559514045715332, "learning_rate": 7.872e-06, "loss": 0.2542, "step": 2627 }, { "epoch": 6.37910085054678, "grad_norm": 1.40970778465271, "learning_rate": 7.875e-06, "loss": 0.2331, "step": 2628 }, { "epoch": 6.381530984204131, "grad_norm": 1.4556331634521484, "learning_rate": 7.878e-06, "loss": 0.2079, "step": 2629 }, { "epoch": 6.383961117861483, "grad_norm": 1.0315810441970825, "learning_rate": 7.881e-06, "loss": 0.1763, "step": 2630 }, { "epoch": 6.386391251518834, "grad_norm": 0.934049665927887, "learning_rate": 7.884e-06, "loss": 0.1671, "step": 2631 }, { "epoch": 6.388821385176184, "grad_norm": 1.5557851791381836, "learning_rate": 7.887000000000001e-06, "loss": 0.141, "step": 2632 }, { "epoch": 6.391251518833536, "grad_norm": 1.0710318088531494, "learning_rate": 7.89e-06, "loss": 0.1517, "step": 2633 }, { "epoch": 6.393681652490887, "grad_norm": 0.971956729888916, "learning_rate": 7.893e-06, "loss": 0.1261, "step": 2634 }, { "epoch": 6.396111786148238, "grad_norm": 1.1212929487228394, "learning_rate": 7.896e-06, "loss": 0.0909, "step": 2635 }, { "epoch": 6.39854191980559, "grad_norm": 0.9451019167900085, "learning_rate": 7.899e-06, "loss": 0.1058, "step": 2636 }, { "epoch": 6.40097205346294, "grad_norm": 1.0464431047439575, "learning_rate": 7.902000000000002e-06, "loss": 0.1275, "step": 2637 }, { "epoch": 6.4034021871202915, "grad_norm": 0.8578687310218811, "learning_rate": 7.905000000000001e-06, "loss": 0.0986, "step": 2638 }, { "epoch": 6.405832320777643, "grad_norm": 1.1993918418884277, "learning_rate": 7.908e-06, "loss": 0.1535, "step": 2639 }, { "epoch": 6.408262454434994, "grad_norm": 1.0648045539855957, "learning_rate": 7.911e-06, "loss": 0.1015, "step": 2640 }, { "epoch": 6.4106925880923455, "grad_norm": 0.8076362013816833, "learning_rate": 7.913999999999999e-06, "loss": 0.0949, "step": 2641 }, { "epoch": 6.413122721749696, "grad_norm": 1.2385286092758179, "learning_rate": 7.917e-06, "loss": 0.1067, "step": 2642 }, { "epoch": 6.415552855407047, "grad_norm": 1.1172754764556885, "learning_rate": 7.92e-06, "loss": 0.1058, "step": 2643 }, { "epoch": 6.417982989064399, "grad_norm": 1.180301547050476, "learning_rate": 7.923e-06, "loss": 0.1022, "step": 2644 }, { "epoch": 6.42041312272175, "grad_norm": 1.2569429874420166, "learning_rate": 7.926e-06, "loss": 0.1184, "step": 2645 }, { "epoch": 6.4228432563791005, "grad_norm": 1.1167293787002563, "learning_rate": 7.929e-06, "loss": 0.1265, "step": 2646 }, { "epoch": 6.425273390036452, "grad_norm": 0.7889602184295654, "learning_rate": 7.932000000000001e-06, "loss": 0.1007, "step": 2647 }, { "epoch": 6.427703523693803, "grad_norm": 1.0103150606155396, "learning_rate": 7.935000000000001e-06, "loss": 0.131, "step": 2648 }, { "epoch": 6.4301336573511545, "grad_norm": 1.254231333732605, "learning_rate": 7.938000000000001e-06, "loss": 0.1332, "step": 2649 }, { "epoch": 6.432563791008506, "grad_norm": 1.2670990228652954, "learning_rate": 7.941e-06, "loss": 0.1336, "step": 2650 }, { "epoch": 6.434993924665856, "grad_norm": 1.1028050184249878, "learning_rate": 7.943999999999999e-06, "loss": 0.0895, "step": 2651 }, { "epoch": 6.437424058323208, "grad_norm": 1.0860298871994019, "learning_rate": 7.947e-06, "loss": 0.1181, "step": 2652 }, { "epoch": 6.439854191980559, "grad_norm": 1.3077948093414307, "learning_rate": 7.95e-06, "loss": 0.1259, "step": 2653 }, { "epoch": 6.44228432563791, "grad_norm": 1.3216291666030884, "learning_rate": 7.953e-06, "loss": 0.1238, "step": 2654 }, { "epoch": 6.444714459295262, "grad_norm": 1.0802372694015503, "learning_rate": 7.956e-06, "loss": 0.098, "step": 2655 }, { "epoch": 6.447144592952612, "grad_norm": 2.808753252029419, "learning_rate": 7.959e-06, "loss": 0.2667, "step": 2656 }, { "epoch": 6.4495747266099634, "grad_norm": 1.19962739944458, "learning_rate": 7.962000000000001e-06, "loss": 0.122, "step": 2657 }, { "epoch": 6.452004860267315, "grad_norm": 1.1742205619812012, "learning_rate": 7.965e-06, "loss": 0.1604, "step": 2658 }, { "epoch": 6.454434993924666, "grad_norm": 1.1324875354766846, "learning_rate": 7.968e-06, "loss": 0.127, "step": 2659 }, { "epoch": 6.456865127582017, "grad_norm": 1.4819210767745972, "learning_rate": 7.971e-06, "loss": 0.1879, "step": 2660 }, { "epoch": 6.459295261239368, "grad_norm": 1.1645939350128174, "learning_rate": 7.974e-06, "loss": 0.108, "step": 2661 }, { "epoch": 6.461725394896719, "grad_norm": 1.2013187408447266, "learning_rate": 7.977000000000002e-06, "loss": 0.1187, "step": 2662 }, { "epoch": 6.464155528554071, "grad_norm": 1.3011349439620972, "learning_rate": 7.98e-06, "loss": 0.1289, "step": 2663 }, { "epoch": 6.466585662211422, "grad_norm": 1.2430623769760132, "learning_rate": 7.983e-06, "loss": 0.1455, "step": 2664 }, { "epoch": 6.469015795868772, "grad_norm": 1.0684716701507568, "learning_rate": 7.986e-06, "loss": 0.1124, "step": 2665 }, { "epoch": 6.471445929526124, "grad_norm": 0.9284802675247192, "learning_rate": 7.989e-06, "loss": 0.081, "step": 2666 }, { "epoch": 6.473876063183475, "grad_norm": 1.821504831314087, "learning_rate": 7.992e-06, "loss": 0.1709, "step": 2667 }, { "epoch": 6.476306196840826, "grad_norm": 1.1605758666992188, "learning_rate": 7.995e-06, "loss": 0.1152, "step": 2668 }, { "epoch": 6.478736330498178, "grad_norm": 1.8798537254333496, "learning_rate": 7.998e-06, "loss": 0.1546, "step": 2669 }, { "epoch": 6.481166464155528, "grad_norm": 1.6157398223876953, "learning_rate": 8.001e-06, "loss": 0.1315, "step": 2670 }, { "epoch": 6.48359659781288, "grad_norm": 2.073880195617676, "learning_rate": 8.004e-06, "loss": 0.197, "step": 2671 }, { "epoch": 6.486026731470231, "grad_norm": 4.067700386047363, "learning_rate": 8.007000000000001e-06, "loss": 0.3687, "step": 2672 }, { "epoch": 6.488456865127582, "grad_norm": 1.8081440925598145, "learning_rate": 8.010000000000001e-06, "loss": 0.4995, "step": 2673 }, { "epoch": 6.490886998784934, "grad_norm": 1.450165867805481, "learning_rate": 8.013000000000001e-06, "loss": 0.4063, "step": 2674 }, { "epoch": 6.493317132442284, "grad_norm": 1.6865510940551758, "learning_rate": 8.016e-06, "loss": 0.3183, "step": 2675 }, { "epoch": 6.495747266099635, "grad_norm": 1.2329670190811157, "learning_rate": 8.018999999999999e-06, "loss": 0.326, "step": 2676 }, { "epoch": 6.498177399756987, "grad_norm": 1.1863807439804077, "learning_rate": 8.022e-06, "loss": 0.2708, "step": 2677 }, { "epoch": 6.500607533414338, "grad_norm": 0.9781352281570435, "learning_rate": 8.025e-06, "loss": 0.243, "step": 2678 }, { "epoch": 6.503037667071689, "grad_norm": 0.8619372844696045, "learning_rate": 8.028e-06, "loss": 0.1992, "step": 2679 }, { "epoch": 6.50546780072904, "grad_norm": 1.0119574069976807, "learning_rate": 8.031e-06, "loss": 0.2042, "step": 2680 }, { "epoch": 6.507897934386391, "grad_norm": 0.980551540851593, "learning_rate": 8.034e-06, "loss": 0.1407, "step": 2681 }, { "epoch": 6.5103280680437425, "grad_norm": 0.9053468704223633, "learning_rate": 8.037000000000001e-06, "loss": 0.139, "step": 2682 }, { "epoch": 6.512758201701094, "grad_norm": 1.0502222776412964, "learning_rate": 8.040000000000001e-06, "loss": 0.155, "step": 2683 }, { "epoch": 6.515188335358444, "grad_norm": 1.0574818849563599, "learning_rate": 8.043e-06, "loss": 0.1402, "step": 2684 }, { "epoch": 6.517618469015796, "grad_norm": 0.7404270172119141, "learning_rate": 8.046e-06, "loss": 0.109, "step": 2685 }, { "epoch": 6.520048602673147, "grad_norm": 1.2764712572097778, "learning_rate": 8.049e-06, "loss": 0.1397, "step": 2686 }, { "epoch": 6.522478736330498, "grad_norm": 0.9023524522781372, "learning_rate": 8.052000000000002e-06, "loss": 0.0928, "step": 2687 }, { "epoch": 6.52490886998785, "grad_norm": 0.9140835404396057, "learning_rate": 8.055e-06, "loss": 0.1212, "step": 2688 }, { "epoch": 6.5273390036452, "grad_norm": 0.924657940864563, "learning_rate": 8.058e-06, "loss": 0.1117, "step": 2689 }, { "epoch": 6.5297691373025515, "grad_norm": 0.7942074537277222, "learning_rate": 8.061e-06, "loss": 0.1171, "step": 2690 }, { "epoch": 6.532199270959903, "grad_norm": 1.0293172597885132, "learning_rate": 8.064e-06, "loss": 0.1153, "step": 2691 }, { "epoch": 6.534629404617254, "grad_norm": 0.8616093397140503, "learning_rate": 8.067e-06, "loss": 0.1179, "step": 2692 }, { "epoch": 6.537059538274605, "grad_norm": 1.0312994718551636, "learning_rate": 8.07e-06, "loss": 0.1098, "step": 2693 }, { "epoch": 6.539489671931956, "grad_norm": 1.047086238861084, "learning_rate": 8.073e-06, "loss": 0.1375, "step": 2694 }, { "epoch": 6.541919805589307, "grad_norm": 1.1066792011260986, "learning_rate": 8.076e-06, "loss": 0.1312, "step": 2695 }, { "epoch": 6.544349939246659, "grad_norm": 1.0412644147872925, "learning_rate": 8.079e-06, "loss": 0.1224, "step": 2696 }, { "epoch": 6.54678007290401, "grad_norm": 1.0086016654968262, "learning_rate": 8.082e-06, "loss": 0.0807, "step": 2697 }, { "epoch": 6.54921020656136, "grad_norm": 1.1820390224456787, "learning_rate": 8.085000000000001e-06, "loss": 0.1175, "step": 2698 }, { "epoch": 6.551640340218712, "grad_norm": 1.2971707582473755, "learning_rate": 8.088000000000001e-06, "loss": 0.2088, "step": 2699 }, { "epoch": 6.554070473876063, "grad_norm": 0.8938049674034119, "learning_rate": 8.091e-06, "loss": 0.1193, "step": 2700 }, { "epoch": 6.5565006075334145, "grad_norm": 1.0746617317199707, "learning_rate": 8.093999999999999e-06, "loss": 0.1157, "step": 2701 }, { "epoch": 6.558930741190766, "grad_norm": 0.9125484228134155, "learning_rate": 8.096999999999999e-06, "loss": 0.0874, "step": 2702 }, { "epoch": 6.561360874848116, "grad_norm": 0.9175683259963989, "learning_rate": 8.1e-06, "loss": 0.1028, "step": 2703 }, { "epoch": 6.563791008505468, "grad_norm": 1.0474097728729248, "learning_rate": 8.103e-06, "loss": 0.1485, "step": 2704 }, { "epoch": 6.566221142162819, "grad_norm": 0.9356892108917236, "learning_rate": 8.106e-06, "loss": 0.1084, "step": 2705 }, { "epoch": 6.56865127582017, "grad_norm": 1.7830719947814941, "learning_rate": 8.109e-06, "loss": 0.1261, "step": 2706 }, { "epoch": 6.571081409477522, "grad_norm": 0.9988549947738647, "learning_rate": 8.112e-06, "loss": 0.116, "step": 2707 }, { "epoch": 6.573511543134872, "grad_norm": 1.0400245189666748, "learning_rate": 8.115000000000001e-06, "loss": 0.1063, "step": 2708 }, { "epoch": 6.575941676792223, "grad_norm": 1.0983022451400757, "learning_rate": 8.118000000000001e-06, "loss": 0.1369, "step": 2709 }, { "epoch": 6.578371810449575, "grad_norm": 1.2740463018417358, "learning_rate": 8.121e-06, "loss": 0.1399, "step": 2710 }, { "epoch": 6.580801944106926, "grad_norm": 1.6033657789230347, "learning_rate": 8.124e-06, "loss": 0.1077, "step": 2711 }, { "epoch": 6.583232077764277, "grad_norm": 1.4856244325637817, "learning_rate": 8.126999999999999e-06, "loss": 0.1386, "step": 2712 }, { "epoch": 6.585662211421628, "grad_norm": 1.1714844703674316, "learning_rate": 8.13e-06, "loss": 0.1165, "step": 2713 }, { "epoch": 6.588092345078979, "grad_norm": 2.790815830230713, "learning_rate": 8.133e-06, "loss": 0.1004, "step": 2714 }, { "epoch": 6.590522478736331, "grad_norm": 1.09629225730896, "learning_rate": 8.136e-06, "loss": 0.1386, "step": 2715 }, { "epoch": 6.592952612393682, "grad_norm": 1.3576419353485107, "learning_rate": 8.139e-06, "loss": 0.1677, "step": 2716 }, { "epoch": 6.595382746051033, "grad_norm": 1.445607304573059, "learning_rate": 8.142e-06, "loss": 0.1795, "step": 2717 }, { "epoch": 6.597812879708384, "grad_norm": 1.3237290382385254, "learning_rate": 8.145e-06, "loss": 0.1584, "step": 2718 }, { "epoch": 6.600243013365735, "grad_norm": 1.2572216987609863, "learning_rate": 8.148e-06, "loss": 0.1564, "step": 2719 }, { "epoch": 6.602673147023086, "grad_norm": 2.2331018447875977, "learning_rate": 8.151e-06, "loss": 0.2206, "step": 2720 }, { "epoch": 6.605103280680438, "grad_norm": 1.7731250524520874, "learning_rate": 8.154e-06, "loss": 0.1959, "step": 2721 }, { "epoch": 6.607533414337789, "grad_norm": 2.7507691383361816, "learning_rate": 8.157e-06, "loss": 0.2392, "step": 2722 }, { "epoch": 6.6099635479951395, "grad_norm": 1.5826547145843506, "learning_rate": 8.160000000000001e-06, "loss": 0.5572, "step": 2723 }, { "epoch": 6.612393681652491, "grad_norm": 1.2255803346633911, "learning_rate": 8.163000000000001e-06, "loss": 0.427, "step": 2724 }, { "epoch": 6.614823815309842, "grad_norm": 1.0592894554138184, "learning_rate": 8.166e-06, "loss": 0.3629, "step": 2725 }, { "epoch": 6.6172539489671935, "grad_norm": 1.4010467529296875, "learning_rate": 8.169e-06, "loss": 0.3836, "step": 2726 }, { "epoch": 6.619684082624544, "grad_norm": 1.3565411567687988, "learning_rate": 8.171999999999999e-06, "loss": 0.3604, "step": 2727 }, { "epoch": 6.622114216281895, "grad_norm": 1.8400237560272217, "learning_rate": 8.175e-06, "loss": 0.2421, "step": 2728 }, { "epoch": 6.624544349939247, "grad_norm": 1.4003740549087524, "learning_rate": 8.178e-06, "loss": 0.2462, "step": 2729 }, { "epoch": 6.626974483596598, "grad_norm": 0.8142560720443726, "learning_rate": 8.181e-06, "loss": 0.1679, "step": 2730 }, { "epoch": 6.6294046172539485, "grad_norm": 0.9146862626075745, "learning_rate": 8.184e-06, "loss": 0.1052, "step": 2731 }, { "epoch": 6.6318347509113, "grad_norm": 1.0118476152420044, "learning_rate": 8.187e-06, "loss": 0.1602, "step": 2732 }, { "epoch": 6.634264884568651, "grad_norm": 0.8942214846611023, "learning_rate": 8.190000000000001e-06, "loss": 0.1172, "step": 2733 }, { "epoch": 6.6366950182260025, "grad_norm": 1.4780769348144531, "learning_rate": 8.193000000000001e-06, "loss": 0.1225, "step": 2734 }, { "epoch": 6.639125151883354, "grad_norm": 0.9120915532112122, "learning_rate": 8.196e-06, "loss": 0.1373, "step": 2735 }, { "epoch": 6.641555285540704, "grad_norm": 0.9756389260292053, "learning_rate": 8.199e-06, "loss": 0.1189, "step": 2736 }, { "epoch": 6.643985419198056, "grad_norm": 1.0340502262115479, "learning_rate": 8.201999999999999e-06, "loss": 0.1568, "step": 2737 }, { "epoch": 6.646415552855407, "grad_norm": 1.010231375694275, "learning_rate": 8.205e-06, "loss": 0.137, "step": 2738 }, { "epoch": 6.648845686512758, "grad_norm": 1.0358771085739136, "learning_rate": 8.208e-06, "loss": 0.1525, "step": 2739 }, { "epoch": 6.65127582017011, "grad_norm": 0.914556622505188, "learning_rate": 8.211e-06, "loss": 0.1219, "step": 2740 }, { "epoch": 6.65370595382746, "grad_norm": 0.8555624485015869, "learning_rate": 8.214e-06, "loss": 0.0725, "step": 2741 }, { "epoch": 6.6561360874848114, "grad_norm": 1.0084551572799683, "learning_rate": 8.217e-06, "loss": 0.0986, "step": 2742 }, { "epoch": 6.658566221142163, "grad_norm": 0.8976922035217285, "learning_rate": 8.220000000000001e-06, "loss": 0.1248, "step": 2743 }, { "epoch": 6.660996354799514, "grad_norm": 0.9310399293899536, "learning_rate": 8.223e-06, "loss": 0.1217, "step": 2744 }, { "epoch": 6.6634264884568655, "grad_norm": 1.0086047649383545, "learning_rate": 8.226e-06, "loss": 0.1196, "step": 2745 }, { "epoch": 6.665856622114216, "grad_norm": 0.7527347803115845, "learning_rate": 8.229e-06, "loss": 0.0829, "step": 2746 }, { "epoch": 6.668286755771567, "grad_norm": 0.7560634016990662, "learning_rate": 8.232e-06, "loss": 0.0866, "step": 2747 }, { "epoch": 6.670716889428919, "grad_norm": 1.206551194190979, "learning_rate": 8.235000000000002e-06, "loss": 0.1142, "step": 2748 }, { "epoch": 6.67314702308627, "grad_norm": 1.0679631233215332, "learning_rate": 8.238e-06, "loss": 0.1401, "step": 2749 }, { "epoch": 6.675577156743621, "grad_norm": 0.8657529354095459, "learning_rate": 8.241e-06, "loss": 0.0917, "step": 2750 }, { "epoch": 6.678007290400972, "grad_norm": 1.1600580215454102, "learning_rate": 8.244e-06, "loss": 0.1126, "step": 2751 }, { "epoch": 6.680437424058323, "grad_norm": 0.9616500735282898, "learning_rate": 8.246999999999999e-06, "loss": 0.1185, "step": 2752 }, { "epoch": 6.682867557715674, "grad_norm": 0.9350200295448303, "learning_rate": 8.25e-06, "loss": 0.1146, "step": 2753 }, { "epoch": 6.685297691373026, "grad_norm": 1.2357869148254395, "learning_rate": 8.253e-06, "loss": 0.1455, "step": 2754 }, { "epoch": 6.687727825030377, "grad_norm": 1.968917965888977, "learning_rate": 8.256e-06, "loss": 0.1109, "step": 2755 }, { "epoch": 6.690157958687728, "grad_norm": 1.0182483196258545, "learning_rate": 8.259e-06, "loss": 0.108, "step": 2756 }, { "epoch": 6.692588092345079, "grad_norm": 0.82079017162323, "learning_rate": 8.262e-06, "loss": 0.1003, "step": 2757 }, { "epoch": 6.69501822600243, "grad_norm": 1.4876564741134644, "learning_rate": 8.265000000000001e-06, "loss": 0.1543, "step": 2758 }, { "epoch": 6.697448359659782, "grad_norm": 1.031934142112732, "learning_rate": 8.268000000000001e-06, "loss": 0.1279, "step": 2759 }, { "epoch": 6.699878493317133, "grad_norm": 0.9274235367774963, "learning_rate": 8.271000000000001e-06, "loss": 0.1245, "step": 2760 }, { "epoch": 6.702308626974483, "grad_norm": 1.2403210401535034, "learning_rate": 8.274e-06, "loss": 0.162, "step": 2761 }, { "epoch": 6.704738760631835, "grad_norm": 1.5497483015060425, "learning_rate": 8.276999999999999e-06, "loss": 0.1407, "step": 2762 }, { "epoch": 6.707168894289186, "grad_norm": 1.3189949989318848, "learning_rate": 8.28e-06, "loss": 0.1546, "step": 2763 }, { "epoch": 6.709599027946537, "grad_norm": 1.1431851387023926, "learning_rate": 8.283e-06, "loss": 0.1351, "step": 2764 }, { "epoch": 6.712029161603888, "grad_norm": 0.8902296423912048, "learning_rate": 8.286e-06, "loss": 0.101, "step": 2765 }, { "epoch": 6.714459295261239, "grad_norm": 1.3902565240859985, "learning_rate": 8.289e-06, "loss": 0.2027, "step": 2766 }, { "epoch": 6.7168894289185905, "grad_norm": 1.3561567068099976, "learning_rate": 8.292e-06, "loss": 0.1232, "step": 2767 }, { "epoch": 6.719319562575942, "grad_norm": 1.7476344108581543, "learning_rate": 8.295000000000001e-06, "loss": 0.1609, "step": 2768 }, { "epoch": 6.721749696233293, "grad_norm": 1.5860440731048584, "learning_rate": 8.298000000000001e-06, "loss": 0.1464, "step": 2769 }, { "epoch": 6.724179829890644, "grad_norm": 1.9687498807907104, "learning_rate": 8.301e-06, "loss": 0.2109, "step": 2770 }, { "epoch": 6.726609963547995, "grad_norm": 2.9089808464050293, "learning_rate": 8.304e-06, "loss": 0.2352, "step": 2771 }, { "epoch": 6.729040097205346, "grad_norm": 2.6260886192321777, "learning_rate": 8.307e-06, "loss": 0.3251, "step": 2772 }, { "epoch": 6.731470230862698, "grad_norm": 1.5856819152832031, "learning_rate": 8.310000000000002e-06, "loss": 0.4717, "step": 2773 }, { "epoch": 6.733900364520048, "grad_norm": 1.8534902334213257, "learning_rate": 8.313e-06, "loss": 0.4446, "step": 2774 }, { "epoch": 6.7363304981773995, "grad_norm": 1.4357357025146484, "learning_rate": 8.316e-06, "loss": 0.3401, "step": 2775 }, { "epoch": 6.738760631834751, "grad_norm": 1.2835822105407715, "learning_rate": 8.319e-06, "loss": 0.3467, "step": 2776 }, { "epoch": 6.741190765492102, "grad_norm": 1.3268494606018066, "learning_rate": 8.322e-06, "loss": 0.2769, "step": 2777 }, { "epoch": 6.7436208991494535, "grad_norm": 1.0214462280273438, "learning_rate": 8.325e-06, "loss": 0.2255, "step": 2778 }, { "epoch": 6.746051032806804, "grad_norm": 0.9694271087646484, "learning_rate": 8.328e-06, "loss": 0.2218, "step": 2779 }, { "epoch": 6.748481166464155, "grad_norm": 0.9460226893424988, "learning_rate": 8.331e-06, "loss": 0.1642, "step": 2780 }, { "epoch": 6.750911300121507, "grad_norm": 1.0299861431121826, "learning_rate": 8.334e-06, "loss": 0.1695, "step": 2781 }, { "epoch": 6.753341433778858, "grad_norm": 0.6670804619789124, "learning_rate": 8.337e-06, "loss": 0.1189, "step": 2782 }, { "epoch": 6.755771567436209, "grad_norm": 1.1075067520141602, "learning_rate": 8.340000000000001e-06, "loss": 0.1274, "step": 2783 }, { "epoch": 6.75820170109356, "grad_norm": 0.7592532634735107, "learning_rate": 8.343000000000001e-06, "loss": 0.1167, "step": 2784 }, { "epoch": 6.760631834750911, "grad_norm": 1.2473076581954956, "learning_rate": 8.346000000000001e-06, "loss": 0.1611, "step": 2785 }, { "epoch": 6.7630619684082625, "grad_norm": 0.8932879567146301, "learning_rate": 8.349e-06, "loss": 0.1584, "step": 2786 }, { "epoch": 6.765492102065614, "grad_norm": 0.9052709937095642, "learning_rate": 8.351999999999999e-06, "loss": 0.0958, "step": 2787 }, { "epoch": 6.767922235722965, "grad_norm": 0.9275968670845032, "learning_rate": 8.355e-06, "loss": 0.102, "step": 2788 }, { "epoch": 6.770352369380316, "grad_norm": 1.2302567958831787, "learning_rate": 8.358e-06, "loss": 0.1141, "step": 2789 }, { "epoch": 6.772782503037667, "grad_norm": 1.154685139656067, "learning_rate": 8.361e-06, "loss": 0.1423, "step": 2790 }, { "epoch": 6.775212636695018, "grad_norm": 0.9067649841308594, "learning_rate": 8.364e-06, "loss": 0.1175, "step": 2791 }, { "epoch": 6.77764277035237, "grad_norm": 1.1607609987258911, "learning_rate": 8.367e-06, "loss": 0.106, "step": 2792 }, { "epoch": 6.780072904009721, "grad_norm": 0.9886043071746826, "learning_rate": 8.370000000000001e-06, "loss": 0.1041, "step": 2793 }, { "epoch": 6.782503037667071, "grad_norm": 1.112146258354187, "learning_rate": 8.373000000000001e-06, "loss": 0.121, "step": 2794 }, { "epoch": 6.784933171324423, "grad_norm": 1.113596796989441, "learning_rate": 8.376e-06, "loss": 0.1421, "step": 2795 }, { "epoch": 6.787363304981774, "grad_norm": 1.0755425691604614, "learning_rate": 8.379e-06, "loss": 0.1362, "step": 2796 }, { "epoch": 6.789793438639125, "grad_norm": 0.7641807198524475, "learning_rate": 8.382e-06, "loss": 0.1004, "step": 2797 }, { "epoch": 6.792223572296477, "grad_norm": 0.8232617378234863, "learning_rate": 8.385e-06, "loss": 0.0957, "step": 2798 }, { "epoch": 6.794653705953827, "grad_norm": 1.0945161581039429, "learning_rate": 8.388e-06, "loss": 0.1181, "step": 2799 }, { "epoch": 6.797083839611179, "grad_norm": 0.8075026273727417, "learning_rate": 8.391e-06, "loss": 0.0836, "step": 2800 }, { "epoch": 6.79951397326853, "grad_norm": 0.8767585754394531, "learning_rate": 8.394e-06, "loss": 0.0828, "step": 2801 }, { "epoch": 6.801944106925881, "grad_norm": 0.8418838977813721, "learning_rate": 8.397e-06, "loss": 0.1068, "step": 2802 }, { "epoch": 6.804374240583232, "grad_norm": 0.9635752439498901, "learning_rate": 8.400000000000001e-06, "loss": 0.1276, "step": 2803 }, { "epoch": 6.806804374240583, "grad_norm": 0.9590868353843689, "learning_rate": 8.403e-06, "loss": 0.093, "step": 2804 }, { "epoch": 6.809234507897934, "grad_norm": 1.1624865531921387, "learning_rate": 8.406e-06, "loss": 0.1295, "step": 2805 }, { "epoch": 6.811664641555286, "grad_norm": 1.5240155458450317, "learning_rate": 8.409e-06, "loss": 0.1279, "step": 2806 }, { "epoch": 6.814094775212637, "grad_norm": 1.1490607261657715, "learning_rate": 8.412e-06, "loss": 0.0941, "step": 2807 }, { "epoch": 6.8165249088699875, "grad_norm": 0.8667052984237671, "learning_rate": 8.415000000000002e-06, "loss": 0.0963, "step": 2808 }, { "epoch": 6.818955042527339, "grad_norm": 1.0073697566986084, "learning_rate": 8.418000000000001e-06, "loss": 0.1195, "step": 2809 }, { "epoch": 6.82138517618469, "grad_norm": 1.1218255758285522, "learning_rate": 8.421000000000001e-06, "loss": 0.1103, "step": 2810 }, { "epoch": 6.8238153098420415, "grad_norm": 1.2613219022750854, "learning_rate": 8.424e-06, "loss": 0.1351, "step": 2811 }, { "epoch": 6.826245443499392, "grad_norm": 1.0270333290100098, "learning_rate": 8.426999999999999e-06, "loss": 0.1285, "step": 2812 }, { "epoch": 6.828675577156743, "grad_norm": 1.303863763809204, "learning_rate": 8.43e-06, "loss": 0.1129, "step": 2813 }, { "epoch": 6.831105710814095, "grad_norm": 1.0920381546020508, "learning_rate": 8.433e-06, "loss": 0.1204, "step": 2814 }, { "epoch": 6.833535844471446, "grad_norm": 1.026807188987732, "learning_rate": 8.436e-06, "loss": 0.1057, "step": 2815 }, { "epoch": 6.835965978128797, "grad_norm": 1.0310567617416382, "learning_rate": 8.439e-06, "loss": 0.1155, "step": 2816 }, { "epoch": 6.838396111786148, "grad_norm": 1.4148989915847778, "learning_rate": 8.442e-06, "loss": 0.1641, "step": 2817 }, { "epoch": 6.840826245443499, "grad_norm": 1.2654485702514648, "learning_rate": 8.445e-06, "loss": 0.1154, "step": 2818 }, { "epoch": 6.8432563791008505, "grad_norm": 1.7739887237548828, "learning_rate": 8.448000000000001e-06, "loss": 0.192, "step": 2819 }, { "epoch": 6.845686512758202, "grad_norm": 1.4379448890686035, "learning_rate": 8.451000000000001e-06, "loss": 0.1498, "step": 2820 }, { "epoch": 6.848116646415553, "grad_norm": 1.6476082801818848, "learning_rate": 8.454e-06, "loss": 0.1651, "step": 2821 }, { "epoch": 6.850546780072904, "grad_norm": 2.5447959899902344, "learning_rate": 8.457e-06, "loss": 0.2615, "step": 2822 }, { "epoch": 6.852976913730255, "grad_norm": 1.6710773706436157, "learning_rate": 8.459999999999999e-06, "loss": 0.4624, "step": 2823 }, { "epoch": 6.855407047387606, "grad_norm": 1.0699023008346558, "learning_rate": 8.463e-06, "loss": 0.4066, "step": 2824 }, { "epoch": 6.857837181044958, "grad_norm": 0.9316319823265076, "learning_rate": 8.466e-06, "loss": 0.3294, "step": 2825 }, { "epoch": 6.860267314702309, "grad_norm": 1.0694053173065186, "learning_rate": 8.469e-06, "loss": 0.3297, "step": 2826 }, { "epoch": 6.8626974483596594, "grad_norm": 1.2013037204742432, "learning_rate": 8.472e-06, "loss": 0.3118, "step": 2827 }, { "epoch": 6.865127582017011, "grad_norm": 0.9354962110519409, "learning_rate": 8.475e-06, "loss": 0.2531, "step": 2828 }, { "epoch": 6.867557715674362, "grad_norm": 1.0207432508468628, "learning_rate": 8.478e-06, "loss": 0.2103, "step": 2829 }, { "epoch": 6.8699878493317135, "grad_norm": 0.8556438088417053, "learning_rate": 8.481e-06, "loss": 0.1802, "step": 2830 }, { "epoch": 6.872417982989065, "grad_norm": 0.9612013101577759, "learning_rate": 8.484e-06, "loss": 0.1324, "step": 2831 }, { "epoch": 6.874848116646415, "grad_norm": 1.067575454711914, "learning_rate": 8.487e-06, "loss": 0.1277, "step": 2832 }, { "epoch": 6.877278250303767, "grad_norm": 1.2376412153244019, "learning_rate": 8.49e-06, "loss": 0.1384, "step": 2833 }, { "epoch": 6.879708383961118, "grad_norm": 1.098117709159851, "learning_rate": 8.493000000000002e-06, "loss": 0.1439, "step": 2834 }, { "epoch": 6.882138517618469, "grad_norm": 0.856576144695282, "learning_rate": 8.496e-06, "loss": 0.1321, "step": 2835 }, { "epoch": 6.884568651275821, "grad_norm": 0.8298060297966003, "learning_rate": 8.499e-06, "loss": 0.0917, "step": 2836 }, { "epoch": 6.886998784933171, "grad_norm": 1.0139598846435547, "learning_rate": 8.502e-06, "loss": 0.1592, "step": 2837 }, { "epoch": 6.889428918590522, "grad_norm": 1.379341721534729, "learning_rate": 8.504999999999999e-06, "loss": 0.1631, "step": 2838 }, { "epoch": 6.891859052247874, "grad_norm": 0.9342786073684692, "learning_rate": 8.508e-06, "loss": 0.0964, "step": 2839 }, { "epoch": 6.894289185905225, "grad_norm": 1.1103047132492065, "learning_rate": 8.511e-06, "loss": 0.1369, "step": 2840 }, { "epoch": 6.896719319562576, "grad_norm": 0.7830293774604797, "learning_rate": 8.514e-06, "loss": 0.0794, "step": 2841 }, { "epoch": 6.899149453219927, "grad_norm": 0.8479546308517456, "learning_rate": 8.517e-06, "loss": 0.0921, "step": 2842 }, { "epoch": 6.901579586877278, "grad_norm": 0.8597145676612854, "learning_rate": 8.52e-06, "loss": 0.1174, "step": 2843 }, { "epoch": 6.90400972053463, "grad_norm": 0.9553868174552917, "learning_rate": 8.523000000000001e-06, "loss": 0.0971, "step": 2844 }, { "epoch": 6.906439854191981, "grad_norm": 1.1766971349716187, "learning_rate": 8.526000000000001e-06, "loss": 0.1315, "step": 2845 }, { "epoch": 6.908869987849331, "grad_norm": 1.0683263540267944, "learning_rate": 8.529e-06, "loss": 0.1265, "step": 2846 }, { "epoch": 6.911300121506683, "grad_norm": 1.1457170248031616, "learning_rate": 8.532e-06, "loss": 0.0965, "step": 2847 }, { "epoch": 6.913730255164034, "grad_norm": 0.8530668020248413, "learning_rate": 8.534999999999999e-06, "loss": 0.1014, "step": 2848 }, { "epoch": 6.916160388821385, "grad_norm": 0.8186802268028259, "learning_rate": 8.538e-06, "loss": 0.1139, "step": 2849 }, { "epoch": 6.918590522478736, "grad_norm": 1.112242579460144, "learning_rate": 8.541e-06, "loss": 0.1128, "step": 2850 }, { "epoch": 6.921020656136087, "grad_norm": 1.225403904914856, "learning_rate": 8.544e-06, "loss": 0.1297, "step": 2851 }, { "epoch": 6.9234507897934385, "grad_norm": 0.9512274861335754, "learning_rate": 8.547e-06, "loss": 0.108, "step": 2852 }, { "epoch": 6.92588092345079, "grad_norm": 1.2386040687561035, "learning_rate": 8.55e-06, "loss": 0.11, "step": 2853 }, { "epoch": 6.928311057108141, "grad_norm": 1.2915568351745605, "learning_rate": 8.553000000000001e-06, "loss": 0.1241, "step": 2854 }, { "epoch": 6.930741190765492, "grad_norm": 1.1006078720092773, "learning_rate": 8.556e-06, "loss": 0.1079, "step": 2855 }, { "epoch": 6.933171324422843, "grad_norm": 1.4069461822509766, "learning_rate": 8.559e-06, "loss": 0.1342, "step": 2856 }, { "epoch": 6.935601458080194, "grad_norm": 0.6980803608894348, "learning_rate": 8.562e-06, "loss": 0.0679, "step": 2857 }, { "epoch": 6.938031591737546, "grad_norm": 1.1904902458190918, "learning_rate": 8.565e-06, "loss": 0.1369, "step": 2858 }, { "epoch": 6.940461725394897, "grad_norm": 1.5737961530685425, "learning_rate": 8.568000000000002e-06, "loss": 0.1477, "step": 2859 }, { "epoch": 6.9428918590522475, "grad_norm": 1.1017532348632812, "learning_rate": 8.571e-06, "loss": 0.0966, "step": 2860 }, { "epoch": 6.945321992709599, "grad_norm": 1.400254487991333, "learning_rate": 8.574e-06, "loss": 0.1381, "step": 2861 }, { "epoch": 6.94775212636695, "grad_norm": 1.1474223136901855, "learning_rate": 8.577e-06, "loss": 0.1398, "step": 2862 }, { "epoch": 6.9501822600243015, "grad_norm": 1.1342319250106812, "learning_rate": 8.58e-06, "loss": 0.1155, "step": 2863 }, { "epoch": 6.952612393681653, "grad_norm": 1.1977369785308838, "learning_rate": 8.583e-06, "loss": 0.1203, "step": 2864 }, { "epoch": 6.955042527339003, "grad_norm": 1.4756098985671997, "learning_rate": 8.586e-06, "loss": 0.1156, "step": 2865 }, { "epoch": 6.957472660996355, "grad_norm": 2.2213549613952637, "learning_rate": 8.589e-06, "loss": 0.1659, "step": 2866 }, { "epoch": 6.959902794653706, "grad_norm": 1.2403497695922852, "learning_rate": 8.592e-06, "loss": 0.0968, "step": 2867 }, { "epoch": 6.962332928311057, "grad_norm": 1.3309614658355713, "learning_rate": 8.595e-06, "loss": 0.1271, "step": 2868 }, { "epoch": 6.964763061968409, "grad_norm": 1.8903179168701172, "learning_rate": 8.598000000000001e-06, "loss": 0.1598, "step": 2869 }, { "epoch": 6.967193195625759, "grad_norm": 1.6888636350631714, "learning_rate": 8.601000000000001e-06, "loss": 0.1655, "step": 2870 }, { "epoch": 6.9696233292831105, "grad_norm": 1.7504202127456665, "learning_rate": 8.604000000000001e-06, "loss": 0.1601, "step": 2871 }, { "epoch": 6.972053462940462, "grad_norm": 2.3783748149871826, "learning_rate": 8.606999999999999e-06, "loss": 0.188, "step": 2872 }, { "epoch": 6.974483596597813, "grad_norm": 1.8887375593185425, "learning_rate": 8.609999999999999e-06, "loss": 0.3745, "step": 2873 }, { "epoch": 6.9769137302551645, "grad_norm": 0.9919509887695312, "learning_rate": 8.613e-06, "loss": 0.1928, "step": 2874 }, { "epoch": 6.979343863912515, "grad_norm": 0.9585666060447693, "learning_rate": 8.616e-06, "loss": 0.1787, "step": 2875 }, { "epoch": 6.981773997569866, "grad_norm": 1.4107111692428589, "learning_rate": 8.619e-06, "loss": 0.1105, "step": 2876 }, { "epoch": 6.984204131227218, "grad_norm": 0.941777229309082, "learning_rate": 8.622e-06, "loss": 0.0937, "step": 2877 }, { "epoch": 6.986634264884569, "grad_norm": 1.066224455833435, "learning_rate": 8.625e-06, "loss": 0.1138, "step": 2878 }, { "epoch": 6.98906439854192, "grad_norm": 0.8554605841636658, "learning_rate": 8.628000000000001e-06, "loss": 0.1108, "step": 2879 }, { "epoch": 6.991494532199271, "grad_norm": 0.9030752778053284, "learning_rate": 8.631000000000001e-06, "loss": 0.098, "step": 2880 }, { "epoch": 6.993924665856622, "grad_norm": 0.9093160629272461, "learning_rate": 8.634e-06, "loss": 0.1027, "step": 2881 }, { "epoch": 6.996354799513973, "grad_norm": 1.1995387077331543, "learning_rate": 8.637e-06, "loss": 0.1357, "step": 2882 }, { "epoch": 6.998784933171325, "grad_norm": 1.5893195867538452, "learning_rate": 8.64e-06, "loss": 0.1459, "step": 2883 }, { "epoch": 7.0, "grad_norm": 1.4203251600265503, "learning_rate": 8.643e-06, "loss": 0.0616, "step": 2884 }, { "epoch": 7.002430133657351, "grad_norm": 5.856880187988281, "learning_rate": 8.646e-06, "loss": 0.4961, "step": 2885 }, { "epoch": 7.004860267314703, "grad_norm": 1.4214622974395752, "learning_rate": 8.649e-06, "loss": 0.4153, "step": 2886 }, { "epoch": 7.007290400972053, "grad_norm": 1.5999938249588013, "learning_rate": 8.652e-06, "loss": 0.3318, "step": 2887 }, { "epoch": 7.0097205346294045, "grad_norm": 2.072356700897217, "learning_rate": 8.655e-06, "loss": 0.3178, "step": 2888 }, { "epoch": 7.012150668286756, "grad_norm": 1.1376198530197144, "learning_rate": 8.658e-06, "loss": 0.2482, "step": 2889 }, { "epoch": 7.014580801944107, "grad_norm": 1.4822895526885986, "learning_rate": 8.661e-06, "loss": 0.1997, "step": 2890 }, { "epoch": 7.0170109356014585, "grad_norm": 0.9793977737426758, "learning_rate": 8.664e-06, "loss": 0.1983, "step": 2891 }, { "epoch": 7.019441069258809, "grad_norm": 1.087731122970581, "learning_rate": 8.667e-06, "loss": 0.1441, "step": 2892 }, { "epoch": 7.02187120291616, "grad_norm": 1.8440912961959839, "learning_rate": 8.67e-06, "loss": 0.1915, "step": 2893 }, { "epoch": 7.024301336573512, "grad_norm": 1.076706051826477, "learning_rate": 8.673000000000001e-06, "loss": 0.1367, "step": 2894 }, { "epoch": 7.026731470230863, "grad_norm": 1.4016008377075195, "learning_rate": 8.676000000000001e-06, "loss": 0.1162, "step": 2895 }, { "epoch": 7.029161603888213, "grad_norm": 1.0298744440078735, "learning_rate": 8.679000000000001e-06, "loss": 0.1127, "step": 2896 }, { "epoch": 7.031591737545565, "grad_norm": 1.1222773790359497, "learning_rate": 8.682e-06, "loss": 0.1058, "step": 2897 }, { "epoch": 7.034021871202916, "grad_norm": 0.9510758519172668, "learning_rate": 8.684999999999999e-06, "loss": 0.1276, "step": 2898 }, { "epoch": 7.0364520048602675, "grad_norm": 0.7224106788635254, "learning_rate": 8.688e-06, "loss": 0.0965, "step": 2899 }, { "epoch": 7.038882138517619, "grad_norm": 0.9061241149902344, "learning_rate": 8.691e-06, "loss": 0.1001, "step": 2900 }, { "epoch": 7.041312272174969, "grad_norm": 1.1837884187698364, "learning_rate": 8.694e-06, "loss": 0.1206, "step": 2901 }, { "epoch": 7.043742405832321, "grad_norm": 1.0564559698104858, "learning_rate": 8.697e-06, "loss": 0.1121, "step": 2902 }, { "epoch": 7.046172539489672, "grad_norm": 0.7840033173561096, "learning_rate": 8.7e-06, "loss": 0.0723, "step": 2903 }, { "epoch": 7.048602673147023, "grad_norm": 0.8745826482772827, "learning_rate": 8.703000000000001e-06, "loss": 0.092, "step": 2904 }, { "epoch": 7.051032806804375, "grad_norm": 0.6893240809440613, "learning_rate": 8.706000000000001e-06, "loss": 0.0804, "step": 2905 }, { "epoch": 7.053462940461725, "grad_norm": 1.2710423469543457, "learning_rate": 8.709e-06, "loss": 0.0956, "step": 2906 }, { "epoch": 7.055893074119076, "grad_norm": 0.8211115002632141, "learning_rate": 8.712e-06, "loss": 0.0777, "step": 2907 }, { "epoch": 7.058323207776428, "grad_norm": 0.9617464542388916, "learning_rate": 8.715e-06, "loss": 0.0831, "step": 2908 }, { "epoch": 7.060753341433779, "grad_norm": 0.8946683406829834, "learning_rate": 8.718e-06, "loss": 0.0887, "step": 2909 }, { "epoch": 7.06318347509113, "grad_norm": 0.9590336084365845, "learning_rate": 8.721e-06, "loss": 0.1165, "step": 2910 }, { "epoch": 7.065613608748481, "grad_norm": 1.164517879486084, "learning_rate": 8.724e-06, "loss": 0.0878, "step": 2911 }, { "epoch": 7.068043742405832, "grad_norm": 1.169183611869812, "learning_rate": 8.727e-06, "loss": 0.1098, "step": 2912 }, { "epoch": 7.070473876063184, "grad_norm": 0.9208186864852905, "learning_rate": 8.73e-06, "loss": 0.0733, "step": 2913 }, { "epoch": 7.072904009720535, "grad_norm": 1.0236953496932983, "learning_rate": 8.733000000000001e-06, "loss": 0.1066, "step": 2914 }, { "epoch": 7.075334143377885, "grad_norm": 1.307508111000061, "learning_rate": 8.736e-06, "loss": 0.1128, "step": 2915 }, { "epoch": 7.077764277035237, "grad_norm": 1.0308222770690918, "learning_rate": 8.739e-06, "loss": 0.0726, "step": 2916 }, { "epoch": 7.080194410692588, "grad_norm": 0.9731301665306091, "learning_rate": 8.742e-06, "loss": 0.0943, "step": 2917 }, { "epoch": 7.082624544349939, "grad_norm": 1.2564448118209839, "learning_rate": 8.745e-06, "loss": 0.113, "step": 2918 }, { "epoch": 7.085054678007291, "grad_norm": 0.8944262862205505, "learning_rate": 8.748000000000002e-06, "loss": 0.091, "step": 2919 }, { "epoch": 7.087484811664641, "grad_norm": 1.1345443725585938, "learning_rate": 8.751000000000001e-06, "loss": 0.1218, "step": 2920 }, { "epoch": 7.0899149453219925, "grad_norm": 1.0261553525924683, "learning_rate": 8.754e-06, "loss": 0.1574, "step": 2921 }, { "epoch": 7.092345078979344, "grad_norm": 0.9502443671226501, "learning_rate": 8.757e-06, "loss": 0.0923, "step": 2922 }, { "epoch": 7.094775212636695, "grad_norm": 1.1530680656433105, "learning_rate": 8.759999999999999e-06, "loss": 0.0949, "step": 2923 }, { "epoch": 7.0972053462940465, "grad_norm": 2.3693599700927734, "learning_rate": 8.763e-06, "loss": 0.1255, "step": 2924 }, { "epoch": 7.099635479951397, "grad_norm": 0.9894478917121887, "learning_rate": 8.766e-06, "loss": 0.1026, "step": 2925 }, { "epoch": 7.102065613608748, "grad_norm": 1.162497878074646, "learning_rate": 8.769e-06, "loss": 0.1108, "step": 2926 }, { "epoch": 7.1044957472661, "grad_norm": 0.8137326240539551, "learning_rate": 8.772e-06, "loss": 0.0797, "step": 2927 }, { "epoch": 7.106925880923451, "grad_norm": 1.4459730386734009, "learning_rate": 8.775e-06, "loss": 0.1482, "step": 2928 }, { "epoch": 7.109356014580802, "grad_norm": 1.5572175979614258, "learning_rate": 8.778000000000001e-06, "loss": 0.1381, "step": 2929 }, { "epoch": 7.111786148238153, "grad_norm": 0.9615602493286133, "learning_rate": 8.781000000000001e-06, "loss": 0.0981, "step": 2930 }, { "epoch": 7.114216281895504, "grad_norm": 1.8974305391311646, "learning_rate": 8.784000000000001e-06, "loss": 0.1405, "step": 2931 }, { "epoch": 7.1166464155528555, "grad_norm": 1.543992280960083, "learning_rate": 8.787e-06, "loss": 0.1638, "step": 2932 }, { "epoch": 7.119076549210207, "grad_norm": 1.7997047901153564, "learning_rate": 8.79e-06, "loss": 0.1516, "step": 2933 }, { "epoch": 7.121506682867557, "grad_norm": 3.8620522022247314, "learning_rate": 8.793e-06, "loss": 0.3206, "step": 2934 }, { "epoch": 7.123936816524909, "grad_norm": 2.319304943084717, "learning_rate": 8.796e-06, "loss": 0.4698, "step": 2935 }, { "epoch": 7.12636695018226, "grad_norm": 1.2053827047348022, "learning_rate": 8.799e-06, "loss": 0.3719, "step": 2936 }, { "epoch": 7.128797083839611, "grad_norm": 0.9871417880058289, "learning_rate": 8.802e-06, "loss": 0.2871, "step": 2937 }, { "epoch": 7.131227217496963, "grad_norm": 1.1194177865982056, "learning_rate": 8.805e-06, "loss": 0.2964, "step": 2938 }, { "epoch": 7.133657351154313, "grad_norm": 1.4288568496704102, "learning_rate": 8.808000000000001e-06, "loss": 0.2319, "step": 2939 }, { "epoch": 7.136087484811664, "grad_norm": 1.1183284521102905, "learning_rate": 8.811000000000001e-06, "loss": 0.2325, "step": 2940 }, { "epoch": 7.138517618469016, "grad_norm": 0.9053753018379211, "learning_rate": 8.814e-06, "loss": 0.1599, "step": 2941 }, { "epoch": 7.140947752126367, "grad_norm": 0.7984399199485779, "learning_rate": 8.817e-06, "loss": 0.1235, "step": 2942 }, { "epoch": 7.1433778857837185, "grad_norm": 0.8424935936927795, "learning_rate": 8.82e-06, "loss": 0.1187, "step": 2943 }, { "epoch": 7.145808019441069, "grad_norm": 0.6819348931312561, "learning_rate": 8.823e-06, "loss": 0.153, "step": 2944 }, { "epoch": 7.14823815309842, "grad_norm": 1.5324777364730835, "learning_rate": 8.826000000000002e-06, "loss": 0.1412, "step": 2945 }, { "epoch": 7.150668286755772, "grad_norm": 0.6884765625, "learning_rate": 8.829e-06, "loss": 0.1079, "step": 2946 }, { "epoch": 7.153098420413123, "grad_norm": 0.7203267812728882, "learning_rate": 8.832e-06, "loss": 0.0895, "step": 2947 }, { "epoch": 7.155528554070474, "grad_norm": 0.6836177706718445, "learning_rate": 8.835e-06, "loss": 0.0766, "step": 2948 }, { "epoch": 7.157958687727825, "grad_norm": 0.9848700761795044, "learning_rate": 8.837999999999999e-06, "loss": 0.1052, "step": 2949 }, { "epoch": 7.160388821385176, "grad_norm": 1.3817994594573975, "learning_rate": 8.841e-06, "loss": 0.1192, "step": 2950 }, { "epoch": 7.162818955042527, "grad_norm": 0.9070823788642883, "learning_rate": 8.844e-06, "loss": 0.0678, "step": 2951 }, { "epoch": 7.165249088699879, "grad_norm": 0.8449195027351379, "learning_rate": 8.847e-06, "loss": 0.125, "step": 2952 }, { "epoch": 7.167679222357229, "grad_norm": 0.7241218090057373, "learning_rate": 8.85e-06, "loss": 0.0774, "step": 2953 }, { "epoch": 7.1701093560145805, "grad_norm": 0.8624741435050964, "learning_rate": 8.853e-06, "loss": 0.1112, "step": 2954 }, { "epoch": 7.172539489671932, "grad_norm": 0.8470218777656555, "learning_rate": 8.856000000000001e-06, "loss": 0.0974, "step": 2955 }, { "epoch": 7.174969623329283, "grad_norm": 1.2881578207015991, "learning_rate": 8.859000000000001e-06, "loss": 0.1249, "step": 2956 }, { "epoch": 7.177399756986635, "grad_norm": 1.1506344079971313, "learning_rate": 8.862000000000001e-06, "loss": 0.0864, "step": 2957 }, { "epoch": 7.179829890643985, "grad_norm": 0.9672608375549316, "learning_rate": 8.864999999999999e-06, "loss": 0.1031, "step": 2958 }, { "epoch": 7.182260024301336, "grad_norm": 0.6598014831542969, "learning_rate": 8.867999999999999e-06, "loss": 0.0601, "step": 2959 }, { "epoch": 7.184690157958688, "grad_norm": 1.1408588886260986, "learning_rate": 8.871e-06, "loss": 0.1317, "step": 2960 }, { "epoch": 7.187120291616039, "grad_norm": 0.7867428660392761, "learning_rate": 8.874e-06, "loss": 0.0791, "step": 2961 }, { "epoch": 7.18955042527339, "grad_norm": 1.0272084474563599, "learning_rate": 8.877e-06, "loss": 0.1117, "step": 2962 }, { "epoch": 7.191980558930741, "grad_norm": 0.9876877665519714, "learning_rate": 8.88e-06, "loss": 0.0927, "step": 2963 }, { "epoch": 7.194410692588092, "grad_norm": 1.1509242057800293, "learning_rate": 8.883e-06, "loss": 0.1045, "step": 2964 }, { "epoch": 7.1968408262454435, "grad_norm": 0.9499573707580566, "learning_rate": 8.886000000000001e-06, "loss": 0.0918, "step": 2965 }, { "epoch": 7.199270959902795, "grad_norm": 1.175283670425415, "learning_rate": 8.889e-06, "loss": 0.1299, "step": 2966 }, { "epoch": 7.201701093560146, "grad_norm": 0.9863525032997131, "learning_rate": 8.892e-06, "loss": 0.095, "step": 2967 }, { "epoch": 7.204131227217497, "grad_norm": 0.7780662775039673, "learning_rate": 8.895e-06, "loss": 0.0711, "step": 2968 }, { "epoch": 7.206561360874848, "grad_norm": 0.9458411931991577, "learning_rate": 8.898e-06, "loss": 0.0873, "step": 2969 }, { "epoch": 7.208991494532199, "grad_norm": 1.6590903997421265, "learning_rate": 8.901e-06, "loss": 0.1152, "step": 2970 }, { "epoch": 7.211421628189551, "grad_norm": 0.7419745326042175, "learning_rate": 8.904e-06, "loss": 0.0733, "step": 2971 }, { "epoch": 7.213851761846902, "grad_norm": 1.8009010553359985, "learning_rate": 8.907e-06, "loss": 0.1225, "step": 2972 }, { "epoch": 7.2162818955042525, "grad_norm": 1.1042439937591553, "learning_rate": 8.91e-06, "loss": 0.1283, "step": 2973 }, { "epoch": 7.218712029161604, "grad_norm": 0.9816156029701233, "learning_rate": 8.913e-06, "loss": 0.1062, "step": 2974 }, { "epoch": 7.221142162818955, "grad_norm": 1.4051513671875, "learning_rate": 8.916e-06, "loss": 0.1222, "step": 2975 }, { "epoch": 7.2235722964763065, "grad_norm": 1.1887025833129883, "learning_rate": 8.919e-06, "loss": 0.1266, "step": 2976 }, { "epoch": 7.226002430133657, "grad_norm": 1.2337183952331543, "learning_rate": 8.922e-06, "loss": 0.1099, "step": 2977 }, { "epoch": 7.228432563791008, "grad_norm": 1.154975175857544, "learning_rate": 8.925e-06, "loss": 0.1157, "step": 2978 }, { "epoch": 7.23086269744836, "grad_norm": 1.1439259052276611, "learning_rate": 8.928e-06, "loss": 0.1207, "step": 2979 }, { "epoch": 7.233292831105711, "grad_norm": 1.1952955722808838, "learning_rate": 8.931000000000001e-06, "loss": 0.1285, "step": 2980 }, { "epoch": 7.235722964763062, "grad_norm": 2.1316328048706055, "learning_rate": 8.934000000000001e-06, "loss": 0.1358, "step": 2981 }, { "epoch": 7.238153098420413, "grad_norm": 1.6464402675628662, "learning_rate": 8.937000000000001e-06, "loss": 0.1714, "step": 2982 }, { "epoch": 7.240583232077764, "grad_norm": 1.4761812686920166, "learning_rate": 8.939999999999999e-06, "loss": 0.163, "step": 2983 }, { "epoch": 7.2430133657351154, "grad_norm": 2.2565860748291016, "learning_rate": 8.942999999999999e-06, "loss": 0.2656, "step": 2984 }, { "epoch": 7.245443499392467, "grad_norm": 1.422861099243164, "learning_rate": 8.946e-06, "loss": 0.4376, "step": 2985 }, { "epoch": 7.247873633049818, "grad_norm": 2.0698020458221436, "learning_rate": 8.949e-06, "loss": 0.4202, "step": 2986 }, { "epoch": 7.250303766707169, "grad_norm": 1.1127262115478516, "learning_rate": 8.952e-06, "loss": 0.3118, "step": 2987 }, { "epoch": 7.25273390036452, "grad_norm": 0.9697991013526917, "learning_rate": 8.955e-06, "loss": 0.2546, "step": 2988 }, { "epoch": 7.255164034021871, "grad_norm": 1.1440421342849731, "learning_rate": 8.958e-06, "loss": 0.2748, "step": 2989 }, { "epoch": 7.257594167679223, "grad_norm": 1.2681821584701538, "learning_rate": 8.961000000000001e-06, "loss": 0.2585, "step": 2990 }, { "epoch": 7.260024301336573, "grad_norm": 0.7745319604873657, "learning_rate": 8.964000000000001e-06, "loss": 0.1728, "step": 2991 }, { "epoch": 7.262454434993924, "grad_norm": 0.9151167869567871, "learning_rate": 8.967e-06, "loss": 0.1532, "step": 2992 }, { "epoch": 7.264884568651276, "grad_norm": 1.122431755065918, "learning_rate": 8.97e-06, "loss": 0.1457, "step": 2993 }, { "epoch": 7.267314702308627, "grad_norm": 0.9717774987220764, "learning_rate": 8.973e-06, "loss": 0.103, "step": 2994 }, { "epoch": 7.269744835965978, "grad_norm": 0.9632847905158997, "learning_rate": 8.976e-06, "loss": 0.0932, "step": 2995 }, { "epoch": 7.272174969623329, "grad_norm": 0.9347562789916992, "learning_rate": 8.979e-06, "loss": 0.1252, "step": 2996 }, { "epoch": 7.27460510328068, "grad_norm": 0.9451033473014832, "learning_rate": 8.982e-06, "loss": 0.1013, "step": 2997 }, { "epoch": 7.277035236938032, "grad_norm": 0.9914412498474121, "learning_rate": 8.985e-06, "loss": 0.1262, "step": 2998 }, { "epoch": 7.279465370595383, "grad_norm": 0.8981965184211731, "learning_rate": 8.988e-06, "loss": 0.1316, "step": 2999 }, { "epoch": 7.281895504252734, "grad_norm": 1.2751671075820923, "learning_rate": 8.991e-06, "loss": 0.099, "step": 3000 }, { "epoch": 7.281895504252734, "eval_cer": 0.10641715524857441, "eval_loss": 0.3206274211406708, "eval_runtime": 8.0838, "eval_samples_per_second": 12.494, "eval_steps_per_second": 0.495, "eval_wer": 0.33289241622574955, "step": 3000 }, { "epoch": 7.284325637910085, "grad_norm": 0.6211936473846436, "learning_rate": 8.994e-06, "loss": 0.0823, "step": 3001 }, { "epoch": 7.286755771567436, "grad_norm": 0.8595839738845825, "learning_rate": 8.997e-06, "loss": 0.1219, "step": 3002 }, { "epoch": 7.289185905224787, "grad_norm": 0.8607363700866699, "learning_rate": 9e-06, "loss": 0.0864, "step": 3003 }, { "epoch": 7.291616038882139, "grad_norm": 0.8218222856521606, "learning_rate": 9.003e-06, "loss": 0.095, "step": 3004 }, { "epoch": 7.29404617253949, "grad_norm": 0.8917497992515564, "learning_rate": 9.006000000000002e-06, "loss": 0.0929, "step": 3005 }, { "epoch": 7.2964763061968405, "grad_norm": 0.656795859336853, "learning_rate": 9.009000000000001e-06, "loss": 0.0785, "step": 3006 }, { "epoch": 7.298906439854192, "grad_norm": 0.873046875, "learning_rate": 9.012e-06, "loss": 0.0888, "step": 3007 }, { "epoch": 7.301336573511543, "grad_norm": 1.1846240758895874, "learning_rate": 9.015e-06, "loss": 0.1246, "step": 3008 }, { "epoch": 7.3037667071688945, "grad_norm": 0.7245266437530518, "learning_rate": 9.017999999999999e-06, "loss": 0.0698, "step": 3009 }, { "epoch": 7.306196840826246, "grad_norm": 0.732732355594635, "learning_rate": 9.021e-06, "loss": 0.0809, "step": 3010 }, { "epoch": 7.308626974483596, "grad_norm": 0.7844884395599365, "learning_rate": 9.024e-06, "loss": 0.1282, "step": 3011 }, { "epoch": 7.311057108140948, "grad_norm": 0.8465268015861511, "learning_rate": 9.027e-06, "loss": 0.0908, "step": 3012 }, { "epoch": 7.313487241798299, "grad_norm": 1.2491686344146729, "learning_rate": 9.03e-06, "loss": 0.1257, "step": 3013 }, { "epoch": 7.31591737545565, "grad_norm": 0.9567793607711792, "learning_rate": 9.033e-06, "loss": 0.0755, "step": 3014 }, { "epoch": 7.318347509113001, "grad_norm": 1.014614224433899, "learning_rate": 9.036000000000001e-06, "loss": 0.0734, "step": 3015 }, { "epoch": 7.320777642770352, "grad_norm": 1.5539456605911255, "learning_rate": 9.039000000000001e-06, "loss": 0.1215, "step": 3016 }, { "epoch": 7.3232077764277035, "grad_norm": 0.9245264530181885, "learning_rate": 9.042e-06, "loss": 0.056, "step": 3017 }, { "epoch": 7.325637910085055, "grad_norm": 0.957251787185669, "learning_rate": 9.045e-06, "loss": 0.1004, "step": 3018 }, { "epoch": 7.328068043742406, "grad_norm": 1.2421276569366455, "learning_rate": 9.048e-06, "loss": 0.1298, "step": 3019 }, { "epoch": 7.330498177399757, "grad_norm": 0.8746510744094849, "learning_rate": 9.051e-06, "loss": 0.0983, "step": 3020 }, { "epoch": 7.332928311057108, "grad_norm": 1.5925912857055664, "learning_rate": 9.054e-06, "loss": 0.1166, "step": 3021 }, { "epoch": 7.335358444714459, "grad_norm": 0.880885899066925, "learning_rate": 9.057e-06, "loss": 0.0949, "step": 3022 }, { "epoch": 7.337788578371811, "grad_norm": 0.9118296504020691, "learning_rate": 9.06e-06, "loss": 0.08, "step": 3023 }, { "epoch": 7.340218712029162, "grad_norm": 0.9778973460197449, "learning_rate": 9.063e-06, "loss": 0.0896, "step": 3024 }, { "epoch": 7.342648845686512, "grad_norm": 1.2184462547302246, "learning_rate": 9.066000000000001e-06, "loss": 0.1115, "step": 3025 }, { "epoch": 7.345078979343864, "grad_norm": 1.3282058238983154, "learning_rate": 9.069e-06, "loss": 0.1207, "step": 3026 }, { "epoch": 7.347509113001215, "grad_norm": 1.2112979888916016, "learning_rate": 9.072e-06, "loss": 0.1021, "step": 3027 }, { "epoch": 7.3499392466585665, "grad_norm": 2.2904045581817627, "learning_rate": 9.075e-06, "loss": 0.1514, "step": 3028 }, { "epoch": 7.352369380315917, "grad_norm": 1.296786904335022, "learning_rate": 9.078e-06, "loss": 0.1164, "step": 3029 }, { "epoch": 7.354799513973268, "grad_norm": 1.448795199394226, "learning_rate": 9.081000000000002e-06, "loss": 0.1479, "step": 3030 }, { "epoch": 7.35722964763062, "grad_norm": 1.8632888793945312, "learning_rate": 9.084000000000001e-06, "loss": 0.1875, "step": 3031 }, { "epoch": 7.359659781287971, "grad_norm": 1.2794158458709717, "learning_rate": 9.087e-06, "loss": 0.1324, "step": 3032 }, { "epoch": 7.362089914945322, "grad_norm": 1.8570164442062378, "learning_rate": 9.09e-06, "loss": 0.1633, "step": 3033 }, { "epoch": 7.364520048602673, "grad_norm": 2.111524820327759, "learning_rate": 9.093e-06, "loss": 0.1806, "step": 3034 }, { "epoch": 7.366950182260024, "grad_norm": 2.781816005706787, "learning_rate": 9.096e-06, "loss": 0.4666, "step": 3035 }, { "epoch": 7.369380315917375, "grad_norm": 1.5373930931091309, "learning_rate": 9.099e-06, "loss": 0.3783, "step": 3036 }, { "epoch": 7.371810449574727, "grad_norm": 1.1655811071395874, "learning_rate": 9.102e-06, "loss": 0.3049, "step": 3037 }, { "epoch": 7.374240583232078, "grad_norm": 1.1625198125839233, "learning_rate": 9.105e-06, "loss": 0.2932, "step": 3038 }, { "epoch": 7.3766707168894285, "grad_norm": 1.4432224035263062, "learning_rate": 9.108e-06, "loss": 0.246, "step": 3039 }, { "epoch": 7.37910085054678, "grad_norm": 1.9667658805847168, "learning_rate": 9.111000000000001e-06, "loss": 0.2315, "step": 3040 }, { "epoch": 7.381530984204131, "grad_norm": 1.4907883405685425, "learning_rate": 9.114000000000001e-06, "loss": 0.1988, "step": 3041 }, { "epoch": 7.383961117861483, "grad_norm": 1.0739716291427612, "learning_rate": 9.117000000000001e-06, "loss": 0.1508, "step": 3042 }, { "epoch": 7.386391251518834, "grad_norm": 0.7794821262359619, "learning_rate": 9.12e-06, "loss": 0.1112, "step": 3043 }, { "epoch": 7.388821385176184, "grad_norm": 1.6269255876541138, "learning_rate": 9.122999999999999e-06, "loss": 0.1216, "step": 3044 }, { "epoch": 7.391251518833536, "grad_norm": 1.0620756149291992, "learning_rate": 9.126e-06, "loss": 0.1359, "step": 3045 }, { "epoch": 7.393681652490887, "grad_norm": 0.9234948754310608, "learning_rate": 9.129e-06, "loss": 0.0923, "step": 3046 }, { "epoch": 7.396111786148238, "grad_norm": 0.8823694586753845, "learning_rate": 9.132e-06, "loss": 0.1093, "step": 3047 }, { "epoch": 7.39854191980559, "grad_norm": 1.4210247993469238, "learning_rate": 9.135e-06, "loss": 0.0922, "step": 3048 }, { "epoch": 7.40097205346294, "grad_norm": 0.8747981786727905, "learning_rate": 9.138e-06, "loss": 0.1174, "step": 3049 }, { "epoch": 7.4034021871202915, "grad_norm": 1.0417293310165405, "learning_rate": 9.141000000000001e-06, "loss": 0.113, "step": 3050 }, { "epoch": 7.405832320777643, "grad_norm": 0.8701476454734802, "learning_rate": 9.144000000000001e-06, "loss": 0.1278, "step": 3051 }, { "epoch": 7.408262454434994, "grad_norm": 0.8739352822303772, "learning_rate": 9.147e-06, "loss": 0.1054, "step": 3052 }, { "epoch": 7.4106925880923455, "grad_norm": 0.7663303017616272, "learning_rate": 9.15e-06, "loss": 0.0743, "step": 3053 }, { "epoch": 7.413122721749696, "grad_norm": 0.8385040163993835, "learning_rate": 9.153e-06, "loss": 0.1082, "step": 3054 }, { "epoch": 7.415552855407047, "grad_norm": 0.7535604238510132, "learning_rate": 9.156000000000002e-06, "loss": 0.0883, "step": 3055 }, { "epoch": 7.417982989064399, "grad_norm": 1.030684232711792, "learning_rate": 9.159e-06, "loss": 0.2201, "step": 3056 }, { "epoch": 7.42041312272175, "grad_norm": 0.8547562956809998, "learning_rate": 9.162e-06, "loss": 0.0783, "step": 3057 }, { "epoch": 7.4228432563791005, "grad_norm": 0.8849889636039734, "learning_rate": 9.165e-06, "loss": 0.1296, "step": 3058 }, { "epoch": 7.425273390036452, "grad_norm": 0.928966760635376, "learning_rate": 9.168e-06, "loss": 0.1036, "step": 3059 }, { "epoch": 7.427703523693803, "grad_norm": 0.6816655397415161, "learning_rate": 9.171e-06, "loss": 0.0839, "step": 3060 }, { "epoch": 7.4301336573511545, "grad_norm": 1.3581254482269287, "learning_rate": 9.174e-06, "loss": 0.1137, "step": 3061 }, { "epoch": 7.432563791008506, "grad_norm": 0.8864293694496155, "learning_rate": 9.177e-06, "loss": 0.0979, "step": 3062 }, { "epoch": 7.434993924665856, "grad_norm": 1.6426750421524048, "learning_rate": 9.18e-06, "loss": 0.0899, "step": 3063 }, { "epoch": 7.437424058323208, "grad_norm": 0.9464544057846069, "learning_rate": 9.183e-06, "loss": 0.1048, "step": 3064 }, { "epoch": 7.439854191980559, "grad_norm": 0.8637104034423828, "learning_rate": 9.186000000000001e-06, "loss": 0.1118, "step": 3065 }, { "epoch": 7.44228432563791, "grad_norm": 0.7853138446807861, "learning_rate": 9.189000000000001e-06, "loss": 0.0962, "step": 3066 }, { "epoch": 7.444714459295262, "grad_norm": 1.6852295398712158, "learning_rate": 9.192000000000001e-06, "loss": 0.0931, "step": 3067 }, { "epoch": 7.447144592952612, "grad_norm": 0.8753742575645447, "learning_rate": 9.195000000000001e-06, "loss": 0.0895, "step": 3068 }, { "epoch": 7.4495747266099634, "grad_norm": 1.7016512155532837, "learning_rate": 9.197999999999999e-06, "loss": 0.0969, "step": 3069 }, { "epoch": 7.452004860267315, "grad_norm": 0.8179618716239929, "learning_rate": 9.200999999999999e-06, "loss": 0.0849, "step": 3070 }, { "epoch": 7.454434993924666, "grad_norm": 0.9793505072593689, "learning_rate": 9.204e-06, "loss": 0.079, "step": 3071 }, { "epoch": 7.456865127582017, "grad_norm": 1.028232455253601, "learning_rate": 9.207e-06, "loss": 0.1073, "step": 3072 }, { "epoch": 7.459295261239368, "grad_norm": 1.0248374938964844, "learning_rate": 9.21e-06, "loss": 0.111, "step": 3073 }, { "epoch": 7.461725394896719, "grad_norm": 0.8237469792366028, "learning_rate": 9.213e-06, "loss": 0.0873, "step": 3074 }, { "epoch": 7.464155528554071, "grad_norm": 1.2362836599349976, "learning_rate": 9.216e-06, "loss": 0.0925, "step": 3075 }, { "epoch": 7.466585662211422, "grad_norm": 1.4543476104736328, "learning_rate": 9.219000000000001e-06, "loss": 0.0885, "step": 3076 }, { "epoch": 7.469015795868772, "grad_norm": 1.2483289241790771, "learning_rate": 9.222e-06, "loss": 0.1113, "step": 3077 }, { "epoch": 7.471445929526124, "grad_norm": 1.663267970085144, "learning_rate": 9.225e-06, "loss": 0.1419, "step": 3078 }, { "epoch": 7.473876063183475, "grad_norm": 1.323998212814331, "learning_rate": 9.228e-06, "loss": 0.1395, "step": 3079 }, { "epoch": 7.476306196840826, "grad_norm": 1.0778498649597168, "learning_rate": 9.231e-06, "loss": 0.0957, "step": 3080 }, { "epoch": 7.478736330498178, "grad_norm": 1.847100853919983, "learning_rate": 9.234e-06, "loss": 0.1839, "step": 3081 }, { "epoch": 7.481166464155528, "grad_norm": 2.118206262588501, "learning_rate": 9.237e-06, "loss": 0.2125, "step": 3082 }, { "epoch": 7.48359659781288, "grad_norm": 1.3314286470413208, "learning_rate": 9.24e-06, "loss": 0.1486, "step": 3083 }, { "epoch": 7.486026731470231, "grad_norm": 3.2769556045532227, "learning_rate": 9.243e-06, "loss": 0.2122, "step": 3084 }, { "epoch": 7.488456865127582, "grad_norm": 1.9519426822662354, "learning_rate": 9.246e-06, "loss": 0.3931, "step": 3085 }, { "epoch": 7.490886998784934, "grad_norm": 1.4838125705718994, "learning_rate": 9.249e-06, "loss": 0.3727, "step": 3086 }, { "epoch": 7.493317132442284, "grad_norm": 1.4563932418823242, "learning_rate": 9.252e-06, "loss": 0.3462, "step": 3087 }, { "epoch": 7.495747266099635, "grad_norm": 1.315729022026062, "learning_rate": 9.255e-06, "loss": 0.2824, "step": 3088 }, { "epoch": 7.498177399756987, "grad_norm": 1.2923448085784912, "learning_rate": 9.258e-06, "loss": 0.2037, "step": 3089 }, { "epoch": 7.500607533414338, "grad_norm": 1.586430549621582, "learning_rate": 9.261e-06, "loss": 0.2329, "step": 3090 }, { "epoch": 7.503037667071689, "grad_norm": 1.247545599937439, "learning_rate": 9.264000000000001e-06, "loss": 0.1881, "step": 3091 }, { "epoch": 7.50546780072904, "grad_norm": 0.8823976516723633, "learning_rate": 9.267000000000001e-06, "loss": 0.1226, "step": 3092 }, { "epoch": 7.507897934386391, "grad_norm": 0.6534527540206909, "learning_rate": 9.27e-06, "loss": 0.1213, "step": 3093 }, { "epoch": 7.5103280680437425, "grad_norm": 0.8903974294662476, "learning_rate": 9.272999999999999e-06, "loss": 0.1081, "step": 3094 }, { "epoch": 7.512758201701094, "grad_norm": 1.0630383491516113, "learning_rate": 9.275999999999999e-06, "loss": 0.1371, "step": 3095 }, { "epoch": 7.515188335358444, "grad_norm": 0.8589605689048767, "learning_rate": 9.279e-06, "loss": 0.1195, "step": 3096 }, { "epoch": 7.517618469015796, "grad_norm": 1.0938260555267334, "learning_rate": 9.282e-06, "loss": 0.118, "step": 3097 }, { "epoch": 7.520048602673147, "grad_norm": 1.12555730342865, "learning_rate": 9.285e-06, "loss": 0.1072, "step": 3098 }, { "epoch": 7.522478736330498, "grad_norm": 0.8477674722671509, "learning_rate": 9.288e-06, "loss": 0.1122, "step": 3099 }, { "epoch": 7.52490886998785, "grad_norm": 0.7759634256362915, "learning_rate": 9.291e-06, "loss": 0.0826, "step": 3100 }, { "epoch": 7.5273390036452, "grad_norm": 1.0126651525497437, "learning_rate": 9.294000000000001e-06, "loss": 0.1271, "step": 3101 }, { "epoch": 7.5297691373025515, "grad_norm": 0.9378772377967834, "learning_rate": 9.297000000000001e-06, "loss": 0.0791, "step": 3102 }, { "epoch": 7.532199270959903, "grad_norm": 0.8418811559677124, "learning_rate": 9.3e-06, "loss": 0.1321, "step": 3103 }, { "epoch": 7.534629404617254, "grad_norm": 0.7987210154533386, "learning_rate": 9.303e-06, "loss": 0.0889, "step": 3104 }, { "epoch": 7.537059538274605, "grad_norm": 0.9183957576751709, "learning_rate": 9.306e-06, "loss": 0.1017, "step": 3105 }, { "epoch": 7.539489671931956, "grad_norm": 0.7660580277442932, "learning_rate": 9.309e-06, "loss": 0.089, "step": 3106 }, { "epoch": 7.541919805589307, "grad_norm": 0.6337129473686218, "learning_rate": 9.312e-06, "loss": 0.0806, "step": 3107 }, { "epoch": 7.544349939246659, "grad_norm": 2.01876163482666, "learning_rate": 9.315e-06, "loss": 0.0863, "step": 3108 }, { "epoch": 7.54678007290401, "grad_norm": 0.8291923403739929, "learning_rate": 9.318e-06, "loss": 0.0877, "step": 3109 }, { "epoch": 7.54921020656136, "grad_norm": 0.9739956259727478, "learning_rate": 9.321e-06, "loss": 0.0838, "step": 3110 }, { "epoch": 7.551640340218712, "grad_norm": 1.762833595275879, "learning_rate": 9.324000000000001e-06, "loss": 0.1179, "step": 3111 }, { "epoch": 7.554070473876063, "grad_norm": 0.9236059188842773, "learning_rate": 9.327e-06, "loss": 0.0962, "step": 3112 }, { "epoch": 7.5565006075334145, "grad_norm": 0.8870382905006409, "learning_rate": 9.33e-06, "loss": 0.09, "step": 3113 }, { "epoch": 7.558930741190766, "grad_norm": 0.7698293924331665, "learning_rate": 9.333e-06, "loss": 0.0743, "step": 3114 }, { "epoch": 7.561360874848116, "grad_norm": 1.0521949529647827, "learning_rate": 9.336e-06, "loss": 0.0733, "step": 3115 }, { "epoch": 7.563791008505468, "grad_norm": 1.1116786003112793, "learning_rate": 9.339000000000002e-06, "loss": 0.1295, "step": 3116 }, { "epoch": 7.566221142162819, "grad_norm": 1.3611721992492676, "learning_rate": 9.342000000000001e-06, "loss": 0.1118, "step": 3117 }, { "epoch": 7.56865127582017, "grad_norm": 1.1114692687988281, "learning_rate": 9.345e-06, "loss": 0.1049, "step": 3118 }, { "epoch": 7.571081409477522, "grad_norm": 1.3498514890670776, "learning_rate": 9.348e-06, "loss": 0.1204, "step": 3119 }, { "epoch": 7.573511543134872, "grad_norm": 1.4940922260284424, "learning_rate": 9.350999999999999e-06, "loss": 0.1386, "step": 3120 }, { "epoch": 7.575941676792223, "grad_norm": 1.0957454442977905, "learning_rate": 9.354e-06, "loss": 0.093, "step": 3121 }, { "epoch": 7.578371810449575, "grad_norm": 1.4814560413360596, "learning_rate": 9.357e-06, "loss": 0.1857, "step": 3122 }, { "epoch": 7.580801944106926, "grad_norm": 1.0598191022872925, "learning_rate": 9.36e-06, "loss": 0.1108, "step": 3123 }, { "epoch": 7.583232077764277, "grad_norm": 0.9928390979766846, "learning_rate": 9.363e-06, "loss": 0.147, "step": 3124 }, { "epoch": 7.585662211421628, "grad_norm": 0.7033221125602722, "learning_rate": 9.366e-06, "loss": 0.0679, "step": 3125 }, { "epoch": 7.588092345078979, "grad_norm": 1.2424790859222412, "learning_rate": 9.369000000000001e-06, "loss": 0.1199, "step": 3126 }, { "epoch": 7.590522478736331, "grad_norm": 2.135963201522827, "learning_rate": 9.372000000000001e-06, "loss": 0.1222, "step": 3127 }, { "epoch": 7.592952612393682, "grad_norm": 1.173579454421997, "learning_rate": 9.375000000000001e-06, "loss": 0.1444, "step": 3128 }, { "epoch": 7.595382746051033, "grad_norm": 1.4241389036178589, "learning_rate": 9.378e-06, "loss": 0.1094, "step": 3129 }, { "epoch": 7.597812879708384, "grad_norm": 1.273918628692627, "learning_rate": 9.380999999999999e-06, "loss": 0.1102, "step": 3130 }, { "epoch": 7.600243013365735, "grad_norm": 1.9201279878616333, "learning_rate": 9.384e-06, "loss": 0.1216, "step": 3131 }, { "epoch": 7.602673147023086, "grad_norm": 2.8961169719696045, "learning_rate": 9.387e-06, "loss": 0.1746, "step": 3132 }, { "epoch": 7.605103280680438, "grad_norm": 1.8930540084838867, "learning_rate": 9.39e-06, "loss": 0.1423, "step": 3133 }, { "epoch": 7.607533414337789, "grad_norm": 2.9658944606781006, "learning_rate": 9.393e-06, "loss": 0.2767, "step": 3134 }, { "epoch": 7.6099635479951395, "grad_norm": 2.426393985748291, "learning_rate": 9.396e-06, "loss": 0.5069, "step": 3135 }, { "epoch": 7.612393681652491, "grad_norm": 1.3090547323226929, "learning_rate": 9.399000000000001e-06, "loss": 0.4092, "step": 3136 }, { "epoch": 7.614823815309842, "grad_norm": 1.2531012296676636, "learning_rate": 9.402e-06, "loss": 0.2848, "step": 3137 }, { "epoch": 7.6172539489671935, "grad_norm": 1.2696424722671509, "learning_rate": 9.405e-06, "loss": 0.2851, "step": 3138 }, { "epoch": 7.619684082624544, "grad_norm": 1.2969313859939575, "learning_rate": 9.408e-06, "loss": 0.2911, "step": 3139 }, { "epoch": 7.622114216281895, "grad_norm": 1.3704447746276855, "learning_rate": 9.411e-06, "loss": 0.222, "step": 3140 }, { "epoch": 7.624544349939247, "grad_norm": 1.7212305068969727, "learning_rate": 9.414000000000002e-06, "loss": 0.1817, "step": 3141 }, { "epoch": 7.626974483596598, "grad_norm": 1.107580304145813, "learning_rate": 9.417e-06, "loss": 0.1496, "step": 3142 }, { "epoch": 7.6294046172539485, "grad_norm": 0.7416384816169739, "learning_rate": 9.42e-06, "loss": 0.1115, "step": 3143 }, { "epoch": 7.6318347509113, "grad_norm": 0.8354853391647339, "learning_rate": 9.423e-06, "loss": 0.1273, "step": 3144 }, { "epoch": 7.634264884568651, "grad_norm": 1.0029125213623047, "learning_rate": 9.426e-06, "loss": 0.1234, "step": 3145 }, { "epoch": 7.6366950182260025, "grad_norm": 0.9235063195228577, "learning_rate": 9.429e-06, "loss": 0.0935, "step": 3146 }, { "epoch": 7.639125151883354, "grad_norm": 0.7247021198272705, "learning_rate": 9.432e-06, "loss": 0.0714, "step": 3147 }, { "epoch": 7.641555285540704, "grad_norm": 0.7022574543952942, "learning_rate": 9.435e-06, "loss": 0.0947, "step": 3148 }, { "epoch": 7.643985419198056, "grad_norm": 0.8142540454864502, "learning_rate": 9.438e-06, "loss": 0.0818, "step": 3149 }, { "epoch": 7.646415552855407, "grad_norm": 1.1042571067810059, "learning_rate": 9.441e-06, "loss": 0.0933, "step": 3150 }, { "epoch": 7.648845686512758, "grad_norm": 0.9954770803451538, "learning_rate": 9.444000000000001e-06, "loss": 0.0971, "step": 3151 }, { "epoch": 7.65127582017011, "grad_norm": 0.9318314790725708, "learning_rate": 9.447000000000001e-06, "loss": 0.0972, "step": 3152 }, { "epoch": 7.65370595382746, "grad_norm": 1.28435480594635, "learning_rate": 9.450000000000001e-06, "loss": 0.0746, "step": 3153 }, { "epoch": 7.6561360874848114, "grad_norm": 0.7693827152252197, "learning_rate": 9.453e-06, "loss": 0.0971, "step": 3154 }, { "epoch": 7.658566221142163, "grad_norm": 0.8223510980606079, "learning_rate": 9.455999999999999e-06, "loss": 0.0748, "step": 3155 }, { "epoch": 7.660996354799514, "grad_norm": 0.9231458902359009, "learning_rate": 9.459e-06, "loss": 0.0922, "step": 3156 }, { "epoch": 7.6634264884568655, "grad_norm": 0.8688566088676453, "learning_rate": 9.462e-06, "loss": 0.0872, "step": 3157 }, { "epoch": 7.665856622114216, "grad_norm": 0.9654849171638489, "learning_rate": 9.465e-06, "loss": 0.0863, "step": 3158 }, { "epoch": 7.668286755771567, "grad_norm": 1.088117003440857, "learning_rate": 9.468e-06, "loss": 0.1037, "step": 3159 }, { "epoch": 7.670716889428919, "grad_norm": 0.9309152364730835, "learning_rate": 9.471e-06, "loss": 0.1161, "step": 3160 }, { "epoch": 7.67314702308627, "grad_norm": 1.0514986515045166, "learning_rate": 9.474000000000001e-06, "loss": 0.096, "step": 3161 }, { "epoch": 7.675577156743621, "grad_norm": 1.020918369293213, "learning_rate": 9.477000000000001e-06, "loss": 0.1084, "step": 3162 }, { "epoch": 7.678007290400972, "grad_norm": 0.8568511605262756, "learning_rate": 9.48e-06, "loss": 0.0675, "step": 3163 }, { "epoch": 7.680437424058323, "grad_norm": 0.8808151483535767, "learning_rate": 9.483e-06, "loss": 0.0826, "step": 3164 }, { "epoch": 7.682867557715674, "grad_norm": 0.9286811351776123, "learning_rate": 9.486e-06, "loss": 0.1035, "step": 3165 }, { "epoch": 7.685297691373026, "grad_norm": 1.0524616241455078, "learning_rate": 9.489000000000002e-06, "loss": 0.103, "step": 3166 }, { "epoch": 7.687727825030377, "grad_norm": 0.8885053396224976, "learning_rate": 9.492e-06, "loss": 0.0906, "step": 3167 }, { "epoch": 7.690157958687728, "grad_norm": 1.3393508195877075, "learning_rate": 9.495e-06, "loss": 0.1398, "step": 3168 }, { "epoch": 7.692588092345079, "grad_norm": 1.04470694065094, "learning_rate": 9.498e-06, "loss": 0.0843, "step": 3169 }, { "epoch": 7.69501822600243, "grad_norm": 1.0290112495422363, "learning_rate": 9.501e-06, "loss": 0.0964, "step": 3170 }, { "epoch": 7.697448359659782, "grad_norm": 0.9464157819747925, "learning_rate": 9.504e-06, "loss": 0.0868, "step": 3171 }, { "epoch": 7.699878493317133, "grad_norm": 1.0754444599151611, "learning_rate": 9.507e-06, "loss": 0.102, "step": 3172 }, { "epoch": 7.702308626974483, "grad_norm": 1.4318864345550537, "learning_rate": 9.51e-06, "loss": 0.1288, "step": 3173 }, { "epoch": 7.704738760631835, "grad_norm": 1.030702829360962, "learning_rate": 9.513e-06, "loss": 0.0877, "step": 3174 }, { "epoch": 7.707168894289186, "grad_norm": 1.0353349447250366, "learning_rate": 9.516e-06, "loss": 0.1052, "step": 3175 }, { "epoch": 7.709599027946537, "grad_norm": 1.4953701496124268, "learning_rate": 9.519000000000002e-06, "loss": 0.1464, "step": 3176 }, { "epoch": 7.712029161603888, "grad_norm": 1.2841112613677979, "learning_rate": 9.522000000000001e-06, "loss": 0.1343, "step": 3177 }, { "epoch": 7.714459295261239, "grad_norm": 1.085141658782959, "learning_rate": 9.525000000000001e-06, "loss": 0.0925, "step": 3178 }, { "epoch": 7.7168894289185905, "grad_norm": 1.2454344034194946, "learning_rate": 9.528e-06, "loss": 0.0949, "step": 3179 }, { "epoch": 7.719319562575942, "grad_norm": 1.5862658023834229, "learning_rate": 9.530999999999999e-06, "loss": 0.1423, "step": 3180 }, { "epoch": 7.721749696233293, "grad_norm": 1.4244749546051025, "learning_rate": 9.534e-06, "loss": 0.1283, "step": 3181 }, { "epoch": 7.724179829890644, "grad_norm": 6.449389457702637, "learning_rate": 9.537e-06, "loss": 0.2088, "step": 3182 }, { "epoch": 7.726609963547995, "grad_norm": 2.4289400577545166, "learning_rate": 9.54e-06, "loss": 0.1807, "step": 3183 }, { "epoch": 7.729040097205346, "grad_norm": 2.2510762214660645, "learning_rate": 9.543e-06, "loss": 0.1809, "step": 3184 }, { "epoch": 7.731470230862698, "grad_norm": 1.9133301973342896, "learning_rate": 9.546e-06, "loss": 0.5278, "step": 3185 }, { "epoch": 7.733900364520048, "grad_norm": 1.1558294296264648, "learning_rate": 9.549000000000001e-06, "loss": 0.3992, "step": 3186 }, { "epoch": 7.7363304981773995, "grad_norm": 1.3731507062911987, "learning_rate": 9.552000000000001e-06, "loss": 0.3365, "step": 3187 }, { "epoch": 7.738760631834751, "grad_norm": 1.0149041414260864, "learning_rate": 9.555e-06, "loss": 0.3175, "step": 3188 }, { "epoch": 7.741190765492102, "grad_norm": 0.9348776936531067, "learning_rate": 9.558e-06, "loss": 0.2372, "step": 3189 }, { "epoch": 7.7436208991494535, "grad_norm": 1.0618232488632202, "learning_rate": 9.561e-06, "loss": 0.1855, "step": 3190 }, { "epoch": 7.746051032806804, "grad_norm": 1.063045620918274, "learning_rate": 9.564e-06, "loss": 0.1816, "step": 3191 }, { "epoch": 7.748481166464155, "grad_norm": 2.252105474472046, "learning_rate": 9.567e-06, "loss": 0.1611, "step": 3192 }, { "epoch": 7.750911300121507, "grad_norm": 1.076424241065979, "learning_rate": 9.57e-06, "loss": 0.1579, "step": 3193 }, { "epoch": 7.753341433778858, "grad_norm": 0.7357202768325806, "learning_rate": 9.573e-06, "loss": 0.1452, "step": 3194 }, { "epoch": 7.755771567436209, "grad_norm": 1.0315216779708862, "learning_rate": 9.576e-06, "loss": 0.1469, "step": 3195 }, { "epoch": 7.75820170109356, "grad_norm": 0.913920521736145, "learning_rate": 9.579e-06, "loss": 0.1145, "step": 3196 }, { "epoch": 7.760631834750911, "grad_norm": 1.2942973375320435, "learning_rate": 9.582e-06, "loss": 0.0984, "step": 3197 }, { "epoch": 7.7630619684082625, "grad_norm": 0.7444298267364502, "learning_rate": 9.585e-06, "loss": 0.1068, "step": 3198 }, { "epoch": 7.765492102065614, "grad_norm": 0.7679409980773926, "learning_rate": 9.588e-06, "loss": 0.1247, "step": 3199 }, { "epoch": 7.767922235722965, "grad_norm": 0.7420872449874878, "learning_rate": 9.591e-06, "loss": 0.0821, "step": 3200 }, { "epoch": 7.770352369380316, "grad_norm": 0.6239394545555115, "learning_rate": 9.594e-06, "loss": 0.0786, "step": 3201 }, { "epoch": 7.772782503037667, "grad_norm": 0.953916609287262, "learning_rate": 9.597000000000001e-06, "loss": 0.0828, "step": 3202 }, { "epoch": 7.775212636695018, "grad_norm": 0.8140321969985962, "learning_rate": 9.600000000000001e-06, "loss": 0.1063, "step": 3203 }, { "epoch": 7.77764277035237, "grad_norm": 0.8141579031944275, "learning_rate": 9.603e-06, "loss": 0.0972, "step": 3204 }, { "epoch": 7.780072904009721, "grad_norm": 1.0308319330215454, "learning_rate": 9.606e-06, "loss": 0.1006, "step": 3205 }, { "epoch": 7.782503037667071, "grad_norm": 1.2496254444122314, "learning_rate": 9.608999999999999e-06, "loss": 0.0806, "step": 3206 }, { "epoch": 7.784933171324423, "grad_norm": 1.0232502222061157, "learning_rate": 9.612e-06, "loss": 0.0882, "step": 3207 }, { "epoch": 7.787363304981774, "grad_norm": 0.8003423810005188, "learning_rate": 9.615e-06, "loss": 0.0705, "step": 3208 }, { "epoch": 7.789793438639125, "grad_norm": 0.9211364388465881, "learning_rate": 9.618e-06, "loss": 0.1028, "step": 3209 }, { "epoch": 7.792223572296477, "grad_norm": 1.054533839225769, "learning_rate": 9.621e-06, "loss": 0.098, "step": 3210 }, { "epoch": 7.794653705953827, "grad_norm": 0.9522476196289062, "learning_rate": 9.624e-06, "loss": 0.086, "step": 3211 }, { "epoch": 7.797083839611179, "grad_norm": 1.0615030527114868, "learning_rate": 9.627000000000001e-06, "loss": 0.0887, "step": 3212 }, { "epoch": 7.79951397326853, "grad_norm": 1.0815606117248535, "learning_rate": 9.630000000000001e-06, "loss": 0.1059, "step": 3213 }, { "epoch": 7.801944106925881, "grad_norm": 0.8378172516822815, "learning_rate": 9.633e-06, "loss": 0.0814, "step": 3214 }, { "epoch": 7.804374240583232, "grad_norm": 0.7711126804351807, "learning_rate": 9.636e-06, "loss": 0.0758, "step": 3215 }, { "epoch": 7.806804374240583, "grad_norm": 0.8795079588890076, "learning_rate": 9.638999999999999e-06, "loss": 0.1035, "step": 3216 }, { "epoch": 7.809234507897934, "grad_norm": 0.9190886616706848, "learning_rate": 9.642e-06, "loss": 0.1152, "step": 3217 }, { "epoch": 7.811664641555286, "grad_norm": 1.1144553422927856, "learning_rate": 9.645e-06, "loss": 0.1047, "step": 3218 }, { "epoch": 7.814094775212637, "grad_norm": 2.0324270725250244, "learning_rate": 9.648e-06, "loss": 0.2729, "step": 3219 }, { "epoch": 7.8165249088699875, "grad_norm": 1.0972973108291626, "learning_rate": 9.651e-06, "loss": 0.1498, "step": 3220 }, { "epoch": 7.818955042527339, "grad_norm": 0.7112351059913635, "learning_rate": 9.654e-06, "loss": 0.08, "step": 3221 }, { "epoch": 7.82138517618469, "grad_norm": 0.8359589576721191, "learning_rate": 9.657000000000001e-06, "loss": 0.0797, "step": 3222 }, { "epoch": 7.8238153098420415, "grad_norm": 0.9083468317985535, "learning_rate": 9.66e-06, "loss": 0.1071, "step": 3223 }, { "epoch": 7.826245443499392, "grad_norm": 1.088517189025879, "learning_rate": 9.663e-06, "loss": 0.1077, "step": 3224 }, { "epoch": 7.828675577156743, "grad_norm": 1.483981966972351, "learning_rate": 9.666e-06, "loss": 0.135, "step": 3225 }, { "epoch": 7.831105710814095, "grad_norm": 1.1003167629241943, "learning_rate": 9.669e-06, "loss": 0.1146, "step": 3226 }, { "epoch": 7.833535844471446, "grad_norm": 1.1634995937347412, "learning_rate": 9.672000000000002e-06, "loss": 0.1141, "step": 3227 }, { "epoch": 7.835965978128797, "grad_norm": 1.2582167387008667, "learning_rate": 9.675e-06, "loss": 0.0914, "step": 3228 }, { "epoch": 7.838396111786148, "grad_norm": 1.337365746498108, "learning_rate": 9.678e-06, "loss": 0.1232, "step": 3229 }, { "epoch": 7.840826245443499, "grad_norm": 1.3862096071243286, "learning_rate": 9.681e-06, "loss": 0.1209, "step": 3230 }, { "epoch": 7.8432563791008505, "grad_norm": 1.4054635763168335, "learning_rate": 9.683999999999999e-06, "loss": 0.1446, "step": 3231 }, { "epoch": 7.845686512758202, "grad_norm": 1.6446105241775513, "learning_rate": 9.687e-06, "loss": 0.1511, "step": 3232 }, { "epoch": 7.848116646415553, "grad_norm": 2.9457879066467285, "learning_rate": 9.69e-06, "loss": 0.2403, "step": 3233 }, { "epoch": 7.850546780072904, "grad_norm": 2.2214934825897217, "learning_rate": 9.693e-06, "loss": 0.2007, "step": 3234 }, { "epoch": 7.852976913730255, "grad_norm": 1.7208398580551147, "learning_rate": 9.696e-06, "loss": 0.5032, "step": 3235 }, { "epoch": 7.855407047387606, "grad_norm": 1.1740220785140991, "learning_rate": 9.699e-06, "loss": 0.3139, "step": 3236 }, { "epoch": 7.857837181044958, "grad_norm": 0.7495171427726746, "learning_rate": 9.702000000000001e-06, "loss": 0.2797, "step": 3237 }, { "epoch": 7.860267314702309, "grad_norm": 1.0284812450408936, "learning_rate": 9.705000000000001e-06, "loss": 0.3148, "step": 3238 }, { "epoch": 7.8626974483596594, "grad_norm": 1.0995818376541138, "learning_rate": 9.708000000000001e-06, "loss": 0.2886, "step": 3239 }, { "epoch": 7.865127582017011, "grad_norm": 1.4504302740097046, "learning_rate": 9.711e-06, "loss": 0.2081, "step": 3240 }, { "epoch": 7.867557715674362, "grad_norm": 0.9019107818603516, "learning_rate": 9.713999999999999e-06, "loss": 0.1328, "step": 3241 }, { "epoch": 7.8699878493317135, "grad_norm": 1.0842934846878052, "learning_rate": 9.717e-06, "loss": 0.1998, "step": 3242 }, { "epoch": 7.872417982989065, "grad_norm": 0.6515651941299438, "learning_rate": 9.72e-06, "loss": 0.1145, "step": 3243 }, { "epoch": 7.874848116646415, "grad_norm": 1.177318811416626, "learning_rate": 9.723e-06, "loss": 0.1248, "step": 3244 }, { "epoch": 7.877278250303767, "grad_norm": 1.023370385169983, "learning_rate": 9.726e-06, "loss": 0.1387, "step": 3245 }, { "epoch": 7.879708383961118, "grad_norm": 0.904001772403717, "learning_rate": 9.729e-06, "loss": 0.1246, "step": 3246 }, { "epoch": 7.882138517618469, "grad_norm": 0.8978198170661926, "learning_rate": 9.732000000000001e-06, "loss": 0.115, "step": 3247 }, { "epoch": 7.884568651275821, "grad_norm": 0.7234178781509399, "learning_rate": 9.735e-06, "loss": 0.0761, "step": 3248 }, { "epoch": 7.886998784933171, "grad_norm": 0.8736076951026917, "learning_rate": 9.738e-06, "loss": 0.0907, "step": 3249 }, { "epoch": 7.889428918590522, "grad_norm": 0.8493151664733887, "learning_rate": 9.741e-06, "loss": 0.1045, "step": 3250 }, { "epoch": 7.891859052247874, "grad_norm": 0.7556934952735901, "learning_rate": 9.744e-06, "loss": 0.1023, "step": 3251 }, { "epoch": 7.894289185905225, "grad_norm": 0.7892445921897888, "learning_rate": 9.747000000000002e-06, "loss": 0.1202, "step": 3252 }, { "epoch": 7.896719319562576, "grad_norm": 0.8341344594955444, "learning_rate": 9.75e-06, "loss": 0.0847, "step": 3253 }, { "epoch": 7.899149453219927, "grad_norm": 1.1463717222213745, "learning_rate": 9.753e-06, "loss": 0.1116, "step": 3254 }, { "epoch": 7.901579586877278, "grad_norm": 0.8181086182594299, "learning_rate": 9.756e-06, "loss": 0.0835, "step": 3255 }, { "epoch": 7.90400972053463, "grad_norm": 0.5666205883026123, "learning_rate": 9.759e-06, "loss": 0.0722, "step": 3256 }, { "epoch": 7.906439854191981, "grad_norm": 0.8691707849502563, "learning_rate": 9.762e-06, "loss": 0.1004, "step": 3257 }, { "epoch": 7.908869987849331, "grad_norm": 0.8382599353790283, "learning_rate": 9.765e-06, "loss": 0.0977, "step": 3258 }, { "epoch": 7.911300121506683, "grad_norm": 0.6189960837364197, "learning_rate": 9.768e-06, "loss": 0.077, "step": 3259 }, { "epoch": 7.913730255164034, "grad_norm": 1.2717177867889404, "learning_rate": 9.771e-06, "loss": 0.2173, "step": 3260 }, { "epoch": 7.916160388821385, "grad_norm": 0.7077528238296509, "learning_rate": 9.774e-06, "loss": 0.0686, "step": 3261 }, { "epoch": 7.918590522478736, "grad_norm": 1.3194481134414673, "learning_rate": 9.777000000000001e-06, "loss": 0.1217, "step": 3262 }, { "epoch": 7.921020656136087, "grad_norm": 0.8063385486602783, "learning_rate": 9.780000000000001e-06, "loss": 0.1099, "step": 3263 }, { "epoch": 7.9234507897934385, "grad_norm": 0.8069854974746704, "learning_rate": 9.783000000000001e-06, "loss": 0.0777, "step": 3264 }, { "epoch": 7.92588092345079, "grad_norm": 0.7131829261779785, "learning_rate": 9.785999999999999e-06, "loss": 0.0666, "step": 3265 }, { "epoch": 7.928311057108141, "grad_norm": 0.9050866365432739, "learning_rate": 9.788999999999999e-06, "loss": 0.0977, "step": 3266 }, { "epoch": 7.930741190765492, "grad_norm": 0.8365057706832886, "learning_rate": 9.792e-06, "loss": 0.086, "step": 3267 }, { "epoch": 7.933171324422843, "grad_norm": 0.8722661733627319, "learning_rate": 9.795e-06, "loss": 0.0718, "step": 3268 }, { "epoch": 7.935601458080194, "grad_norm": 1.1128402948379517, "learning_rate": 9.798e-06, "loss": 0.112, "step": 3269 }, { "epoch": 7.938031591737546, "grad_norm": 1.1257323026657104, "learning_rate": 9.801e-06, "loss": 0.1009, "step": 3270 }, { "epoch": 7.940461725394897, "grad_norm": 0.9065591096878052, "learning_rate": 9.804e-06, "loss": 0.0815, "step": 3271 }, { "epoch": 7.9428918590522475, "grad_norm": 1.0503095388412476, "learning_rate": 9.807000000000001e-06, "loss": 0.1081, "step": 3272 }, { "epoch": 7.945321992709599, "grad_norm": 0.6638137698173523, "learning_rate": 9.810000000000001e-06, "loss": 0.0593, "step": 3273 }, { "epoch": 7.94775212636695, "grad_norm": 3.2394073009490967, "learning_rate": 9.813e-06, "loss": 0.1004, "step": 3274 }, { "epoch": 7.9501822600243015, "grad_norm": 2.0023393630981445, "learning_rate": 9.816e-06, "loss": 0.1049, "step": 3275 }, { "epoch": 7.952612393681653, "grad_norm": 1.6208677291870117, "learning_rate": 9.819e-06, "loss": 0.0957, "step": 3276 }, { "epoch": 7.955042527339003, "grad_norm": 1.3668400049209595, "learning_rate": 9.822e-06, "loss": 0.1238, "step": 3277 }, { "epoch": 7.957472660996355, "grad_norm": 1.233834147453308, "learning_rate": 9.825e-06, "loss": 0.1251, "step": 3278 }, { "epoch": 7.959902794653706, "grad_norm": 1.131894826889038, "learning_rate": 9.828e-06, "loss": 0.0992, "step": 3279 }, { "epoch": 7.962332928311057, "grad_norm": 1.8776366710662842, "learning_rate": 9.831e-06, "loss": 0.1455, "step": 3280 }, { "epoch": 7.964763061968409, "grad_norm": 1.2550761699676514, "learning_rate": 9.834e-06, "loss": 0.1497, "step": 3281 }, { "epoch": 7.967193195625759, "grad_norm": 1.6061393022537231, "learning_rate": 9.837000000000001e-06, "loss": 0.1309, "step": 3282 }, { "epoch": 7.9696233292831105, "grad_norm": 1.6731425523757935, "learning_rate": 9.84e-06, "loss": 0.1631, "step": 3283 }, { "epoch": 7.972053462940462, "grad_norm": 2.453242778778076, "learning_rate": 9.843e-06, "loss": 0.2251, "step": 3284 }, { "epoch": 7.974483596597813, "grad_norm": 1.9778401851654053, "learning_rate": 9.846e-06, "loss": 0.3701, "step": 3285 }, { "epoch": 7.9769137302551645, "grad_norm": 0.7609272003173828, "learning_rate": 9.849e-06, "loss": 0.1805, "step": 3286 }, { "epoch": 7.979343863912515, "grad_norm": 1.0395481586456299, "learning_rate": 9.852000000000002e-06, "loss": 0.1021, "step": 3287 }, { "epoch": 7.981773997569866, "grad_norm": 0.9108520150184631, "learning_rate": 9.855000000000001e-06, "loss": 0.1036, "step": 3288 }, { "epoch": 7.984204131227218, "grad_norm": 1.2403968572616577, "learning_rate": 9.858000000000001e-06, "loss": 0.0888, "step": 3289 }, { "epoch": 7.986634264884569, "grad_norm": 1.2000263929367065, "learning_rate": 9.861e-06, "loss": 0.1081, "step": 3290 }, { "epoch": 7.98906439854192, "grad_norm": 0.8078727722167969, "learning_rate": 9.863999999999999e-06, "loss": 0.0738, "step": 3291 }, { "epoch": 7.991494532199271, "grad_norm": 0.6826702952384949, "learning_rate": 9.867e-06, "loss": 0.0764, "step": 3292 }, { "epoch": 7.993924665856622, "grad_norm": 1.3415156602859497, "learning_rate": 9.87e-06, "loss": 0.1348, "step": 3293 }, { "epoch": 7.996354799513973, "grad_norm": 1.1132245063781738, "learning_rate": 9.873e-06, "loss": 0.1197, "step": 3294 }, { "epoch": 7.998784933171325, "grad_norm": 2.374213218688965, "learning_rate": 9.876e-06, "loss": 0.1226, "step": 3295 }, { "epoch": 8.0, "grad_norm": 1.6520699262619019, "learning_rate": 9.879e-06, "loss": 0.1038, "step": 3296 }, { "epoch": 8.00243013365735, "grad_norm": 7.138794422149658, "learning_rate": 9.882000000000001e-06, "loss": 0.5592, "step": 3297 }, { "epoch": 8.004860267314703, "grad_norm": 2.6149611473083496, "learning_rate": 9.885000000000001e-06, "loss": 0.3903, "step": 3298 }, { "epoch": 8.007290400972053, "grad_norm": 1.095265507698059, "learning_rate": 9.888000000000001e-06, "loss": 0.2999, "step": 3299 }, { "epoch": 8.009720534629405, "grad_norm": 1.5966033935546875, "learning_rate": 9.891e-06, "loss": 0.2592, "step": 3300 }, { "epoch": 8.012150668286756, "grad_norm": 1.1147688627243042, "learning_rate": 9.894e-06, "loss": 0.1898, "step": 3301 }, { "epoch": 8.014580801944106, "grad_norm": 1.1593531370162964, "learning_rate": 9.897e-06, "loss": 0.2413, "step": 3302 }, { "epoch": 8.017010935601458, "grad_norm": 1.3676508665084839, "learning_rate": 9.9e-06, "loss": 0.1664, "step": 3303 }, { "epoch": 8.019441069258809, "grad_norm": 0.9491952657699585, "learning_rate": 9.903e-06, "loss": 0.1488, "step": 3304 }, { "epoch": 8.021871202916161, "grad_norm": 1.1188209056854248, "learning_rate": 9.906e-06, "loss": 0.1698, "step": 3305 }, { "epoch": 8.024301336573512, "grad_norm": 0.7801015973091125, "learning_rate": 9.909e-06, "loss": 0.1181, "step": 3306 }, { "epoch": 8.026731470230862, "grad_norm": 0.6904942989349365, "learning_rate": 9.912000000000001e-06, "loss": 0.12, "step": 3307 }, { "epoch": 8.029161603888214, "grad_norm": 2.442054033279419, "learning_rate": 9.915e-06, "loss": 0.098, "step": 3308 }, { "epoch": 8.031591737545565, "grad_norm": 0.8011068105697632, "learning_rate": 9.918e-06, "loss": 0.0864, "step": 3309 }, { "epoch": 8.034021871202917, "grad_norm": 0.9428938031196594, "learning_rate": 9.921e-06, "loss": 0.0964, "step": 3310 }, { "epoch": 8.036452004860267, "grad_norm": 0.7288050055503845, "learning_rate": 9.924e-06, "loss": 0.092, "step": 3311 }, { "epoch": 8.038882138517618, "grad_norm": 0.8311866521835327, "learning_rate": 9.927000000000002e-06, "loss": 0.1009, "step": 3312 }, { "epoch": 8.04131227217497, "grad_norm": 1.0125364065170288, "learning_rate": 9.930000000000001e-06, "loss": 0.0901, "step": 3313 }, { "epoch": 8.04374240583232, "grad_norm": 0.925116240978241, "learning_rate": 9.933e-06, "loss": 0.0787, "step": 3314 }, { "epoch": 8.046172539489673, "grad_norm": 0.8153257966041565, "learning_rate": 9.936e-06, "loss": 0.1127, "step": 3315 }, { "epoch": 8.048602673147023, "grad_norm": 1.320761799812317, "learning_rate": 9.939e-06, "loss": 0.1028, "step": 3316 }, { "epoch": 8.051032806804374, "grad_norm": 0.7591928839683533, "learning_rate": 9.941999999999999e-06, "loss": 0.0662, "step": 3317 }, { "epoch": 8.053462940461726, "grad_norm": 0.7747924327850342, "learning_rate": 9.945e-06, "loss": 0.0727, "step": 3318 }, { "epoch": 8.055893074119076, "grad_norm": 0.8821365237236023, "learning_rate": 9.948e-06, "loss": 0.0842, "step": 3319 }, { "epoch": 8.058323207776427, "grad_norm": 0.650090754032135, "learning_rate": 9.951e-06, "loss": 0.0877, "step": 3320 }, { "epoch": 8.060753341433779, "grad_norm": 0.6751341819763184, "learning_rate": 9.954e-06, "loss": 0.0718, "step": 3321 }, { "epoch": 8.06318347509113, "grad_norm": 0.8449902534484863, "learning_rate": 9.957e-06, "loss": 0.0761, "step": 3322 }, { "epoch": 8.065613608748482, "grad_norm": 0.6883044838905334, "learning_rate": 9.960000000000001e-06, "loss": 0.0781, "step": 3323 }, { "epoch": 8.068043742405832, "grad_norm": 0.8260672688484192, "learning_rate": 9.963000000000001e-06, "loss": 0.0718, "step": 3324 }, { "epoch": 8.070473876063183, "grad_norm": 0.753498911857605, "learning_rate": 9.966e-06, "loss": 0.0626, "step": 3325 }, { "epoch": 8.072904009720535, "grad_norm": 0.9489023089408875, "learning_rate": 9.969e-06, "loss": 0.0805, "step": 3326 }, { "epoch": 8.075334143377885, "grad_norm": 0.8876113891601562, "learning_rate": 9.971999999999999e-06, "loss": 0.0768, "step": 3327 }, { "epoch": 8.077764277035238, "grad_norm": 0.8805519342422485, "learning_rate": 9.975e-06, "loss": 0.0794, "step": 3328 }, { "epoch": 8.080194410692588, "grad_norm": 0.9154880046844482, "learning_rate": 9.978e-06, "loss": 0.082, "step": 3329 }, { "epoch": 8.082624544349938, "grad_norm": 0.9973198175430298, "learning_rate": 9.981e-06, "loss": 0.1144, "step": 3330 }, { "epoch": 8.08505467800729, "grad_norm": 0.7392475605010986, "learning_rate": 9.984e-06, "loss": 0.0716, "step": 3331 }, { "epoch": 8.087484811664641, "grad_norm": 1.0053342580795288, "learning_rate": 9.987e-06, "loss": 0.0924, "step": 3332 }, { "epoch": 8.089914945321993, "grad_norm": 1.1457382440567017, "learning_rate": 9.990000000000001e-06, "loss": 0.1168, "step": 3333 }, { "epoch": 8.092345078979344, "grad_norm": 1.0653412342071533, "learning_rate": 9.993e-06, "loss": 0.1054, "step": 3334 }, { "epoch": 8.094775212636694, "grad_norm": 1.0976039171218872, "learning_rate": 9.996e-06, "loss": 0.1219, "step": 3335 }, { "epoch": 8.097205346294047, "grad_norm": 1.0033286809921265, "learning_rate": 9.999e-06, "loss": 0.0916, "step": 3336 }, { "epoch": 8.099635479951397, "grad_norm": 1.0651520490646362, "learning_rate": 1.0002e-05, "loss": 0.0816, "step": 3337 }, { "epoch": 8.10206561360875, "grad_norm": 1.0197631120681763, "learning_rate": 1.0005000000000002e-05, "loss": 0.0907, "step": 3338 }, { "epoch": 8.1044957472661, "grad_norm": 1.0743993520736694, "learning_rate": 1.0008e-05, "loss": 0.0765, "step": 3339 }, { "epoch": 8.10692588092345, "grad_norm": 1.6139205694198608, "learning_rate": 1.0011e-05, "loss": 0.151, "step": 3340 }, { "epoch": 8.109356014580802, "grad_norm": 1.2055065631866455, "learning_rate": 1.0014e-05, "loss": 0.1074, "step": 3341 }, { "epoch": 8.111786148238153, "grad_norm": 1.3420093059539795, "learning_rate": 1.0016999999999999e-05, "loss": 0.1153, "step": 3342 }, { "epoch": 8.114216281895505, "grad_norm": 0.917809247970581, "learning_rate": 1.002e-05, "loss": 0.0896, "step": 3343 }, { "epoch": 8.116646415552855, "grad_norm": 1.7452410459518433, "learning_rate": 1.0023e-05, "loss": 0.1368, "step": 3344 }, { "epoch": 8.119076549210206, "grad_norm": 1.7893091440200806, "learning_rate": 1.0026e-05, "loss": 0.1202, "step": 3345 }, { "epoch": 8.121506682867558, "grad_norm": 2.703795909881592, "learning_rate": 1.0029e-05, "loss": 0.2271, "step": 3346 }, { "epoch": 8.123936816524909, "grad_norm": 2.3293251991271973, "learning_rate": 1.0032e-05, "loss": 0.4814, "step": 3347 }, { "epoch": 8.12636695018226, "grad_norm": 1.5195180177688599, "learning_rate": 1.0035000000000001e-05, "loss": 0.371, "step": 3348 }, { "epoch": 8.128797083839611, "grad_norm": 1.0532809495925903, "learning_rate": 1.0038000000000001e-05, "loss": 0.2676, "step": 3349 }, { "epoch": 8.131227217496962, "grad_norm": 1.7559527158737183, "learning_rate": 1.0041000000000001e-05, "loss": 0.2835, "step": 3350 }, { "epoch": 8.133657351154314, "grad_norm": 1.1560602188110352, "learning_rate": 1.0043999999999999e-05, "loss": 0.2383, "step": 3351 }, { "epoch": 8.136087484811664, "grad_norm": 0.9639642238616943, "learning_rate": 1.0046999999999999e-05, "loss": 0.1856, "step": 3352 }, { "epoch": 8.138517618469017, "grad_norm": 0.8354083895683289, "learning_rate": 1.005e-05, "loss": 0.1498, "step": 3353 }, { "epoch": 8.140947752126367, "grad_norm": 0.700840175151825, "learning_rate": 1.0053e-05, "loss": 0.142, "step": 3354 }, { "epoch": 8.143377885783718, "grad_norm": 0.7386534810066223, "learning_rate": 1.0056e-05, "loss": 0.1214, "step": 3355 }, { "epoch": 8.14580801944107, "grad_norm": 1.7404584884643555, "learning_rate": 1.0059e-05, "loss": 0.1398, "step": 3356 }, { "epoch": 8.14823815309842, "grad_norm": 0.7013133764266968, "learning_rate": 1.0062e-05, "loss": 0.1029, "step": 3357 }, { "epoch": 8.15066828675577, "grad_norm": 0.7254215478897095, "learning_rate": 1.0065000000000001e-05, "loss": 0.095, "step": 3358 }, { "epoch": 8.153098420413123, "grad_norm": 0.611047089099884, "learning_rate": 1.0068e-05, "loss": 0.0711, "step": 3359 }, { "epoch": 8.155528554070473, "grad_norm": 0.9504143595695496, "learning_rate": 1.0071e-05, "loss": 0.0822, "step": 3360 }, { "epoch": 8.157958687727826, "grad_norm": 0.7732507586479187, "learning_rate": 1.0074e-05, "loss": 0.0956, "step": 3361 }, { "epoch": 8.160388821385176, "grad_norm": 0.9526719450950623, "learning_rate": 1.0077e-05, "loss": 0.0814, "step": 3362 }, { "epoch": 8.162818955042527, "grad_norm": 0.694277286529541, "learning_rate": 1.008e-05, "loss": 0.0935, "step": 3363 }, { "epoch": 8.165249088699879, "grad_norm": 0.7406065464019775, "learning_rate": 1.0083e-05, "loss": 0.0758, "step": 3364 }, { "epoch": 8.16767922235723, "grad_norm": 0.7371125221252441, "learning_rate": 1.0086e-05, "loss": 0.0605, "step": 3365 }, { "epoch": 8.170109356014581, "grad_norm": 0.9215192198753357, "learning_rate": 1.0089e-05, "loss": 0.0968, "step": 3366 }, { "epoch": 8.172539489671932, "grad_norm": 0.7721760272979736, "learning_rate": 1.0092e-05, "loss": 0.0803, "step": 3367 }, { "epoch": 8.174969623329282, "grad_norm": 0.7593980431556702, "learning_rate": 1.0095e-05, "loss": 0.0772, "step": 3368 }, { "epoch": 8.177399756986635, "grad_norm": 0.7838420867919922, "learning_rate": 1.0098e-05, "loss": 0.073, "step": 3369 }, { "epoch": 8.179829890643985, "grad_norm": 1.2056387662887573, "learning_rate": 1.0101e-05, "loss": 0.1407, "step": 3370 }, { "epoch": 8.182260024301337, "grad_norm": 0.9442618489265442, "learning_rate": 1.0104e-05, "loss": 0.0749, "step": 3371 }, { "epoch": 8.184690157958688, "grad_norm": 1.990085482597351, "learning_rate": 1.0107e-05, "loss": 0.0615, "step": 3372 }, { "epoch": 8.187120291616038, "grad_norm": 0.8726717233657837, "learning_rate": 1.0110000000000001e-05, "loss": 0.0741, "step": 3373 }, { "epoch": 8.18955042527339, "grad_norm": 0.8089768290519714, "learning_rate": 1.0113000000000001e-05, "loss": 0.0619, "step": 3374 }, { "epoch": 8.19198055893074, "grad_norm": 0.8940693140029907, "learning_rate": 1.0116000000000001e-05, "loss": 0.089, "step": 3375 }, { "epoch": 8.194410692588093, "grad_norm": 0.6259778141975403, "learning_rate": 1.0119e-05, "loss": 0.0576, "step": 3376 }, { "epoch": 8.196840826245444, "grad_norm": 1.098593831062317, "learning_rate": 1.0121999999999999e-05, "loss": 0.1133, "step": 3377 }, { "epoch": 8.199270959902794, "grad_norm": 0.821115255355835, "learning_rate": 1.0125e-05, "loss": 0.0891, "step": 3378 }, { "epoch": 8.201701093560146, "grad_norm": 0.9128598570823669, "learning_rate": 1.0128e-05, "loss": 0.0777, "step": 3379 }, { "epoch": 8.204131227217497, "grad_norm": 1.3114391565322876, "learning_rate": 1.0131e-05, "loss": 0.085, "step": 3380 }, { "epoch": 8.206561360874849, "grad_norm": 2.8821959495544434, "learning_rate": 1.0134e-05, "loss": 0.2075, "step": 3381 }, { "epoch": 8.2089914945322, "grad_norm": 2.593290090560913, "learning_rate": 1.0137e-05, "loss": 0.1069, "step": 3382 }, { "epoch": 8.21142162818955, "grad_norm": 1.1286592483520508, "learning_rate": 1.0140000000000001e-05, "loss": 0.1006, "step": 3383 }, { "epoch": 8.213851761846902, "grad_norm": 0.9853768944740295, "learning_rate": 1.0143000000000001e-05, "loss": 0.0719, "step": 3384 }, { "epoch": 8.216281895504252, "grad_norm": 0.8320932388305664, "learning_rate": 1.0146e-05, "loss": 0.0805, "step": 3385 }, { "epoch": 8.218712029161605, "grad_norm": 1.2076342105865479, "learning_rate": 1.0149e-05, "loss": 0.0892, "step": 3386 }, { "epoch": 8.221142162818955, "grad_norm": 1.0470387935638428, "learning_rate": 1.0152e-05, "loss": 0.0789, "step": 3387 }, { "epoch": 8.223572296476306, "grad_norm": 1.1636884212493896, "learning_rate": 1.0155e-05, "loss": 0.078, "step": 3388 }, { "epoch": 8.226002430133658, "grad_norm": 0.99946528673172, "learning_rate": 1.0158e-05, "loss": 0.0895, "step": 3389 }, { "epoch": 8.228432563791008, "grad_norm": 2.200887441635132, "learning_rate": 1.0161e-05, "loss": 0.1202, "step": 3390 }, { "epoch": 8.23086269744836, "grad_norm": 0.9406114220619202, "learning_rate": 1.0164e-05, "loss": 0.0983, "step": 3391 }, { "epoch": 8.233292831105711, "grad_norm": 1.2179685831069946, "learning_rate": 1.0167e-05, "loss": 0.1242, "step": 3392 }, { "epoch": 8.235722964763061, "grad_norm": 1.6630107164382935, "learning_rate": 1.0170000000000001e-05, "loss": 0.1305, "step": 3393 }, { "epoch": 8.238153098420414, "grad_norm": 1.171414852142334, "learning_rate": 1.0173e-05, "loss": 0.0835, "step": 3394 }, { "epoch": 8.240583232077764, "grad_norm": 1.8438276052474976, "learning_rate": 1.0176e-05, "loss": 0.1907, "step": 3395 }, { "epoch": 8.243013365735115, "grad_norm": 2.665879726409912, "learning_rate": 1.0179e-05, "loss": 0.2199, "step": 3396 }, { "epoch": 8.245443499392467, "grad_norm": 2.4092888832092285, "learning_rate": 1.0182e-05, "loss": 0.4623, "step": 3397 }, { "epoch": 8.247873633049817, "grad_norm": 0.8743352890014648, "learning_rate": 1.0185000000000002e-05, "loss": 0.3162, "step": 3398 }, { "epoch": 8.25030376670717, "grad_norm": 0.8364173173904419, "learning_rate": 1.0188000000000001e-05, "loss": 0.2817, "step": 3399 }, { "epoch": 8.25273390036452, "grad_norm": 1.4031034708023071, "learning_rate": 1.0191e-05, "loss": 0.2663, "step": 3400 }, { "epoch": 8.25516403402187, "grad_norm": 1.069007396697998, "learning_rate": 1.0194e-05, "loss": 0.2507, "step": 3401 }, { "epoch": 8.257594167679223, "grad_norm": 0.7592563629150391, "learning_rate": 1.0196999999999999e-05, "loss": 0.1461, "step": 3402 }, { "epoch": 8.260024301336573, "grad_norm": 1.1186316013336182, "learning_rate": 1.02e-05, "loss": 0.1594, "step": 3403 }, { "epoch": 8.262454434993925, "grad_norm": 1.1932214498519897, "learning_rate": 1.0203e-05, "loss": 0.1298, "step": 3404 }, { "epoch": 8.264884568651276, "grad_norm": 0.9672958850860596, "learning_rate": 1.0206e-05, "loss": 0.1379, "step": 3405 }, { "epoch": 8.267314702308626, "grad_norm": 0.609164297580719, "learning_rate": 1.0209e-05, "loss": 0.1001, "step": 3406 }, { "epoch": 8.269744835965978, "grad_norm": 0.6356159448623657, "learning_rate": 1.0212e-05, "loss": 0.1033, "step": 3407 }, { "epoch": 8.272174969623329, "grad_norm": 1.0680944919586182, "learning_rate": 1.0215000000000001e-05, "loss": 0.1153, "step": 3408 }, { "epoch": 8.274605103280681, "grad_norm": 0.782227635383606, "learning_rate": 1.0218000000000001e-05, "loss": 0.1087, "step": 3409 }, { "epoch": 8.277035236938032, "grad_norm": 0.6446893215179443, "learning_rate": 1.0221000000000001e-05, "loss": 0.0692, "step": 3410 }, { "epoch": 8.279465370595382, "grad_norm": 0.704138457775116, "learning_rate": 1.0224e-05, "loss": 0.0986, "step": 3411 }, { "epoch": 8.281895504252734, "grad_norm": 1.0197941064834595, "learning_rate": 1.0227e-05, "loss": 0.1012, "step": 3412 }, { "epoch": 8.284325637910085, "grad_norm": 0.9315072298049927, "learning_rate": 1.023e-05, "loss": 0.0923, "step": 3413 }, { "epoch": 8.286755771567437, "grad_norm": 1.138037085533142, "learning_rate": 1.0233e-05, "loss": 0.0998, "step": 3414 }, { "epoch": 8.289185905224787, "grad_norm": 0.808312177658081, "learning_rate": 1.0236e-05, "loss": 0.0883, "step": 3415 }, { "epoch": 8.291616038882138, "grad_norm": 0.7769222259521484, "learning_rate": 1.0239e-05, "loss": 0.068, "step": 3416 }, { "epoch": 8.29404617253949, "grad_norm": 0.5653959512710571, "learning_rate": 1.0242e-05, "loss": 0.0567, "step": 3417 }, { "epoch": 8.29647630619684, "grad_norm": 0.8083851933479309, "learning_rate": 1.0245000000000001e-05, "loss": 0.0723, "step": 3418 }, { "epoch": 8.298906439854193, "grad_norm": 0.8489545583724976, "learning_rate": 1.0248e-05, "loss": 0.1098, "step": 3419 }, { "epoch": 8.301336573511543, "grad_norm": 0.5909852981567383, "learning_rate": 1.0251e-05, "loss": 0.0566, "step": 3420 }, { "epoch": 8.303766707168894, "grad_norm": 0.6080713272094727, "learning_rate": 1.0254e-05, "loss": 0.079, "step": 3421 }, { "epoch": 8.306196840826246, "grad_norm": 0.7256085872650146, "learning_rate": 1.0257e-05, "loss": 0.0723, "step": 3422 }, { "epoch": 8.308626974483596, "grad_norm": 0.9664333462715149, "learning_rate": 1.0260000000000002e-05, "loss": 0.1196, "step": 3423 }, { "epoch": 8.311057108140949, "grad_norm": 1.2209690809249878, "learning_rate": 1.0263000000000002e-05, "loss": 0.0957, "step": 3424 }, { "epoch": 8.313487241798299, "grad_norm": 0.8502681255340576, "learning_rate": 1.0266e-05, "loss": 0.083, "step": 3425 }, { "epoch": 8.31591737545565, "grad_norm": 0.7487565279006958, "learning_rate": 1.0269e-05, "loss": 0.0675, "step": 3426 }, { "epoch": 8.318347509113002, "grad_norm": 0.7225878834724426, "learning_rate": 1.0272e-05, "loss": 0.0695, "step": 3427 }, { "epoch": 8.320777642770352, "grad_norm": 0.9178133010864258, "learning_rate": 1.0275e-05, "loss": 0.1014, "step": 3428 }, { "epoch": 8.323207776427704, "grad_norm": 0.9607024192810059, "learning_rate": 1.0278e-05, "loss": 0.0905, "step": 3429 }, { "epoch": 8.325637910085055, "grad_norm": 0.8545099496841431, "learning_rate": 1.0281e-05, "loss": 0.0937, "step": 3430 }, { "epoch": 8.328068043742405, "grad_norm": 1.2912176847457886, "learning_rate": 1.0284e-05, "loss": 0.1123, "step": 3431 }, { "epoch": 8.330498177399758, "grad_norm": 0.9292144775390625, "learning_rate": 1.0287e-05, "loss": 0.0921, "step": 3432 }, { "epoch": 8.332928311057108, "grad_norm": 2.0643205642700195, "learning_rate": 1.0290000000000001e-05, "loss": 0.2073, "step": 3433 }, { "epoch": 8.335358444714458, "grad_norm": 0.8849003314971924, "learning_rate": 1.0293000000000001e-05, "loss": 0.084, "step": 3434 }, { "epoch": 8.33778857837181, "grad_norm": 1.1366183757781982, "learning_rate": 1.0296000000000001e-05, "loss": 0.1046, "step": 3435 }, { "epoch": 8.340218712029161, "grad_norm": 1.1191729307174683, "learning_rate": 1.0299e-05, "loss": 0.0938, "step": 3436 }, { "epoch": 8.342648845686513, "grad_norm": 1.7082915306091309, "learning_rate": 1.0301999999999999e-05, "loss": 0.1008, "step": 3437 }, { "epoch": 8.345078979343864, "grad_norm": 1.4962348937988281, "learning_rate": 1.0305e-05, "loss": 0.0894, "step": 3438 }, { "epoch": 8.347509113001214, "grad_norm": 1.5547372102737427, "learning_rate": 1.0308e-05, "loss": 0.1391, "step": 3439 }, { "epoch": 8.349939246658566, "grad_norm": 0.951866626739502, "learning_rate": 1.0311e-05, "loss": 0.0727, "step": 3440 }, { "epoch": 8.352369380315917, "grad_norm": 0.9278546571731567, "learning_rate": 1.0314e-05, "loss": 0.0997, "step": 3441 }, { "epoch": 8.35479951397327, "grad_norm": 1.3074963092803955, "learning_rate": 1.0317e-05, "loss": 0.1045, "step": 3442 }, { "epoch": 8.35722964763062, "grad_norm": 1.1573572158813477, "learning_rate": 1.032e-05, "loss": 0.1302, "step": 3443 }, { "epoch": 8.35965978128797, "grad_norm": 1.6086348295211792, "learning_rate": 1.0323000000000001e-05, "loss": 0.1479, "step": 3444 }, { "epoch": 8.362089914945322, "grad_norm": 1.7546278238296509, "learning_rate": 1.0326e-05, "loss": 0.168, "step": 3445 }, { "epoch": 8.364520048602673, "grad_norm": 2.673367738723755, "learning_rate": 1.0329e-05, "loss": 0.1883, "step": 3446 }, { "epoch": 8.366950182260025, "grad_norm": 2.2090859413146973, "learning_rate": 1.0332e-05, "loss": 0.4294, "step": 3447 }, { "epoch": 8.369380315917375, "grad_norm": 1.1766200065612793, "learning_rate": 1.0335e-05, "loss": 0.3613, "step": 3448 }, { "epoch": 8.371810449574726, "grad_norm": 0.9572736024856567, "learning_rate": 1.0338e-05, "loss": 0.293, "step": 3449 }, { "epoch": 8.374240583232078, "grad_norm": 1.133418083190918, "learning_rate": 1.0341e-05, "loss": 0.2633, "step": 3450 }, { "epoch": 8.376670716889429, "grad_norm": 0.9965869784355164, "learning_rate": 1.0344e-05, "loss": 0.2491, "step": 3451 }, { "epoch": 8.37910085054678, "grad_norm": 1.2730618715286255, "learning_rate": 1.0347e-05, "loss": 0.1926, "step": 3452 }, { "epoch": 8.381530984204131, "grad_norm": 0.980194091796875, "learning_rate": 1.035e-05, "loss": 0.2077, "step": 3453 }, { "epoch": 8.383961117861482, "grad_norm": 1.1543320417404175, "learning_rate": 1.0353e-05, "loss": 0.1467, "step": 3454 }, { "epoch": 8.386391251518834, "grad_norm": 1.498732089996338, "learning_rate": 1.0356e-05, "loss": 0.1572, "step": 3455 }, { "epoch": 8.388821385176184, "grad_norm": 0.8581307530403137, "learning_rate": 1.0359e-05, "loss": 0.1121, "step": 3456 }, { "epoch": 8.391251518833537, "grad_norm": 1.1963766813278198, "learning_rate": 1.0362e-05, "loss": 0.0995, "step": 3457 }, { "epoch": 8.393681652490887, "grad_norm": 3.1727123260498047, "learning_rate": 1.0365e-05, "loss": 0.1173, "step": 3458 }, { "epoch": 8.396111786148237, "grad_norm": 0.6057112812995911, "learning_rate": 1.0368000000000001e-05, "loss": 0.0632, "step": 3459 }, { "epoch": 8.39854191980559, "grad_norm": 0.8342182040214539, "learning_rate": 1.0371000000000001e-05, "loss": 0.0975, "step": 3460 }, { "epoch": 8.40097205346294, "grad_norm": 0.8255597352981567, "learning_rate": 1.0374000000000001e-05, "loss": 0.1072, "step": 3461 }, { "epoch": 8.403402187120292, "grad_norm": 0.6392787098884583, "learning_rate": 1.0376999999999999e-05, "loss": 0.0709, "step": 3462 }, { "epoch": 8.405832320777643, "grad_norm": 0.7596831917762756, "learning_rate": 1.0379999999999999e-05, "loss": 0.1112, "step": 3463 }, { "epoch": 8.408262454434993, "grad_norm": 0.7485291361808777, "learning_rate": 1.0383e-05, "loss": 0.1067, "step": 3464 }, { "epoch": 8.410692588092346, "grad_norm": 0.7450830340385437, "learning_rate": 1.0386e-05, "loss": 0.0654, "step": 3465 }, { "epoch": 8.413122721749696, "grad_norm": 0.7806029319763184, "learning_rate": 1.0389e-05, "loss": 0.0861, "step": 3466 }, { "epoch": 8.415552855407048, "grad_norm": 0.7042810916900635, "learning_rate": 1.0392e-05, "loss": 0.0615, "step": 3467 }, { "epoch": 8.417982989064399, "grad_norm": 0.5624070763587952, "learning_rate": 1.0395e-05, "loss": 0.0675, "step": 3468 }, { "epoch": 8.42041312272175, "grad_norm": 0.8562686443328857, "learning_rate": 1.0398000000000001e-05, "loss": 0.0838, "step": 3469 }, { "epoch": 8.422843256379101, "grad_norm": 0.48260498046875, "learning_rate": 1.0401000000000001e-05, "loss": 0.0542, "step": 3470 }, { "epoch": 8.425273390036452, "grad_norm": 0.7684537768363953, "learning_rate": 1.0404e-05, "loss": 0.0633, "step": 3471 }, { "epoch": 8.427703523693804, "grad_norm": 0.6516621112823486, "learning_rate": 1.0407e-05, "loss": 0.0614, "step": 3472 }, { "epoch": 8.430133657351154, "grad_norm": 0.9535280466079712, "learning_rate": 1.041e-05, "loss": 0.1377, "step": 3473 }, { "epoch": 8.432563791008505, "grad_norm": 1.1930365562438965, "learning_rate": 1.0413e-05, "loss": 0.0883, "step": 3474 }, { "epoch": 8.434993924665857, "grad_norm": 0.8221307992935181, "learning_rate": 1.0416e-05, "loss": 0.066, "step": 3475 }, { "epoch": 8.437424058323208, "grad_norm": 0.923721432685852, "learning_rate": 1.0419e-05, "loss": 0.0814, "step": 3476 }, { "epoch": 8.439854191980558, "grad_norm": 0.9146966338157654, "learning_rate": 1.0422e-05, "loss": 0.0693, "step": 3477 }, { "epoch": 8.44228432563791, "grad_norm": 0.8255177140235901, "learning_rate": 1.0425e-05, "loss": 0.0677, "step": 3478 }, { "epoch": 8.44471445929526, "grad_norm": 1.2606055736541748, "learning_rate": 1.0428e-05, "loss": 0.0678, "step": 3479 }, { "epoch": 8.447144592952613, "grad_norm": 0.9394828081130981, "learning_rate": 1.0431e-05, "loss": 0.0797, "step": 3480 }, { "epoch": 8.449574726609963, "grad_norm": 1.031982421875, "learning_rate": 1.0434e-05, "loss": 0.0909, "step": 3481 }, { "epoch": 8.452004860267314, "grad_norm": 0.8763875365257263, "learning_rate": 1.0437e-05, "loss": 0.0825, "step": 3482 }, { "epoch": 8.454434993924666, "grad_norm": 1.0307679176330566, "learning_rate": 1.044e-05, "loss": 0.1128, "step": 3483 }, { "epoch": 8.456865127582017, "grad_norm": 0.8055126667022705, "learning_rate": 1.0443000000000001e-05, "loss": 0.0692, "step": 3484 }, { "epoch": 8.459295261239369, "grad_norm": 0.9382497072219849, "learning_rate": 1.0446000000000001e-05, "loss": 0.095, "step": 3485 }, { "epoch": 8.46172539489672, "grad_norm": 0.9474718570709229, "learning_rate": 1.0449e-05, "loss": 0.0766, "step": 3486 }, { "epoch": 8.46415552855407, "grad_norm": 0.8343275189399719, "learning_rate": 1.0452e-05, "loss": 0.0845, "step": 3487 }, { "epoch": 8.466585662211422, "grad_norm": 0.9707762002944946, "learning_rate": 1.0454999999999999e-05, "loss": 0.0942, "step": 3488 }, { "epoch": 8.469015795868772, "grad_norm": 1.3272227048873901, "learning_rate": 1.0458e-05, "loss": 0.0734, "step": 3489 }, { "epoch": 8.471445929526125, "grad_norm": 1.2156968116760254, "learning_rate": 1.0461e-05, "loss": 0.1493, "step": 3490 }, { "epoch": 8.473876063183475, "grad_norm": 1.270708441734314, "learning_rate": 1.0464e-05, "loss": 0.0884, "step": 3491 }, { "epoch": 8.476306196840826, "grad_norm": 1.043908715248108, "learning_rate": 1.0467e-05, "loss": 0.1071, "step": 3492 }, { "epoch": 8.478736330498178, "grad_norm": 2.288722515106201, "learning_rate": 1.047e-05, "loss": 0.1036, "step": 3493 }, { "epoch": 8.481166464155528, "grad_norm": 2.1226513385772705, "learning_rate": 1.0473000000000001e-05, "loss": 0.1218, "step": 3494 }, { "epoch": 8.48359659781288, "grad_norm": 1.8582367897033691, "learning_rate": 1.0476000000000001e-05, "loss": 0.1829, "step": 3495 }, { "epoch": 8.486026731470231, "grad_norm": 3.509232759475708, "learning_rate": 1.0479e-05, "loss": 0.3275, "step": 3496 }, { "epoch": 8.488456865127581, "grad_norm": 3.1661152839660645, "learning_rate": 1.0482e-05, "loss": 0.4861, "step": 3497 }, { "epoch": 8.490886998784934, "grad_norm": 1.298532247543335, "learning_rate": 1.0485e-05, "loss": 0.3878, "step": 3498 }, { "epoch": 8.493317132442284, "grad_norm": 1.1581006050109863, "learning_rate": 1.0488e-05, "loss": 0.3581, "step": 3499 }, { "epoch": 8.495747266099636, "grad_norm": 1.3970917463302612, "learning_rate": 1.0491e-05, "loss": 0.282, "step": 3500 }, { "epoch": 8.498177399756987, "grad_norm": 1.7504945993423462, "learning_rate": 1.0494e-05, "loss": 0.351, "step": 3501 }, { "epoch": 8.500607533414337, "grad_norm": 1.0662323236465454, "learning_rate": 1.0497e-05, "loss": 0.1955, "step": 3502 }, { "epoch": 8.50303766707169, "grad_norm": 0.9828105568885803, "learning_rate": 1.05e-05, "loss": 0.2278, "step": 3503 }, { "epoch": 8.50546780072904, "grad_norm": 0.7627030611038208, "learning_rate": 1.0503000000000001e-05, "loss": 0.1591, "step": 3504 }, { "epoch": 8.507897934386392, "grad_norm": 0.7521127462387085, "learning_rate": 1.0506e-05, "loss": 0.1052, "step": 3505 }, { "epoch": 8.510328068043743, "grad_norm": 0.8349062204360962, "learning_rate": 1.0509e-05, "loss": 0.0897, "step": 3506 }, { "epoch": 8.512758201701093, "grad_norm": 0.827431857585907, "learning_rate": 1.0512e-05, "loss": 0.0997, "step": 3507 }, { "epoch": 8.515188335358445, "grad_norm": 0.7806961536407471, "learning_rate": 1.0515e-05, "loss": 0.0973, "step": 3508 }, { "epoch": 8.517618469015796, "grad_norm": 0.7897675633430481, "learning_rate": 1.0518000000000002e-05, "loss": 0.1154, "step": 3509 }, { "epoch": 8.520048602673146, "grad_norm": 0.8918806314468384, "learning_rate": 1.0521000000000001e-05, "loss": 0.0791, "step": 3510 }, { "epoch": 8.522478736330498, "grad_norm": 0.6043018698692322, "learning_rate": 1.0524e-05, "loss": 0.0744, "step": 3511 }, { "epoch": 8.524908869987849, "grad_norm": 0.7331991195678711, "learning_rate": 1.0527e-05, "loss": 0.0747, "step": 3512 }, { "epoch": 8.527339003645201, "grad_norm": 0.6114159822463989, "learning_rate": 1.0529999999999999e-05, "loss": 0.0622, "step": 3513 }, { "epoch": 8.529769137302551, "grad_norm": 0.6618512272834778, "learning_rate": 1.0533e-05, "loss": 0.0857, "step": 3514 }, { "epoch": 8.532199270959904, "grad_norm": 0.8325181007385254, "learning_rate": 1.0536e-05, "loss": 0.0685, "step": 3515 }, { "epoch": 8.534629404617254, "grad_norm": 0.7010734677314758, "learning_rate": 1.0539e-05, "loss": 0.0895, "step": 3516 }, { "epoch": 8.537059538274605, "grad_norm": 1.066476583480835, "learning_rate": 1.0542e-05, "loss": 0.1043, "step": 3517 }, { "epoch": 8.539489671931957, "grad_norm": 0.8672319650650024, "learning_rate": 1.0545e-05, "loss": 0.076, "step": 3518 }, { "epoch": 8.541919805589307, "grad_norm": 1.1523585319519043, "learning_rate": 1.0548000000000001e-05, "loss": 0.1817, "step": 3519 }, { "epoch": 8.544349939246658, "grad_norm": 0.6571928262710571, "learning_rate": 1.0551000000000001e-05, "loss": 0.0715, "step": 3520 }, { "epoch": 8.54678007290401, "grad_norm": 0.998899519443512, "learning_rate": 1.0554000000000001e-05, "loss": 0.0884, "step": 3521 }, { "epoch": 8.54921020656136, "grad_norm": 1.1835187673568726, "learning_rate": 1.0557e-05, "loss": 0.1749, "step": 3522 }, { "epoch": 8.551640340218713, "grad_norm": 0.7644861936569214, "learning_rate": 1.0559999999999999e-05, "loss": 0.072, "step": 3523 }, { "epoch": 8.554070473876063, "grad_norm": 0.6960921883583069, "learning_rate": 1.0563e-05, "loss": 0.078, "step": 3524 }, { "epoch": 8.556500607533414, "grad_norm": 0.7844071984291077, "learning_rate": 1.0566e-05, "loss": 0.0847, "step": 3525 }, { "epoch": 8.558930741190766, "grad_norm": 0.9127982258796692, "learning_rate": 1.0569e-05, "loss": 0.1036, "step": 3526 }, { "epoch": 8.561360874848116, "grad_norm": 0.855091392993927, "learning_rate": 1.0572e-05, "loss": 0.0868, "step": 3527 }, { "epoch": 8.563791008505468, "grad_norm": 0.8794645667076111, "learning_rate": 1.0575e-05, "loss": 0.0787, "step": 3528 }, { "epoch": 8.566221142162819, "grad_norm": 0.8907430171966553, "learning_rate": 1.0578000000000001e-05, "loss": 0.1016, "step": 3529 }, { "epoch": 8.56865127582017, "grad_norm": 0.7788780331611633, "learning_rate": 1.0581e-05, "loss": 0.0815, "step": 3530 }, { "epoch": 8.571081409477522, "grad_norm": 1.152103304862976, "learning_rate": 1.0584e-05, "loss": 0.103, "step": 3531 }, { "epoch": 8.573511543134872, "grad_norm": 1.2630947828292847, "learning_rate": 1.0587e-05, "loss": 0.1778, "step": 3532 }, { "epoch": 8.575941676792224, "grad_norm": 0.9309837222099304, "learning_rate": 1.059e-05, "loss": 0.0752, "step": 3533 }, { "epoch": 8.578371810449575, "grad_norm": 1.4545365571975708, "learning_rate": 1.0593000000000002e-05, "loss": 0.0812, "step": 3534 }, { "epoch": 8.580801944106925, "grad_norm": 0.8539435863494873, "learning_rate": 1.0596e-05, "loss": 0.0891, "step": 3535 }, { "epoch": 8.583232077764277, "grad_norm": 1.3060234785079956, "learning_rate": 1.0599e-05, "loss": 0.1245, "step": 3536 }, { "epoch": 8.585662211421628, "grad_norm": 1.510538935661316, "learning_rate": 1.0602e-05, "loss": 0.0934, "step": 3537 }, { "epoch": 8.58809234507898, "grad_norm": 1.1173698902130127, "learning_rate": 1.0605e-05, "loss": 0.1314, "step": 3538 }, { "epoch": 8.59052247873633, "grad_norm": 1.0071083307266235, "learning_rate": 1.0608e-05, "loss": 0.0746, "step": 3539 }, { "epoch": 8.592952612393681, "grad_norm": 0.9657981395721436, "learning_rate": 1.0611e-05, "loss": 0.0895, "step": 3540 }, { "epoch": 8.595382746051033, "grad_norm": 1.2430858612060547, "learning_rate": 1.0614e-05, "loss": 0.1139, "step": 3541 }, { "epoch": 8.597812879708384, "grad_norm": 1.8318941593170166, "learning_rate": 1.0617e-05, "loss": 0.0952, "step": 3542 }, { "epoch": 8.600243013365736, "grad_norm": 1.7141155004501343, "learning_rate": 1.062e-05, "loss": 0.1155, "step": 3543 }, { "epoch": 8.602673147023086, "grad_norm": 2.105135440826416, "learning_rate": 1.0623000000000001e-05, "loss": 0.1155, "step": 3544 }, { "epoch": 8.605103280680437, "grad_norm": 1.3104794025421143, "learning_rate": 1.0626000000000001e-05, "loss": 0.1307, "step": 3545 }, { "epoch": 8.607533414337789, "grad_norm": 1.9472349882125854, "learning_rate": 1.0629000000000001e-05, "loss": 0.1877, "step": 3546 }, { "epoch": 8.60996354799514, "grad_norm": 2.208508014678955, "learning_rate": 1.0632000000000001e-05, "loss": 0.4003, "step": 3547 }, { "epoch": 8.612393681652492, "grad_norm": 0.9619177579879761, "learning_rate": 1.0634999999999999e-05, "loss": 0.2969, "step": 3548 }, { "epoch": 8.614823815309842, "grad_norm": 1.0388904809951782, "learning_rate": 1.0638e-05, "loss": 0.3056, "step": 3549 }, { "epoch": 8.617253948967193, "grad_norm": 1.6510990858078003, "learning_rate": 1.0641e-05, "loss": 0.2501, "step": 3550 }, { "epoch": 8.619684082624545, "grad_norm": 0.9474871754646301, "learning_rate": 1.0644e-05, "loss": 0.186, "step": 3551 }, { "epoch": 8.622114216281895, "grad_norm": 0.896022379398346, "learning_rate": 1.0647e-05, "loss": 0.1778, "step": 3552 }, { "epoch": 8.624544349939246, "grad_norm": 1.0468806028366089, "learning_rate": 1.065e-05, "loss": 0.1421, "step": 3553 }, { "epoch": 8.626974483596598, "grad_norm": 0.7803558111190796, "learning_rate": 1.0653000000000001e-05, "loss": 0.1216, "step": 3554 }, { "epoch": 8.629404617253948, "grad_norm": 0.9084542393684387, "learning_rate": 1.0656000000000001e-05, "loss": 0.1389, "step": 3555 }, { "epoch": 8.6318347509113, "grad_norm": 0.6450456380844116, "learning_rate": 1.0659e-05, "loss": 0.1073, "step": 3556 }, { "epoch": 8.634264884568651, "grad_norm": 0.6236186623573303, "learning_rate": 1.0662e-05, "loss": 0.0876, "step": 3557 }, { "epoch": 8.636695018226002, "grad_norm": 0.7553118467330933, "learning_rate": 1.0665e-05, "loss": 0.0715, "step": 3558 }, { "epoch": 8.639125151883354, "grad_norm": 0.8512172102928162, "learning_rate": 1.0668000000000002e-05, "loss": 0.1233, "step": 3559 }, { "epoch": 8.641555285540704, "grad_norm": 0.8153476715087891, "learning_rate": 1.0671e-05, "loss": 0.1027, "step": 3560 }, { "epoch": 8.643985419198057, "grad_norm": 0.7584511041641235, "learning_rate": 1.0674e-05, "loss": 0.0884, "step": 3561 }, { "epoch": 8.646415552855407, "grad_norm": 0.8908721804618835, "learning_rate": 1.0677e-05, "loss": 0.0905, "step": 3562 }, { "epoch": 8.648845686512757, "grad_norm": 0.7374231219291687, "learning_rate": 1.068e-05, "loss": 0.0883, "step": 3563 }, { "epoch": 8.65127582017011, "grad_norm": 1.097661018371582, "learning_rate": 1.0683000000000001e-05, "loss": 0.1222, "step": 3564 }, { "epoch": 8.65370595382746, "grad_norm": 0.6581023335456848, "learning_rate": 1.0686e-05, "loss": 0.0636, "step": 3565 }, { "epoch": 8.656136087484812, "grad_norm": 0.7916021943092346, "learning_rate": 1.0689e-05, "loss": 0.0912, "step": 3566 }, { "epoch": 8.658566221142163, "grad_norm": 0.6695494055747986, "learning_rate": 1.0692e-05, "loss": 0.0558, "step": 3567 }, { "epoch": 8.660996354799513, "grad_norm": 0.7838827967643738, "learning_rate": 1.0695e-05, "loss": 0.087, "step": 3568 }, { "epoch": 8.663426488456865, "grad_norm": 0.6640971302986145, "learning_rate": 1.0698e-05, "loss": 0.0705, "step": 3569 }, { "epoch": 8.665856622114216, "grad_norm": 0.6184972524642944, "learning_rate": 1.0701000000000001e-05, "loss": 0.0756, "step": 3570 }, { "epoch": 8.668286755771568, "grad_norm": 0.7081942558288574, "learning_rate": 1.0704000000000001e-05, "loss": 0.0624, "step": 3571 }, { "epoch": 8.670716889428919, "grad_norm": 0.6558104753494263, "learning_rate": 1.0707e-05, "loss": 0.0761, "step": 3572 }, { "epoch": 8.673147023086269, "grad_norm": 0.9832576513290405, "learning_rate": 1.0709999999999999e-05, "loss": 0.0824, "step": 3573 }, { "epoch": 8.675577156743621, "grad_norm": 0.8671634197235107, "learning_rate": 1.0712999999999999e-05, "loss": 0.0883, "step": 3574 }, { "epoch": 8.678007290400972, "grad_norm": 0.8938408493995667, "learning_rate": 1.0716e-05, "loss": 0.0844, "step": 3575 }, { "epoch": 8.680437424058324, "grad_norm": 0.9574825763702393, "learning_rate": 1.0719e-05, "loss": 0.0564, "step": 3576 }, { "epoch": 8.682867557715674, "grad_norm": 1.077308177947998, "learning_rate": 1.0722e-05, "loss": 0.075, "step": 3577 }, { "epoch": 8.685297691373025, "grad_norm": 1.2828798294067383, "learning_rate": 1.0725e-05, "loss": 0.0975, "step": 3578 }, { "epoch": 8.687727825030377, "grad_norm": 0.9457767605781555, "learning_rate": 1.0728e-05, "loss": 0.0713, "step": 3579 }, { "epoch": 8.690157958687728, "grad_norm": 1.3570295572280884, "learning_rate": 1.0731000000000001e-05, "loss": 0.0781, "step": 3580 }, { "epoch": 8.69258809234508, "grad_norm": 1.603902816772461, "learning_rate": 1.0734000000000001e-05, "loss": 0.0981, "step": 3581 }, { "epoch": 8.69501822600243, "grad_norm": 1.1562210321426392, "learning_rate": 1.0737e-05, "loss": 0.0802, "step": 3582 }, { "epoch": 8.69744835965978, "grad_norm": 0.6103928685188293, "learning_rate": 1.074e-05, "loss": 0.0539, "step": 3583 }, { "epoch": 8.699878493317133, "grad_norm": 0.9930464029312134, "learning_rate": 1.0743e-05, "loss": 0.0588, "step": 3584 }, { "epoch": 8.702308626974483, "grad_norm": 0.9710842967033386, "learning_rate": 1.0746e-05, "loss": 0.0893, "step": 3585 }, { "epoch": 8.704738760631834, "grad_norm": 1.3183330297470093, "learning_rate": 1.0749e-05, "loss": 0.0982, "step": 3586 }, { "epoch": 8.707168894289186, "grad_norm": 1.0589584112167358, "learning_rate": 1.0752e-05, "loss": 0.1249, "step": 3587 }, { "epoch": 8.709599027946537, "grad_norm": 0.8317046761512756, "learning_rate": 1.0755e-05, "loss": 0.0463, "step": 3588 }, { "epoch": 8.712029161603889, "grad_norm": 1.0411224365234375, "learning_rate": 1.0758e-05, "loss": 0.1084, "step": 3589 }, { "epoch": 8.71445929526124, "grad_norm": 1.1228159666061401, "learning_rate": 1.0761e-05, "loss": 0.0918, "step": 3590 }, { "epoch": 8.716889428918591, "grad_norm": 1.6488016843795776, "learning_rate": 1.0764e-05, "loss": 0.097, "step": 3591 }, { "epoch": 8.719319562575942, "grad_norm": 1.1296926736831665, "learning_rate": 1.0767e-05, "loss": 0.0878, "step": 3592 }, { "epoch": 8.721749696233292, "grad_norm": 1.2776216268539429, "learning_rate": 1.077e-05, "loss": 0.101, "step": 3593 }, { "epoch": 8.724179829890645, "grad_norm": 1.7436091899871826, "learning_rate": 1.0773e-05, "loss": 0.1507, "step": 3594 }, { "epoch": 8.726609963547995, "grad_norm": 1.2203038930892944, "learning_rate": 1.0776000000000002e-05, "loss": 0.1045, "step": 3595 }, { "epoch": 8.729040097205345, "grad_norm": 1.7163102626800537, "learning_rate": 1.0779000000000001e-05, "loss": 0.1778, "step": 3596 }, { "epoch": 8.731470230862698, "grad_norm": 1.7791814804077148, "learning_rate": 1.0782e-05, "loss": 0.3988, "step": 3597 }, { "epoch": 8.733900364520048, "grad_norm": 1.113782286643982, "learning_rate": 1.0785e-05, "loss": 0.3103, "step": 3598 }, { "epoch": 8.7363304981774, "grad_norm": 0.7477917075157166, "learning_rate": 1.0787999999999999e-05, "loss": 0.2533, "step": 3599 }, { "epoch": 8.73876063183475, "grad_norm": 0.9131085276603699, "learning_rate": 1.0791e-05, "loss": 0.3139, "step": 3600 }, { "epoch": 8.741190765492101, "grad_norm": 1.0526750087738037, "learning_rate": 1.0794e-05, "loss": 0.2223, "step": 3601 }, { "epoch": 8.743620899149454, "grad_norm": 1.1096440553665161, "learning_rate": 1.0797e-05, "loss": 0.1992, "step": 3602 }, { "epoch": 8.746051032806804, "grad_norm": 0.8725631237030029, "learning_rate": 1.08e-05, "loss": 0.1202, "step": 3603 }, { "epoch": 8.748481166464156, "grad_norm": 0.7813929915428162, "learning_rate": 1.0803e-05, "loss": 0.1126, "step": 3604 }, { "epoch": 8.750911300121507, "grad_norm": 0.606411337852478, "learning_rate": 1.0806000000000001e-05, "loss": 0.0994, "step": 3605 }, { "epoch": 8.753341433778857, "grad_norm": 0.7085534930229187, "learning_rate": 1.0809000000000001e-05, "loss": 0.135, "step": 3606 }, { "epoch": 8.75577156743621, "grad_norm": 0.6607227325439453, "learning_rate": 1.0812e-05, "loss": 0.1065, "step": 3607 }, { "epoch": 8.75820170109356, "grad_norm": 0.6759333610534668, "learning_rate": 1.0815e-05, "loss": 0.0763, "step": 3608 }, { "epoch": 8.760631834750912, "grad_norm": 0.9977691173553467, "learning_rate": 1.0817999999999999e-05, "loss": 0.0775, "step": 3609 }, { "epoch": 8.763061968408262, "grad_norm": 0.796722948551178, "learning_rate": 1.0821e-05, "loss": 0.098, "step": 3610 }, { "epoch": 8.765492102065613, "grad_norm": 0.7802160382270813, "learning_rate": 1.0824e-05, "loss": 0.0822, "step": 3611 }, { "epoch": 8.767922235722965, "grad_norm": 0.897199273109436, "learning_rate": 1.0827e-05, "loss": 0.1311, "step": 3612 }, { "epoch": 8.770352369380316, "grad_norm": 0.6595299243927002, "learning_rate": 1.083e-05, "loss": 0.0834, "step": 3613 }, { "epoch": 8.772782503037668, "grad_norm": 0.717093825340271, "learning_rate": 1.0833e-05, "loss": 0.0947, "step": 3614 }, { "epoch": 8.775212636695018, "grad_norm": 0.6246829628944397, "learning_rate": 1.0836000000000001e-05, "loss": 0.0606, "step": 3615 }, { "epoch": 8.777642770352369, "grad_norm": 0.6699481010437012, "learning_rate": 1.0839e-05, "loss": 0.0717, "step": 3616 }, { "epoch": 8.780072904009721, "grad_norm": 0.7305915951728821, "learning_rate": 1.0842e-05, "loss": 0.0656, "step": 3617 }, { "epoch": 8.782503037667071, "grad_norm": 1.12618887424469, "learning_rate": 1.0845e-05, "loss": 0.0884, "step": 3618 }, { "epoch": 8.784933171324424, "grad_norm": 0.720097005367279, "learning_rate": 1.0848e-05, "loss": 0.0841, "step": 3619 }, { "epoch": 8.787363304981774, "grad_norm": 0.8564953207969666, "learning_rate": 1.0851000000000002e-05, "loss": 0.0799, "step": 3620 }, { "epoch": 8.789793438639125, "grad_norm": 0.7422672510147095, "learning_rate": 1.0854e-05, "loss": 0.0692, "step": 3621 }, { "epoch": 8.792223572296477, "grad_norm": 0.7274274230003357, "learning_rate": 1.0857e-05, "loss": 0.0784, "step": 3622 }, { "epoch": 8.794653705953827, "grad_norm": 1.1418190002441406, "learning_rate": 1.086e-05, "loss": 0.0928, "step": 3623 }, { "epoch": 8.79708383961118, "grad_norm": 0.6944081783294678, "learning_rate": 1.0863e-05, "loss": 0.0726, "step": 3624 }, { "epoch": 8.79951397326853, "grad_norm": 0.7944489121437073, "learning_rate": 1.0866e-05, "loss": 0.0625, "step": 3625 }, { "epoch": 8.80194410692588, "grad_norm": 0.9365942478179932, "learning_rate": 1.0869e-05, "loss": 0.0729, "step": 3626 }, { "epoch": 8.804374240583233, "grad_norm": 1.0071407556533813, "learning_rate": 1.0872e-05, "loss": 0.1066, "step": 3627 }, { "epoch": 8.806804374240583, "grad_norm": 0.8206190466880798, "learning_rate": 1.0875e-05, "loss": 0.0704, "step": 3628 }, { "epoch": 8.809234507897933, "grad_norm": 1.5316742658615112, "learning_rate": 1.0878e-05, "loss": 0.0744, "step": 3629 }, { "epoch": 8.811664641555286, "grad_norm": 0.7422362565994263, "learning_rate": 1.0881000000000001e-05, "loss": 0.0751, "step": 3630 }, { "epoch": 8.814094775212636, "grad_norm": 1.4961318969726562, "learning_rate": 1.0884000000000001e-05, "loss": 0.0697, "step": 3631 }, { "epoch": 8.816524908869988, "grad_norm": 1.0824501514434814, "learning_rate": 1.0887000000000001e-05, "loss": 0.0712, "step": 3632 }, { "epoch": 8.818955042527339, "grad_norm": 1.2299058437347412, "learning_rate": 1.089e-05, "loss": 0.0924, "step": 3633 }, { "epoch": 8.821385176184691, "grad_norm": 1.1616430282592773, "learning_rate": 1.0892999999999999e-05, "loss": 0.0601, "step": 3634 }, { "epoch": 8.823815309842042, "grad_norm": 0.9664846658706665, "learning_rate": 1.0896e-05, "loss": 0.1062, "step": 3635 }, { "epoch": 8.826245443499392, "grad_norm": 1.4107543230056763, "learning_rate": 1.0899e-05, "loss": 0.1195, "step": 3636 }, { "epoch": 8.828675577156744, "grad_norm": 1.1036816835403442, "learning_rate": 1.0902e-05, "loss": 0.1087, "step": 3637 }, { "epoch": 8.831105710814095, "grad_norm": 1.2652158737182617, "learning_rate": 1.0905e-05, "loss": 0.0917, "step": 3638 }, { "epoch": 8.833535844471445, "grad_norm": 1.4770082235336304, "learning_rate": 1.0908e-05, "loss": 0.1323, "step": 3639 }, { "epoch": 8.835965978128797, "grad_norm": 0.797332227230072, "learning_rate": 1.0911000000000001e-05, "loss": 0.0688, "step": 3640 }, { "epoch": 8.838396111786148, "grad_norm": 1.3993229866027832, "learning_rate": 1.0914000000000001e-05, "loss": 0.1107, "step": 3641 }, { "epoch": 8.8408262454435, "grad_norm": 1.2294425964355469, "learning_rate": 1.0917e-05, "loss": 0.1385, "step": 3642 }, { "epoch": 8.84325637910085, "grad_norm": 1.1293728351593018, "learning_rate": 1.092e-05, "loss": 0.1299, "step": 3643 }, { "epoch": 8.845686512758201, "grad_norm": 1.5230646133422852, "learning_rate": 1.0923e-05, "loss": 0.1578, "step": 3644 }, { "epoch": 8.848116646415553, "grad_norm": 2.0029795169830322, "learning_rate": 1.0926000000000002e-05, "loss": 0.1243, "step": 3645 }, { "epoch": 8.850546780072904, "grad_norm": 3.6453468799591064, "learning_rate": 1.0929e-05, "loss": 0.1776, "step": 3646 }, { "epoch": 8.852976913730256, "grad_norm": 1.633599877357483, "learning_rate": 1.0932e-05, "loss": 0.4582, "step": 3647 }, { "epoch": 8.855407047387606, "grad_norm": 1.2198071479797363, "learning_rate": 1.0935e-05, "loss": 0.3329, "step": 3648 }, { "epoch": 8.857837181044957, "grad_norm": 0.933480978012085, "learning_rate": 1.0938e-05, "loss": 0.2852, "step": 3649 }, { "epoch": 8.860267314702309, "grad_norm": 1.0350075960159302, "learning_rate": 1.0941e-05, "loss": 0.2422, "step": 3650 }, { "epoch": 8.86269744835966, "grad_norm": 1.1329631805419922, "learning_rate": 1.0944e-05, "loss": 0.2515, "step": 3651 }, { "epoch": 8.865127582017012, "grad_norm": 1.1061662435531616, "learning_rate": 1.0947e-05, "loss": 0.1867, "step": 3652 }, { "epoch": 8.867557715674362, "grad_norm": 1.1829988956451416, "learning_rate": 1.095e-05, "loss": 0.1742, "step": 3653 }, { "epoch": 8.869987849331713, "grad_norm": 1.2147653102874756, "learning_rate": 1.0953e-05, "loss": 0.155, "step": 3654 }, { "epoch": 8.872417982989065, "grad_norm": 0.6594895124435425, "learning_rate": 1.0956000000000001e-05, "loss": 0.1056, "step": 3655 }, { "epoch": 8.874848116646415, "grad_norm": 0.9235714673995972, "learning_rate": 1.0959000000000001e-05, "loss": 0.1282, "step": 3656 }, { "epoch": 8.877278250303767, "grad_norm": 0.589444637298584, "learning_rate": 1.0962000000000001e-05, "loss": 0.0923, "step": 3657 }, { "epoch": 8.879708383961118, "grad_norm": 0.8475097417831421, "learning_rate": 1.0965e-05, "loss": 0.1073, "step": 3658 }, { "epoch": 8.882138517618468, "grad_norm": 0.8400071263313293, "learning_rate": 1.0967999999999999e-05, "loss": 0.0871, "step": 3659 }, { "epoch": 8.88456865127582, "grad_norm": 0.6567978262901306, "learning_rate": 1.0971e-05, "loss": 0.0852, "step": 3660 }, { "epoch": 8.886998784933171, "grad_norm": 0.6677161455154419, "learning_rate": 1.0974e-05, "loss": 0.0718, "step": 3661 }, { "epoch": 8.889428918590523, "grad_norm": 0.6420778632164001, "learning_rate": 1.0977e-05, "loss": 0.0617, "step": 3662 }, { "epoch": 8.891859052247874, "grad_norm": 0.8281939029693604, "learning_rate": 1.098e-05, "loss": 0.1005, "step": 3663 }, { "epoch": 8.894289185905224, "grad_norm": 0.8518216013908386, "learning_rate": 1.0983e-05, "loss": 0.0892, "step": 3664 }, { "epoch": 8.896719319562576, "grad_norm": 0.7387720942497253, "learning_rate": 1.0986000000000001e-05, "loss": 0.0814, "step": 3665 }, { "epoch": 8.899149453219927, "grad_norm": 0.703399658203125, "learning_rate": 1.0989000000000001e-05, "loss": 0.0709, "step": 3666 }, { "epoch": 8.90157958687728, "grad_norm": 0.6388988494873047, "learning_rate": 1.0992e-05, "loss": 0.0824, "step": 3667 }, { "epoch": 8.90400972053463, "grad_norm": 0.764024555683136, "learning_rate": 1.0995e-05, "loss": 0.1039, "step": 3668 }, { "epoch": 8.90643985419198, "grad_norm": 1.1038377285003662, "learning_rate": 1.0998e-05, "loss": 0.103, "step": 3669 }, { "epoch": 8.908869987849332, "grad_norm": 0.9863934516906738, "learning_rate": 1.1001e-05, "loss": 0.1258, "step": 3670 }, { "epoch": 8.911300121506683, "grad_norm": 0.686062753200531, "learning_rate": 1.1004e-05, "loss": 0.0953, "step": 3671 }, { "epoch": 8.913730255164033, "grad_norm": 0.7013913989067078, "learning_rate": 1.1007e-05, "loss": 0.0797, "step": 3672 }, { "epoch": 8.916160388821385, "grad_norm": 0.9058762192726135, "learning_rate": 1.101e-05, "loss": 0.1114, "step": 3673 }, { "epoch": 8.918590522478736, "grad_norm": 0.887830376625061, "learning_rate": 1.1013e-05, "loss": 0.0833, "step": 3674 }, { "epoch": 8.921020656136088, "grad_norm": 0.8628110289573669, "learning_rate": 1.1016000000000001e-05, "loss": 0.0988, "step": 3675 }, { "epoch": 8.923450789793439, "grad_norm": 0.8226523995399475, "learning_rate": 1.1019e-05, "loss": 0.0742, "step": 3676 }, { "epoch": 8.925880923450789, "grad_norm": 0.6768421530723572, "learning_rate": 1.1022e-05, "loss": 0.0783, "step": 3677 }, { "epoch": 8.928311057108141, "grad_norm": 0.8569360971450806, "learning_rate": 1.1025e-05, "loss": 0.0978, "step": 3678 }, { "epoch": 8.930741190765492, "grad_norm": 0.9821699857711792, "learning_rate": 1.1028e-05, "loss": 0.1051, "step": 3679 }, { "epoch": 8.933171324422844, "grad_norm": 0.7366060018539429, "learning_rate": 1.1031000000000002e-05, "loss": 0.0684, "step": 3680 }, { "epoch": 8.935601458080194, "grad_norm": 0.8040265440940857, "learning_rate": 1.1034000000000001e-05, "loss": 0.1068, "step": 3681 }, { "epoch": 8.938031591737545, "grad_norm": 0.992822527885437, "learning_rate": 1.1037000000000001e-05, "loss": 0.0957, "step": 3682 }, { "epoch": 8.940461725394897, "grad_norm": 1.0125216245651245, "learning_rate": 1.104e-05, "loss": 0.1079, "step": 3683 }, { "epoch": 8.942891859052247, "grad_norm": 1.3157879114151, "learning_rate": 1.1042999999999999e-05, "loss": 0.0836, "step": 3684 }, { "epoch": 8.9453219927096, "grad_norm": 0.8167380690574646, "learning_rate": 1.1046e-05, "loss": 0.0609, "step": 3685 }, { "epoch": 8.94775212636695, "grad_norm": 0.9202518463134766, "learning_rate": 1.1049e-05, "loss": 0.0801, "step": 3686 }, { "epoch": 8.9501822600243, "grad_norm": 1.4293988943099976, "learning_rate": 1.1052e-05, "loss": 0.1213, "step": 3687 }, { "epoch": 8.952612393681653, "grad_norm": 1.0583962202072144, "learning_rate": 1.1055e-05, "loss": 0.1076, "step": 3688 }, { "epoch": 8.955042527339003, "grad_norm": 0.6290149688720703, "learning_rate": 1.1058e-05, "loss": 0.0559, "step": 3689 }, { "epoch": 8.957472660996356, "grad_norm": 1.0508028268814087, "learning_rate": 1.1061000000000001e-05, "loss": 0.0874, "step": 3690 }, { "epoch": 8.959902794653706, "grad_norm": 1.4907796382904053, "learning_rate": 1.1064000000000001e-05, "loss": 0.126, "step": 3691 }, { "epoch": 8.962332928311056, "grad_norm": 1.2985432147979736, "learning_rate": 1.1067000000000001e-05, "loss": 0.1145, "step": 3692 }, { "epoch": 8.964763061968409, "grad_norm": 1.6109005212783813, "learning_rate": 1.107e-05, "loss": 0.1834, "step": 3693 }, { "epoch": 8.96719319562576, "grad_norm": 2.2065436840057373, "learning_rate": 1.1073e-05, "loss": 0.1318, "step": 3694 }, { "epoch": 8.969623329283111, "grad_norm": 1.427951455116272, "learning_rate": 1.1075999999999999e-05, "loss": 0.1097, "step": 3695 }, { "epoch": 8.972053462940462, "grad_norm": 2.2420494556427, "learning_rate": 1.1079e-05, "loss": 0.214, "step": 3696 }, { "epoch": 8.974483596597812, "grad_norm": 1.0923840999603271, "learning_rate": 1.1082e-05, "loss": 0.3105, "step": 3697 }, { "epoch": 8.976913730255164, "grad_norm": 0.9252915382385254, "learning_rate": 1.1085e-05, "loss": 0.1766, "step": 3698 }, { "epoch": 8.979343863912515, "grad_norm": 0.8117088079452515, "learning_rate": 1.1088e-05, "loss": 0.1003, "step": 3699 }, { "epoch": 8.981773997569867, "grad_norm": 0.675574779510498, "learning_rate": 1.1091e-05, "loss": 0.0871, "step": 3700 }, { "epoch": 8.984204131227218, "grad_norm": 0.7631402611732483, "learning_rate": 1.1094e-05, "loss": 0.0945, "step": 3701 }, { "epoch": 8.986634264884568, "grad_norm": 0.6563335061073303, "learning_rate": 1.1097e-05, "loss": 0.1011, "step": 3702 }, { "epoch": 8.98906439854192, "grad_norm": 0.7775347828865051, "learning_rate": 1.11e-05, "loss": 0.085, "step": 3703 }, { "epoch": 8.99149453219927, "grad_norm": 1.5119030475616455, "learning_rate": 1.1103e-05, "loss": 0.0828, "step": 3704 }, { "epoch": 8.993924665856621, "grad_norm": 0.8301509618759155, "learning_rate": 1.1106e-05, "loss": 0.0779, "step": 3705 }, { "epoch": 8.996354799513973, "grad_norm": 1.787467360496521, "learning_rate": 1.1109000000000002e-05, "loss": 0.1251, "step": 3706 }, { "epoch": 8.998784933171324, "grad_norm": 1.4546087980270386, "learning_rate": 1.1112e-05, "loss": 0.1222, "step": 3707 }, { "epoch": 9.0, "grad_norm": 2.7163820266723633, "learning_rate": 1.1115e-05, "loss": 0.137, "step": 3708 }, { "epoch": 9.00243013365735, "grad_norm": 3.4846715927124023, "learning_rate": 1.1118e-05, "loss": 0.3893, "step": 3709 }, { "epoch": 9.004860267314703, "grad_norm": 1.0197054147720337, "learning_rate": 1.1120999999999999e-05, "loss": 0.3032, "step": 3710 }, { "epoch": 9.007290400972053, "grad_norm": 1.4979338645935059, "learning_rate": 1.1124e-05, "loss": 0.2649, "step": 3711 }, { "epoch": 9.009720534629405, "grad_norm": 1.9787566661834717, "learning_rate": 1.1127e-05, "loss": 0.2945, "step": 3712 }, { "epoch": 9.012150668286756, "grad_norm": 1.6814604997634888, "learning_rate": 1.113e-05, "loss": 0.2162, "step": 3713 }, { "epoch": 9.014580801944106, "grad_norm": 1.087492823600769, "learning_rate": 1.1133e-05, "loss": 0.1813, "step": 3714 }, { "epoch": 9.017010935601458, "grad_norm": 1.107271432876587, "learning_rate": 1.1136e-05, "loss": 0.1844, "step": 3715 }, { "epoch": 9.019441069258809, "grad_norm": 1.0408809185028076, "learning_rate": 1.1139000000000001e-05, "loss": 0.1255, "step": 3716 }, { "epoch": 9.021871202916161, "grad_norm": 0.8358994722366333, "learning_rate": 1.1142000000000001e-05, "loss": 0.093, "step": 3717 }, { "epoch": 9.024301336573512, "grad_norm": 0.7608299851417542, "learning_rate": 1.1145000000000001e-05, "loss": 0.0997, "step": 3718 }, { "epoch": 9.026731470230862, "grad_norm": 0.7022808790206909, "learning_rate": 1.1148e-05, "loss": 0.0829, "step": 3719 }, { "epoch": 9.029161603888214, "grad_norm": 0.8234644532203674, "learning_rate": 1.1150999999999999e-05, "loss": 0.0788, "step": 3720 }, { "epoch": 9.031591737545565, "grad_norm": 0.8457633852958679, "learning_rate": 1.1154e-05, "loss": 0.0728, "step": 3721 }, { "epoch": 9.034021871202917, "grad_norm": 0.868418276309967, "learning_rate": 1.1157e-05, "loss": 0.0803, "step": 3722 }, { "epoch": 9.036452004860267, "grad_norm": 0.6367961168289185, "learning_rate": 1.116e-05, "loss": 0.0896, "step": 3723 }, { "epoch": 9.038882138517618, "grad_norm": 0.8289440870285034, "learning_rate": 1.1163e-05, "loss": 0.069, "step": 3724 }, { "epoch": 9.04131227217497, "grad_norm": 1.0133695602416992, "learning_rate": 1.1166e-05, "loss": 0.0751, "step": 3725 }, { "epoch": 9.04374240583232, "grad_norm": 0.6605246067047119, "learning_rate": 1.1169000000000001e-05, "loss": 0.0873, "step": 3726 }, { "epoch": 9.046172539489673, "grad_norm": 0.523106038570404, "learning_rate": 1.1172e-05, "loss": 0.0404, "step": 3727 }, { "epoch": 9.048602673147023, "grad_norm": 0.676879346370697, "learning_rate": 1.1175e-05, "loss": 0.0618, "step": 3728 }, { "epoch": 9.051032806804374, "grad_norm": 0.6788640022277832, "learning_rate": 1.1178e-05, "loss": 0.0625, "step": 3729 }, { "epoch": 9.053462940461726, "grad_norm": 0.8673667311668396, "learning_rate": 1.1181e-05, "loss": 0.0701, "step": 3730 }, { "epoch": 9.055893074119076, "grad_norm": 1.1171890497207642, "learning_rate": 1.1184000000000002e-05, "loss": 0.1592, "step": 3731 }, { "epoch": 9.058323207776427, "grad_norm": 0.9817842841148376, "learning_rate": 1.1187e-05, "loss": 0.0825, "step": 3732 }, { "epoch": 9.060753341433779, "grad_norm": 0.6724960803985596, "learning_rate": 1.119e-05, "loss": 0.0758, "step": 3733 }, { "epoch": 9.06318347509113, "grad_norm": 0.8495436906814575, "learning_rate": 1.1193e-05, "loss": 0.0678, "step": 3734 }, { "epoch": 9.065613608748482, "grad_norm": 0.7473029494285583, "learning_rate": 1.1196e-05, "loss": 0.0631, "step": 3735 }, { "epoch": 9.068043742405832, "grad_norm": 0.7243484258651733, "learning_rate": 1.1199e-05, "loss": 0.0712, "step": 3736 }, { "epoch": 9.070473876063183, "grad_norm": 0.7826787233352661, "learning_rate": 1.1202e-05, "loss": 0.062, "step": 3737 }, { "epoch": 9.072904009720535, "grad_norm": 0.5779997706413269, "learning_rate": 1.1205e-05, "loss": 0.0524, "step": 3738 }, { "epoch": 9.075334143377885, "grad_norm": 1.0816570520401, "learning_rate": 1.1208e-05, "loss": 0.1035, "step": 3739 }, { "epoch": 9.077764277035238, "grad_norm": 0.897916853427887, "learning_rate": 1.1211e-05, "loss": 0.0607, "step": 3740 }, { "epoch": 9.080194410692588, "grad_norm": 0.6661940813064575, "learning_rate": 1.1214000000000001e-05, "loss": 0.0774, "step": 3741 }, { "epoch": 9.082624544349938, "grad_norm": 0.48887166380882263, "learning_rate": 1.1217000000000001e-05, "loss": 0.0436, "step": 3742 }, { "epoch": 9.08505467800729, "grad_norm": 1.5490739345550537, "learning_rate": 1.1220000000000001e-05, "loss": 0.2173, "step": 3743 }, { "epoch": 9.087484811664641, "grad_norm": 0.9912199378013611, "learning_rate": 1.1222999999999999e-05, "loss": 0.1044, "step": 3744 }, { "epoch": 9.089914945321993, "grad_norm": 0.7175644636154175, "learning_rate": 1.1225999999999999e-05, "loss": 0.0787, "step": 3745 }, { "epoch": 9.092345078979344, "grad_norm": 0.940506637096405, "learning_rate": 1.1229e-05, "loss": 0.0801, "step": 3746 }, { "epoch": 9.094775212636694, "grad_norm": 0.720362663269043, "learning_rate": 1.1232e-05, "loss": 0.0763, "step": 3747 }, { "epoch": 9.097205346294047, "grad_norm": 0.9404833316802979, "learning_rate": 1.1235e-05, "loss": 0.0952, "step": 3748 }, { "epoch": 9.099635479951397, "grad_norm": 0.913378119468689, "learning_rate": 1.1238e-05, "loss": 0.0889, "step": 3749 }, { "epoch": 9.10206561360875, "grad_norm": 1.1006996631622314, "learning_rate": 1.1241e-05, "loss": 0.0917, "step": 3750 }, { "epoch": 9.1044957472661, "grad_norm": 0.8441502451896667, "learning_rate": 1.1244000000000001e-05, "loss": 0.0718, "step": 3751 }, { "epoch": 9.10692588092345, "grad_norm": 0.9150943160057068, "learning_rate": 1.1247000000000001e-05, "loss": 0.0722, "step": 3752 }, { "epoch": 9.109356014580802, "grad_norm": 1.0148392915725708, "learning_rate": 1.125e-05, "loss": 0.0909, "step": 3753 }, { "epoch": 9.111786148238153, "grad_norm": 0.9082624912261963, "learning_rate": 1.1253e-05, "loss": 0.0569, "step": 3754 }, { "epoch": 9.114216281895505, "grad_norm": 1.0408775806427002, "learning_rate": 1.1256e-05, "loss": 0.0837, "step": 3755 }, { "epoch": 9.116646415552855, "grad_norm": 2.237964630126953, "learning_rate": 1.1259e-05, "loss": 0.1335, "step": 3756 }, { "epoch": 9.119076549210206, "grad_norm": 1.5449855327606201, "learning_rate": 1.1262e-05, "loss": 0.0994, "step": 3757 }, { "epoch": 9.121506682867558, "grad_norm": 2.122748851776123, "learning_rate": 1.1265e-05, "loss": 0.2512, "step": 3758 }, { "epoch": 9.123936816524909, "grad_norm": 2.38661527633667, "learning_rate": 1.1268e-05, "loss": 0.4305, "step": 3759 }, { "epoch": 9.12636695018226, "grad_norm": 1.3037171363830566, "learning_rate": 1.1271e-05, "loss": 0.3927, "step": 3760 }, { "epoch": 9.128797083839611, "grad_norm": 0.8623318672180176, "learning_rate": 1.1274e-05, "loss": 0.2793, "step": 3761 }, { "epoch": 9.131227217496962, "grad_norm": 1.2745212316513062, "learning_rate": 1.1277e-05, "loss": 0.2984, "step": 3762 }, { "epoch": 9.133657351154314, "grad_norm": 1.1779829263687134, "learning_rate": 1.128e-05, "loss": 0.2066, "step": 3763 }, { "epoch": 9.136087484811664, "grad_norm": 0.7833021879196167, "learning_rate": 1.1283e-05, "loss": 0.1665, "step": 3764 }, { "epoch": 9.138517618469017, "grad_norm": 0.9147211909294128, "learning_rate": 1.1286e-05, "loss": 0.1676, "step": 3765 }, { "epoch": 9.140947752126367, "grad_norm": 1.3323650360107422, "learning_rate": 1.1289000000000002e-05, "loss": 0.1513, "step": 3766 }, { "epoch": 9.143377885783718, "grad_norm": 0.8814960718154907, "learning_rate": 1.1292000000000001e-05, "loss": 0.1366, "step": 3767 }, { "epoch": 9.14580801944107, "grad_norm": 0.7930541634559631, "learning_rate": 1.1295000000000001e-05, "loss": 0.1507, "step": 3768 }, { "epoch": 9.14823815309842, "grad_norm": 0.5671955347061157, "learning_rate": 1.1298e-05, "loss": 0.0988, "step": 3769 }, { "epoch": 9.15066828675577, "grad_norm": 1.022989273071289, "learning_rate": 1.1300999999999999e-05, "loss": 0.114, "step": 3770 }, { "epoch": 9.153098420413123, "grad_norm": 0.5511711835861206, "learning_rate": 1.1304e-05, "loss": 0.0591, "step": 3771 }, { "epoch": 9.155528554070473, "grad_norm": 0.9521755576133728, "learning_rate": 1.1307e-05, "loss": 0.0829, "step": 3772 }, { "epoch": 9.157958687727826, "grad_norm": 0.8364288210868835, "learning_rate": 1.131e-05, "loss": 0.0979, "step": 3773 }, { "epoch": 9.160388821385176, "grad_norm": 0.6764402985572815, "learning_rate": 1.1313e-05, "loss": 0.0834, "step": 3774 }, { "epoch": 9.162818955042527, "grad_norm": 0.6855900287628174, "learning_rate": 1.1316e-05, "loss": 0.0928, "step": 3775 }, { "epoch": 9.165249088699879, "grad_norm": 1.4880623817443848, "learning_rate": 1.1319000000000001e-05, "loss": 0.0742, "step": 3776 }, { "epoch": 9.16767922235723, "grad_norm": 0.7344149351119995, "learning_rate": 1.1322000000000001e-05, "loss": 0.0724, "step": 3777 }, { "epoch": 9.170109356014581, "grad_norm": 0.796861469745636, "learning_rate": 1.1325e-05, "loss": 0.0622, "step": 3778 }, { "epoch": 9.172539489671932, "grad_norm": 0.7043129205703735, "learning_rate": 1.1328e-05, "loss": 0.0625, "step": 3779 }, { "epoch": 9.174969623329282, "grad_norm": 0.707977831363678, "learning_rate": 1.1331e-05, "loss": 0.0543, "step": 3780 }, { "epoch": 9.177399756986635, "grad_norm": 0.6993305683135986, "learning_rate": 1.1334e-05, "loss": 0.0675, "step": 3781 }, { "epoch": 9.179829890643985, "grad_norm": 0.7897548675537109, "learning_rate": 1.1337e-05, "loss": 0.0676, "step": 3782 }, { "epoch": 9.182260024301337, "grad_norm": 0.6236314177513123, "learning_rate": 1.134e-05, "loss": 0.05, "step": 3783 }, { "epoch": 9.184690157958688, "grad_norm": 0.6343854069709778, "learning_rate": 1.1343e-05, "loss": 0.0607, "step": 3784 }, { "epoch": 9.187120291616038, "grad_norm": 0.7269455790519714, "learning_rate": 1.1346e-05, "loss": 0.0683, "step": 3785 }, { "epoch": 9.18955042527339, "grad_norm": 0.5160521268844604, "learning_rate": 1.1349000000000001e-05, "loss": 0.057, "step": 3786 }, { "epoch": 9.19198055893074, "grad_norm": 0.77953040599823, "learning_rate": 1.1352e-05, "loss": 0.0534, "step": 3787 }, { "epoch": 9.194410692588093, "grad_norm": 0.6657785177230835, "learning_rate": 1.1355e-05, "loss": 0.0641, "step": 3788 }, { "epoch": 9.196840826245444, "grad_norm": 0.9522373676300049, "learning_rate": 1.1358e-05, "loss": 0.0745, "step": 3789 }, { "epoch": 9.199270959902794, "grad_norm": 0.7558581829071045, "learning_rate": 1.1361e-05, "loss": 0.0678, "step": 3790 }, { "epoch": 9.201701093560146, "grad_norm": 0.9826641082763672, "learning_rate": 1.1364000000000002e-05, "loss": 0.0886, "step": 3791 }, { "epoch": 9.204131227217497, "grad_norm": 0.8829717636108398, "learning_rate": 1.1367000000000001e-05, "loss": 0.0769, "step": 3792 }, { "epoch": 9.206561360874849, "grad_norm": 0.7927889823913574, "learning_rate": 1.137e-05, "loss": 0.0634, "step": 3793 }, { "epoch": 9.2089914945322, "grad_norm": 0.8799672722816467, "learning_rate": 1.1373e-05, "loss": 0.1239, "step": 3794 }, { "epoch": 9.21142162818955, "grad_norm": 0.8656009435653687, "learning_rate": 1.1376e-05, "loss": 0.0703, "step": 3795 }, { "epoch": 9.213851761846902, "grad_norm": 1.0130431652069092, "learning_rate": 1.1379e-05, "loss": 0.1153, "step": 3796 }, { "epoch": 9.216281895504252, "grad_norm": 0.9249078035354614, "learning_rate": 1.1382e-05, "loss": 0.056, "step": 3797 }, { "epoch": 9.218712029161605, "grad_norm": 1.094120979309082, "learning_rate": 1.1385e-05, "loss": 0.0714, "step": 3798 }, { "epoch": 9.221142162818955, "grad_norm": 1.5473953485488892, "learning_rate": 1.1388e-05, "loss": 0.0737, "step": 3799 }, { "epoch": 9.223572296476306, "grad_norm": 1.065179705619812, "learning_rate": 1.1391e-05, "loss": 0.0723, "step": 3800 }, { "epoch": 9.226002430133658, "grad_norm": 1.0675091743469238, "learning_rate": 1.1394000000000001e-05, "loss": 0.0877, "step": 3801 }, { "epoch": 9.228432563791008, "grad_norm": 0.8774566650390625, "learning_rate": 1.1397000000000001e-05, "loss": 0.0679, "step": 3802 }, { "epoch": 9.23086269744836, "grad_norm": 1.1582714319229126, "learning_rate": 1.1400000000000001e-05, "loss": 0.0725, "step": 3803 }, { "epoch": 9.233292831105711, "grad_norm": 1.1541523933410645, "learning_rate": 1.1403e-05, "loss": 0.0952, "step": 3804 }, { "epoch": 9.235722964763061, "grad_norm": 1.4048404693603516, "learning_rate": 1.1406e-05, "loss": 0.0962, "step": 3805 }, { "epoch": 9.238153098420414, "grad_norm": 1.172943115234375, "learning_rate": 1.1409e-05, "loss": 0.1178, "step": 3806 }, { "epoch": 9.240583232077764, "grad_norm": 1.7869070768356323, "learning_rate": 1.1412e-05, "loss": 0.194, "step": 3807 }, { "epoch": 9.243013365735115, "grad_norm": 2.359499216079712, "learning_rate": 1.1415e-05, "loss": 0.1422, "step": 3808 }, { "epoch": 9.245443499392467, "grad_norm": 1.787642240524292, "learning_rate": 1.1418e-05, "loss": 0.4083, "step": 3809 }, { "epoch": 9.247873633049817, "grad_norm": 0.9641139507293701, "learning_rate": 1.1421e-05, "loss": 0.2998, "step": 3810 }, { "epoch": 9.25030376670717, "grad_norm": 1.19845712184906, "learning_rate": 1.1424000000000001e-05, "loss": 0.2642, "step": 3811 }, { "epoch": 9.25273390036452, "grad_norm": 1.0352765321731567, "learning_rate": 1.1427000000000001e-05, "loss": 0.2395, "step": 3812 }, { "epoch": 9.25516403402187, "grad_norm": 1.0660028457641602, "learning_rate": 1.143e-05, "loss": 0.2422, "step": 3813 }, { "epoch": 9.257594167679223, "grad_norm": 0.8300871253013611, "learning_rate": 1.1433e-05, "loss": 0.1704, "step": 3814 }, { "epoch": 9.260024301336573, "grad_norm": 0.8198728561401367, "learning_rate": 1.1436e-05, "loss": 0.1613, "step": 3815 }, { "epoch": 9.262454434993925, "grad_norm": 0.7764173746109009, "learning_rate": 1.1439e-05, "loss": 0.1191, "step": 3816 }, { "epoch": 9.264884568651276, "grad_norm": 0.9708309769630432, "learning_rate": 1.1442000000000002e-05, "loss": 0.1115, "step": 3817 }, { "epoch": 9.267314702308626, "grad_norm": 0.7730659246444702, "learning_rate": 1.1445e-05, "loss": 0.091, "step": 3818 }, { "epoch": 9.269744835965978, "grad_norm": 0.6850373148918152, "learning_rate": 1.1448e-05, "loss": 0.0867, "step": 3819 }, { "epoch": 9.272174969623329, "grad_norm": 0.9154075980186462, "learning_rate": 1.1451e-05, "loss": 0.1104, "step": 3820 }, { "epoch": 9.274605103280681, "grad_norm": 0.6720854640007019, "learning_rate": 1.1453999999999999e-05, "loss": 0.0695, "step": 3821 }, { "epoch": 9.277035236938032, "grad_norm": 0.5737730264663696, "learning_rate": 1.1457e-05, "loss": 0.0729, "step": 3822 }, { "epoch": 9.279465370595382, "grad_norm": 0.6851139068603516, "learning_rate": 1.146e-05, "loss": 0.0813, "step": 3823 }, { "epoch": 9.281895504252734, "grad_norm": 0.5478085875511169, "learning_rate": 1.1463e-05, "loss": 0.0553, "step": 3824 }, { "epoch": 9.284325637910085, "grad_norm": 1.5650413036346436, "learning_rate": 1.1466e-05, "loss": 0.0796, "step": 3825 }, { "epoch": 9.286755771567437, "grad_norm": 0.6345076560974121, "learning_rate": 1.1469e-05, "loss": 0.0514, "step": 3826 }, { "epoch": 9.289185905224787, "grad_norm": 0.5771114826202393, "learning_rate": 1.1472000000000001e-05, "loss": 0.0749, "step": 3827 }, { "epoch": 9.291616038882138, "grad_norm": 0.8014042973518372, "learning_rate": 1.1475000000000001e-05, "loss": 0.0726, "step": 3828 }, { "epoch": 9.29404617253949, "grad_norm": 0.7651469111442566, "learning_rate": 1.1478000000000001e-05, "loss": 0.0661, "step": 3829 }, { "epoch": 9.29647630619684, "grad_norm": 0.7142805457115173, "learning_rate": 1.1480999999999999e-05, "loss": 0.0631, "step": 3830 }, { "epoch": 9.298906439854193, "grad_norm": 0.6141322255134583, "learning_rate": 1.1483999999999999e-05, "loss": 0.074, "step": 3831 }, { "epoch": 9.301336573511543, "grad_norm": 0.8092586994171143, "learning_rate": 1.1487e-05, "loss": 0.0828, "step": 3832 }, { "epoch": 9.303766707168894, "grad_norm": 0.829997718334198, "learning_rate": 1.149e-05, "loss": 0.1086, "step": 3833 }, { "epoch": 9.306196840826246, "grad_norm": 0.8644275069236755, "learning_rate": 1.1493e-05, "loss": 0.0693, "step": 3834 }, { "epoch": 9.308626974483596, "grad_norm": 0.6772448420524597, "learning_rate": 1.1496e-05, "loss": 0.0667, "step": 3835 }, { "epoch": 9.311057108140949, "grad_norm": 0.6158614158630371, "learning_rate": 1.1499e-05, "loss": 0.0607, "step": 3836 }, { "epoch": 9.313487241798299, "grad_norm": 0.9566076397895813, "learning_rate": 1.1502000000000001e-05, "loss": 0.0939, "step": 3837 }, { "epoch": 9.31591737545565, "grad_norm": 0.7387718558311462, "learning_rate": 1.1505e-05, "loss": 0.0583, "step": 3838 }, { "epoch": 9.318347509113002, "grad_norm": 1.390122652053833, "learning_rate": 1.1508e-05, "loss": 0.073, "step": 3839 }, { "epoch": 9.320777642770352, "grad_norm": 0.9123455286026001, "learning_rate": 1.1511e-05, "loss": 0.0942, "step": 3840 }, { "epoch": 9.323207776427704, "grad_norm": 1.3035674095153809, "learning_rate": 1.1514e-05, "loss": 0.0632, "step": 3841 }, { "epoch": 9.325637910085055, "grad_norm": 0.7785066366195679, "learning_rate": 1.1517e-05, "loss": 0.0778, "step": 3842 }, { "epoch": 9.328068043742405, "grad_norm": 0.8373407125473022, "learning_rate": 1.152e-05, "loss": 0.0708, "step": 3843 }, { "epoch": 9.330498177399758, "grad_norm": 0.8052675127983093, "learning_rate": 1.1523e-05, "loss": 0.0626, "step": 3844 }, { "epoch": 9.332928311057108, "grad_norm": 1.0576874017715454, "learning_rate": 1.1526e-05, "loss": 0.0904, "step": 3845 }, { "epoch": 9.335358444714458, "grad_norm": 0.6464706659317017, "learning_rate": 1.1529e-05, "loss": 0.0585, "step": 3846 }, { "epoch": 9.33778857837181, "grad_norm": 0.8763131499290466, "learning_rate": 1.1532e-05, "loss": 0.0843, "step": 3847 }, { "epoch": 9.340218712029161, "grad_norm": 0.763068437576294, "learning_rate": 1.1535e-05, "loss": 0.0858, "step": 3848 }, { "epoch": 9.342648845686513, "grad_norm": 1.411829948425293, "learning_rate": 1.1538e-05, "loss": 0.1042, "step": 3849 }, { "epoch": 9.345078979343864, "grad_norm": 0.6550055146217346, "learning_rate": 1.1541e-05, "loss": 0.0509, "step": 3850 }, { "epoch": 9.347509113001214, "grad_norm": 0.7170867919921875, "learning_rate": 1.1544e-05, "loss": 0.0582, "step": 3851 }, { "epoch": 9.349939246658566, "grad_norm": 1.8897705078125, "learning_rate": 1.1547000000000001e-05, "loss": 0.1452, "step": 3852 }, { "epoch": 9.352369380315917, "grad_norm": 1.131097435951233, "learning_rate": 1.1550000000000001e-05, "loss": 0.0831, "step": 3853 }, { "epoch": 9.35479951397327, "grad_norm": 1.082171082496643, "learning_rate": 1.1553000000000001e-05, "loss": 0.0952, "step": 3854 }, { "epoch": 9.35722964763062, "grad_norm": 1.5456652641296387, "learning_rate": 1.1555999999999999e-05, "loss": 0.0868, "step": 3855 }, { "epoch": 9.35965978128797, "grad_norm": 1.4530287981033325, "learning_rate": 1.1558999999999999e-05, "loss": 0.0834, "step": 3856 }, { "epoch": 9.362089914945322, "grad_norm": 1.3778501749038696, "learning_rate": 1.1562e-05, "loss": 0.1037, "step": 3857 }, { "epoch": 9.364520048602673, "grad_norm": 1.5676705837249756, "learning_rate": 1.1565e-05, "loss": 0.146, "step": 3858 }, { "epoch": 9.366950182260025, "grad_norm": 1.3544514179229736, "learning_rate": 1.1568e-05, "loss": 0.3987, "step": 3859 }, { "epoch": 9.369380315917375, "grad_norm": 0.8129224181175232, "learning_rate": 1.1571e-05, "loss": 0.2829, "step": 3860 }, { "epoch": 9.371810449574726, "grad_norm": 1.1629010438919067, "learning_rate": 1.1574e-05, "loss": 0.2976, "step": 3861 }, { "epoch": 9.374240583232078, "grad_norm": 0.9790984988212585, "learning_rate": 1.1577000000000001e-05, "loss": 0.2232, "step": 3862 }, { "epoch": 9.376670716889429, "grad_norm": 1.4014487266540527, "learning_rate": 1.1580000000000001e-05, "loss": 0.1829, "step": 3863 }, { "epoch": 9.37910085054678, "grad_norm": 0.8855032920837402, "learning_rate": 1.1583e-05, "loss": 0.1761, "step": 3864 }, { "epoch": 9.381530984204131, "grad_norm": 0.9171119928359985, "learning_rate": 1.1586e-05, "loss": 0.161, "step": 3865 }, { "epoch": 9.383961117861482, "grad_norm": 0.666663408279419, "learning_rate": 1.1589e-05, "loss": 0.11, "step": 3866 }, { "epoch": 9.386391251518834, "grad_norm": 0.7494332790374756, "learning_rate": 1.1592e-05, "loss": 0.0965, "step": 3867 }, { "epoch": 9.388821385176184, "grad_norm": 1.3334908485412598, "learning_rate": 1.1595e-05, "loss": 0.138, "step": 3868 }, { "epoch": 9.391251518833537, "grad_norm": 0.8285418152809143, "learning_rate": 1.1598e-05, "loss": 0.0966, "step": 3869 }, { "epoch": 9.393681652490887, "grad_norm": 1.1959677934646606, "learning_rate": 1.1601e-05, "loss": 0.1252, "step": 3870 }, { "epoch": 9.396111786148237, "grad_norm": 0.6083600521087646, "learning_rate": 1.1604e-05, "loss": 0.0814, "step": 3871 }, { "epoch": 9.39854191980559, "grad_norm": 0.6831820011138916, "learning_rate": 1.1607000000000001e-05, "loss": 0.0686, "step": 3872 }, { "epoch": 9.40097205346294, "grad_norm": 0.6046907901763916, "learning_rate": 1.161e-05, "loss": 0.0545, "step": 3873 }, { "epoch": 9.403402187120292, "grad_norm": 0.7212254405021667, "learning_rate": 1.1613e-05, "loss": 0.0727, "step": 3874 }, { "epoch": 9.405832320777643, "grad_norm": 0.7563762068748474, "learning_rate": 1.1616e-05, "loss": 0.0694, "step": 3875 }, { "epoch": 9.408262454434993, "grad_norm": 0.7118474245071411, "learning_rate": 1.1619e-05, "loss": 0.0761, "step": 3876 }, { "epoch": 9.410692588092346, "grad_norm": 0.7550501227378845, "learning_rate": 1.1622000000000002e-05, "loss": 0.0809, "step": 3877 }, { "epoch": 9.413122721749696, "grad_norm": 0.8778311610221863, "learning_rate": 1.1625000000000001e-05, "loss": 0.0762, "step": 3878 }, { "epoch": 9.415552855407048, "grad_norm": 1.0638527870178223, "learning_rate": 1.1628e-05, "loss": 0.1051, "step": 3879 }, { "epoch": 9.417982989064399, "grad_norm": 0.6909265518188477, "learning_rate": 1.1631e-05, "loss": 0.0817, "step": 3880 }, { "epoch": 9.42041312272175, "grad_norm": 0.8358462452888489, "learning_rate": 1.1633999999999999e-05, "loss": 0.0685, "step": 3881 }, { "epoch": 9.422843256379101, "grad_norm": 0.8550371527671814, "learning_rate": 1.1637e-05, "loss": 0.0902, "step": 3882 }, { "epoch": 9.425273390036452, "grad_norm": 0.6507645845413208, "learning_rate": 1.164e-05, "loss": 0.0645, "step": 3883 }, { "epoch": 9.427703523693804, "grad_norm": 0.6183695793151855, "learning_rate": 1.1643e-05, "loss": 0.0748, "step": 3884 }, { "epoch": 9.430133657351154, "grad_norm": 0.5408215522766113, "learning_rate": 1.1646e-05, "loss": 0.0594, "step": 3885 }, { "epoch": 9.432563791008505, "grad_norm": 1.0458916425704956, "learning_rate": 1.1649e-05, "loss": 0.1071, "step": 3886 }, { "epoch": 9.434993924665857, "grad_norm": 0.7156410813331604, "learning_rate": 1.1652000000000001e-05, "loss": 0.0797, "step": 3887 }, { "epoch": 9.437424058323208, "grad_norm": 0.5950073599815369, "learning_rate": 1.1655000000000001e-05, "loss": 0.058, "step": 3888 }, { "epoch": 9.439854191980558, "grad_norm": 0.6003885269165039, "learning_rate": 1.1658000000000001e-05, "loss": 0.0442, "step": 3889 }, { "epoch": 9.44228432563791, "grad_norm": 0.9502792358398438, "learning_rate": 1.1661e-05, "loss": 0.0781, "step": 3890 }, { "epoch": 9.44471445929526, "grad_norm": 1.05560302734375, "learning_rate": 1.1664e-05, "loss": 0.0781, "step": 3891 }, { "epoch": 9.447144592952613, "grad_norm": 0.9556542634963989, "learning_rate": 1.1667e-05, "loss": 0.0728, "step": 3892 }, { "epoch": 9.449574726609963, "grad_norm": 1.083723783493042, "learning_rate": 1.167e-05, "loss": 0.1098, "step": 3893 }, { "epoch": 9.452004860267314, "grad_norm": 0.864305853843689, "learning_rate": 1.1673e-05, "loss": 0.0506, "step": 3894 }, { "epoch": 9.454434993924666, "grad_norm": 0.8985621929168701, "learning_rate": 1.1676e-05, "loss": 0.0743, "step": 3895 }, { "epoch": 9.456865127582017, "grad_norm": 0.7533180713653564, "learning_rate": 1.1679e-05, "loss": 0.0494, "step": 3896 }, { "epoch": 9.459295261239369, "grad_norm": 1.325729489326477, "learning_rate": 1.1682000000000001e-05, "loss": 0.0784, "step": 3897 }, { "epoch": 9.46172539489672, "grad_norm": 0.9489179253578186, "learning_rate": 1.1685e-05, "loss": 0.0646, "step": 3898 }, { "epoch": 9.46415552855407, "grad_norm": 0.8399462699890137, "learning_rate": 1.1688e-05, "loss": 0.0671, "step": 3899 }, { "epoch": 9.466585662211422, "grad_norm": 1.775354027748108, "learning_rate": 1.1691e-05, "loss": 0.0915, "step": 3900 }, { "epoch": 9.469015795868772, "grad_norm": 0.990739643573761, "learning_rate": 1.1694e-05, "loss": 0.0867, "step": 3901 }, { "epoch": 9.471445929526125, "grad_norm": 1.0265594720840454, "learning_rate": 1.1697000000000002e-05, "loss": 0.0985, "step": 3902 }, { "epoch": 9.473876063183475, "grad_norm": 1.1219139099121094, "learning_rate": 1.1700000000000001e-05, "loss": 0.087, "step": 3903 }, { "epoch": 9.476306196840826, "grad_norm": 1.2440447807312012, "learning_rate": 1.1703e-05, "loss": 0.1179, "step": 3904 }, { "epoch": 9.478736330498178, "grad_norm": 1.9269241094589233, "learning_rate": 1.1706e-05, "loss": 0.1208, "step": 3905 }, { "epoch": 9.481166464155528, "grad_norm": 1.254377841949463, "learning_rate": 1.1709e-05, "loss": 0.14, "step": 3906 }, { "epoch": 9.48359659781288, "grad_norm": 1.8283107280731201, "learning_rate": 1.1712e-05, "loss": 0.1384, "step": 3907 }, { "epoch": 9.486026731470231, "grad_norm": 3.596926689147949, "learning_rate": 1.1715e-05, "loss": 0.1841, "step": 3908 }, { "epoch": 9.488456865127581, "grad_norm": 1.3228574991226196, "learning_rate": 1.1718e-05, "loss": 0.4082, "step": 3909 }, { "epoch": 9.490886998784934, "grad_norm": 0.8849658966064453, "learning_rate": 1.1721e-05, "loss": 0.3735, "step": 3910 }, { "epoch": 9.493317132442284, "grad_norm": 0.6221359968185425, "learning_rate": 1.1724e-05, "loss": 0.2139, "step": 3911 }, { "epoch": 9.495747266099636, "grad_norm": 1.0277059078216553, "learning_rate": 1.1727000000000001e-05, "loss": 0.2549, "step": 3912 }, { "epoch": 9.498177399756987, "grad_norm": 1.0335015058517456, "learning_rate": 1.1730000000000001e-05, "loss": 0.2703, "step": 3913 }, { "epoch": 9.500607533414337, "grad_norm": 0.864609956741333, "learning_rate": 1.1733000000000001e-05, "loss": 0.176, "step": 3914 }, { "epoch": 9.50303766707169, "grad_norm": 0.8929049968719482, "learning_rate": 1.1736e-05, "loss": 0.1402, "step": 3915 }, { "epoch": 9.50546780072904, "grad_norm": 0.8698409199714661, "learning_rate": 1.1738999999999999e-05, "loss": 0.1422, "step": 3916 }, { "epoch": 9.507897934386392, "grad_norm": 0.6067090630531311, "learning_rate": 1.1742e-05, "loss": 0.1464, "step": 3917 }, { "epoch": 9.510328068043743, "grad_norm": 0.5859969258308411, "learning_rate": 1.1745e-05, "loss": 0.0787, "step": 3918 }, { "epoch": 9.512758201701093, "grad_norm": 1.0456345081329346, "learning_rate": 1.1748e-05, "loss": 0.0629, "step": 3919 }, { "epoch": 9.515188335358445, "grad_norm": 0.7424594163894653, "learning_rate": 1.1751e-05, "loss": 0.0682, "step": 3920 }, { "epoch": 9.517618469015796, "grad_norm": 0.8495199084281921, "learning_rate": 1.1754e-05, "loss": 0.0839, "step": 3921 }, { "epoch": 9.520048602673146, "grad_norm": 0.8076930642127991, "learning_rate": 1.1757000000000001e-05, "loss": 0.0974, "step": 3922 }, { "epoch": 9.522478736330498, "grad_norm": 0.5520391464233398, "learning_rate": 1.1760000000000001e-05, "loss": 0.05, "step": 3923 }, { "epoch": 9.524908869987849, "grad_norm": 0.8230535387992859, "learning_rate": 1.1763e-05, "loss": 0.0736, "step": 3924 }, { "epoch": 9.527339003645201, "grad_norm": 0.6057299971580505, "learning_rate": 1.1766e-05, "loss": 0.0659, "step": 3925 }, { "epoch": 9.529769137302551, "grad_norm": 0.9586207866668701, "learning_rate": 1.1769e-05, "loss": 0.082, "step": 3926 }, { "epoch": 9.532199270959904, "grad_norm": 0.6856425404548645, "learning_rate": 1.1772000000000002e-05, "loss": 0.0509, "step": 3927 }, { "epoch": 9.534629404617254, "grad_norm": 0.8518305420875549, "learning_rate": 1.1775000000000002e-05, "loss": 0.0703, "step": 3928 }, { "epoch": 9.537059538274605, "grad_norm": 0.677503764629364, "learning_rate": 1.1778e-05, "loss": 0.0639, "step": 3929 }, { "epoch": 9.539489671931957, "grad_norm": 0.6887807250022888, "learning_rate": 1.1781e-05, "loss": 0.073, "step": 3930 }, { "epoch": 9.541919805589307, "grad_norm": 0.918113648891449, "learning_rate": 1.1784e-05, "loss": 0.0513, "step": 3931 }, { "epoch": 9.544349939246658, "grad_norm": 0.6493898034095764, "learning_rate": 1.1787e-05, "loss": 0.0513, "step": 3932 }, { "epoch": 9.54678007290401, "grad_norm": 0.6792699098587036, "learning_rate": 1.179e-05, "loss": 0.0631, "step": 3933 }, { "epoch": 9.54921020656136, "grad_norm": 1.259799838066101, "learning_rate": 1.1793e-05, "loss": 0.1056, "step": 3934 }, { "epoch": 9.551640340218713, "grad_norm": 1.1137750148773193, "learning_rate": 1.1796e-05, "loss": 0.12, "step": 3935 }, { "epoch": 9.554070473876063, "grad_norm": 0.9289217591285706, "learning_rate": 1.1799e-05, "loss": 0.0831, "step": 3936 }, { "epoch": 9.556500607533414, "grad_norm": 0.6468031406402588, "learning_rate": 1.1802000000000002e-05, "loss": 0.0525, "step": 3937 }, { "epoch": 9.558930741190766, "grad_norm": 0.67312091588974, "learning_rate": 1.1805000000000001e-05, "loss": 0.0583, "step": 3938 }, { "epoch": 9.561360874848116, "grad_norm": 0.8866440653800964, "learning_rate": 1.1808000000000001e-05, "loss": 0.0887, "step": 3939 }, { "epoch": 9.563791008505468, "grad_norm": 0.5108437538146973, "learning_rate": 1.1811000000000001e-05, "loss": 0.0385, "step": 3940 }, { "epoch": 9.566221142162819, "grad_norm": 0.7453426718711853, "learning_rate": 1.1813999999999999e-05, "loss": 0.0665, "step": 3941 }, { "epoch": 9.56865127582017, "grad_norm": 0.7549096941947937, "learning_rate": 1.1816999999999999e-05, "loss": 0.064, "step": 3942 }, { "epoch": 9.571081409477522, "grad_norm": 0.9895404577255249, "learning_rate": 1.182e-05, "loss": 0.0831, "step": 3943 }, { "epoch": 9.573511543134872, "grad_norm": 0.8211608529090881, "learning_rate": 1.1823e-05, "loss": 0.0909, "step": 3944 }, { "epoch": 9.575941676792224, "grad_norm": 0.6897784471511841, "learning_rate": 1.1826e-05, "loss": 0.0694, "step": 3945 }, { "epoch": 9.578371810449575, "grad_norm": 0.9691817760467529, "learning_rate": 1.1829e-05, "loss": 0.08, "step": 3946 }, { "epoch": 9.580801944106925, "grad_norm": 1.0135992765426636, "learning_rate": 1.1832e-05, "loss": 0.1015, "step": 3947 }, { "epoch": 9.583232077764277, "grad_norm": 0.726894736289978, "learning_rate": 1.1835000000000001e-05, "loss": 0.0634, "step": 3948 }, { "epoch": 9.585662211421628, "grad_norm": 0.837372362613678, "learning_rate": 1.1838e-05, "loss": 0.0722, "step": 3949 }, { "epoch": 9.58809234507898, "grad_norm": 0.8475009799003601, "learning_rate": 1.1841e-05, "loss": 0.0665, "step": 3950 }, { "epoch": 9.59052247873633, "grad_norm": 0.8714566826820374, "learning_rate": 1.1844e-05, "loss": 0.0787, "step": 3951 }, { "epoch": 9.592952612393681, "grad_norm": 1.5556869506835938, "learning_rate": 1.1847e-05, "loss": 0.1261, "step": 3952 }, { "epoch": 9.595382746051033, "grad_norm": 0.7213168740272522, "learning_rate": 1.185e-05, "loss": 0.0462, "step": 3953 }, { "epoch": 9.597812879708384, "grad_norm": 1.1129196882247925, "learning_rate": 1.1853e-05, "loss": 0.0982, "step": 3954 }, { "epoch": 9.600243013365736, "grad_norm": 0.984826385974884, "learning_rate": 1.1856e-05, "loss": 0.0831, "step": 3955 }, { "epoch": 9.602673147023086, "grad_norm": 2.0334653854370117, "learning_rate": 1.1859e-05, "loss": 0.1833, "step": 3956 }, { "epoch": 9.605103280680437, "grad_norm": 1.111237645149231, "learning_rate": 1.1862e-05, "loss": 0.0795, "step": 3957 }, { "epoch": 9.607533414337789, "grad_norm": 1.9143145084381104, "learning_rate": 1.1865e-05, "loss": 0.1915, "step": 3958 }, { "epoch": 9.60996354799514, "grad_norm": 1.4873487949371338, "learning_rate": 1.1868e-05, "loss": 0.4058, "step": 3959 }, { "epoch": 9.612393681652492, "grad_norm": 0.8647413849830627, "learning_rate": 1.1871e-05, "loss": 0.3158, "step": 3960 }, { "epoch": 9.614823815309842, "grad_norm": 1.2146745920181274, "learning_rate": 1.1874e-05, "loss": 0.2684, "step": 3961 }, { "epoch": 9.617253948967193, "grad_norm": 1.5019580125808716, "learning_rate": 1.1877e-05, "loss": 0.2674, "step": 3962 }, { "epoch": 9.619684082624545, "grad_norm": 0.8174078464508057, "learning_rate": 1.1880000000000001e-05, "loss": 0.183, "step": 3963 }, { "epoch": 9.622114216281895, "grad_norm": 0.8374642729759216, "learning_rate": 1.1883000000000001e-05, "loss": 0.1981, "step": 3964 }, { "epoch": 9.624544349939246, "grad_norm": 0.9992530941963196, "learning_rate": 1.1886e-05, "loss": 0.1212, "step": 3965 }, { "epoch": 9.626974483596598, "grad_norm": 0.6600720882415771, "learning_rate": 1.1889e-05, "loss": 0.116, "step": 3966 }, { "epoch": 9.629404617253948, "grad_norm": 0.5735585689544678, "learning_rate": 1.1891999999999999e-05, "loss": 0.0926, "step": 3967 }, { "epoch": 9.6318347509113, "grad_norm": 0.7419699430465698, "learning_rate": 1.1895e-05, "loss": 0.1042, "step": 3968 }, { "epoch": 9.634264884568651, "grad_norm": 0.6410108804702759, "learning_rate": 1.1898e-05, "loss": 0.0898, "step": 3969 }, { "epoch": 9.636695018226002, "grad_norm": 1.2784897089004517, "learning_rate": 1.1901e-05, "loss": 0.0807, "step": 3970 }, { "epoch": 9.639125151883354, "grad_norm": 0.8338420987129211, "learning_rate": 1.1904e-05, "loss": 0.0903, "step": 3971 }, { "epoch": 9.641555285540704, "grad_norm": 0.6453339457511902, "learning_rate": 1.1907e-05, "loss": 0.0866, "step": 3972 }, { "epoch": 9.643985419198057, "grad_norm": 0.6549910306930542, "learning_rate": 1.1910000000000001e-05, "loss": 0.0589, "step": 3973 }, { "epoch": 9.646415552855407, "grad_norm": 0.808856189250946, "learning_rate": 1.1913000000000001e-05, "loss": 0.0879, "step": 3974 }, { "epoch": 9.648845686512757, "grad_norm": 0.9454911351203918, "learning_rate": 1.1916e-05, "loss": 0.1018, "step": 3975 }, { "epoch": 9.65127582017011, "grad_norm": 0.6782306432723999, "learning_rate": 1.1919e-05, "loss": 0.0631, "step": 3976 }, { "epoch": 9.65370595382746, "grad_norm": 0.8668819069862366, "learning_rate": 1.1922e-05, "loss": 0.067, "step": 3977 }, { "epoch": 9.656136087484812, "grad_norm": 1.100614070892334, "learning_rate": 1.1925e-05, "loss": 0.0644, "step": 3978 }, { "epoch": 9.658566221142163, "grad_norm": 0.5416662096977234, "learning_rate": 1.1928e-05, "loss": 0.0449, "step": 3979 }, { "epoch": 9.660996354799513, "grad_norm": 1.6774275302886963, "learning_rate": 1.1931e-05, "loss": 0.1227, "step": 3980 }, { "epoch": 9.663426488456865, "grad_norm": 0.5479196906089783, "learning_rate": 1.1934e-05, "loss": 0.0603, "step": 3981 }, { "epoch": 9.665856622114216, "grad_norm": 0.5983408093452454, "learning_rate": 1.1937e-05, "loss": 0.0568, "step": 3982 }, { "epoch": 9.668286755771568, "grad_norm": 0.7951889038085938, "learning_rate": 1.1940000000000001e-05, "loss": 0.0641, "step": 3983 }, { "epoch": 9.670716889428919, "grad_norm": 0.744257926940918, "learning_rate": 1.1943e-05, "loss": 0.1056, "step": 3984 }, { "epoch": 9.673147023086269, "grad_norm": 0.871935248374939, "learning_rate": 1.1946e-05, "loss": 0.0835, "step": 3985 }, { "epoch": 9.675577156743621, "grad_norm": 0.8176775574684143, "learning_rate": 1.1949e-05, "loss": 0.0627, "step": 3986 }, { "epoch": 9.678007290400972, "grad_norm": 0.612061619758606, "learning_rate": 1.1952e-05, "loss": 0.0514, "step": 3987 }, { "epoch": 9.680437424058324, "grad_norm": 0.7113280296325684, "learning_rate": 1.1955000000000002e-05, "loss": 0.0753, "step": 3988 }, { "epoch": 9.682867557715674, "grad_norm": 0.9346372485160828, "learning_rate": 1.1958000000000001e-05, "loss": 0.0693, "step": 3989 }, { "epoch": 9.685297691373025, "grad_norm": 1.0244131088256836, "learning_rate": 1.1961e-05, "loss": 0.0869, "step": 3990 }, { "epoch": 9.687727825030377, "grad_norm": 1.9670886993408203, "learning_rate": 1.1964e-05, "loss": 0.1181, "step": 3991 }, { "epoch": 9.690157958687728, "grad_norm": 1.0022163391113281, "learning_rate": 1.1966999999999999e-05, "loss": 0.0818, "step": 3992 }, { "epoch": 9.69258809234508, "grad_norm": 0.947953999042511, "learning_rate": 1.197e-05, "loss": 0.1054, "step": 3993 }, { "epoch": 9.69501822600243, "grad_norm": 0.7512049674987793, "learning_rate": 1.1973e-05, "loss": 0.0728, "step": 3994 }, { "epoch": 9.69744835965978, "grad_norm": 0.8740143179893494, "learning_rate": 1.1976e-05, "loss": 0.0585, "step": 3995 }, { "epoch": 9.699878493317133, "grad_norm": 0.7359869480133057, "learning_rate": 1.1979e-05, "loss": 0.0661, "step": 3996 }, { "epoch": 9.702308626974483, "grad_norm": 0.9360350370407104, "learning_rate": 1.1982e-05, "loss": 0.098, "step": 3997 }, { "epoch": 9.704738760631834, "grad_norm": 0.854098379611969, "learning_rate": 1.1985000000000001e-05, "loss": 0.0488, "step": 3998 }, { "epoch": 9.707168894289186, "grad_norm": 0.9849023222923279, "learning_rate": 1.1988000000000001e-05, "loss": 0.1064, "step": 3999 }, { "epoch": 9.709599027946537, "grad_norm": 0.6695330142974854, "learning_rate": 1.1991000000000001e-05, "loss": 0.0731, "step": 4000 }, { "epoch": 9.709599027946537, "eval_cer": 0.10031322785318449, "eval_loss": 0.2828277349472046, "eval_runtime": 8.2642, "eval_samples_per_second": 12.221, "eval_steps_per_second": 0.484, "eval_wer": 0.31966490299823636, "step": 4000 }, { "epoch": 9.712029161603889, "grad_norm": 0.7885348796844482, "learning_rate": 1.1994e-05, "loss": 0.0924, "step": 4001 }, { "epoch": 9.71445929526124, "grad_norm": 0.8784909844398499, "learning_rate": 1.1996999999999999e-05, "loss": 0.077, "step": 4002 }, { "epoch": 9.716889428918591, "grad_norm": 1.095613718032837, "learning_rate": 1.2e-05, "loss": 0.0933, "step": 4003 }, { "epoch": 9.719319562575942, "grad_norm": 1.3100472688674927, "learning_rate": 1.2003e-05, "loss": 0.129, "step": 4004 }, { "epoch": 9.721749696233292, "grad_norm": 1.9011716842651367, "learning_rate": 1.2006e-05, "loss": 0.1495, "step": 4005 }, { "epoch": 9.724179829890645, "grad_norm": 1.7603285312652588, "learning_rate": 1.2009e-05, "loss": 0.1024, "step": 4006 }, { "epoch": 9.726609963547995, "grad_norm": 1.633337140083313, "learning_rate": 1.2012e-05, "loss": 0.1306, "step": 4007 }, { "epoch": 9.729040097205345, "grad_norm": 2.0464460849761963, "learning_rate": 1.2015000000000001e-05, "loss": 0.2001, "step": 4008 }, { "epoch": 9.731470230862698, "grad_norm": 2.059093475341797, "learning_rate": 1.2018e-05, "loss": 0.4094, "step": 4009 }, { "epoch": 9.733900364520048, "grad_norm": 1.6223270893096924, "learning_rate": 1.2021e-05, "loss": 0.3842, "step": 4010 }, { "epoch": 9.7363304981774, "grad_norm": 0.846379816532135, "learning_rate": 1.2024e-05, "loss": 0.2441, "step": 4011 }, { "epoch": 9.73876063183475, "grad_norm": 0.8960883021354675, "learning_rate": 1.2027e-05, "loss": 0.2198, "step": 4012 }, { "epoch": 9.741190765492101, "grad_norm": 1.2174370288848877, "learning_rate": 1.2030000000000002e-05, "loss": 0.2762, "step": 4013 }, { "epoch": 9.743620899149454, "grad_norm": 0.7714695930480957, "learning_rate": 1.2033000000000002e-05, "loss": 0.1558, "step": 4014 }, { "epoch": 9.746051032806804, "grad_norm": 0.7243995666503906, "learning_rate": 1.2036e-05, "loss": 0.141, "step": 4015 }, { "epoch": 9.748481166464156, "grad_norm": 0.9650675654411316, "learning_rate": 1.2039e-05, "loss": 0.172, "step": 4016 }, { "epoch": 9.750911300121507, "grad_norm": 0.8113147020339966, "learning_rate": 1.2042e-05, "loss": 0.1011, "step": 4017 }, { "epoch": 9.753341433778857, "grad_norm": 0.6532836556434631, "learning_rate": 1.2045e-05, "loss": 0.095, "step": 4018 }, { "epoch": 9.75577156743621, "grad_norm": 0.9101742506027222, "learning_rate": 1.2048e-05, "loss": 0.1035, "step": 4019 }, { "epoch": 9.75820170109356, "grad_norm": 0.5480958223342896, "learning_rate": 1.2051e-05, "loss": 0.0802, "step": 4020 }, { "epoch": 9.760631834750912, "grad_norm": 0.8144893050193787, "learning_rate": 1.2054e-05, "loss": 0.0722, "step": 4021 }, { "epoch": 9.763061968408262, "grad_norm": 0.6094329953193665, "learning_rate": 1.2057e-05, "loss": 0.0894, "step": 4022 }, { "epoch": 9.765492102065613, "grad_norm": 1.0910437107086182, "learning_rate": 1.2060000000000001e-05, "loss": 0.0894, "step": 4023 }, { "epoch": 9.767922235722965, "grad_norm": 0.7106759548187256, "learning_rate": 1.2063000000000001e-05, "loss": 0.0732, "step": 4024 }, { "epoch": 9.770352369380316, "grad_norm": 0.6392402648925781, "learning_rate": 1.2066000000000001e-05, "loss": 0.1038, "step": 4025 }, { "epoch": 9.772782503037668, "grad_norm": 0.6430336236953735, "learning_rate": 1.2069e-05, "loss": 0.0596, "step": 4026 }, { "epoch": 9.775212636695018, "grad_norm": 0.7856960892677307, "learning_rate": 1.2071999999999999e-05, "loss": 0.1272, "step": 4027 }, { "epoch": 9.777642770352369, "grad_norm": 1.0492945909500122, "learning_rate": 1.2075e-05, "loss": 0.0671, "step": 4028 }, { "epoch": 9.780072904009721, "grad_norm": 0.66923588514328, "learning_rate": 1.2078e-05, "loss": 0.0879, "step": 4029 }, { "epoch": 9.782503037667071, "grad_norm": 0.7457530498504639, "learning_rate": 1.2081e-05, "loss": 0.0592, "step": 4030 }, { "epoch": 9.784933171324424, "grad_norm": 0.8737735748291016, "learning_rate": 1.2084e-05, "loss": 0.1045, "step": 4031 }, { "epoch": 9.787363304981774, "grad_norm": 0.5311715006828308, "learning_rate": 1.2087e-05, "loss": 0.0503, "step": 4032 }, { "epoch": 9.789793438639125, "grad_norm": 0.7600491642951965, "learning_rate": 1.2090000000000001e-05, "loss": 0.0726, "step": 4033 }, { "epoch": 9.792223572296477, "grad_norm": 1.3316890001296997, "learning_rate": 1.2093000000000001e-05, "loss": 0.1671, "step": 4034 }, { "epoch": 9.794653705953827, "grad_norm": 0.6906312108039856, "learning_rate": 1.2096e-05, "loss": 0.0524, "step": 4035 }, { "epoch": 9.79708383961118, "grad_norm": 0.7611827850341797, "learning_rate": 1.2099e-05, "loss": 0.0712, "step": 4036 }, { "epoch": 9.79951397326853, "grad_norm": 0.7847610116004944, "learning_rate": 1.2102e-05, "loss": 0.0895, "step": 4037 }, { "epoch": 9.80194410692588, "grad_norm": 0.5028907656669617, "learning_rate": 1.2105000000000002e-05, "loss": 0.0438, "step": 4038 }, { "epoch": 9.804374240583233, "grad_norm": 0.7710916996002197, "learning_rate": 1.2108e-05, "loss": 0.0528, "step": 4039 }, { "epoch": 9.806804374240583, "grad_norm": 1.0897681713104248, "learning_rate": 1.2111e-05, "loss": 0.088, "step": 4040 }, { "epoch": 9.809234507897933, "grad_norm": 0.8172774314880371, "learning_rate": 1.2114e-05, "loss": 0.0826, "step": 4041 }, { "epoch": 9.811664641555286, "grad_norm": 1.1620759963989258, "learning_rate": 1.2117e-05, "loss": 0.0753, "step": 4042 }, { "epoch": 9.814094775212636, "grad_norm": 0.9349687695503235, "learning_rate": 1.2120000000000001e-05, "loss": 0.0775, "step": 4043 }, { "epoch": 9.816524908869988, "grad_norm": 0.7666258811950684, "learning_rate": 1.2123e-05, "loss": 0.0754, "step": 4044 }, { "epoch": 9.818955042527339, "grad_norm": 0.7745054960250854, "learning_rate": 1.2126e-05, "loss": 0.07, "step": 4045 }, { "epoch": 9.821385176184691, "grad_norm": 0.8087015151977539, "learning_rate": 1.2129e-05, "loss": 0.1043, "step": 4046 }, { "epoch": 9.823815309842042, "grad_norm": 0.6293290853500366, "learning_rate": 1.2132e-05, "loss": 0.0653, "step": 4047 }, { "epoch": 9.826245443499392, "grad_norm": 0.9123288989067078, "learning_rate": 1.2135000000000002e-05, "loss": 0.0876, "step": 4048 }, { "epoch": 9.828675577156744, "grad_norm": 0.8321337103843689, "learning_rate": 1.2138000000000001e-05, "loss": 0.0689, "step": 4049 }, { "epoch": 9.831105710814095, "grad_norm": 0.874648928642273, "learning_rate": 1.2141000000000001e-05, "loss": 0.0885, "step": 4050 }, { "epoch": 9.833535844471445, "grad_norm": 0.8251922726631165, "learning_rate": 1.2144e-05, "loss": 0.0479, "step": 4051 }, { "epoch": 9.835965978128797, "grad_norm": 1.0854803323745728, "learning_rate": 1.2146999999999999e-05, "loss": 0.1031, "step": 4052 }, { "epoch": 9.838396111786148, "grad_norm": 1.1135716438293457, "learning_rate": 1.215e-05, "loss": 0.1195, "step": 4053 }, { "epoch": 9.8408262454435, "grad_norm": 0.9583134651184082, "learning_rate": 1.2153e-05, "loss": 0.0671, "step": 4054 }, { "epoch": 9.84325637910085, "grad_norm": 2.4061269760131836, "learning_rate": 1.2156e-05, "loss": 0.1269, "step": 4055 }, { "epoch": 9.845686512758201, "grad_norm": 1.4005557298660278, "learning_rate": 1.2159e-05, "loss": 0.1181, "step": 4056 }, { "epoch": 9.848116646415553, "grad_norm": 1.3643583059310913, "learning_rate": 1.2162e-05, "loss": 0.1019, "step": 4057 }, { "epoch": 9.850546780072904, "grad_norm": 1.7559082508087158, "learning_rate": 1.2165000000000001e-05, "loss": 0.1293, "step": 4058 }, { "epoch": 9.852976913730256, "grad_norm": 2.5779097080230713, "learning_rate": 1.2168000000000001e-05, "loss": 0.4769, "step": 4059 }, { "epoch": 9.855407047387606, "grad_norm": 0.8714258074760437, "learning_rate": 1.2171000000000001e-05, "loss": 0.2864, "step": 4060 }, { "epoch": 9.857837181044957, "grad_norm": 0.8575035929679871, "learning_rate": 1.2174e-05, "loss": 0.2809, "step": 4061 }, { "epoch": 9.860267314702309, "grad_norm": 0.9362747073173523, "learning_rate": 1.2177e-05, "loss": 0.1828, "step": 4062 }, { "epoch": 9.86269744835966, "grad_norm": 1.0450187921524048, "learning_rate": 1.2180000000000002e-05, "loss": 0.2097, "step": 4063 }, { "epoch": 9.865127582017012, "grad_norm": 0.8687231540679932, "learning_rate": 1.2183e-05, "loss": 0.1615, "step": 4064 }, { "epoch": 9.867557715674362, "grad_norm": 1.0193885564804077, "learning_rate": 1.2186e-05, "loss": 0.1516, "step": 4065 }, { "epoch": 9.869987849331713, "grad_norm": 0.9945717453956604, "learning_rate": 1.2189e-05, "loss": 0.1411, "step": 4066 }, { "epoch": 9.872417982989065, "grad_norm": 0.7079204320907593, "learning_rate": 1.2192e-05, "loss": 0.0923, "step": 4067 }, { "epoch": 9.874848116646415, "grad_norm": 0.6559435725212097, "learning_rate": 1.2195e-05, "loss": 0.0975, "step": 4068 }, { "epoch": 9.877278250303767, "grad_norm": 0.8030732870101929, "learning_rate": 1.2198e-05, "loss": 0.0865, "step": 4069 }, { "epoch": 9.879708383961118, "grad_norm": 0.7096171379089355, "learning_rate": 1.2201e-05, "loss": 0.0974, "step": 4070 }, { "epoch": 9.882138517618468, "grad_norm": 0.7418208718299866, "learning_rate": 1.2204e-05, "loss": 0.0923, "step": 4071 }, { "epoch": 9.88456865127582, "grad_norm": 0.9177073240280151, "learning_rate": 1.2207e-05, "loss": 0.0821, "step": 4072 }, { "epoch": 9.886998784933171, "grad_norm": 0.7734960317611694, "learning_rate": 1.221e-05, "loss": 0.0836, "step": 4073 }, { "epoch": 9.889428918590523, "grad_norm": 1.1521797180175781, "learning_rate": 1.2213000000000001e-05, "loss": 0.0946, "step": 4074 }, { "epoch": 9.891859052247874, "grad_norm": 0.8857397437095642, "learning_rate": 1.2216000000000001e-05, "loss": 0.0935, "step": 4075 }, { "epoch": 9.894289185905224, "grad_norm": 0.9266164898872375, "learning_rate": 1.2219e-05, "loss": 0.1069, "step": 4076 }, { "epoch": 9.896719319562576, "grad_norm": 0.5907802581787109, "learning_rate": 1.2222e-05, "loss": 0.0498, "step": 4077 }, { "epoch": 9.899149453219927, "grad_norm": 0.9312072396278381, "learning_rate": 1.2224999999999999e-05, "loss": 0.0836, "step": 4078 }, { "epoch": 9.90157958687728, "grad_norm": 0.7070134282112122, "learning_rate": 1.2228e-05, "loss": 0.0667, "step": 4079 }, { "epoch": 9.90400972053463, "grad_norm": 0.6562714576721191, "learning_rate": 1.2231e-05, "loss": 0.0744, "step": 4080 }, { "epoch": 9.90643985419198, "grad_norm": 0.7530447840690613, "learning_rate": 1.2234e-05, "loss": 0.0614, "step": 4081 }, { "epoch": 9.908869987849332, "grad_norm": 0.6723609566688538, "learning_rate": 1.2237e-05, "loss": 0.0549, "step": 4082 }, { "epoch": 9.911300121506683, "grad_norm": 0.8016909956932068, "learning_rate": 1.224e-05, "loss": 0.069, "step": 4083 }, { "epoch": 9.913730255164033, "grad_norm": 0.7288452982902527, "learning_rate": 1.2243000000000001e-05, "loss": 0.059, "step": 4084 }, { "epoch": 9.916160388821385, "grad_norm": 0.8134220242500305, "learning_rate": 1.2246000000000001e-05, "loss": 0.0532, "step": 4085 }, { "epoch": 9.918590522478736, "grad_norm": 0.703211784362793, "learning_rate": 1.2249e-05, "loss": 0.0427, "step": 4086 }, { "epoch": 9.921020656136088, "grad_norm": 0.751253604888916, "learning_rate": 1.2252e-05, "loss": 0.057, "step": 4087 }, { "epoch": 9.923450789793439, "grad_norm": 0.8336164951324463, "learning_rate": 1.2254999999999999e-05, "loss": 0.0842, "step": 4088 }, { "epoch": 9.925880923450789, "grad_norm": 0.698402464389801, "learning_rate": 1.2258e-05, "loss": 0.072, "step": 4089 }, { "epoch": 9.928311057108141, "grad_norm": 0.649729311466217, "learning_rate": 1.2261e-05, "loss": 0.0693, "step": 4090 }, { "epoch": 9.930741190765492, "grad_norm": 0.7681673765182495, "learning_rate": 1.2264e-05, "loss": 0.0625, "step": 4091 }, { "epoch": 9.933171324422844, "grad_norm": 0.8698141574859619, "learning_rate": 1.2267e-05, "loss": 0.0743, "step": 4092 }, { "epoch": 9.935601458080194, "grad_norm": 1.3191531896591187, "learning_rate": 1.227e-05, "loss": 0.0914, "step": 4093 }, { "epoch": 9.938031591737545, "grad_norm": 0.8680530786514282, "learning_rate": 1.2273000000000001e-05, "loss": 0.0691, "step": 4094 }, { "epoch": 9.940461725394897, "grad_norm": 0.8309135437011719, "learning_rate": 1.2276e-05, "loss": 0.0771, "step": 4095 }, { "epoch": 9.942891859052247, "grad_norm": 1.6422141790390015, "learning_rate": 1.2279e-05, "loss": 0.0729, "step": 4096 }, { "epoch": 9.9453219927096, "grad_norm": 0.8026317358016968, "learning_rate": 1.2282e-05, "loss": 0.0713, "step": 4097 }, { "epoch": 9.94775212636695, "grad_norm": 1.2384305000305176, "learning_rate": 1.2285e-05, "loss": 0.0851, "step": 4098 }, { "epoch": 9.9501822600243, "grad_norm": 0.7599031925201416, "learning_rate": 1.2288000000000002e-05, "loss": 0.0882, "step": 4099 }, { "epoch": 9.952612393681653, "grad_norm": 0.8594346046447754, "learning_rate": 1.2291000000000001e-05, "loss": 0.0814, "step": 4100 }, { "epoch": 9.955042527339003, "grad_norm": 0.8090288639068604, "learning_rate": 1.2294e-05, "loss": 0.0694, "step": 4101 }, { "epoch": 9.957472660996356, "grad_norm": 1.1138737201690674, "learning_rate": 1.2297e-05, "loss": 0.0941, "step": 4102 }, { "epoch": 9.959902794653706, "grad_norm": 1.2144750356674194, "learning_rate": 1.2299999999999999e-05, "loss": 0.1097, "step": 4103 }, { "epoch": 9.962332928311056, "grad_norm": 1.0028700828552246, "learning_rate": 1.2303e-05, "loss": 0.0817, "step": 4104 }, { "epoch": 9.964763061968409, "grad_norm": 1.0064047574996948, "learning_rate": 1.2306e-05, "loss": 0.0875, "step": 4105 }, { "epoch": 9.96719319562576, "grad_norm": 1.1194124221801758, "learning_rate": 1.2309e-05, "loss": 0.1229, "step": 4106 }, { "epoch": 9.969623329283111, "grad_norm": 1.451069712638855, "learning_rate": 1.2312e-05, "loss": 0.1115, "step": 4107 }, { "epoch": 9.972053462940462, "grad_norm": 2.1380860805511475, "learning_rate": 1.2315e-05, "loss": 0.2424, "step": 4108 }, { "epoch": 9.974483596597812, "grad_norm": 1.512285828590393, "learning_rate": 1.2318000000000001e-05, "loss": 0.3118, "step": 4109 }, { "epoch": 9.976913730255164, "grad_norm": 0.8621230125427246, "learning_rate": 1.2321000000000001e-05, "loss": 0.1657, "step": 4110 }, { "epoch": 9.979343863912515, "grad_norm": 0.8132374286651611, "learning_rate": 1.2324000000000001e-05, "loss": 0.0813, "step": 4111 }, { "epoch": 9.981773997569867, "grad_norm": 0.5776856541633606, "learning_rate": 1.2327e-05, "loss": 0.0787, "step": 4112 }, { "epoch": 9.984204131227218, "grad_norm": 0.71916663646698, "learning_rate": 1.2329999999999999e-05, "loss": 0.0768, "step": 4113 }, { "epoch": 9.986634264884568, "grad_norm": 0.8414753079414368, "learning_rate": 1.2333e-05, "loss": 0.086, "step": 4114 }, { "epoch": 9.98906439854192, "grad_norm": 0.729172945022583, "learning_rate": 1.2336e-05, "loss": 0.0623, "step": 4115 }, { "epoch": 9.99149453219927, "grad_norm": 0.9441060423851013, "learning_rate": 1.2339e-05, "loss": 0.096, "step": 4116 }, { "epoch": 9.993924665856621, "grad_norm": 0.7184531092643738, "learning_rate": 1.2342e-05, "loss": 0.0779, "step": 4117 }, { "epoch": 9.996354799513973, "grad_norm": 0.8321018815040588, "learning_rate": 1.2345e-05, "loss": 0.0857, "step": 4118 }, { "epoch": 9.998784933171324, "grad_norm": 1.0477559566497803, "learning_rate": 1.2348000000000001e-05, "loss": 0.1019, "step": 4119 }, { "epoch": 10.0, "grad_norm": 1.0691829919815063, "learning_rate": 1.2351e-05, "loss": 0.0757, "step": 4120 }, { "epoch": 10.00243013365735, "grad_norm": 3.846013069152832, "learning_rate": 1.2354e-05, "loss": 0.4014, "step": 4121 }, { "epoch": 10.004860267314703, "grad_norm": 1.2860736846923828, "learning_rate": 1.2357e-05, "loss": 0.2978, "step": 4122 }, { "epoch": 10.007290400972053, "grad_norm": 1.1688238382339478, "learning_rate": 1.236e-05, "loss": 0.2357, "step": 4123 }, { "epoch": 10.009720534629405, "grad_norm": 0.7874366641044617, "learning_rate": 1.2363000000000002e-05, "loss": 0.2261, "step": 4124 }, { "epoch": 10.012150668286756, "grad_norm": 0.767211377620697, "learning_rate": 1.2366e-05, "loss": 0.1688, "step": 4125 }, { "epoch": 10.014580801944106, "grad_norm": 0.753541111946106, "learning_rate": 1.2369e-05, "loss": 0.1476, "step": 4126 }, { "epoch": 10.017010935601458, "grad_norm": 0.8497649431228638, "learning_rate": 1.2372e-05, "loss": 0.1204, "step": 4127 }, { "epoch": 10.019441069258809, "grad_norm": 0.6650218963623047, "learning_rate": 1.2375e-05, "loss": 0.1099, "step": 4128 }, { "epoch": 10.021871202916161, "grad_norm": 1.1516717672348022, "learning_rate": 1.2378e-05, "loss": 0.1051, "step": 4129 }, { "epoch": 10.024301336573512, "grad_norm": 0.5846656560897827, "learning_rate": 1.2381e-05, "loss": 0.109, "step": 4130 }, { "epoch": 10.026731470230862, "grad_norm": 0.9949701428413391, "learning_rate": 1.2384e-05, "loss": 0.0998, "step": 4131 }, { "epoch": 10.029161603888214, "grad_norm": 0.6824644804000854, "learning_rate": 1.2387e-05, "loss": 0.0862, "step": 4132 }, { "epoch": 10.031591737545565, "grad_norm": 0.760601282119751, "learning_rate": 1.239e-05, "loss": 0.0839, "step": 4133 }, { "epoch": 10.034021871202917, "grad_norm": 0.6288650631904602, "learning_rate": 1.2393000000000001e-05, "loss": 0.0554, "step": 4134 }, { "epoch": 10.036452004860267, "grad_norm": 0.6393222808837891, "learning_rate": 1.2396000000000001e-05, "loss": 0.0655, "step": 4135 }, { "epoch": 10.038882138517618, "grad_norm": 0.5841144919395447, "learning_rate": 1.2399000000000001e-05, "loss": 0.0611, "step": 4136 }, { "epoch": 10.04131227217497, "grad_norm": 0.681143581867218, "learning_rate": 1.2402e-05, "loss": 0.0741, "step": 4137 }, { "epoch": 10.04374240583232, "grad_norm": 0.7326239347457886, "learning_rate": 1.2404999999999999e-05, "loss": 0.1297, "step": 4138 }, { "epoch": 10.046172539489673, "grad_norm": 0.6161578893661499, "learning_rate": 1.2408e-05, "loss": 0.0588, "step": 4139 }, { "epoch": 10.048602673147023, "grad_norm": 0.7169691324234009, "learning_rate": 1.2411e-05, "loss": 0.0903, "step": 4140 }, { "epoch": 10.051032806804374, "grad_norm": 0.6986242532730103, "learning_rate": 1.2414e-05, "loss": 0.0696, "step": 4141 }, { "epoch": 10.053462940461726, "grad_norm": 0.7029225826263428, "learning_rate": 1.2417e-05, "loss": 0.0711, "step": 4142 }, { "epoch": 10.055893074119076, "grad_norm": 0.9191884398460388, "learning_rate": 1.242e-05, "loss": 0.0876, "step": 4143 }, { "epoch": 10.058323207776427, "grad_norm": 0.5137796998023987, "learning_rate": 1.2423000000000001e-05, "loss": 0.0507, "step": 4144 }, { "epoch": 10.060753341433779, "grad_norm": 0.6931435465812683, "learning_rate": 1.2426000000000001e-05, "loss": 0.095, "step": 4145 }, { "epoch": 10.06318347509113, "grad_norm": 0.6814424991607666, "learning_rate": 1.2429e-05, "loss": 0.0574, "step": 4146 }, { "epoch": 10.065613608748482, "grad_norm": 0.8171980381011963, "learning_rate": 1.2432e-05, "loss": 0.137, "step": 4147 }, { "epoch": 10.068043742405832, "grad_norm": 0.7642788887023926, "learning_rate": 1.2435e-05, "loss": 0.0583, "step": 4148 }, { "epoch": 10.070473876063183, "grad_norm": 0.6366555094718933, "learning_rate": 1.2438000000000002e-05, "loss": 0.0521, "step": 4149 }, { "epoch": 10.072904009720535, "grad_norm": 0.937148928642273, "learning_rate": 1.2441e-05, "loss": 0.0564, "step": 4150 }, { "epoch": 10.075334143377885, "grad_norm": 0.7228671312332153, "learning_rate": 1.2444e-05, "loss": 0.0567, "step": 4151 }, { "epoch": 10.077764277035238, "grad_norm": 1.0396548509597778, "learning_rate": 1.2447e-05, "loss": 0.0442, "step": 4152 }, { "epoch": 10.080194410692588, "grad_norm": 0.8016345500946045, "learning_rate": 1.245e-05, "loss": 0.0497, "step": 4153 }, { "epoch": 10.082624544349938, "grad_norm": 0.7056156396865845, "learning_rate": 1.2453000000000001e-05, "loss": 0.0579, "step": 4154 }, { "epoch": 10.08505467800729, "grad_norm": 0.6976181864738464, "learning_rate": 1.2456e-05, "loss": 0.0514, "step": 4155 }, { "epoch": 10.087484811664641, "grad_norm": 0.7447927594184875, "learning_rate": 1.2459e-05, "loss": 0.0788, "step": 4156 }, { "epoch": 10.089914945321993, "grad_norm": 1.8395562171936035, "learning_rate": 1.2462e-05, "loss": 0.1239, "step": 4157 }, { "epoch": 10.092345078979344, "grad_norm": 1.1037226915359497, "learning_rate": 1.2465e-05, "loss": 0.0521, "step": 4158 }, { "epoch": 10.094775212636694, "grad_norm": 0.7947566509246826, "learning_rate": 1.2468000000000002e-05, "loss": 0.0751, "step": 4159 }, { "epoch": 10.097205346294047, "grad_norm": 0.9924338459968567, "learning_rate": 1.2471000000000001e-05, "loss": 0.0774, "step": 4160 }, { "epoch": 10.099635479951397, "grad_norm": 0.9319251179695129, "learning_rate": 1.2474000000000001e-05, "loss": 0.0824, "step": 4161 }, { "epoch": 10.10206561360875, "grad_norm": 0.6886150240898132, "learning_rate": 1.2477e-05, "loss": 0.0432, "step": 4162 }, { "epoch": 10.1044957472661, "grad_norm": 0.5785420536994934, "learning_rate": 1.2479999999999999e-05, "loss": 0.0552, "step": 4163 }, { "epoch": 10.10692588092345, "grad_norm": 0.7354112267494202, "learning_rate": 1.2483e-05, "loss": 0.0949, "step": 4164 }, { "epoch": 10.109356014580802, "grad_norm": 0.8507426381111145, "learning_rate": 1.2486e-05, "loss": 0.0668, "step": 4165 }, { "epoch": 10.111786148238153, "grad_norm": 0.9731454849243164, "learning_rate": 1.2489e-05, "loss": 0.096, "step": 4166 }, { "epoch": 10.114216281895505, "grad_norm": 1.1207867860794067, "learning_rate": 1.2492e-05, "loss": 0.0804, "step": 4167 }, { "epoch": 10.116646415552855, "grad_norm": 1.4569486379623413, "learning_rate": 1.2495e-05, "loss": 0.0999, "step": 4168 }, { "epoch": 10.119076549210206, "grad_norm": 1.3076386451721191, "learning_rate": 1.2498000000000001e-05, "loss": 0.1249, "step": 4169 }, { "epoch": 10.121506682867558, "grad_norm": 1.98964524269104, "learning_rate": 1.2501000000000001e-05, "loss": 0.1718, "step": 4170 }, { "epoch": 10.123936816524909, "grad_norm": 1.30082106590271, "learning_rate": 1.2504000000000001e-05, "loss": 0.3714, "step": 4171 }, { "epoch": 10.12636695018226, "grad_norm": 0.9123899340629578, "learning_rate": 1.2507e-05, "loss": 0.2898, "step": 4172 }, { "epoch": 10.128797083839611, "grad_norm": 0.9613159894943237, "learning_rate": 1.251e-05, "loss": 0.2298, "step": 4173 }, { "epoch": 10.131227217496962, "grad_norm": 1.0202299356460571, "learning_rate": 1.2513e-05, "loss": 0.2218, "step": 4174 }, { "epoch": 10.133657351154314, "grad_norm": 0.8749375939369202, "learning_rate": 1.2516e-05, "loss": 0.2939, "step": 4175 }, { "epoch": 10.136087484811664, "grad_norm": 0.7908825278282166, "learning_rate": 1.2519e-05, "loss": 0.1481, "step": 4176 }, { "epoch": 10.138517618469017, "grad_norm": 1.1506491899490356, "learning_rate": 1.2522e-05, "loss": 0.1392, "step": 4177 }, { "epoch": 10.140947752126367, "grad_norm": 0.8160725831985474, "learning_rate": 1.2525e-05, "loss": 0.0958, "step": 4178 }, { "epoch": 10.143377885783718, "grad_norm": 0.6868699789047241, "learning_rate": 1.2528000000000001e-05, "loss": 0.0928, "step": 4179 }, { "epoch": 10.14580801944107, "grad_norm": 0.6016743779182434, "learning_rate": 1.2531e-05, "loss": 0.0795, "step": 4180 }, { "epoch": 10.14823815309842, "grad_norm": 0.5461216568946838, "learning_rate": 1.2534e-05, "loss": 0.072, "step": 4181 }, { "epoch": 10.15066828675577, "grad_norm": 1.049648404121399, "learning_rate": 1.2537e-05, "loss": 0.0857, "step": 4182 }, { "epoch": 10.153098420413123, "grad_norm": 0.7132783532142639, "learning_rate": 1.254e-05, "loss": 0.0732, "step": 4183 }, { "epoch": 10.155528554070473, "grad_norm": 0.5842369198799133, "learning_rate": 1.2543000000000002e-05, "loss": 0.0677, "step": 4184 }, { "epoch": 10.157958687727826, "grad_norm": 0.5733945965766907, "learning_rate": 1.2546000000000002e-05, "loss": 0.0701, "step": 4185 }, { "epoch": 10.160388821385176, "grad_norm": 0.60147625207901, "learning_rate": 1.2549000000000001e-05, "loss": 0.0701, "step": 4186 }, { "epoch": 10.162818955042527, "grad_norm": 0.6172634363174438, "learning_rate": 1.2552e-05, "loss": 0.0707, "step": 4187 }, { "epoch": 10.165249088699879, "grad_norm": 0.707711935043335, "learning_rate": 1.2555e-05, "loss": 0.0736, "step": 4188 }, { "epoch": 10.16767922235723, "grad_norm": 0.4868810772895813, "learning_rate": 1.2558e-05, "loss": 0.0473, "step": 4189 }, { "epoch": 10.170109356014581, "grad_norm": 0.5305838584899902, "learning_rate": 1.2561e-05, "loss": 0.0571, "step": 4190 }, { "epoch": 10.172539489671932, "grad_norm": 0.7500386834144592, "learning_rate": 1.2564e-05, "loss": 0.0693, "step": 4191 }, { "epoch": 10.174969623329282, "grad_norm": 0.6053832769393921, "learning_rate": 1.2567e-05, "loss": 0.0608, "step": 4192 }, { "epoch": 10.177399756986635, "grad_norm": 0.5559926629066467, "learning_rate": 1.257e-05, "loss": 0.0626, "step": 4193 }, { "epoch": 10.179829890643985, "grad_norm": 0.781096875667572, "learning_rate": 1.2573e-05, "loss": 0.0525, "step": 4194 }, { "epoch": 10.182260024301337, "grad_norm": 0.7093937397003174, "learning_rate": 1.2576000000000001e-05, "loss": 0.0704, "step": 4195 }, { "epoch": 10.184690157958688, "grad_norm": 0.7365634441375732, "learning_rate": 1.2579000000000001e-05, "loss": 0.0895, "step": 4196 }, { "epoch": 10.187120291616038, "grad_norm": 0.6916201710700989, "learning_rate": 1.2582e-05, "loss": 0.0674, "step": 4197 }, { "epoch": 10.18955042527339, "grad_norm": 0.6920248866081238, "learning_rate": 1.2585e-05, "loss": 0.0745, "step": 4198 }, { "epoch": 10.19198055893074, "grad_norm": 0.9182626605033875, "learning_rate": 1.2587999999999999e-05, "loss": 0.0863, "step": 4199 }, { "epoch": 10.194410692588093, "grad_norm": 0.6545429825782776, "learning_rate": 1.2591e-05, "loss": 0.0513, "step": 4200 }, { "epoch": 10.196840826245444, "grad_norm": 0.611048698425293, "learning_rate": 1.2594e-05, "loss": 0.0666, "step": 4201 }, { "epoch": 10.199270959902794, "grad_norm": 0.6800433993339539, "learning_rate": 1.2597e-05, "loss": 0.0789, "step": 4202 }, { "epoch": 10.201701093560146, "grad_norm": 0.6321026682853699, "learning_rate": 1.26e-05, "loss": 0.0558, "step": 4203 }, { "epoch": 10.204131227217497, "grad_norm": 0.7163286209106445, "learning_rate": 1.2603e-05, "loss": 0.0695, "step": 4204 }, { "epoch": 10.206561360874849, "grad_norm": 0.7351863384246826, "learning_rate": 1.2606000000000001e-05, "loss": 0.0634, "step": 4205 }, { "epoch": 10.2089914945322, "grad_norm": 0.5209522247314453, "learning_rate": 1.2609e-05, "loss": 0.0455, "step": 4206 }, { "epoch": 10.21142162818955, "grad_norm": 0.6908193826675415, "learning_rate": 1.2612e-05, "loss": 0.0545, "step": 4207 }, { "epoch": 10.213851761846902, "grad_norm": 0.8099454045295715, "learning_rate": 1.2615e-05, "loss": 0.0786, "step": 4208 }, { "epoch": 10.216281895504252, "grad_norm": 1.126659631729126, "learning_rate": 1.2618e-05, "loss": 0.0758, "step": 4209 }, { "epoch": 10.218712029161605, "grad_norm": 0.7557768821716309, "learning_rate": 1.2621000000000002e-05, "loss": 0.0493, "step": 4210 }, { "epoch": 10.221142162818955, "grad_norm": 0.8970341682434082, "learning_rate": 1.2624e-05, "loss": 0.0643, "step": 4211 }, { "epoch": 10.223572296476306, "grad_norm": 1.2505683898925781, "learning_rate": 1.2627e-05, "loss": 0.0572, "step": 4212 }, { "epoch": 10.226002430133658, "grad_norm": 0.9959913492202759, "learning_rate": 1.263e-05, "loss": 0.0669, "step": 4213 }, { "epoch": 10.228432563791008, "grad_norm": 1.2058637142181396, "learning_rate": 1.2633e-05, "loss": 0.0868, "step": 4214 }, { "epoch": 10.23086269744836, "grad_norm": 0.834153950214386, "learning_rate": 1.2636e-05, "loss": 0.0698, "step": 4215 }, { "epoch": 10.233292831105711, "grad_norm": 0.9296355247497559, "learning_rate": 1.2639e-05, "loss": 0.0612, "step": 4216 }, { "epoch": 10.235722964763061, "grad_norm": 1.1855281591415405, "learning_rate": 1.2642e-05, "loss": 0.093, "step": 4217 }, { "epoch": 10.238153098420414, "grad_norm": 1.6592620611190796, "learning_rate": 1.2645e-05, "loss": 0.1207, "step": 4218 }, { "epoch": 10.240583232077764, "grad_norm": 2.3727903366088867, "learning_rate": 1.2648e-05, "loss": 0.1224, "step": 4219 }, { "epoch": 10.243013365735115, "grad_norm": 2.0129363536834717, "learning_rate": 1.2651000000000001e-05, "loss": 0.228, "step": 4220 }, { "epoch": 10.245443499392467, "grad_norm": 1.055010199546814, "learning_rate": 1.2654000000000001e-05, "loss": 0.3331, "step": 4221 }, { "epoch": 10.247873633049817, "grad_norm": 0.8363828063011169, "learning_rate": 1.2657000000000001e-05, "loss": 0.3075, "step": 4222 }, { "epoch": 10.25030376670717, "grad_norm": 0.6751115322113037, "learning_rate": 1.2659999999999999e-05, "loss": 0.2477, "step": 4223 }, { "epoch": 10.25273390036452, "grad_norm": 0.641815185546875, "learning_rate": 1.2662999999999999e-05, "loss": 0.2114, "step": 4224 }, { "epoch": 10.25516403402187, "grad_norm": 0.7786287069320679, "learning_rate": 1.2666e-05, "loss": 0.1967, "step": 4225 }, { "epoch": 10.257594167679223, "grad_norm": 0.8264980316162109, "learning_rate": 1.2669e-05, "loss": 0.1764, "step": 4226 }, { "epoch": 10.260024301336573, "grad_norm": 0.7733672261238098, "learning_rate": 1.2672e-05, "loss": 0.1733, "step": 4227 }, { "epoch": 10.262454434993925, "grad_norm": 0.7003387212753296, "learning_rate": 1.2675e-05, "loss": 0.094, "step": 4228 }, { "epoch": 10.264884568651276, "grad_norm": 0.908356249332428, "learning_rate": 1.2678e-05, "loss": 0.1073, "step": 4229 }, { "epoch": 10.267314702308626, "grad_norm": 0.7386147975921631, "learning_rate": 1.2681000000000001e-05, "loss": 0.1077, "step": 4230 }, { "epoch": 10.269744835965978, "grad_norm": 0.5690781474113464, "learning_rate": 1.2684000000000001e-05, "loss": 0.0701, "step": 4231 }, { "epoch": 10.272174969623329, "grad_norm": 0.7902666330337524, "learning_rate": 1.2687e-05, "loss": 0.0913, "step": 4232 }, { "epoch": 10.274605103280681, "grad_norm": 0.5892894268035889, "learning_rate": 1.269e-05, "loss": 0.064, "step": 4233 }, { "epoch": 10.277035236938032, "grad_norm": 0.7023167014122009, "learning_rate": 1.2693e-05, "loss": 0.0913, "step": 4234 }, { "epoch": 10.279465370595382, "grad_norm": 0.570422351360321, "learning_rate": 1.2696000000000002e-05, "loss": 0.0711, "step": 4235 }, { "epoch": 10.281895504252734, "grad_norm": 0.7939373254776001, "learning_rate": 1.2699e-05, "loss": 0.0709, "step": 4236 }, { "epoch": 10.284325637910085, "grad_norm": 0.6070003509521484, "learning_rate": 1.2702e-05, "loss": 0.0804, "step": 4237 }, { "epoch": 10.286755771567437, "grad_norm": 0.8488731384277344, "learning_rate": 1.2705e-05, "loss": 0.0951, "step": 4238 }, { "epoch": 10.289185905224787, "grad_norm": 0.6660283803939819, "learning_rate": 1.2708e-05, "loss": 0.0688, "step": 4239 }, { "epoch": 10.291616038882138, "grad_norm": 0.4576435089111328, "learning_rate": 1.2711e-05, "loss": 0.057, "step": 4240 }, { "epoch": 10.29404617253949, "grad_norm": 0.675860583782196, "learning_rate": 1.2714e-05, "loss": 0.0711, "step": 4241 }, { "epoch": 10.29647630619684, "grad_norm": 0.6125268340110779, "learning_rate": 1.2717e-05, "loss": 0.0729, "step": 4242 }, { "epoch": 10.298906439854193, "grad_norm": 0.7078626751899719, "learning_rate": 1.272e-05, "loss": 0.0678, "step": 4243 }, { "epoch": 10.301336573511543, "grad_norm": 0.5567758679389954, "learning_rate": 1.2723e-05, "loss": 0.0506, "step": 4244 }, { "epoch": 10.303766707168894, "grad_norm": 0.572856605052948, "learning_rate": 1.2726000000000001e-05, "loss": 0.0516, "step": 4245 }, { "epoch": 10.306196840826246, "grad_norm": 0.5600025653839111, "learning_rate": 1.2729000000000001e-05, "loss": 0.0497, "step": 4246 }, { "epoch": 10.308626974483596, "grad_norm": 0.705685019493103, "learning_rate": 1.2732000000000001e-05, "loss": 0.0584, "step": 4247 }, { "epoch": 10.311057108140949, "grad_norm": 0.6902216076850891, "learning_rate": 1.2735e-05, "loss": 0.074, "step": 4248 }, { "epoch": 10.313487241798299, "grad_norm": 0.5484153032302856, "learning_rate": 1.2737999999999999e-05, "loss": 0.0508, "step": 4249 }, { "epoch": 10.31591737545565, "grad_norm": 0.6271480917930603, "learning_rate": 1.2741e-05, "loss": 0.0561, "step": 4250 }, { "epoch": 10.318347509113002, "grad_norm": 1.204410195350647, "learning_rate": 1.2744e-05, "loss": 0.0928, "step": 4251 }, { "epoch": 10.320777642770352, "grad_norm": 1.716626763343811, "learning_rate": 1.2747e-05, "loss": 0.0807, "step": 4252 }, { "epoch": 10.323207776427704, "grad_norm": 0.6136549115180969, "learning_rate": 1.275e-05, "loss": 0.0473, "step": 4253 }, { "epoch": 10.325637910085055, "grad_norm": 0.6752810478210449, "learning_rate": 1.2753e-05, "loss": 0.0527, "step": 4254 }, { "epoch": 10.328068043742405, "grad_norm": 0.783905565738678, "learning_rate": 1.2756000000000001e-05, "loss": 0.0795, "step": 4255 }, { "epoch": 10.330498177399758, "grad_norm": 0.9270179271697998, "learning_rate": 1.2759000000000001e-05, "loss": 0.0889, "step": 4256 }, { "epoch": 10.332928311057108, "grad_norm": 0.8607389330863953, "learning_rate": 1.2762e-05, "loss": 0.0625, "step": 4257 }, { "epoch": 10.335358444714458, "grad_norm": 0.835658073425293, "learning_rate": 1.2765e-05, "loss": 0.042, "step": 4258 }, { "epoch": 10.33778857837181, "grad_norm": 0.786562979221344, "learning_rate": 1.2768e-05, "loss": 0.0572, "step": 4259 }, { "epoch": 10.340218712029161, "grad_norm": 0.8859125375747681, "learning_rate": 1.2771e-05, "loss": 0.1109, "step": 4260 }, { "epoch": 10.342648845686513, "grad_norm": 1.0422643423080444, "learning_rate": 1.2774e-05, "loss": 0.0685, "step": 4261 }, { "epoch": 10.345078979343864, "grad_norm": 0.6217149496078491, "learning_rate": 1.2777e-05, "loss": 0.062, "step": 4262 }, { "epoch": 10.347509113001214, "grad_norm": 0.5805233120918274, "learning_rate": 1.278e-05, "loss": 0.0496, "step": 4263 }, { "epoch": 10.349939246658566, "grad_norm": 1.00359308719635, "learning_rate": 1.2783e-05, "loss": 0.108, "step": 4264 }, { "epoch": 10.352369380315917, "grad_norm": 1.1292548179626465, "learning_rate": 1.2786000000000001e-05, "loss": 0.09, "step": 4265 }, { "epoch": 10.35479951397327, "grad_norm": 1.075920820236206, "learning_rate": 1.2789e-05, "loss": 0.1014, "step": 4266 }, { "epoch": 10.35722964763062, "grad_norm": 1.2080352306365967, "learning_rate": 1.2792e-05, "loss": 0.106, "step": 4267 }, { "epoch": 10.35965978128797, "grad_norm": 1.2264249324798584, "learning_rate": 1.2795e-05, "loss": 0.0795, "step": 4268 }, { "epoch": 10.362089914945322, "grad_norm": 2.155771017074585, "learning_rate": 1.2798e-05, "loss": 0.1223, "step": 4269 }, { "epoch": 10.364520048602673, "grad_norm": 1.9630595445632935, "learning_rate": 1.2801000000000002e-05, "loss": 0.1488, "step": 4270 }, { "epoch": 10.366950182260025, "grad_norm": 2.131197452545166, "learning_rate": 1.2804000000000001e-05, "loss": 0.3799, "step": 4271 }, { "epoch": 10.369380315917375, "grad_norm": 1.0859004259109497, "learning_rate": 1.2807000000000001e-05, "loss": 0.3064, "step": 4272 }, { "epoch": 10.371810449574726, "grad_norm": 0.8567915558815002, "learning_rate": 1.281e-05, "loss": 0.2724, "step": 4273 }, { "epoch": 10.374240583232078, "grad_norm": 0.8366007208824158, "learning_rate": 1.2812999999999999e-05, "loss": 0.192, "step": 4274 }, { "epoch": 10.376670716889429, "grad_norm": 1.114949345588684, "learning_rate": 1.2816e-05, "loss": 0.2008, "step": 4275 }, { "epoch": 10.37910085054678, "grad_norm": 0.9048606157302856, "learning_rate": 1.2819e-05, "loss": 0.1356, "step": 4276 }, { "epoch": 10.381530984204131, "grad_norm": 1.2858774662017822, "learning_rate": 1.2822e-05, "loss": 0.1796, "step": 4277 }, { "epoch": 10.383961117861482, "grad_norm": 0.8448458909988403, "learning_rate": 1.2825e-05, "loss": 0.1408, "step": 4278 }, { "epoch": 10.386391251518834, "grad_norm": 0.670690655708313, "learning_rate": 1.2828e-05, "loss": 0.0945, "step": 4279 }, { "epoch": 10.388821385176184, "grad_norm": 0.5129196643829346, "learning_rate": 1.2831000000000001e-05, "loss": 0.0734, "step": 4280 }, { "epoch": 10.391251518833537, "grad_norm": 0.633441686630249, "learning_rate": 1.2834000000000001e-05, "loss": 0.0648, "step": 4281 }, { "epoch": 10.393681652490887, "grad_norm": 0.6331435441970825, "learning_rate": 1.2837000000000001e-05, "loss": 0.0657, "step": 4282 }, { "epoch": 10.396111786148237, "grad_norm": 0.8106446266174316, "learning_rate": 1.284e-05, "loss": 0.0897, "step": 4283 }, { "epoch": 10.39854191980559, "grad_norm": 0.7609205842018127, "learning_rate": 1.2843e-05, "loss": 0.078, "step": 4284 }, { "epoch": 10.40097205346294, "grad_norm": 0.8587610721588135, "learning_rate": 1.2846e-05, "loss": 0.0614, "step": 4285 }, { "epoch": 10.403402187120292, "grad_norm": 0.8214881420135498, "learning_rate": 1.2849e-05, "loss": 0.0547, "step": 4286 }, { "epoch": 10.405832320777643, "grad_norm": 0.6494355797767639, "learning_rate": 1.2852e-05, "loss": 0.0641, "step": 4287 }, { "epoch": 10.408262454434993, "grad_norm": 0.5112002491950989, "learning_rate": 1.2855e-05, "loss": 0.0645, "step": 4288 }, { "epoch": 10.410692588092346, "grad_norm": 0.9755772352218628, "learning_rate": 1.2858e-05, "loss": 0.0658, "step": 4289 }, { "epoch": 10.413122721749696, "grad_norm": 0.5253199338912964, "learning_rate": 1.2861000000000001e-05, "loss": 0.0554, "step": 4290 }, { "epoch": 10.415552855407048, "grad_norm": 0.529179036617279, "learning_rate": 1.2864e-05, "loss": 0.0452, "step": 4291 }, { "epoch": 10.417982989064399, "grad_norm": 0.8799165487289429, "learning_rate": 1.2867e-05, "loss": 0.0807, "step": 4292 }, { "epoch": 10.42041312272175, "grad_norm": 0.999807596206665, "learning_rate": 1.287e-05, "loss": 0.1562, "step": 4293 }, { "epoch": 10.422843256379101, "grad_norm": 0.8203343749046326, "learning_rate": 1.2873e-05, "loss": 0.0776, "step": 4294 }, { "epoch": 10.425273390036452, "grad_norm": 0.6402435898780823, "learning_rate": 1.2876000000000002e-05, "loss": 0.0649, "step": 4295 }, { "epoch": 10.427703523693804, "grad_norm": 0.5687727928161621, "learning_rate": 1.2879000000000002e-05, "loss": 0.0527, "step": 4296 }, { "epoch": 10.430133657351154, "grad_norm": 0.7009173631668091, "learning_rate": 1.2882e-05, "loss": 0.0683, "step": 4297 }, { "epoch": 10.432563791008505, "grad_norm": 0.7958319187164307, "learning_rate": 1.2885e-05, "loss": 0.0437, "step": 4298 }, { "epoch": 10.434993924665857, "grad_norm": 0.6361892819404602, "learning_rate": 1.2888e-05, "loss": 0.0695, "step": 4299 }, { "epoch": 10.437424058323208, "grad_norm": 0.6560825109481812, "learning_rate": 1.2891e-05, "loss": 0.0506, "step": 4300 }, { "epoch": 10.439854191980558, "grad_norm": 1.0755890607833862, "learning_rate": 1.2894e-05, "loss": 0.0731, "step": 4301 }, { "epoch": 10.44228432563791, "grad_norm": 0.6724491715431213, "learning_rate": 1.2897e-05, "loss": 0.0623, "step": 4302 }, { "epoch": 10.44471445929526, "grad_norm": 0.9407429099082947, "learning_rate": 1.29e-05, "loss": 0.055, "step": 4303 }, { "epoch": 10.447144592952613, "grad_norm": 0.7460927963256836, "learning_rate": 1.2903e-05, "loss": 0.0626, "step": 4304 }, { "epoch": 10.449574726609963, "grad_norm": 0.694618284702301, "learning_rate": 1.2906000000000001e-05, "loss": 0.0826, "step": 4305 }, { "epoch": 10.452004860267314, "grad_norm": 0.7978695631027222, "learning_rate": 1.2909000000000001e-05, "loss": 0.0921, "step": 4306 }, { "epoch": 10.454434993924666, "grad_norm": 0.8356127142906189, "learning_rate": 1.2912000000000001e-05, "loss": 0.0656, "step": 4307 }, { "epoch": 10.456865127582017, "grad_norm": 0.9445888996124268, "learning_rate": 1.2915000000000001e-05, "loss": 0.0537, "step": 4308 }, { "epoch": 10.459295261239369, "grad_norm": 0.9544053077697754, "learning_rate": 1.2917999999999999e-05, "loss": 0.0955, "step": 4309 }, { "epoch": 10.46172539489672, "grad_norm": 0.7196928262710571, "learning_rate": 1.2921e-05, "loss": 0.0525, "step": 4310 }, { "epoch": 10.46415552855407, "grad_norm": 0.9055981040000916, "learning_rate": 1.2924e-05, "loss": 0.0619, "step": 4311 }, { "epoch": 10.466585662211422, "grad_norm": 0.9249035120010376, "learning_rate": 1.2927e-05, "loss": 0.0691, "step": 4312 }, { "epoch": 10.469015795868772, "grad_norm": 1.795716404914856, "learning_rate": 1.293e-05, "loss": 0.102, "step": 4313 }, { "epoch": 10.471445929526125, "grad_norm": 1.007365345954895, "learning_rate": 1.2933e-05, "loss": 0.1438, "step": 4314 }, { "epoch": 10.473876063183475, "grad_norm": 0.847745418548584, "learning_rate": 1.2936000000000001e-05, "loss": 0.0822, "step": 4315 }, { "epoch": 10.476306196840826, "grad_norm": 1.1423629522323608, "learning_rate": 1.2939000000000001e-05, "loss": 0.0835, "step": 4316 }, { "epoch": 10.478736330498178, "grad_norm": 0.748823344707489, "learning_rate": 1.2942e-05, "loss": 0.0861, "step": 4317 }, { "epoch": 10.481166464155528, "grad_norm": 0.7640035152435303, "learning_rate": 1.2945e-05, "loss": 0.081, "step": 4318 }, { "epoch": 10.48359659781288, "grad_norm": 1.3828707933425903, "learning_rate": 1.2948e-05, "loss": 0.0977, "step": 4319 }, { "epoch": 10.486026731470231, "grad_norm": 2.235599994659424, "learning_rate": 1.2951e-05, "loss": 0.2295, "step": 4320 }, { "epoch": 10.488456865127581, "grad_norm": 1.7217297554016113, "learning_rate": 1.2954000000000002e-05, "loss": 0.3746, "step": 4321 }, { "epoch": 10.490886998784934, "grad_norm": 1.1380938291549683, "learning_rate": 1.2957e-05, "loss": 0.3089, "step": 4322 }, { "epoch": 10.493317132442284, "grad_norm": 0.9486186504364014, "learning_rate": 1.296e-05, "loss": 0.2493, "step": 4323 }, { "epoch": 10.495747266099636, "grad_norm": 0.9247182607650757, "learning_rate": 1.2963e-05, "loss": 0.2632, "step": 4324 }, { "epoch": 10.498177399756987, "grad_norm": 0.8593354821205139, "learning_rate": 1.2966e-05, "loss": 0.196, "step": 4325 }, { "epoch": 10.500607533414337, "grad_norm": 0.956272542476654, "learning_rate": 1.2969e-05, "loss": 0.1668, "step": 4326 }, { "epoch": 10.50303766707169, "grad_norm": 0.62644362449646, "learning_rate": 1.2972e-05, "loss": 0.127, "step": 4327 }, { "epoch": 10.50546780072904, "grad_norm": 1.4610912799835205, "learning_rate": 1.2975e-05, "loss": 0.1544, "step": 4328 }, { "epoch": 10.507897934386392, "grad_norm": 0.8671621084213257, "learning_rate": 1.2978e-05, "loss": 0.1169, "step": 4329 }, { "epoch": 10.510328068043743, "grad_norm": 0.8207100629806519, "learning_rate": 1.2981e-05, "loss": 0.1406, "step": 4330 }, { "epoch": 10.512758201701093, "grad_norm": 0.6328557729721069, "learning_rate": 1.2984000000000001e-05, "loss": 0.0793, "step": 4331 }, { "epoch": 10.515188335358445, "grad_norm": 0.639218270778656, "learning_rate": 1.2987000000000001e-05, "loss": 0.0772, "step": 4332 }, { "epoch": 10.517618469015796, "grad_norm": 0.7359764575958252, "learning_rate": 1.2990000000000001e-05, "loss": 0.0672, "step": 4333 }, { "epoch": 10.520048602673146, "grad_norm": 0.7601892352104187, "learning_rate": 1.2992999999999999e-05, "loss": 0.0876, "step": 4334 }, { "epoch": 10.522478736330498, "grad_norm": 0.6847090721130371, "learning_rate": 1.2995999999999999e-05, "loss": 0.0822, "step": 4335 }, { "epoch": 10.524908869987849, "grad_norm": 0.5369909405708313, "learning_rate": 1.2999e-05, "loss": 0.0527, "step": 4336 }, { "epoch": 10.527339003645201, "grad_norm": 0.771131694316864, "learning_rate": 1.3002e-05, "loss": 0.0926, "step": 4337 }, { "epoch": 10.529769137302551, "grad_norm": 0.7128024697303772, "learning_rate": 1.3005e-05, "loss": 0.0867, "step": 4338 }, { "epoch": 10.532199270959904, "grad_norm": 0.4846346080303192, "learning_rate": 1.3008e-05, "loss": 0.0468, "step": 4339 }, { "epoch": 10.534629404617254, "grad_norm": 0.78702312707901, "learning_rate": 1.3011e-05, "loss": 0.0585, "step": 4340 }, { "epoch": 10.537059538274605, "grad_norm": 0.7211533784866333, "learning_rate": 1.3014000000000001e-05, "loss": 0.0408, "step": 4341 }, { "epoch": 10.539489671931957, "grad_norm": 0.7701517343521118, "learning_rate": 1.3017000000000001e-05, "loss": 0.0679, "step": 4342 }, { "epoch": 10.541919805589307, "grad_norm": 0.7890806794166565, "learning_rate": 1.302e-05, "loss": 0.0857, "step": 4343 }, { "epoch": 10.544349939246658, "grad_norm": 0.6884579062461853, "learning_rate": 1.3023e-05, "loss": 0.0718, "step": 4344 }, { "epoch": 10.54678007290401, "grad_norm": 0.7437359094619751, "learning_rate": 1.3026e-05, "loss": 0.0657, "step": 4345 }, { "epoch": 10.54921020656136, "grad_norm": 1.01282799243927, "learning_rate": 1.3029e-05, "loss": 0.1491, "step": 4346 }, { "epoch": 10.551640340218713, "grad_norm": 0.6589197516441345, "learning_rate": 1.3032e-05, "loss": 0.055, "step": 4347 }, { "epoch": 10.554070473876063, "grad_norm": 0.7649976015090942, "learning_rate": 1.3035e-05, "loss": 0.0626, "step": 4348 }, { "epoch": 10.556500607533414, "grad_norm": 0.5895805358886719, "learning_rate": 1.3038e-05, "loss": 0.0531, "step": 4349 }, { "epoch": 10.558930741190766, "grad_norm": 0.5259085297584534, "learning_rate": 1.3041e-05, "loss": 0.041, "step": 4350 }, { "epoch": 10.561360874848116, "grad_norm": 0.6405687928199768, "learning_rate": 1.3044e-05, "loss": 0.048, "step": 4351 }, { "epoch": 10.563791008505468, "grad_norm": 0.52789705991745, "learning_rate": 1.3047e-05, "loss": 0.0429, "step": 4352 }, { "epoch": 10.566221142162819, "grad_norm": 0.8160581588745117, "learning_rate": 1.305e-05, "loss": 0.0853, "step": 4353 }, { "epoch": 10.56865127582017, "grad_norm": 0.8227992057800293, "learning_rate": 1.3053e-05, "loss": 0.0725, "step": 4354 }, { "epoch": 10.571081409477522, "grad_norm": 1.0421736240386963, "learning_rate": 1.3056e-05, "loss": 0.073, "step": 4355 }, { "epoch": 10.573511543134872, "grad_norm": 1.3437597751617432, "learning_rate": 1.3059000000000002e-05, "loss": 0.1306, "step": 4356 }, { "epoch": 10.575941676792224, "grad_norm": 0.6587740182876587, "learning_rate": 1.3062000000000001e-05, "loss": 0.0529, "step": 4357 }, { "epoch": 10.578371810449575, "grad_norm": 0.741858959197998, "learning_rate": 1.3065000000000001e-05, "loss": 0.0616, "step": 4358 }, { "epoch": 10.580801944106925, "grad_norm": 1.4503802061080933, "learning_rate": 1.3068e-05, "loss": 0.0845, "step": 4359 }, { "epoch": 10.583232077764277, "grad_norm": 0.654259443283081, "learning_rate": 1.3070999999999999e-05, "loss": 0.0443, "step": 4360 }, { "epoch": 10.585662211421628, "grad_norm": 0.6310545802116394, "learning_rate": 1.3074e-05, "loss": 0.0466, "step": 4361 }, { "epoch": 10.58809234507898, "grad_norm": 1.2866687774658203, "learning_rate": 1.3077e-05, "loss": 0.0829, "step": 4362 }, { "epoch": 10.59052247873633, "grad_norm": 0.8888977766036987, "learning_rate": 1.308e-05, "loss": 0.0622, "step": 4363 }, { "epoch": 10.592952612393681, "grad_norm": 0.969586193561554, "learning_rate": 1.3083e-05, "loss": 0.0518, "step": 4364 }, { "epoch": 10.595382746051033, "grad_norm": 1.0620267391204834, "learning_rate": 1.3086e-05, "loss": 0.0762, "step": 4365 }, { "epoch": 10.597812879708384, "grad_norm": 1.7358733415603638, "learning_rate": 1.3089000000000001e-05, "loss": 0.0914, "step": 4366 }, { "epoch": 10.600243013365736, "grad_norm": 1.1123356819152832, "learning_rate": 1.3092000000000001e-05, "loss": 0.1086, "step": 4367 }, { "epoch": 10.602673147023086, "grad_norm": 1.1562339067459106, "learning_rate": 1.3095e-05, "loss": 0.1098, "step": 4368 }, { "epoch": 10.605103280680437, "grad_norm": 1.5881913900375366, "learning_rate": 1.3098e-05, "loss": 0.1589, "step": 4369 }, { "epoch": 10.607533414337789, "grad_norm": 1.5586732625961304, "learning_rate": 1.3101e-05, "loss": 0.1057, "step": 4370 }, { "epoch": 10.60996354799514, "grad_norm": 1.2185837030410767, "learning_rate": 1.3104e-05, "loss": 0.361, "step": 4371 }, { "epoch": 10.612393681652492, "grad_norm": 0.8692100048065186, "learning_rate": 1.3107e-05, "loss": 0.3078, "step": 4372 }, { "epoch": 10.614823815309842, "grad_norm": 1.6888052225112915, "learning_rate": 1.311e-05, "loss": 0.2749, "step": 4373 }, { "epoch": 10.617253948967193, "grad_norm": 0.9070069193840027, "learning_rate": 1.3113e-05, "loss": 0.2239, "step": 4374 }, { "epoch": 10.619684082624545, "grad_norm": 0.6581593155860901, "learning_rate": 1.3116e-05, "loss": 0.1949, "step": 4375 }, { "epoch": 10.622114216281895, "grad_norm": 0.8060280680656433, "learning_rate": 1.3119000000000001e-05, "loss": 0.1714, "step": 4376 }, { "epoch": 10.624544349939246, "grad_norm": 1.148512601852417, "learning_rate": 1.3122e-05, "loss": 0.1284, "step": 4377 }, { "epoch": 10.626974483596598, "grad_norm": 0.7194034457206726, "learning_rate": 1.3125e-05, "loss": 0.1401, "step": 4378 }, { "epoch": 10.629404617253948, "grad_norm": 0.8178122043609619, "learning_rate": 1.3128e-05, "loss": 0.1094, "step": 4379 }, { "epoch": 10.6318347509113, "grad_norm": 0.8332579731941223, "learning_rate": 1.3131e-05, "loss": 0.0994, "step": 4380 }, { "epoch": 10.634264884568651, "grad_norm": 0.7070622444152832, "learning_rate": 1.3134000000000002e-05, "loss": 0.0947, "step": 4381 }, { "epoch": 10.636695018226002, "grad_norm": 0.6354169845581055, "learning_rate": 1.3137000000000001e-05, "loss": 0.0765, "step": 4382 }, { "epoch": 10.639125151883354, "grad_norm": 0.3872576355934143, "learning_rate": 1.314e-05, "loss": 0.0553, "step": 4383 }, { "epoch": 10.641555285540704, "grad_norm": 0.559376060962677, "learning_rate": 1.3143e-05, "loss": 0.0537, "step": 4384 }, { "epoch": 10.643985419198057, "grad_norm": 0.9283326864242554, "learning_rate": 1.3146e-05, "loss": 0.0953, "step": 4385 }, { "epoch": 10.646415552855407, "grad_norm": 0.4918272793292999, "learning_rate": 1.3149e-05, "loss": 0.052, "step": 4386 }, { "epoch": 10.648845686512757, "grad_norm": 0.8072372674942017, "learning_rate": 1.3152e-05, "loss": 0.0769, "step": 4387 }, { "epoch": 10.65127582017011, "grad_norm": 1.29289972782135, "learning_rate": 1.3155e-05, "loss": 0.0837, "step": 4388 }, { "epoch": 10.65370595382746, "grad_norm": 0.666766345500946, "learning_rate": 1.3158e-05, "loss": 0.0676, "step": 4389 }, { "epoch": 10.656136087484812, "grad_norm": 0.7967982888221741, "learning_rate": 1.3161e-05, "loss": 0.0774, "step": 4390 }, { "epoch": 10.658566221142163, "grad_norm": 1.2714201211929321, "learning_rate": 1.3164000000000001e-05, "loss": 0.0938, "step": 4391 }, { "epoch": 10.660996354799513, "grad_norm": 0.808298647403717, "learning_rate": 1.3167000000000001e-05, "loss": 0.0711, "step": 4392 }, { "epoch": 10.663426488456865, "grad_norm": 0.730484127998352, "learning_rate": 1.3170000000000001e-05, "loss": 0.0527, "step": 4393 }, { "epoch": 10.665856622114216, "grad_norm": 0.4928746819496155, "learning_rate": 1.3173e-05, "loss": 0.0595, "step": 4394 }, { "epoch": 10.668286755771568, "grad_norm": 0.547423243522644, "learning_rate": 1.3175999999999999e-05, "loss": 0.0461, "step": 4395 }, { "epoch": 10.670716889428919, "grad_norm": 0.5674730539321899, "learning_rate": 1.3179e-05, "loss": 0.0754, "step": 4396 }, { "epoch": 10.673147023086269, "grad_norm": 0.5812348127365112, "learning_rate": 1.3182e-05, "loss": 0.0566, "step": 4397 }, { "epoch": 10.675577156743621, "grad_norm": 0.585222601890564, "learning_rate": 1.3185e-05, "loss": 0.0575, "step": 4398 }, { "epoch": 10.678007290400972, "grad_norm": 0.7932372689247131, "learning_rate": 1.3188e-05, "loss": 0.067, "step": 4399 }, { "epoch": 10.680437424058324, "grad_norm": 0.5495865941047668, "learning_rate": 1.3191e-05, "loss": 0.0434, "step": 4400 }, { "epoch": 10.682867557715674, "grad_norm": 0.6916366815567017, "learning_rate": 1.3194000000000001e-05, "loss": 0.0505, "step": 4401 }, { "epoch": 10.685297691373025, "grad_norm": 0.6764870285987854, "learning_rate": 1.3197000000000001e-05, "loss": 0.0717, "step": 4402 }, { "epoch": 10.687727825030377, "grad_norm": 0.9756735563278198, "learning_rate": 1.32e-05, "loss": 0.045, "step": 4403 }, { "epoch": 10.690157958687728, "grad_norm": 0.8913256525993347, "learning_rate": 1.3203e-05, "loss": 0.0625, "step": 4404 }, { "epoch": 10.69258809234508, "grad_norm": 1.184391975402832, "learning_rate": 1.3206e-05, "loss": 0.076, "step": 4405 }, { "epoch": 10.69501822600243, "grad_norm": 0.8700153231620789, "learning_rate": 1.3209000000000002e-05, "loss": 0.0461, "step": 4406 }, { "epoch": 10.69744835965978, "grad_norm": 0.8494783043861389, "learning_rate": 1.3212000000000002e-05, "loss": 0.0622, "step": 4407 }, { "epoch": 10.699878493317133, "grad_norm": 1.1681592464447021, "learning_rate": 1.3215e-05, "loss": 0.1293, "step": 4408 }, { "epoch": 10.702308626974483, "grad_norm": 1.0699896812438965, "learning_rate": 1.3218e-05, "loss": 0.0579, "step": 4409 }, { "epoch": 10.704738760631834, "grad_norm": 0.8277936577796936, "learning_rate": 1.3221e-05, "loss": 0.0628, "step": 4410 }, { "epoch": 10.707168894289186, "grad_norm": 0.9110010266304016, "learning_rate": 1.3224e-05, "loss": 0.0808, "step": 4411 }, { "epoch": 10.709599027946537, "grad_norm": 0.7841404676437378, "learning_rate": 1.3227e-05, "loss": 0.0737, "step": 4412 }, { "epoch": 10.712029161603889, "grad_norm": 0.5445214509963989, "learning_rate": 1.323e-05, "loss": 0.0376, "step": 4413 }, { "epoch": 10.71445929526124, "grad_norm": 0.9355319142341614, "learning_rate": 1.3233e-05, "loss": 0.0623, "step": 4414 }, { "epoch": 10.716889428918591, "grad_norm": 0.9036315679550171, "learning_rate": 1.3236e-05, "loss": 0.0653, "step": 4415 }, { "epoch": 10.719319562575942, "grad_norm": 0.900381326675415, "learning_rate": 1.3239000000000001e-05, "loss": 0.0725, "step": 4416 }, { "epoch": 10.721749696233292, "grad_norm": 1.2659591436386108, "learning_rate": 1.3242000000000001e-05, "loss": 0.0975, "step": 4417 }, { "epoch": 10.724179829890645, "grad_norm": 1.1742018461227417, "learning_rate": 1.3245000000000001e-05, "loss": 0.113, "step": 4418 }, { "epoch": 10.726609963547995, "grad_norm": 1.2912795543670654, "learning_rate": 1.3248000000000001e-05, "loss": 0.0851, "step": 4419 }, { "epoch": 10.729040097205345, "grad_norm": 1.831681489944458, "learning_rate": 1.3250999999999999e-05, "loss": 0.1599, "step": 4420 }, { "epoch": 10.731470230862698, "grad_norm": 1.6239972114562988, "learning_rate": 1.3254e-05, "loss": 0.4532, "step": 4421 }, { "epoch": 10.733900364520048, "grad_norm": 0.7172882556915283, "learning_rate": 1.3257e-05, "loss": 0.2666, "step": 4422 }, { "epoch": 10.7363304981774, "grad_norm": 0.6648383140563965, "learning_rate": 1.326e-05, "loss": 0.21, "step": 4423 }, { "epoch": 10.73876063183475, "grad_norm": 1.112903356552124, "learning_rate": 1.3263e-05, "loss": 0.2442, "step": 4424 }, { "epoch": 10.741190765492101, "grad_norm": 0.8260800838470459, "learning_rate": 1.3266e-05, "loss": 0.1881, "step": 4425 }, { "epoch": 10.743620899149454, "grad_norm": 0.8411895036697388, "learning_rate": 1.3269000000000001e-05, "loss": 0.1647, "step": 4426 }, { "epoch": 10.746051032806804, "grad_norm": 0.9271567463874817, "learning_rate": 1.3272000000000001e-05, "loss": 0.1357, "step": 4427 }, { "epoch": 10.748481166464156, "grad_norm": 0.6786314845085144, "learning_rate": 1.3275e-05, "loss": 0.1185, "step": 4428 }, { "epoch": 10.750911300121507, "grad_norm": 0.6914088726043701, "learning_rate": 1.3278e-05, "loss": 0.1495, "step": 4429 }, { "epoch": 10.753341433778857, "grad_norm": 0.6400651335716248, "learning_rate": 1.3281e-05, "loss": 0.0722, "step": 4430 }, { "epoch": 10.75577156743621, "grad_norm": 0.8184458017349243, "learning_rate": 1.3284000000000002e-05, "loss": 0.0886, "step": 4431 }, { "epoch": 10.75820170109356, "grad_norm": 0.7162454128265381, "learning_rate": 1.3287e-05, "loss": 0.0699, "step": 4432 }, { "epoch": 10.760631834750912, "grad_norm": 0.6394118666648865, "learning_rate": 1.329e-05, "loss": 0.0906, "step": 4433 }, { "epoch": 10.763061968408262, "grad_norm": 0.5048874616622925, "learning_rate": 1.3293e-05, "loss": 0.0562, "step": 4434 }, { "epoch": 10.765492102065613, "grad_norm": 0.6418811082839966, "learning_rate": 1.3296e-05, "loss": 0.0672, "step": 4435 }, { "epoch": 10.767922235722965, "grad_norm": 0.6769041419029236, "learning_rate": 1.3299000000000001e-05, "loss": 0.0741, "step": 4436 }, { "epoch": 10.770352369380316, "grad_norm": 0.961323618888855, "learning_rate": 1.3302e-05, "loss": 0.0725, "step": 4437 }, { "epoch": 10.772782503037668, "grad_norm": 0.5555863976478577, "learning_rate": 1.3305e-05, "loss": 0.047, "step": 4438 }, { "epoch": 10.775212636695018, "grad_norm": 0.864205539226532, "learning_rate": 1.3308e-05, "loss": 0.0673, "step": 4439 }, { "epoch": 10.777642770352369, "grad_norm": 0.6051881909370422, "learning_rate": 1.3311e-05, "loss": 0.0481, "step": 4440 }, { "epoch": 10.780072904009721, "grad_norm": 0.6664055585861206, "learning_rate": 1.3314e-05, "loss": 0.0529, "step": 4441 }, { "epoch": 10.782503037667071, "grad_norm": 0.7148986458778381, "learning_rate": 1.3317000000000001e-05, "loss": 0.0757, "step": 4442 }, { "epoch": 10.784933171324424, "grad_norm": 0.5184940695762634, "learning_rate": 1.3320000000000001e-05, "loss": 0.0394, "step": 4443 }, { "epoch": 10.787363304981774, "grad_norm": 0.5213060975074768, "learning_rate": 1.3323000000000001e-05, "loss": 0.0438, "step": 4444 }, { "epoch": 10.789793438639125, "grad_norm": 1.277317762374878, "learning_rate": 1.3325999999999999e-05, "loss": 0.0569, "step": 4445 }, { "epoch": 10.792223572296477, "grad_norm": 1.3535833358764648, "learning_rate": 1.3328999999999999e-05, "loss": 0.0494, "step": 4446 }, { "epoch": 10.794653705953827, "grad_norm": 0.674156665802002, "learning_rate": 1.3332e-05, "loss": 0.0626, "step": 4447 }, { "epoch": 10.79708383961118, "grad_norm": 0.6604660153388977, "learning_rate": 1.3335e-05, "loss": 0.0861, "step": 4448 }, { "epoch": 10.79951397326853, "grad_norm": 0.716681182384491, "learning_rate": 1.3338e-05, "loss": 0.0583, "step": 4449 }, { "epoch": 10.80194410692588, "grad_norm": 0.7251180410385132, "learning_rate": 1.3341e-05, "loss": 0.0596, "step": 4450 }, { "epoch": 10.804374240583233, "grad_norm": 0.8702787160873413, "learning_rate": 1.3344e-05, "loss": 0.1041, "step": 4451 }, { "epoch": 10.806804374240583, "grad_norm": 0.7300487160682678, "learning_rate": 1.3347000000000001e-05, "loss": 0.0703, "step": 4452 }, { "epoch": 10.809234507897933, "grad_norm": 0.5890774726867676, "learning_rate": 1.3350000000000001e-05, "loss": 0.0595, "step": 4453 }, { "epoch": 10.811664641555286, "grad_norm": 3.577747106552124, "learning_rate": 1.3353e-05, "loss": 0.158, "step": 4454 }, { "epoch": 10.814094775212636, "grad_norm": 0.8091457486152649, "learning_rate": 1.3356e-05, "loss": 0.0574, "step": 4455 }, { "epoch": 10.816524908869988, "grad_norm": 1.1938974857330322, "learning_rate": 1.3359e-05, "loss": 0.1012, "step": 4456 }, { "epoch": 10.818955042527339, "grad_norm": 0.8709457516670227, "learning_rate": 1.3362e-05, "loss": 0.0894, "step": 4457 }, { "epoch": 10.821385176184691, "grad_norm": 0.8922865986824036, "learning_rate": 1.3365e-05, "loss": 0.0636, "step": 4458 }, { "epoch": 10.823815309842042, "grad_norm": 0.8535493016242981, "learning_rate": 1.3368e-05, "loss": 0.0722, "step": 4459 }, { "epoch": 10.826245443499392, "grad_norm": 0.6876110434532166, "learning_rate": 1.3371e-05, "loss": 0.0845, "step": 4460 }, { "epoch": 10.828675577156744, "grad_norm": 0.7087515592575073, "learning_rate": 1.3374e-05, "loss": 0.0605, "step": 4461 }, { "epoch": 10.831105710814095, "grad_norm": 1.3405842781066895, "learning_rate": 1.3377e-05, "loss": 0.0796, "step": 4462 }, { "epoch": 10.833535844471445, "grad_norm": 0.8772732019424438, "learning_rate": 1.338e-05, "loss": 0.0495, "step": 4463 }, { "epoch": 10.835965978128797, "grad_norm": 0.7503268718719482, "learning_rate": 1.3383e-05, "loss": 0.0674, "step": 4464 }, { "epoch": 10.838396111786148, "grad_norm": 1.51255464553833, "learning_rate": 1.3386e-05, "loss": 0.0909, "step": 4465 }, { "epoch": 10.8408262454435, "grad_norm": 0.7475121021270752, "learning_rate": 1.3389e-05, "loss": 0.0694, "step": 4466 }, { "epoch": 10.84325637910085, "grad_norm": 1.355352520942688, "learning_rate": 1.3392000000000002e-05, "loss": 0.0973, "step": 4467 }, { "epoch": 10.845686512758201, "grad_norm": 2.552509069442749, "learning_rate": 1.3395000000000001e-05, "loss": 0.1434, "step": 4468 }, { "epoch": 10.848116646415553, "grad_norm": 1.5646576881408691, "learning_rate": 1.3398e-05, "loss": 0.1235, "step": 4469 }, { "epoch": 10.850546780072904, "grad_norm": 2.17215633392334, "learning_rate": 1.3401e-05, "loss": 0.1817, "step": 4470 }, { "epoch": 10.852976913730256, "grad_norm": 2.2424368858337402, "learning_rate": 1.3403999999999999e-05, "loss": 0.4332, "step": 4471 }, { "epoch": 10.855407047387606, "grad_norm": 0.9427192807197571, "learning_rate": 1.3407e-05, "loss": 0.3126, "step": 4472 }, { "epoch": 10.857837181044957, "grad_norm": 0.8806581497192383, "learning_rate": 1.341e-05, "loss": 0.2266, "step": 4473 }, { "epoch": 10.860267314702309, "grad_norm": 0.7554003596305847, "learning_rate": 1.3413e-05, "loss": 0.2234, "step": 4474 }, { "epoch": 10.86269744835966, "grad_norm": 0.8483538627624512, "learning_rate": 1.3416e-05, "loss": 0.2015, "step": 4475 }, { "epoch": 10.865127582017012, "grad_norm": 0.6396898031234741, "learning_rate": 1.3419e-05, "loss": 0.1622, "step": 4476 }, { "epoch": 10.867557715674362, "grad_norm": 0.8639312386512756, "learning_rate": 1.3422000000000001e-05, "loss": 0.1471, "step": 4477 }, { "epoch": 10.869987849331713, "grad_norm": 0.6714649796485901, "learning_rate": 1.3425000000000001e-05, "loss": 0.1009, "step": 4478 }, { "epoch": 10.872417982989065, "grad_norm": 0.5057902932167053, "learning_rate": 1.3428000000000001e-05, "loss": 0.0805, "step": 4479 }, { "epoch": 10.874848116646415, "grad_norm": 0.6815232038497925, "learning_rate": 1.3431e-05, "loss": 0.0745, "step": 4480 }, { "epoch": 10.877278250303767, "grad_norm": 1.0225533246994019, "learning_rate": 1.3433999999999999e-05, "loss": 0.1337, "step": 4481 }, { "epoch": 10.879708383961118, "grad_norm": 0.48655983805656433, "learning_rate": 1.3437e-05, "loss": 0.0582, "step": 4482 }, { "epoch": 10.882138517618468, "grad_norm": 0.9081035852432251, "learning_rate": 1.344e-05, "loss": 0.052, "step": 4483 }, { "epoch": 10.88456865127582, "grad_norm": 0.6728028655052185, "learning_rate": 1.3443e-05, "loss": 0.0669, "step": 4484 }, { "epoch": 10.886998784933171, "grad_norm": 0.5696329474449158, "learning_rate": 1.3446e-05, "loss": 0.072, "step": 4485 }, { "epoch": 10.889428918590523, "grad_norm": 0.5811458826065063, "learning_rate": 1.3449e-05, "loss": 0.0556, "step": 4486 }, { "epoch": 10.891859052247874, "grad_norm": 0.6191088557243347, "learning_rate": 1.3452000000000001e-05, "loss": 0.0529, "step": 4487 }, { "epoch": 10.894289185905224, "grad_norm": 0.6167824864387512, "learning_rate": 1.3455e-05, "loss": 0.0894, "step": 4488 }, { "epoch": 10.896719319562576, "grad_norm": 0.8451576232910156, "learning_rate": 1.3458e-05, "loss": 0.0434, "step": 4489 }, { "epoch": 10.899149453219927, "grad_norm": 0.5339174270629883, "learning_rate": 1.3461e-05, "loss": 0.0452, "step": 4490 }, { "epoch": 10.90157958687728, "grad_norm": 0.5258620381355286, "learning_rate": 1.3464e-05, "loss": 0.0371, "step": 4491 }, { "epoch": 10.90400972053463, "grad_norm": 0.6882297992706299, "learning_rate": 1.3467000000000002e-05, "loss": 0.0725, "step": 4492 }, { "epoch": 10.90643985419198, "grad_norm": 0.5179954767227173, "learning_rate": 1.3470000000000001e-05, "loss": 0.0471, "step": 4493 }, { "epoch": 10.908869987849332, "grad_norm": 0.649039089679718, "learning_rate": 1.3473e-05, "loss": 0.0592, "step": 4494 }, { "epoch": 10.911300121506683, "grad_norm": 0.8501604795455933, "learning_rate": 1.3476e-05, "loss": 0.0748, "step": 4495 }, { "epoch": 10.913730255164033, "grad_norm": 0.577048122882843, "learning_rate": 1.3479e-05, "loss": 0.0572, "step": 4496 }, { "epoch": 10.916160388821385, "grad_norm": 0.7322097420692444, "learning_rate": 1.3482e-05, "loss": 0.0503, "step": 4497 }, { "epoch": 10.918590522478736, "grad_norm": 1.179318904876709, "learning_rate": 1.3485e-05, "loss": 0.075, "step": 4498 }, { "epoch": 10.921020656136088, "grad_norm": 0.746444046497345, "learning_rate": 1.3488e-05, "loss": 0.0555, "step": 4499 }, { "epoch": 10.923450789793439, "grad_norm": 0.5798726081848145, "learning_rate": 1.3491e-05, "loss": 0.062, "step": 4500 }, { "epoch": 10.925880923450789, "grad_norm": 0.8185794949531555, "learning_rate": 1.3494e-05, "loss": 0.0681, "step": 4501 }, { "epoch": 10.928311057108141, "grad_norm": 1.2488787174224854, "learning_rate": 1.3497000000000001e-05, "loss": 0.0589, "step": 4502 }, { "epoch": 10.930741190765492, "grad_norm": 0.6553597450256348, "learning_rate": 1.3500000000000001e-05, "loss": 0.0657, "step": 4503 }, { "epoch": 10.933171324422844, "grad_norm": 0.8491834402084351, "learning_rate": 1.3503000000000001e-05, "loss": 0.0839, "step": 4504 }, { "epoch": 10.935601458080194, "grad_norm": 0.8780386447906494, "learning_rate": 1.3506e-05, "loss": 0.0638, "step": 4505 }, { "epoch": 10.938031591737545, "grad_norm": 1.1382180452346802, "learning_rate": 1.3508999999999999e-05, "loss": 0.0642, "step": 4506 }, { "epoch": 10.940461725394897, "grad_norm": 1.193527340888977, "learning_rate": 1.3512e-05, "loss": 0.0929, "step": 4507 }, { "epoch": 10.942891859052247, "grad_norm": 0.7274829149246216, "learning_rate": 1.3515e-05, "loss": 0.0691, "step": 4508 }, { "epoch": 10.9453219927096, "grad_norm": 1.0001554489135742, "learning_rate": 1.3518e-05, "loss": 0.0855, "step": 4509 }, { "epoch": 10.94775212636695, "grad_norm": 0.912079930305481, "learning_rate": 1.3521e-05, "loss": 0.0818, "step": 4510 }, { "epoch": 10.9501822600243, "grad_norm": 0.6816204786300659, "learning_rate": 1.3524e-05, "loss": 0.074, "step": 4511 }, { "epoch": 10.952612393681653, "grad_norm": 0.8792251348495483, "learning_rate": 1.3527000000000001e-05, "loss": 0.094, "step": 4512 }, { "epoch": 10.955042527339003, "grad_norm": 0.7014767527580261, "learning_rate": 1.3530000000000001e-05, "loss": 0.0546, "step": 4513 }, { "epoch": 10.957472660996356, "grad_norm": 0.5847507119178772, "learning_rate": 1.3533e-05, "loss": 0.0518, "step": 4514 }, { "epoch": 10.959902794653706, "grad_norm": 1.2501832246780396, "learning_rate": 1.3536e-05, "loss": 0.0619, "step": 4515 }, { "epoch": 10.962332928311056, "grad_norm": 1.3429285287857056, "learning_rate": 1.3539e-05, "loss": 0.0769, "step": 4516 }, { "epoch": 10.964763061968409, "grad_norm": 1.2715150117874146, "learning_rate": 1.3542000000000002e-05, "loss": 0.0856, "step": 4517 }, { "epoch": 10.96719319562576, "grad_norm": 1.359774112701416, "learning_rate": 1.3545e-05, "loss": 0.0731, "step": 4518 }, { "epoch": 10.969623329283111, "grad_norm": 1.680776596069336, "learning_rate": 1.3548e-05, "loss": 0.1267, "step": 4519 }, { "epoch": 10.972053462940462, "grad_norm": 1.7724626064300537, "learning_rate": 1.3551e-05, "loss": 0.1487, "step": 4520 }, { "epoch": 10.974483596597812, "grad_norm": 0.947708010673523, "learning_rate": 1.3554e-05, "loss": 0.2607, "step": 4521 }, { "epoch": 10.976913730255164, "grad_norm": 0.9355026483535767, "learning_rate": 1.3557e-05, "loss": 0.131, "step": 4522 }, { "epoch": 10.979343863912515, "grad_norm": 0.6311138868331909, "learning_rate": 1.356e-05, "loss": 0.0805, "step": 4523 }, { "epoch": 10.981773997569867, "grad_norm": 0.7788707613945007, "learning_rate": 1.3563e-05, "loss": 0.0994, "step": 4524 }, { "epoch": 10.984204131227218, "grad_norm": 0.5767011642456055, "learning_rate": 1.3566e-05, "loss": 0.0589, "step": 4525 }, { "epoch": 10.986634264884568, "grad_norm": 0.6148889064788818, "learning_rate": 1.3569e-05, "loss": 0.0575, "step": 4526 }, { "epoch": 10.98906439854192, "grad_norm": 0.6818249821662903, "learning_rate": 1.3572000000000002e-05, "loss": 0.066, "step": 4527 }, { "epoch": 10.99149453219927, "grad_norm": 1.1136940717697144, "learning_rate": 1.3575000000000001e-05, "loss": 0.0738, "step": 4528 }, { "epoch": 10.993924665856621, "grad_norm": 0.8918789625167847, "learning_rate": 1.3578000000000001e-05, "loss": 0.0798, "step": 4529 }, { "epoch": 10.996354799513973, "grad_norm": 0.9335149526596069, "learning_rate": 1.3581000000000001e-05, "loss": 0.0745, "step": 4530 }, { "epoch": 10.998784933171324, "grad_norm": 1.0536930561065674, "learning_rate": 1.3583999999999999e-05, "loss": 0.1051, "step": 4531 }, { "epoch": 11.0, "grad_norm": 1.0888304710388184, "learning_rate": 1.3587e-05, "loss": 0.0581, "step": 4532 }, { "epoch": 11.00243013365735, "grad_norm": 1.0281765460968018, "learning_rate": 1.359e-05, "loss": 0.3131, "step": 4533 }, { "epoch": 11.004860267314703, "grad_norm": 0.8474702835083008, "learning_rate": 1.3593e-05, "loss": 0.2456, "step": 4534 }, { "epoch": 11.007290400972053, "grad_norm": 0.6644335985183716, "learning_rate": 1.3596e-05, "loss": 0.2207, "step": 4535 }, { "epoch": 11.009720534629405, "grad_norm": 0.8094347715377808, "learning_rate": 1.3599e-05, "loss": 0.194, "step": 4536 }, { "epoch": 11.012150668286756, "grad_norm": 0.6056820750236511, "learning_rate": 1.3602000000000001e-05, "loss": 0.1434, "step": 4537 }, { "epoch": 11.014580801944106, "grad_norm": 0.8686061501502991, "learning_rate": 1.3605000000000001e-05, "loss": 0.1401, "step": 4538 }, { "epoch": 11.017010935601458, "grad_norm": 0.7036507725715637, "learning_rate": 1.3608e-05, "loss": 0.1049, "step": 4539 }, { "epoch": 11.019441069258809, "grad_norm": 0.7280768156051636, "learning_rate": 1.3611e-05, "loss": 0.0932, "step": 4540 }, { "epoch": 11.021871202916161, "grad_norm": 1.7508724927902222, "learning_rate": 1.3614e-05, "loss": 0.1572, "step": 4541 }, { "epoch": 11.024301336573512, "grad_norm": 1.2561818361282349, "learning_rate": 1.3617000000000002e-05, "loss": 0.0966, "step": 4542 }, { "epoch": 11.026731470230862, "grad_norm": 1.1103229522705078, "learning_rate": 1.362e-05, "loss": 0.0916, "step": 4543 }, { "epoch": 11.029161603888214, "grad_norm": 0.6898956894874573, "learning_rate": 1.3623e-05, "loss": 0.0718, "step": 4544 }, { "epoch": 11.031591737545565, "grad_norm": 0.7787789106369019, "learning_rate": 1.3626e-05, "loss": 0.0794, "step": 4545 }, { "epoch": 11.034021871202917, "grad_norm": 0.6480409502983093, "learning_rate": 1.3629e-05, "loss": 0.0671, "step": 4546 }, { "epoch": 11.036452004860267, "grad_norm": 0.7971064448356628, "learning_rate": 1.3632000000000001e-05, "loss": 0.0578, "step": 4547 }, { "epoch": 11.038882138517618, "grad_norm": 0.7847037315368652, "learning_rate": 1.3635e-05, "loss": 0.0599, "step": 4548 }, { "epoch": 11.04131227217497, "grad_norm": 0.9049919247627258, "learning_rate": 1.3638e-05, "loss": 0.0761, "step": 4549 }, { "epoch": 11.04374240583232, "grad_norm": 0.743338942527771, "learning_rate": 1.3641e-05, "loss": 0.0893, "step": 4550 }, { "epoch": 11.046172539489673, "grad_norm": 0.4288365840911865, "learning_rate": 1.3644e-05, "loss": 0.0456, "step": 4551 }, { "epoch": 11.048602673147023, "grad_norm": 0.6377533078193665, "learning_rate": 1.3647000000000002e-05, "loss": 0.0775, "step": 4552 }, { "epoch": 11.051032806804374, "grad_norm": 0.47764870524406433, "learning_rate": 1.3650000000000001e-05, "loss": 0.0426, "step": 4553 }, { "epoch": 11.053462940461726, "grad_norm": 0.6602082848548889, "learning_rate": 1.3653000000000001e-05, "loss": 0.0735, "step": 4554 }, { "epoch": 11.055893074119076, "grad_norm": 0.5588088631629944, "learning_rate": 1.3656e-05, "loss": 0.0579, "step": 4555 }, { "epoch": 11.058323207776427, "grad_norm": 0.6690793633460999, "learning_rate": 1.3659e-05, "loss": 0.0621, "step": 4556 }, { "epoch": 11.060753341433779, "grad_norm": 0.5920650362968445, "learning_rate": 1.3662e-05, "loss": 0.0429, "step": 4557 }, { "epoch": 11.06318347509113, "grad_norm": 0.5815759301185608, "learning_rate": 1.3665e-05, "loss": 0.0427, "step": 4558 }, { "epoch": 11.065613608748482, "grad_norm": 0.6155309677124023, "learning_rate": 1.3668e-05, "loss": 0.0958, "step": 4559 }, { "epoch": 11.068043742405832, "grad_norm": 0.5718246698379517, "learning_rate": 1.3671e-05, "loss": 0.0592, "step": 4560 }, { "epoch": 11.070473876063183, "grad_norm": 0.701897919178009, "learning_rate": 1.3674e-05, "loss": 0.0668, "step": 4561 }, { "epoch": 11.072904009720535, "grad_norm": 0.921239972114563, "learning_rate": 1.3677000000000001e-05, "loss": 0.0633, "step": 4562 }, { "epoch": 11.075334143377885, "grad_norm": 0.8597447872161865, "learning_rate": 1.3680000000000001e-05, "loss": 0.0454, "step": 4563 }, { "epoch": 11.077764277035238, "grad_norm": 0.5339418053627014, "learning_rate": 1.3683000000000001e-05, "loss": 0.0464, "step": 4564 }, { "epoch": 11.080194410692588, "grad_norm": 0.5611770749092102, "learning_rate": 1.3686e-05, "loss": 0.0468, "step": 4565 }, { "epoch": 11.082624544349938, "grad_norm": 0.4580084979534149, "learning_rate": 1.3689e-05, "loss": 0.0453, "step": 4566 }, { "epoch": 11.08505467800729, "grad_norm": 0.6452357769012451, "learning_rate": 1.3691999999999999e-05, "loss": 0.0498, "step": 4567 }, { "epoch": 11.087484811664641, "grad_norm": 0.6216208934783936, "learning_rate": 1.3695e-05, "loss": 0.0647, "step": 4568 }, { "epoch": 11.089914945321993, "grad_norm": 0.6424198150634766, "learning_rate": 1.3698e-05, "loss": 0.0526, "step": 4569 }, { "epoch": 11.092345078979344, "grad_norm": 0.704156756401062, "learning_rate": 1.3701e-05, "loss": 0.0525, "step": 4570 }, { "epoch": 11.094775212636694, "grad_norm": 0.791461706161499, "learning_rate": 1.3704e-05, "loss": 0.0634, "step": 4571 }, { "epoch": 11.097205346294047, "grad_norm": 0.8407659530639648, "learning_rate": 1.3707e-05, "loss": 0.0673, "step": 4572 }, { "epoch": 11.099635479951397, "grad_norm": 1.3231325149536133, "learning_rate": 1.3710000000000001e-05, "loss": 0.0711, "step": 4573 }, { "epoch": 11.10206561360875, "grad_norm": 1.2092708349227905, "learning_rate": 1.3713e-05, "loss": 0.0691, "step": 4574 }, { "epoch": 11.1044957472661, "grad_norm": 0.7800148725509644, "learning_rate": 1.3716e-05, "loss": 0.0586, "step": 4575 }, { "epoch": 11.10692588092345, "grad_norm": 0.9578167796134949, "learning_rate": 1.3719e-05, "loss": 0.0892, "step": 4576 }, { "epoch": 11.109356014580802, "grad_norm": 1.1285959482192993, "learning_rate": 1.3722e-05, "loss": 0.0636, "step": 4577 }, { "epoch": 11.111786148238153, "grad_norm": 0.8453039526939392, "learning_rate": 1.3725000000000002e-05, "loss": 0.0619, "step": 4578 }, { "epoch": 11.114216281895505, "grad_norm": 0.7351916432380676, "learning_rate": 1.3728000000000001e-05, "loss": 0.0571, "step": 4579 }, { "epoch": 11.116646415552855, "grad_norm": 1.7297348976135254, "learning_rate": 1.3731e-05, "loss": 0.1145, "step": 4580 }, { "epoch": 11.119076549210206, "grad_norm": 0.9004336595535278, "learning_rate": 1.3734e-05, "loss": 0.0506, "step": 4581 }, { "epoch": 11.121506682867558, "grad_norm": 1.772289752960205, "learning_rate": 1.3736999999999999e-05, "loss": 0.0844, "step": 4582 }, { "epoch": 11.123936816524909, "grad_norm": 2.315690040588379, "learning_rate": 1.374e-05, "loss": 0.3826, "step": 4583 }, { "epoch": 11.12636695018226, "grad_norm": 0.9074113368988037, "learning_rate": 1.3743e-05, "loss": 0.2739, "step": 4584 }, { "epoch": 11.128797083839611, "grad_norm": 0.8022343516349792, "learning_rate": 1.3746e-05, "loss": 0.2191, "step": 4585 }, { "epoch": 11.131227217496962, "grad_norm": 0.7742899656295776, "learning_rate": 1.3749e-05, "loss": 0.2227, "step": 4586 }, { "epoch": 11.133657351154314, "grad_norm": 0.7375879287719727, "learning_rate": 1.3752e-05, "loss": 0.1742, "step": 4587 }, { "epoch": 11.136087484811664, "grad_norm": 0.6958557367324829, "learning_rate": 1.3755000000000001e-05, "loss": 0.1474, "step": 4588 }, { "epoch": 11.138517618469017, "grad_norm": 0.929663896560669, "learning_rate": 1.3758000000000001e-05, "loss": 0.1602, "step": 4589 }, { "epoch": 11.140947752126367, "grad_norm": 0.6001430153846741, "learning_rate": 1.3761000000000001e-05, "loss": 0.1201, "step": 4590 }, { "epoch": 11.143377885783718, "grad_norm": 0.5025187134742737, "learning_rate": 1.3764e-05, "loss": 0.0889, "step": 4591 }, { "epoch": 11.14580801944107, "grad_norm": 0.6207374930381775, "learning_rate": 1.3766999999999999e-05, "loss": 0.0818, "step": 4592 }, { "epoch": 11.14823815309842, "grad_norm": 0.6416305303573608, "learning_rate": 1.377e-05, "loss": 0.0855, "step": 4593 }, { "epoch": 11.15066828675577, "grad_norm": 0.46895766258239746, "learning_rate": 1.3773e-05, "loss": 0.0558, "step": 4594 }, { "epoch": 11.153098420413123, "grad_norm": 0.6025093793869019, "learning_rate": 1.3776e-05, "loss": 0.0545, "step": 4595 }, { "epoch": 11.155528554070473, "grad_norm": 0.6176912188529968, "learning_rate": 1.3779e-05, "loss": 0.0737, "step": 4596 }, { "epoch": 11.157958687727826, "grad_norm": 0.5274832248687744, "learning_rate": 1.3782e-05, "loss": 0.0682, "step": 4597 }, { "epoch": 11.160388821385176, "grad_norm": 0.6564781665802002, "learning_rate": 1.3785000000000001e-05, "loss": 0.0978, "step": 4598 }, { "epoch": 11.162818955042527, "grad_norm": 0.5945263504981995, "learning_rate": 1.3788e-05, "loss": 0.0511, "step": 4599 }, { "epoch": 11.165249088699879, "grad_norm": 0.5203038454055786, "learning_rate": 1.3791e-05, "loss": 0.0662, "step": 4600 }, { "epoch": 11.16767922235723, "grad_norm": 0.5969316363334656, "learning_rate": 1.3794e-05, "loss": 0.0355, "step": 4601 }, { "epoch": 11.170109356014581, "grad_norm": 0.556174635887146, "learning_rate": 1.3797e-05, "loss": 0.0397, "step": 4602 }, { "epoch": 11.172539489671932, "grad_norm": 0.667084813117981, "learning_rate": 1.3800000000000002e-05, "loss": 0.0434, "step": 4603 }, { "epoch": 11.174969623329282, "grad_norm": 0.8143032193183899, "learning_rate": 1.3803e-05, "loss": 0.0564, "step": 4604 }, { "epoch": 11.177399756986635, "grad_norm": 0.5248196721076965, "learning_rate": 1.3806e-05, "loss": 0.0437, "step": 4605 }, { "epoch": 11.179829890643985, "grad_norm": 0.5297779440879822, "learning_rate": 1.3809e-05, "loss": 0.0383, "step": 4606 }, { "epoch": 11.182260024301337, "grad_norm": 0.4939192831516266, "learning_rate": 1.3812e-05, "loss": 0.0462, "step": 4607 }, { "epoch": 11.184690157958688, "grad_norm": 0.76321941614151, "learning_rate": 1.3815e-05, "loss": 0.0583, "step": 4608 }, { "epoch": 11.187120291616038, "grad_norm": 0.8514441847801208, "learning_rate": 1.3818e-05, "loss": 0.0731, "step": 4609 }, { "epoch": 11.18955042527339, "grad_norm": 0.6526675820350647, "learning_rate": 1.3821e-05, "loss": 0.0566, "step": 4610 }, { "epoch": 11.19198055893074, "grad_norm": 0.38682085275650024, "learning_rate": 1.3824e-05, "loss": 0.0208, "step": 4611 }, { "epoch": 11.194410692588093, "grad_norm": 0.4840641915798187, "learning_rate": 1.3827e-05, "loss": 0.0399, "step": 4612 }, { "epoch": 11.196840826245444, "grad_norm": 1.3953269720077515, "learning_rate": 1.3830000000000001e-05, "loss": 0.0712, "step": 4613 }, { "epoch": 11.199270959902794, "grad_norm": 1.2040568590164185, "learning_rate": 1.3833000000000001e-05, "loss": 0.0537, "step": 4614 }, { "epoch": 11.201701093560146, "grad_norm": 0.6573865413665771, "learning_rate": 1.3836000000000001e-05, "loss": 0.0446, "step": 4615 }, { "epoch": 11.204131227217497, "grad_norm": 0.7755523324012756, "learning_rate": 1.3839e-05, "loss": 0.0799, "step": 4616 }, { "epoch": 11.206561360874849, "grad_norm": 0.5833380222320557, "learning_rate": 1.3841999999999999e-05, "loss": 0.0497, "step": 4617 }, { "epoch": 11.2089914945322, "grad_norm": 0.7119238972663879, "learning_rate": 1.3845e-05, "loss": 0.0443, "step": 4618 }, { "epoch": 11.21142162818955, "grad_norm": 1.1883728504180908, "learning_rate": 1.3848e-05, "loss": 0.0823, "step": 4619 }, { "epoch": 11.213851761846902, "grad_norm": 1.1367583274841309, "learning_rate": 1.3851e-05, "loss": 0.059, "step": 4620 }, { "epoch": 11.216281895504252, "grad_norm": 0.6308059096336365, "learning_rate": 1.3854e-05, "loss": 0.0604, "step": 4621 }, { "epoch": 11.218712029161605, "grad_norm": 0.7637712359428406, "learning_rate": 1.3857e-05, "loss": 0.0543, "step": 4622 }, { "epoch": 11.221142162818955, "grad_norm": 1.1337486505508423, "learning_rate": 1.3860000000000001e-05, "loss": 0.0779, "step": 4623 }, { "epoch": 11.223572296476306, "grad_norm": 1.53800368309021, "learning_rate": 1.3863000000000001e-05, "loss": 0.0718, "step": 4624 }, { "epoch": 11.226002430133658, "grad_norm": 0.6230471730232239, "learning_rate": 1.3866e-05, "loss": 0.044, "step": 4625 }, { "epoch": 11.228432563791008, "grad_norm": 0.8204089403152466, "learning_rate": 1.3869e-05, "loss": 0.0586, "step": 4626 }, { "epoch": 11.23086269744836, "grad_norm": 1.0017032623291016, "learning_rate": 1.3872e-05, "loss": 0.0659, "step": 4627 }, { "epoch": 11.233292831105711, "grad_norm": 0.6710302233695984, "learning_rate": 1.3875000000000002e-05, "loss": 0.0696, "step": 4628 }, { "epoch": 11.235722964763061, "grad_norm": 1.001184344291687, "learning_rate": 1.3878e-05, "loss": 0.0772, "step": 4629 }, { "epoch": 11.238153098420414, "grad_norm": 1.0150865316390991, "learning_rate": 1.3881e-05, "loss": 0.0865, "step": 4630 }, { "epoch": 11.240583232077764, "grad_norm": 1.273235559463501, "learning_rate": 1.3884e-05, "loss": 0.1087, "step": 4631 }, { "epoch": 11.243013365735115, "grad_norm": 3.661667823791504, "learning_rate": 1.3887e-05, "loss": 0.2423, "step": 4632 }, { "epoch": 11.245443499392467, "grad_norm": 3.9475340843200684, "learning_rate": 1.389e-05, "loss": 0.4439, "step": 4633 }, { "epoch": 11.247873633049817, "grad_norm": 1.0987355709075928, "learning_rate": 1.3893e-05, "loss": 0.3289, "step": 4634 }, { "epoch": 11.25030376670717, "grad_norm": 0.9708209037780762, "learning_rate": 1.3896e-05, "loss": 0.2021, "step": 4635 }, { "epoch": 11.25273390036452, "grad_norm": 1.1106432676315308, "learning_rate": 1.3899e-05, "loss": 0.2242, "step": 4636 }, { "epoch": 11.25516403402187, "grad_norm": 0.8857357501983643, "learning_rate": 1.3902e-05, "loss": 0.1905, "step": 4637 }, { "epoch": 11.257594167679223, "grad_norm": 0.853345513343811, "learning_rate": 1.3905000000000002e-05, "loss": 0.1444, "step": 4638 }, { "epoch": 11.260024301336573, "grad_norm": 0.9103320240974426, "learning_rate": 1.3908000000000001e-05, "loss": 0.1526, "step": 4639 }, { "epoch": 11.262454434993925, "grad_norm": 0.546514093875885, "learning_rate": 1.3911000000000001e-05, "loss": 0.1027, "step": 4640 }, { "epoch": 11.264884568651276, "grad_norm": 0.8560022115707397, "learning_rate": 1.3914e-05, "loss": 0.1173, "step": 4641 }, { "epoch": 11.267314702308626, "grad_norm": 0.5508618950843811, "learning_rate": 1.3916999999999999e-05, "loss": 0.068, "step": 4642 }, { "epoch": 11.269744835965978, "grad_norm": 0.6848469376564026, "learning_rate": 1.392e-05, "loss": 0.0874, "step": 4643 }, { "epoch": 11.272174969623329, "grad_norm": 0.6672700643539429, "learning_rate": 1.3923e-05, "loss": 0.0858, "step": 4644 }, { "epoch": 11.274605103280681, "grad_norm": 0.8009377121925354, "learning_rate": 1.3926e-05, "loss": 0.0704, "step": 4645 }, { "epoch": 11.277035236938032, "grad_norm": 0.4635750651359558, "learning_rate": 1.3929e-05, "loss": 0.0461, "step": 4646 }, { "epoch": 11.279465370595382, "grad_norm": 0.5466441512107849, "learning_rate": 1.3932e-05, "loss": 0.059, "step": 4647 }, { "epoch": 11.281895504252734, "grad_norm": 0.6967992186546326, "learning_rate": 1.3935000000000001e-05, "loss": 0.0653, "step": 4648 }, { "epoch": 11.284325637910085, "grad_norm": 0.5540141463279724, "learning_rate": 1.3938000000000001e-05, "loss": 0.0543, "step": 4649 }, { "epoch": 11.286755771567437, "grad_norm": 0.6891055107116699, "learning_rate": 1.3941000000000001e-05, "loss": 0.0635, "step": 4650 }, { "epoch": 11.289185905224787, "grad_norm": 0.550738513469696, "learning_rate": 1.3944e-05, "loss": 0.0543, "step": 4651 }, { "epoch": 11.291616038882138, "grad_norm": 0.6936060190200806, "learning_rate": 1.3947e-05, "loss": 0.052, "step": 4652 }, { "epoch": 11.29404617253949, "grad_norm": 0.7960584759712219, "learning_rate": 1.395e-05, "loss": 0.0695, "step": 4653 }, { "epoch": 11.29647630619684, "grad_norm": 0.6379053592681885, "learning_rate": 1.3953e-05, "loss": 0.0509, "step": 4654 }, { "epoch": 11.298906439854193, "grad_norm": 0.6077910661697388, "learning_rate": 1.3956e-05, "loss": 0.052, "step": 4655 }, { "epoch": 11.301336573511543, "grad_norm": 0.6549823880195618, "learning_rate": 1.3959e-05, "loss": 0.1398, "step": 4656 }, { "epoch": 11.303766707168894, "grad_norm": 0.5672200918197632, "learning_rate": 1.3962e-05, "loss": 0.0304, "step": 4657 }, { "epoch": 11.306196840826246, "grad_norm": 0.5149309039115906, "learning_rate": 1.3965000000000001e-05, "loss": 0.0439, "step": 4658 }, { "epoch": 11.308626974483596, "grad_norm": 0.5573406219482422, "learning_rate": 1.3968e-05, "loss": 0.0503, "step": 4659 }, { "epoch": 11.311057108140949, "grad_norm": 0.5373414158821106, "learning_rate": 1.3971e-05, "loss": 0.0513, "step": 4660 }, { "epoch": 11.313487241798299, "grad_norm": 0.6056873798370361, "learning_rate": 1.3974e-05, "loss": 0.0534, "step": 4661 }, { "epoch": 11.31591737545565, "grad_norm": 0.6744464635848999, "learning_rate": 1.3977e-05, "loss": 0.0534, "step": 4662 }, { "epoch": 11.318347509113002, "grad_norm": 0.7293891310691833, "learning_rate": 1.3980000000000002e-05, "loss": 0.0528, "step": 4663 }, { "epoch": 11.320777642770352, "grad_norm": 0.9008581042289734, "learning_rate": 1.3983000000000001e-05, "loss": 0.0932, "step": 4664 }, { "epoch": 11.323207776427704, "grad_norm": 0.5698143839836121, "learning_rate": 1.3986000000000001e-05, "loss": 0.0462, "step": 4665 }, { "epoch": 11.325637910085055, "grad_norm": 0.7425856590270996, "learning_rate": 1.3989e-05, "loss": 0.0723, "step": 4666 }, { "epoch": 11.328068043742405, "grad_norm": 0.7776923775672913, "learning_rate": 1.3992e-05, "loss": 0.0669, "step": 4667 }, { "epoch": 11.330498177399758, "grad_norm": 0.9670988917350769, "learning_rate": 1.3995e-05, "loss": 0.125, "step": 4668 }, { "epoch": 11.332928311057108, "grad_norm": 0.8543950319290161, "learning_rate": 1.3998e-05, "loss": 0.0788, "step": 4669 }, { "epoch": 11.335358444714458, "grad_norm": 0.7367601990699768, "learning_rate": 1.4001e-05, "loss": 0.1353, "step": 4670 }, { "epoch": 11.33778857837181, "grad_norm": 0.7392695546150208, "learning_rate": 1.4004e-05, "loss": 0.0522, "step": 4671 }, { "epoch": 11.340218712029161, "grad_norm": 0.6669470071792603, "learning_rate": 1.4007e-05, "loss": 0.0565, "step": 4672 }, { "epoch": 11.342648845686513, "grad_norm": 0.7625454664230347, "learning_rate": 1.4010000000000001e-05, "loss": 0.0571, "step": 4673 }, { "epoch": 11.345078979343864, "grad_norm": 1.1563899517059326, "learning_rate": 1.4013000000000001e-05, "loss": 0.0735, "step": 4674 }, { "epoch": 11.347509113001214, "grad_norm": 0.7822791934013367, "learning_rate": 1.4016000000000001e-05, "loss": 0.0532, "step": 4675 }, { "epoch": 11.349939246658566, "grad_norm": 0.7444108128547668, "learning_rate": 1.4019e-05, "loss": 0.0833, "step": 4676 }, { "epoch": 11.352369380315917, "grad_norm": 0.7982631325721741, "learning_rate": 1.4022e-05, "loss": 0.0715, "step": 4677 }, { "epoch": 11.35479951397327, "grad_norm": 1.201768398284912, "learning_rate": 1.4025e-05, "loss": 0.0821, "step": 4678 }, { "epoch": 11.35722964763062, "grad_norm": 0.6844739317893982, "learning_rate": 1.4028e-05, "loss": 0.0466, "step": 4679 }, { "epoch": 11.35965978128797, "grad_norm": 1.8196715116500854, "learning_rate": 1.4031e-05, "loss": 0.1028, "step": 4680 }, { "epoch": 11.362089914945322, "grad_norm": 2.089089870452881, "learning_rate": 1.4034e-05, "loss": 0.1386, "step": 4681 }, { "epoch": 11.364520048602673, "grad_norm": 1.1863865852355957, "learning_rate": 1.4037e-05, "loss": 0.0829, "step": 4682 }, { "epoch": 11.366950182260025, "grad_norm": 2.05863881111145, "learning_rate": 1.4040000000000001e-05, "loss": 0.3839, "step": 4683 }, { "epoch": 11.369380315917375, "grad_norm": 0.7212303876876831, "learning_rate": 1.4043000000000001e-05, "loss": 0.2224, "step": 4684 }, { "epoch": 11.371810449574726, "grad_norm": 1.062619686126709, "learning_rate": 1.4046e-05, "loss": 0.2185, "step": 4685 }, { "epoch": 11.374240583232078, "grad_norm": 1.4646704196929932, "learning_rate": 1.4049e-05, "loss": 0.214, "step": 4686 }, { "epoch": 11.376670716889429, "grad_norm": 1.0513525009155273, "learning_rate": 1.4052e-05, "loss": 0.1951, "step": 4687 }, { "epoch": 11.37910085054678, "grad_norm": 0.6018432378768921, "learning_rate": 1.4055000000000002e-05, "loss": 0.1674, "step": 4688 }, { "epoch": 11.381530984204131, "grad_norm": 0.8726100921630859, "learning_rate": 1.4058000000000002e-05, "loss": 0.0986, "step": 4689 }, { "epoch": 11.383961117861482, "grad_norm": 0.9386072754859924, "learning_rate": 1.4061e-05, "loss": 0.1105, "step": 4690 }, { "epoch": 11.386391251518834, "grad_norm": 0.8742288947105408, "learning_rate": 1.4064e-05, "loss": 0.1046, "step": 4691 }, { "epoch": 11.388821385176184, "grad_norm": 0.6234801411628723, "learning_rate": 1.4067e-05, "loss": 0.0651, "step": 4692 }, { "epoch": 11.391251518833537, "grad_norm": 0.5067813992500305, "learning_rate": 1.4069999999999999e-05, "loss": 0.0734, "step": 4693 }, { "epoch": 11.393681652490887, "grad_norm": 0.6023301482200623, "learning_rate": 1.4073e-05, "loss": 0.0648, "step": 4694 }, { "epoch": 11.396111786148237, "grad_norm": 0.6221252679824829, "learning_rate": 1.4076e-05, "loss": 0.0683, "step": 4695 }, { "epoch": 11.39854191980559, "grad_norm": 0.5169113874435425, "learning_rate": 1.4079e-05, "loss": 0.0449, "step": 4696 }, { "epoch": 11.40097205346294, "grad_norm": 0.8534227609634399, "learning_rate": 1.4082e-05, "loss": 0.0718, "step": 4697 }, { "epoch": 11.403402187120292, "grad_norm": 1.068853735923767, "learning_rate": 1.4085e-05, "loss": 0.0591, "step": 4698 }, { "epoch": 11.405832320777643, "grad_norm": 0.4997553825378418, "learning_rate": 1.4088000000000001e-05, "loss": 0.0572, "step": 4699 }, { "epoch": 11.408262454434993, "grad_norm": 1.144753098487854, "learning_rate": 1.4091000000000001e-05, "loss": 0.0758, "step": 4700 }, { "epoch": 11.410692588092346, "grad_norm": 0.6641992330551147, "learning_rate": 1.4094000000000001e-05, "loss": 0.0586, "step": 4701 }, { "epoch": 11.413122721749696, "grad_norm": 0.8796821236610413, "learning_rate": 1.4097e-05, "loss": 0.0995, "step": 4702 }, { "epoch": 11.415552855407048, "grad_norm": 0.9102869629859924, "learning_rate": 1.4099999999999999e-05, "loss": 0.0588, "step": 4703 }, { "epoch": 11.417982989064399, "grad_norm": 0.6533998250961304, "learning_rate": 1.4103e-05, "loss": 0.0468, "step": 4704 }, { "epoch": 11.42041312272175, "grad_norm": 1.4024206399917603, "learning_rate": 1.4106e-05, "loss": 0.0945, "step": 4705 }, { "epoch": 11.422843256379101, "grad_norm": 0.5543346405029297, "learning_rate": 1.4109e-05, "loss": 0.0582, "step": 4706 }, { "epoch": 11.425273390036452, "grad_norm": 0.6645606756210327, "learning_rate": 1.4112e-05, "loss": 0.0515, "step": 4707 }, { "epoch": 11.427703523693804, "grad_norm": 0.6640419363975525, "learning_rate": 1.4115e-05, "loss": 0.0588, "step": 4708 }, { "epoch": 11.430133657351154, "grad_norm": 0.5819470882415771, "learning_rate": 1.4118000000000001e-05, "loss": 0.0464, "step": 4709 }, { "epoch": 11.432563791008505, "grad_norm": 0.9564409852027893, "learning_rate": 1.4121e-05, "loss": 0.0716, "step": 4710 }, { "epoch": 11.434993924665857, "grad_norm": 0.7303016185760498, "learning_rate": 1.4124e-05, "loss": 0.0442, "step": 4711 }, { "epoch": 11.437424058323208, "grad_norm": 0.7967673540115356, "learning_rate": 1.4127e-05, "loss": 0.057, "step": 4712 }, { "epoch": 11.439854191980558, "grad_norm": 1.24154531955719, "learning_rate": 1.413e-05, "loss": 0.0644, "step": 4713 }, { "epoch": 11.44228432563791, "grad_norm": 0.6299071907997131, "learning_rate": 1.4133000000000002e-05, "loss": 0.0555, "step": 4714 }, { "epoch": 11.44471445929526, "grad_norm": 0.5786968469619751, "learning_rate": 1.4136e-05, "loss": 0.0457, "step": 4715 }, { "epoch": 11.447144592952613, "grad_norm": 0.7598857879638672, "learning_rate": 1.4139e-05, "loss": 0.0535, "step": 4716 }, { "epoch": 11.449574726609963, "grad_norm": 0.6419093012809753, "learning_rate": 1.4142e-05, "loss": 0.0722, "step": 4717 }, { "epoch": 11.452004860267314, "grad_norm": 0.6498236656188965, "learning_rate": 1.4145e-05, "loss": 0.0533, "step": 4718 }, { "epoch": 11.454434993924666, "grad_norm": 0.5418201088905334, "learning_rate": 1.4148e-05, "loss": 0.0407, "step": 4719 }, { "epoch": 11.456865127582017, "grad_norm": 0.8676780462265015, "learning_rate": 1.4151e-05, "loss": 0.0544, "step": 4720 }, { "epoch": 11.459295261239369, "grad_norm": 1.1033474206924438, "learning_rate": 1.4154e-05, "loss": 0.0507, "step": 4721 }, { "epoch": 11.46172539489672, "grad_norm": 0.706311047077179, "learning_rate": 1.4157e-05, "loss": 0.0641, "step": 4722 }, { "epoch": 11.46415552855407, "grad_norm": 0.7653324604034424, "learning_rate": 1.416e-05, "loss": 0.0591, "step": 4723 }, { "epoch": 11.466585662211422, "grad_norm": 0.5635743737220764, "learning_rate": 1.4163000000000001e-05, "loss": 0.0365, "step": 4724 }, { "epoch": 11.469015795868772, "grad_norm": 0.6490671634674072, "learning_rate": 1.4166000000000001e-05, "loss": 0.0567, "step": 4725 }, { "epoch": 11.471445929526125, "grad_norm": 0.9812367558479309, "learning_rate": 1.4169000000000001e-05, "loss": 0.069, "step": 4726 }, { "epoch": 11.473876063183475, "grad_norm": 0.8300378322601318, "learning_rate": 1.4172e-05, "loss": 0.0904, "step": 4727 }, { "epoch": 11.476306196840826, "grad_norm": 1.550307273864746, "learning_rate": 1.4174999999999999e-05, "loss": 0.0728, "step": 4728 }, { "epoch": 11.478736330498178, "grad_norm": 0.9813888669013977, "learning_rate": 1.4178e-05, "loss": 0.1206, "step": 4729 }, { "epoch": 11.481166464155528, "grad_norm": 1.0613576173782349, "learning_rate": 1.4181e-05, "loss": 0.0897, "step": 4730 }, { "epoch": 11.48359659781288, "grad_norm": 1.8807601928710938, "learning_rate": 1.4184e-05, "loss": 0.1165, "step": 4731 }, { "epoch": 11.486026731470231, "grad_norm": 1.9102277755737305, "learning_rate": 1.4187e-05, "loss": 0.1405, "step": 4732 }, { "epoch": 11.488456865127581, "grad_norm": 1.4969979524612427, "learning_rate": 1.419e-05, "loss": 0.363, "step": 4733 }, { "epoch": 11.490886998784934, "grad_norm": 1.1257089376449585, "learning_rate": 1.4193000000000001e-05, "loss": 0.2948, "step": 4734 }, { "epoch": 11.493317132442284, "grad_norm": 0.9647147059440613, "learning_rate": 1.4196000000000001e-05, "loss": 0.2201, "step": 4735 }, { "epoch": 11.495747266099636, "grad_norm": 0.9647418856620789, "learning_rate": 1.4199e-05, "loss": 0.2237, "step": 4736 }, { "epoch": 11.498177399756987, "grad_norm": 0.7772798538208008, "learning_rate": 1.4202e-05, "loss": 0.1561, "step": 4737 }, { "epoch": 11.500607533414337, "grad_norm": 1.0248860120773315, "learning_rate": 1.4205e-05, "loss": 0.1572, "step": 4738 }, { "epoch": 11.50303766707169, "grad_norm": 0.8079015612602234, "learning_rate": 1.4208e-05, "loss": 0.1403, "step": 4739 }, { "epoch": 11.50546780072904, "grad_norm": 0.8200850486755371, "learning_rate": 1.4211e-05, "loss": 0.1413, "step": 4740 }, { "epoch": 11.507897934386392, "grad_norm": 0.6313938498497009, "learning_rate": 1.4214e-05, "loss": 0.0876, "step": 4741 }, { "epoch": 11.510328068043743, "grad_norm": 1.0120580196380615, "learning_rate": 1.4217e-05, "loss": 0.0923, "step": 4742 }, { "epoch": 11.512758201701093, "grad_norm": 0.5327444076538086, "learning_rate": 1.422e-05, "loss": 0.0747, "step": 4743 }, { "epoch": 11.515188335358445, "grad_norm": 0.8297441005706787, "learning_rate": 1.4223000000000001e-05, "loss": 0.0928, "step": 4744 }, { "epoch": 11.517618469015796, "grad_norm": 0.6722994446754456, "learning_rate": 1.4226e-05, "loss": 0.062, "step": 4745 }, { "epoch": 11.520048602673146, "grad_norm": 0.6047726273536682, "learning_rate": 1.4229e-05, "loss": 0.0429, "step": 4746 }, { "epoch": 11.522478736330498, "grad_norm": 0.6852402687072754, "learning_rate": 1.4232e-05, "loss": 0.0637, "step": 4747 }, { "epoch": 11.524908869987849, "grad_norm": 0.5786404013633728, "learning_rate": 1.4235e-05, "loss": 0.069, "step": 4748 }, { "epoch": 11.527339003645201, "grad_norm": 0.5235682725906372, "learning_rate": 1.4238000000000002e-05, "loss": 0.0534, "step": 4749 }, { "epoch": 11.529769137302551, "grad_norm": 0.6461473703384399, "learning_rate": 1.4241000000000001e-05, "loss": 0.0749, "step": 4750 }, { "epoch": 11.532199270959904, "grad_norm": 0.5619738698005676, "learning_rate": 1.4244000000000001e-05, "loss": 0.0508, "step": 4751 }, { "epoch": 11.534629404617254, "grad_norm": 0.5524816513061523, "learning_rate": 1.4247e-05, "loss": 0.0535, "step": 4752 }, { "epoch": 11.537059538274605, "grad_norm": 0.4352133870124817, "learning_rate": 1.4249999999999999e-05, "loss": 0.0286, "step": 4753 }, { "epoch": 11.539489671931957, "grad_norm": 0.922767162322998, "learning_rate": 1.4253e-05, "loss": 0.0695, "step": 4754 }, { "epoch": 11.541919805589307, "grad_norm": 0.7740188837051392, "learning_rate": 1.4256e-05, "loss": 0.0443, "step": 4755 }, { "epoch": 11.544349939246658, "grad_norm": 0.5903917551040649, "learning_rate": 1.4259e-05, "loss": 0.0496, "step": 4756 }, { "epoch": 11.54678007290401, "grad_norm": 0.6308184266090393, "learning_rate": 1.4262e-05, "loss": 0.0652, "step": 4757 }, { "epoch": 11.54921020656136, "grad_norm": 0.917357325553894, "learning_rate": 1.4265e-05, "loss": 0.1441, "step": 4758 }, { "epoch": 11.551640340218713, "grad_norm": 0.6408929824829102, "learning_rate": 1.4268000000000001e-05, "loss": 0.0516, "step": 4759 }, { "epoch": 11.554070473876063, "grad_norm": 0.9660205245018005, "learning_rate": 1.4271000000000001e-05, "loss": 0.0587, "step": 4760 }, { "epoch": 11.556500607533414, "grad_norm": 0.9756492972373962, "learning_rate": 1.4274000000000001e-05, "loss": 0.0532, "step": 4761 }, { "epoch": 11.558930741190766, "grad_norm": 0.7712356448173523, "learning_rate": 1.4277e-05, "loss": 0.0638, "step": 4762 }, { "epoch": 11.561360874848116, "grad_norm": 0.7063807249069214, "learning_rate": 1.428e-05, "loss": 0.0424, "step": 4763 }, { "epoch": 11.563791008505468, "grad_norm": 1.380382776260376, "learning_rate": 1.4283e-05, "loss": 0.067, "step": 4764 }, { "epoch": 11.566221142162819, "grad_norm": 0.593485951423645, "learning_rate": 1.4286e-05, "loss": 0.0428, "step": 4765 }, { "epoch": 11.56865127582017, "grad_norm": 0.692758321762085, "learning_rate": 1.4289e-05, "loss": 0.0538, "step": 4766 }, { "epoch": 11.571081409477522, "grad_norm": 1.8160730600357056, "learning_rate": 1.4292e-05, "loss": 0.1413, "step": 4767 }, { "epoch": 11.573511543134872, "grad_norm": 0.6165350079536438, "learning_rate": 1.4295e-05, "loss": 0.0629, "step": 4768 }, { "epoch": 11.575941676792224, "grad_norm": 0.6647060513496399, "learning_rate": 1.4298000000000001e-05, "loss": 0.0479, "step": 4769 }, { "epoch": 11.578371810449575, "grad_norm": 0.5597238540649414, "learning_rate": 1.4301e-05, "loss": 0.0418, "step": 4770 }, { "epoch": 11.580801944106925, "grad_norm": 1.1014952659606934, "learning_rate": 1.4304e-05, "loss": 0.0749, "step": 4771 }, { "epoch": 11.583232077764277, "grad_norm": 1.061226487159729, "learning_rate": 1.4307e-05, "loss": 0.0696, "step": 4772 }, { "epoch": 11.585662211421628, "grad_norm": 0.6217840313911438, "learning_rate": 1.431e-05, "loss": 0.0632, "step": 4773 }, { "epoch": 11.58809234507898, "grad_norm": 0.6471796631813049, "learning_rate": 1.4313000000000002e-05, "loss": 0.0465, "step": 4774 }, { "epoch": 11.59052247873633, "grad_norm": 0.8374913334846497, "learning_rate": 1.4316000000000002e-05, "loss": 0.0471, "step": 4775 }, { "epoch": 11.592952612393681, "grad_norm": 1.2004560232162476, "learning_rate": 1.4319e-05, "loss": 0.0647, "step": 4776 }, { "epoch": 11.595382746051033, "grad_norm": 1.0786525011062622, "learning_rate": 1.4322e-05, "loss": 0.0872, "step": 4777 }, { "epoch": 11.597812879708384, "grad_norm": 1.2280547618865967, "learning_rate": 1.4325e-05, "loss": 0.092, "step": 4778 }, { "epoch": 11.600243013365736, "grad_norm": 1.1337743997573853, "learning_rate": 1.4328e-05, "loss": 0.0819, "step": 4779 }, { "epoch": 11.602673147023086, "grad_norm": 0.8870834112167358, "learning_rate": 1.4331e-05, "loss": 0.0653, "step": 4780 }, { "epoch": 11.605103280680437, "grad_norm": 1.5750453472137451, "learning_rate": 1.4334e-05, "loss": 0.1399, "step": 4781 }, { "epoch": 11.607533414337789, "grad_norm": 1.9392077922821045, "learning_rate": 1.4337e-05, "loss": 0.1581, "step": 4782 }, { "epoch": 11.60996354799514, "grad_norm": 2.09698748588562, "learning_rate": 1.434e-05, "loss": 0.4487, "step": 4783 }, { "epoch": 11.612393681652492, "grad_norm": 1.3475940227508545, "learning_rate": 1.4343000000000001e-05, "loss": 0.2822, "step": 4784 }, { "epoch": 11.614823815309842, "grad_norm": 1.346366286277771, "learning_rate": 1.4346000000000001e-05, "loss": 0.3034, "step": 4785 }, { "epoch": 11.617253948967193, "grad_norm": 1.079529047012329, "learning_rate": 1.4349000000000001e-05, "loss": 0.2486, "step": 4786 }, { "epoch": 11.619684082624545, "grad_norm": 1.09681236743927, "learning_rate": 1.4352e-05, "loss": 0.2113, "step": 4787 }, { "epoch": 11.622114216281895, "grad_norm": 1.790079116821289, "learning_rate": 1.4355e-05, "loss": 0.1863, "step": 4788 }, { "epoch": 11.624544349939246, "grad_norm": 0.710716724395752, "learning_rate": 1.4358e-05, "loss": 0.1185, "step": 4789 }, { "epoch": 11.626974483596598, "grad_norm": 0.6715278029441833, "learning_rate": 1.4361e-05, "loss": 0.1327, "step": 4790 }, { "epoch": 11.629404617253948, "grad_norm": 0.5612782835960388, "learning_rate": 1.4364e-05, "loss": 0.0922, "step": 4791 }, { "epoch": 11.6318347509113, "grad_norm": 0.5933533906936646, "learning_rate": 1.4367e-05, "loss": 0.0618, "step": 4792 }, { "epoch": 11.634264884568651, "grad_norm": 0.8178914189338684, "learning_rate": 1.437e-05, "loss": 0.063, "step": 4793 }, { "epoch": 11.636695018226002, "grad_norm": 0.6607690453529358, "learning_rate": 1.4373000000000001e-05, "loss": 0.0721, "step": 4794 }, { "epoch": 11.639125151883354, "grad_norm": 0.7802571058273315, "learning_rate": 1.4376000000000001e-05, "loss": 0.0672, "step": 4795 }, { "epoch": 11.641555285540704, "grad_norm": 0.7640334963798523, "learning_rate": 1.4379e-05, "loss": 0.0512, "step": 4796 }, { "epoch": 11.643985419198057, "grad_norm": 0.8391075134277344, "learning_rate": 1.4382e-05, "loss": 0.0606, "step": 4797 }, { "epoch": 11.646415552855407, "grad_norm": 0.553965151309967, "learning_rate": 1.4385e-05, "loss": 0.0581, "step": 4798 }, { "epoch": 11.648845686512757, "grad_norm": 0.7361604571342468, "learning_rate": 1.4388000000000002e-05, "loss": 0.088, "step": 4799 }, { "epoch": 11.65127582017011, "grad_norm": 0.5259209275245667, "learning_rate": 1.4391000000000002e-05, "loss": 0.0577, "step": 4800 }, { "epoch": 11.65370595382746, "grad_norm": 0.6365264654159546, "learning_rate": 1.4394e-05, "loss": 0.0634, "step": 4801 }, { "epoch": 11.656136087484812, "grad_norm": 0.5319544076919556, "learning_rate": 1.4397e-05, "loss": 0.049, "step": 4802 }, { "epoch": 11.658566221142163, "grad_norm": 0.5090621113777161, "learning_rate": 1.44e-05, "loss": 0.0397, "step": 4803 }, { "epoch": 11.660996354799513, "grad_norm": 0.6607389450073242, "learning_rate": 1.4403e-05, "loss": 0.0651, "step": 4804 }, { "epoch": 11.663426488456865, "grad_norm": 0.637231707572937, "learning_rate": 1.4406e-05, "loss": 0.0768, "step": 4805 }, { "epoch": 11.665856622114216, "grad_norm": 0.6801390051841736, "learning_rate": 1.4409e-05, "loss": 0.0616, "step": 4806 }, { "epoch": 11.668286755771568, "grad_norm": 0.7703109383583069, "learning_rate": 1.4412e-05, "loss": 0.0838, "step": 4807 }, { "epoch": 11.670716889428919, "grad_norm": 0.7407732009887695, "learning_rate": 1.4415e-05, "loss": 0.0608, "step": 4808 }, { "epoch": 11.673147023086269, "grad_norm": 0.7353827953338623, "learning_rate": 1.4418000000000002e-05, "loss": 0.0608, "step": 4809 }, { "epoch": 11.675577156743621, "grad_norm": 0.7328819632530212, "learning_rate": 1.4421000000000001e-05, "loss": 0.0576, "step": 4810 }, { "epoch": 11.678007290400972, "grad_norm": 0.518953263759613, "learning_rate": 1.4424000000000001e-05, "loss": 0.0429, "step": 4811 }, { "epoch": 11.680437424058324, "grad_norm": 0.5761600732803345, "learning_rate": 1.4427000000000001e-05, "loss": 0.044, "step": 4812 }, { "epoch": 11.682867557715674, "grad_norm": 0.7103362679481506, "learning_rate": 1.4429999999999999e-05, "loss": 0.0679, "step": 4813 }, { "epoch": 11.685297691373025, "grad_norm": 0.7542375922203064, "learning_rate": 1.4433e-05, "loss": 0.0475, "step": 4814 }, { "epoch": 11.687727825030377, "grad_norm": 0.7265920042991638, "learning_rate": 1.4436e-05, "loss": 0.0568, "step": 4815 }, { "epoch": 11.690157958687728, "grad_norm": 0.6135312914848328, "learning_rate": 1.4439e-05, "loss": 0.0821, "step": 4816 }, { "epoch": 11.69258809234508, "grad_norm": 0.7035861611366272, "learning_rate": 1.4442e-05, "loss": 0.0547, "step": 4817 }, { "epoch": 11.69501822600243, "grad_norm": 0.7588819861412048, "learning_rate": 1.4445e-05, "loss": 0.0809, "step": 4818 }, { "epoch": 11.69744835965978, "grad_norm": 0.8352372646331787, "learning_rate": 1.4448e-05, "loss": 0.0606, "step": 4819 }, { "epoch": 11.699878493317133, "grad_norm": 0.482852965593338, "learning_rate": 1.4451000000000001e-05, "loss": 0.0498, "step": 4820 }, { "epoch": 11.702308626974483, "grad_norm": 0.8414066433906555, "learning_rate": 1.4454000000000001e-05, "loss": 0.0763, "step": 4821 }, { "epoch": 11.704738760631834, "grad_norm": 0.7409859895706177, "learning_rate": 1.4457e-05, "loss": 0.0815, "step": 4822 }, { "epoch": 11.707168894289186, "grad_norm": 0.7795265913009644, "learning_rate": 1.446e-05, "loss": 0.044, "step": 4823 }, { "epoch": 11.709599027946537, "grad_norm": 0.913201630115509, "learning_rate": 1.4463e-05, "loss": 0.0671, "step": 4824 }, { "epoch": 11.712029161603889, "grad_norm": 0.7113469839096069, "learning_rate": 1.4466e-05, "loss": 0.0756, "step": 4825 }, { "epoch": 11.71445929526124, "grad_norm": 1.342022180557251, "learning_rate": 1.4469e-05, "loss": 0.1107, "step": 4826 }, { "epoch": 11.716889428918591, "grad_norm": 0.9371646642684937, "learning_rate": 1.4472e-05, "loss": 0.0742, "step": 4827 }, { "epoch": 11.719319562575942, "grad_norm": 1.1589994430541992, "learning_rate": 1.4475e-05, "loss": 0.0717, "step": 4828 }, { "epoch": 11.721749696233292, "grad_norm": 1.2708971500396729, "learning_rate": 1.4478e-05, "loss": 0.0933, "step": 4829 }, { "epoch": 11.724179829890645, "grad_norm": 1.3340049982070923, "learning_rate": 1.4481e-05, "loss": 0.1151, "step": 4830 }, { "epoch": 11.726609963547995, "grad_norm": 0.9238860607147217, "learning_rate": 1.4484e-05, "loss": 0.0652, "step": 4831 }, { "epoch": 11.729040097205345, "grad_norm": 1.3358722925186157, "learning_rate": 1.4487e-05, "loss": 0.1181, "step": 4832 }, { "epoch": 11.731470230862698, "grad_norm": 1.7934556007385254, "learning_rate": 1.449e-05, "loss": 0.3572, "step": 4833 }, { "epoch": 11.733900364520048, "grad_norm": 0.8428711295127869, "learning_rate": 1.4493e-05, "loss": 0.2683, "step": 4834 }, { "epoch": 11.7363304981774, "grad_norm": 0.7897201180458069, "learning_rate": 1.4496000000000001e-05, "loss": 0.2221, "step": 4835 }, { "epoch": 11.73876063183475, "grad_norm": 1.5668036937713623, "learning_rate": 1.4499000000000001e-05, "loss": 0.2234, "step": 4836 }, { "epoch": 11.741190765492101, "grad_norm": 0.9213005900382996, "learning_rate": 1.4502000000000001e-05, "loss": 0.1873, "step": 4837 }, { "epoch": 11.743620899149454, "grad_norm": 0.7302655577659607, "learning_rate": 1.4505e-05, "loss": 0.1387, "step": 4838 }, { "epoch": 11.746051032806804, "grad_norm": 0.6889105439186096, "learning_rate": 1.4507999999999999e-05, "loss": 0.1544, "step": 4839 }, { "epoch": 11.748481166464156, "grad_norm": 0.8406433463096619, "learning_rate": 1.4511e-05, "loss": 0.1156, "step": 4840 }, { "epoch": 11.750911300121507, "grad_norm": 0.5946420431137085, "learning_rate": 1.4514e-05, "loss": 0.0768, "step": 4841 }, { "epoch": 11.753341433778857, "grad_norm": 0.6702826023101807, "learning_rate": 1.4517e-05, "loss": 0.1351, "step": 4842 }, { "epoch": 11.75577156743621, "grad_norm": 0.769568145275116, "learning_rate": 1.452e-05, "loss": 0.0742, "step": 4843 }, { "epoch": 11.75820170109356, "grad_norm": 0.6082226037979126, "learning_rate": 1.4523e-05, "loss": 0.0644, "step": 4844 }, { "epoch": 11.760631834750912, "grad_norm": 0.7411651611328125, "learning_rate": 1.4526000000000001e-05, "loss": 0.0636, "step": 4845 }, { "epoch": 11.763061968408262, "grad_norm": 0.7471974492073059, "learning_rate": 1.4529000000000001e-05, "loss": 0.0873, "step": 4846 }, { "epoch": 11.765492102065613, "grad_norm": 0.49630454182624817, "learning_rate": 1.4532e-05, "loss": 0.0706, "step": 4847 }, { "epoch": 11.767922235722965, "grad_norm": 0.606709897518158, "learning_rate": 1.4535e-05, "loss": 0.0544, "step": 4848 }, { "epoch": 11.770352369380316, "grad_norm": 0.41516679525375366, "learning_rate": 1.4538e-05, "loss": 0.0544, "step": 4849 }, { "epoch": 11.772782503037668, "grad_norm": 0.609332799911499, "learning_rate": 1.4541e-05, "loss": 0.0532, "step": 4850 }, { "epoch": 11.775212636695018, "grad_norm": 0.4675820469856262, "learning_rate": 1.4544e-05, "loss": 0.0401, "step": 4851 }, { "epoch": 11.777642770352369, "grad_norm": 0.8086404800415039, "learning_rate": 1.4547e-05, "loss": 0.0637, "step": 4852 }, { "epoch": 11.780072904009721, "grad_norm": 1.10532546043396, "learning_rate": 1.455e-05, "loss": 0.0608, "step": 4853 }, { "epoch": 11.782503037667071, "grad_norm": 0.9336029887199402, "learning_rate": 1.4553e-05, "loss": 0.078, "step": 4854 }, { "epoch": 11.784933171324424, "grad_norm": 0.7586062550544739, "learning_rate": 1.4556000000000001e-05, "loss": 0.0518, "step": 4855 }, { "epoch": 11.787363304981774, "grad_norm": 0.47757288813591003, "learning_rate": 1.4559e-05, "loss": 0.0724, "step": 4856 }, { "epoch": 11.789793438639125, "grad_norm": 0.5315451622009277, "learning_rate": 1.4562e-05, "loss": 0.0517, "step": 4857 }, { "epoch": 11.792223572296477, "grad_norm": 0.790195643901825, "learning_rate": 1.4565e-05, "loss": 0.0725, "step": 4858 }, { "epoch": 11.794653705953827, "grad_norm": 0.580984354019165, "learning_rate": 1.4568e-05, "loss": 0.0571, "step": 4859 }, { "epoch": 11.79708383961118, "grad_norm": 0.8533209562301636, "learning_rate": 1.4571000000000002e-05, "loss": 0.0789, "step": 4860 }, { "epoch": 11.79951397326853, "grad_norm": 0.8746164441108704, "learning_rate": 1.4574000000000001e-05, "loss": 0.0567, "step": 4861 }, { "epoch": 11.80194410692588, "grad_norm": 0.8227803707122803, "learning_rate": 1.4577e-05, "loss": 0.0793, "step": 4862 }, { "epoch": 11.804374240583233, "grad_norm": 0.9707587957382202, "learning_rate": 1.458e-05, "loss": 0.0612, "step": 4863 }, { "epoch": 11.806804374240583, "grad_norm": 0.9851076602935791, "learning_rate": 1.4582999999999999e-05, "loss": 0.0779, "step": 4864 }, { "epoch": 11.809234507897933, "grad_norm": 0.6777076721191406, "learning_rate": 1.4586e-05, "loss": 0.0533, "step": 4865 }, { "epoch": 11.811664641555286, "grad_norm": 1.0259110927581787, "learning_rate": 1.4589e-05, "loss": 0.0614, "step": 4866 }, { "epoch": 11.814094775212636, "grad_norm": 1.0438477993011475, "learning_rate": 1.4592e-05, "loss": 0.0614, "step": 4867 }, { "epoch": 11.816524908869988, "grad_norm": 0.7773435115814209, "learning_rate": 1.4595e-05, "loss": 0.0676, "step": 4868 }, { "epoch": 11.818955042527339, "grad_norm": 0.7243484258651733, "learning_rate": 1.4598e-05, "loss": 0.0516, "step": 4869 }, { "epoch": 11.821385176184691, "grad_norm": 1.4348797798156738, "learning_rate": 1.4601000000000001e-05, "loss": 0.0771, "step": 4870 }, { "epoch": 11.823815309842042, "grad_norm": 0.6737686991691589, "learning_rate": 1.4604000000000001e-05, "loss": 0.0455, "step": 4871 }, { "epoch": 11.826245443499392, "grad_norm": 0.9286576509475708, "learning_rate": 1.4607000000000001e-05, "loss": 0.0578, "step": 4872 }, { "epoch": 11.828675577156744, "grad_norm": 0.828507125377655, "learning_rate": 1.461e-05, "loss": 0.0578, "step": 4873 }, { "epoch": 11.831105710814095, "grad_norm": 0.9315369129180908, "learning_rate": 1.4613e-05, "loss": 0.0894, "step": 4874 }, { "epoch": 11.833535844471445, "grad_norm": 0.8294400572776794, "learning_rate": 1.4616e-05, "loss": 0.0592, "step": 4875 }, { "epoch": 11.835965978128797, "grad_norm": 0.9114127159118652, "learning_rate": 1.4619e-05, "loss": 0.0482, "step": 4876 }, { "epoch": 11.838396111786148, "grad_norm": 0.7471025586128235, "learning_rate": 1.4622e-05, "loss": 0.064, "step": 4877 }, { "epoch": 11.8408262454435, "grad_norm": 0.9148945212364197, "learning_rate": 1.4625e-05, "loss": 0.0473, "step": 4878 }, { "epoch": 11.84325637910085, "grad_norm": 1.2576478719711304, "learning_rate": 1.4628e-05, "loss": 0.112, "step": 4879 }, { "epoch": 11.845686512758201, "grad_norm": 1.4123761653900146, "learning_rate": 1.4631000000000001e-05, "loss": 0.1056, "step": 4880 }, { "epoch": 11.848116646415553, "grad_norm": 2.4501709938049316, "learning_rate": 1.4634e-05, "loss": 0.092, "step": 4881 }, { "epoch": 11.850546780072904, "grad_norm": 1.6971111297607422, "learning_rate": 1.4637e-05, "loss": 0.1793, "step": 4882 }, { "epoch": 11.852976913730256, "grad_norm": 1.0755137205123901, "learning_rate": 1.464e-05, "loss": 0.3586, "step": 4883 }, { "epoch": 11.855407047387606, "grad_norm": 1.0672173500061035, "learning_rate": 1.4643e-05, "loss": 0.3069, "step": 4884 }, { "epoch": 11.857837181044957, "grad_norm": 0.7520886659622192, "learning_rate": 1.4646000000000002e-05, "loss": 0.288, "step": 4885 }, { "epoch": 11.860267314702309, "grad_norm": 0.5625882744789124, "learning_rate": 1.4649000000000002e-05, "loss": 0.1825, "step": 4886 }, { "epoch": 11.86269744835966, "grad_norm": 0.8993886709213257, "learning_rate": 1.4652e-05, "loss": 0.2797, "step": 4887 }, { "epoch": 11.865127582017012, "grad_norm": 1.251559853553772, "learning_rate": 1.4655e-05, "loss": 0.138, "step": 4888 }, { "epoch": 11.867557715674362, "grad_norm": 0.9972962737083435, "learning_rate": 1.4658e-05, "loss": 0.1628, "step": 4889 }, { "epoch": 11.869987849331713, "grad_norm": 0.6532452702522278, "learning_rate": 1.4661e-05, "loss": 0.1135, "step": 4890 }, { "epoch": 11.872417982989065, "grad_norm": 0.48437726497650146, "learning_rate": 1.4664e-05, "loss": 0.0741, "step": 4891 }, { "epoch": 11.874848116646415, "grad_norm": 0.7332649827003479, "learning_rate": 1.4667e-05, "loss": 0.0808, "step": 4892 }, { "epoch": 11.877278250303767, "grad_norm": 0.5751990675926208, "learning_rate": 1.467e-05, "loss": 0.0917, "step": 4893 }, { "epoch": 11.879708383961118, "grad_norm": 0.6046324372291565, "learning_rate": 1.4673e-05, "loss": 0.0634, "step": 4894 }, { "epoch": 11.882138517618468, "grad_norm": 1.0406391620635986, "learning_rate": 1.4676000000000001e-05, "loss": 0.0914, "step": 4895 }, { "epoch": 11.88456865127582, "grad_norm": 0.4697572886943817, "learning_rate": 1.4679000000000001e-05, "loss": 0.0623, "step": 4896 }, { "epoch": 11.886998784933171, "grad_norm": 0.7197557687759399, "learning_rate": 1.4682000000000001e-05, "loss": 0.0833, "step": 4897 }, { "epoch": 11.889428918590523, "grad_norm": 0.5822066068649292, "learning_rate": 1.4685000000000001e-05, "loss": 0.0739, "step": 4898 }, { "epoch": 11.891859052247874, "grad_norm": 0.507462203502655, "learning_rate": 1.4687999999999999e-05, "loss": 0.0532, "step": 4899 }, { "epoch": 11.894289185905224, "grad_norm": 0.7836447954177856, "learning_rate": 1.4691e-05, "loss": 0.08, "step": 4900 }, { "epoch": 11.896719319562576, "grad_norm": 0.6900799870491028, "learning_rate": 1.4694e-05, "loss": 0.0869, "step": 4901 }, { "epoch": 11.899149453219927, "grad_norm": 0.5654296875, "learning_rate": 1.4697e-05, "loss": 0.0496, "step": 4902 }, { "epoch": 11.90157958687728, "grad_norm": 0.5787701606750488, "learning_rate": 1.47e-05, "loss": 0.0708, "step": 4903 }, { "epoch": 11.90400972053463, "grad_norm": 0.5805252194404602, "learning_rate": 1.4703e-05, "loss": 0.054, "step": 4904 }, { "epoch": 11.90643985419198, "grad_norm": 1.0496431589126587, "learning_rate": 1.4706000000000001e-05, "loss": 0.0754, "step": 4905 }, { "epoch": 11.908869987849332, "grad_norm": 0.6415497064590454, "learning_rate": 1.4709000000000001e-05, "loss": 0.0945, "step": 4906 }, { "epoch": 11.911300121506683, "grad_norm": 0.555376410484314, "learning_rate": 1.4712e-05, "loss": 0.0553, "step": 4907 }, { "epoch": 11.913730255164033, "grad_norm": 0.7189003825187683, "learning_rate": 1.4715e-05, "loss": 0.0607, "step": 4908 }, { "epoch": 11.916160388821385, "grad_norm": 0.9002354145050049, "learning_rate": 1.4718e-05, "loss": 0.0622, "step": 4909 }, { "epoch": 11.918590522478736, "grad_norm": 0.6308050751686096, "learning_rate": 1.4721000000000002e-05, "loss": 0.0373, "step": 4910 }, { "epoch": 11.921020656136088, "grad_norm": 0.7034825086593628, "learning_rate": 1.4724e-05, "loss": 0.0656, "step": 4911 }, { "epoch": 11.923450789793439, "grad_norm": 0.7149795293807983, "learning_rate": 1.4727e-05, "loss": 0.049, "step": 4912 }, { "epoch": 11.925880923450789, "grad_norm": 0.4584009349346161, "learning_rate": 1.473e-05, "loss": 0.0364, "step": 4913 }, { "epoch": 11.928311057108141, "grad_norm": 0.5719059705734253, "learning_rate": 1.4733e-05, "loss": 0.0755, "step": 4914 }, { "epoch": 11.930741190765492, "grad_norm": 0.5520812273025513, "learning_rate": 1.4736000000000001e-05, "loss": 0.0429, "step": 4915 }, { "epoch": 11.933171324422844, "grad_norm": 0.5248523950576782, "learning_rate": 1.4739e-05, "loss": 0.041, "step": 4916 }, { "epoch": 11.935601458080194, "grad_norm": 0.9035938382148743, "learning_rate": 1.4742e-05, "loss": 0.098, "step": 4917 }, { "epoch": 11.938031591737545, "grad_norm": 1.0294923782348633, "learning_rate": 1.4745e-05, "loss": 0.0643, "step": 4918 }, { "epoch": 11.940461725394897, "grad_norm": 0.6768391728401184, "learning_rate": 1.4748e-05, "loss": 0.046, "step": 4919 }, { "epoch": 11.942891859052247, "grad_norm": 0.9189112782478333, "learning_rate": 1.4751000000000002e-05, "loss": 0.0801, "step": 4920 }, { "epoch": 11.9453219927096, "grad_norm": 0.7309419512748718, "learning_rate": 1.4754000000000001e-05, "loss": 0.0447, "step": 4921 }, { "epoch": 11.94775212636695, "grad_norm": 0.6399688124656677, "learning_rate": 1.4757000000000001e-05, "loss": 0.0441, "step": 4922 }, { "epoch": 11.9501822600243, "grad_norm": 1.0930914878845215, "learning_rate": 1.4760000000000001e-05, "loss": 0.0973, "step": 4923 }, { "epoch": 11.952612393681653, "grad_norm": 0.9509169459342957, "learning_rate": 1.4762999999999999e-05, "loss": 0.0571, "step": 4924 }, { "epoch": 11.955042527339003, "grad_norm": 0.9824607372283936, "learning_rate": 1.4766e-05, "loss": 0.0695, "step": 4925 }, { "epoch": 11.957472660996356, "grad_norm": 0.8297840356826782, "learning_rate": 1.4769e-05, "loss": 0.0661, "step": 4926 }, { "epoch": 11.959902794653706, "grad_norm": 0.7817918062210083, "learning_rate": 1.4772e-05, "loss": 0.0528, "step": 4927 }, { "epoch": 11.962332928311056, "grad_norm": 0.8363382816314697, "learning_rate": 1.4775e-05, "loss": 0.0496, "step": 4928 }, { "epoch": 11.964763061968409, "grad_norm": 1.4114468097686768, "learning_rate": 1.4778e-05, "loss": 0.101, "step": 4929 }, { "epoch": 11.96719319562576, "grad_norm": 1.2844984531402588, "learning_rate": 1.4781000000000001e-05, "loss": 0.0899, "step": 4930 }, { "epoch": 11.969623329283111, "grad_norm": 1.438651204109192, "learning_rate": 1.4784000000000001e-05, "loss": 0.0887, "step": 4931 }, { "epoch": 11.972053462940462, "grad_norm": 1.873184084892273, "learning_rate": 1.4787000000000001e-05, "loss": 0.1048, "step": 4932 }, { "epoch": 11.974483596597812, "grad_norm": 1.9577255249023438, "learning_rate": 1.479e-05, "loss": 0.3006, "step": 4933 }, { "epoch": 11.976913730255164, "grad_norm": 1.900061011314392, "learning_rate": 1.4793e-05, "loss": 0.1126, "step": 4934 }, { "epoch": 11.979343863912515, "grad_norm": 0.5760700702667236, "learning_rate": 1.4796000000000002e-05, "loss": 0.0734, "step": 4935 }, { "epoch": 11.981773997569867, "grad_norm": 0.8235503435134888, "learning_rate": 1.4799e-05, "loss": 0.077, "step": 4936 }, { "epoch": 11.984204131227218, "grad_norm": 0.6825807690620422, "learning_rate": 1.4802e-05, "loss": 0.0601, "step": 4937 }, { "epoch": 11.986634264884568, "grad_norm": 0.5953783988952637, "learning_rate": 1.4805e-05, "loss": 0.0455, "step": 4938 }, { "epoch": 11.98906439854192, "grad_norm": 0.8666526079177856, "learning_rate": 1.4808e-05, "loss": 0.0761, "step": 4939 }, { "epoch": 11.99149453219927, "grad_norm": 0.6139922738075256, "learning_rate": 1.4811000000000001e-05, "loss": 0.0633, "step": 4940 }, { "epoch": 11.993924665856621, "grad_norm": 1.1200125217437744, "learning_rate": 1.4814e-05, "loss": 0.0743, "step": 4941 }, { "epoch": 11.996354799513973, "grad_norm": 0.591373085975647, "learning_rate": 1.4817e-05, "loss": 0.0446, "step": 4942 }, { "epoch": 11.998784933171324, "grad_norm": 0.9927307367324829, "learning_rate": 1.482e-05, "loss": 0.0912, "step": 4943 }, { "epoch": 12.0, "grad_norm": 1.1209731101989746, "learning_rate": 1.4823e-05, "loss": 0.1046, "step": 4944 }, { "epoch": 12.00243013365735, "grad_norm": 1.543431282043457, "learning_rate": 1.4826e-05, "loss": 0.4016, "step": 4945 }, { "epoch": 12.004860267314703, "grad_norm": 1.0894474983215332, "learning_rate": 1.4829000000000002e-05, "loss": 0.3482, "step": 4946 }, { "epoch": 12.007290400972053, "grad_norm": 0.6959973573684692, "learning_rate": 1.4832000000000001e-05, "loss": 0.196, "step": 4947 }, { "epoch": 12.009720534629405, "grad_norm": 0.9300271272659302, "learning_rate": 1.4835e-05, "loss": 0.2045, "step": 4948 }, { "epoch": 12.012150668286756, "grad_norm": 0.9129675626754761, "learning_rate": 1.4838e-05, "loss": 0.1845, "step": 4949 }, { "epoch": 12.014580801944106, "grad_norm": 0.8350293040275574, "learning_rate": 1.4840999999999999e-05, "loss": 0.1194, "step": 4950 }, { "epoch": 12.017010935601458, "grad_norm": 0.6923803687095642, "learning_rate": 1.4844e-05, "loss": 0.1107, "step": 4951 }, { "epoch": 12.019441069258809, "grad_norm": 0.7782987952232361, "learning_rate": 1.4847e-05, "loss": 0.0952, "step": 4952 }, { "epoch": 12.021871202916161, "grad_norm": 0.5575248599052429, "learning_rate": 1.485e-05, "loss": 0.0879, "step": 4953 }, { "epoch": 12.024301336573512, "grad_norm": 0.6056073904037476, "learning_rate": 1.4853e-05, "loss": 0.0674, "step": 4954 }, { "epoch": 12.026731470230862, "grad_norm": 0.47598546743392944, "learning_rate": 1.4856e-05, "loss": 0.0611, "step": 4955 }, { "epoch": 12.029161603888214, "grad_norm": 0.5655194520950317, "learning_rate": 1.4859000000000001e-05, "loss": 0.0744, "step": 4956 }, { "epoch": 12.031591737545565, "grad_norm": 0.5257465243339539, "learning_rate": 1.4862000000000001e-05, "loss": 0.0468, "step": 4957 }, { "epoch": 12.034021871202917, "grad_norm": 0.6215379238128662, "learning_rate": 1.4865e-05, "loss": 0.0682, "step": 4958 }, { "epoch": 12.036452004860267, "grad_norm": 0.6754245162010193, "learning_rate": 1.4868e-05, "loss": 0.0852, "step": 4959 }, { "epoch": 12.038882138517618, "grad_norm": 0.6341303586959839, "learning_rate": 1.4871e-05, "loss": 0.0516, "step": 4960 }, { "epoch": 12.04131227217497, "grad_norm": 0.6968199610710144, "learning_rate": 1.4874e-05, "loss": 0.0663, "step": 4961 }, { "epoch": 12.04374240583232, "grad_norm": 0.43863359093666077, "learning_rate": 1.4877e-05, "loss": 0.048, "step": 4962 }, { "epoch": 12.046172539489673, "grad_norm": 0.8409996628761292, "learning_rate": 1.488e-05, "loss": 0.0692, "step": 4963 }, { "epoch": 12.048602673147023, "grad_norm": 0.4345408082008362, "learning_rate": 1.4883e-05, "loss": 0.0312, "step": 4964 }, { "epoch": 12.051032806804374, "grad_norm": 0.5975987911224365, "learning_rate": 1.4886e-05, "loss": 0.0615, "step": 4965 }, { "epoch": 12.053462940461726, "grad_norm": 0.5581008791923523, "learning_rate": 1.4889000000000001e-05, "loss": 0.0333, "step": 4966 }, { "epoch": 12.055893074119076, "grad_norm": 0.6535038948059082, "learning_rate": 1.4892e-05, "loss": 0.072, "step": 4967 }, { "epoch": 12.058323207776427, "grad_norm": 0.5142465829849243, "learning_rate": 1.4895e-05, "loss": 0.0633, "step": 4968 }, { "epoch": 12.060753341433779, "grad_norm": 0.37616345286369324, "learning_rate": 1.4898e-05, "loss": 0.0361, "step": 4969 }, { "epoch": 12.06318347509113, "grad_norm": 0.6282736659049988, "learning_rate": 1.4901e-05, "loss": 0.0432, "step": 4970 }, { "epoch": 12.065613608748482, "grad_norm": 0.451423317193985, "learning_rate": 1.4904000000000002e-05, "loss": 0.0316, "step": 4971 }, { "epoch": 12.068043742405832, "grad_norm": 0.6465634107589722, "learning_rate": 1.4907000000000001e-05, "loss": 0.0364, "step": 4972 }, { "epoch": 12.070473876063183, "grad_norm": 1.1244269609451294, "learning_rate": 1.491e-05, "loss": 0.0488, "step": 4973 }, { "epoch": 12.072904009720535, "grad_norm": 0.634102463722229, "learning_rate": 1.4913e-05, "loss": 0.0372, "step": 4974 }, { "epoch": 12.075334143377885, "grad_norm": 0.8259007334709167, "learning_rate": 1.4915999999999999e-05, "loss": 0.049, "step": 4975 }, { "epoch": 12.077764277035238, "grad_norm": 0.6830951571464539, "learning_rate": 1.4919e-05, "loss": 0.0619, "step": 4976 }, { "epoch": 12.080194410692588, "grad_norm": 0.6172556281089783, "learning_rate": 1.4922e-05, "loss": 0.0404, "step": 4977 }, { "epoch": 12.082624544349938, "grad_norm": 0.7089698910713196, "learning_rate": 1.4925e-05, "loss": 0.0468, "step": 4978 }, { "epoch": 12.08505467800729, "grad_norm": 0.8037317991256714, "learning_rate": 1.4928e-05, "loss": 0.0665, "step": 4979 }, { "epoch": 12.087484811664641, "grad_norm": 0.8176368474960327, "learning_rate": 1.4931e-05, "loss": 0.061, "step": 4980 }, { "epoch": 12.089914945321993, "grad_norm": 0.843813419342041, "learning_rate": 1.4934000000000001e-05, "loss": 0.0479, "step": 4981 }, { "epoch": 12.092345078979344, "grad_norm": 1.021105408668518, "learning_rate": 1.4937000000000001e-05, "loss": 0.0374, "step": 4982 }, { "epoch": 12.094775212636694, "grad_norm": 1.0985982418060303, "learning_rate": 1.4940000000000001e-05, "loss": 0.0547, "step": 4983 }, { "epoch": 12.097205346294047, "grad_norm": 0.6980089545249939, "learning_rate": 1.4943e-05, "loss": 0.063, "step": 4984 }, { "epoch": 12.099635479951397, "grad_norm": 0.870072066783905, "learning_rate": 1.4945999999999999e-05, "loss": 0.064, "step": 4985 }, { "epoch": 12.10206561360875, "grad_norm": 1.1819905042648315, "learning_rate": 1.4949e-05, "loss": 0.0675, "step": 4986 }, { "epoch": 12.1044957472661, "grad_norm": 0.7000578045845032, "learning_rate": 1.4952e-05, "loss": 0.0464, "step": 4987 }, { "epoch": 12.10692588092345, "grad_norm": 0.603432834148407, "learning_rate": 1.4955e-05, "loss": 0.0406, "step": 4988 }, { "epoch": 12.109356014580802, "grad_norm": 0.847925066947937, "learning_rate": 1.4958e-05, "loss": 0.0642, "step": 4989 }, { "epoch": 12.111786148238153, "grad_norm": 0.9937883019447327, "learning_rate": 1.4961e-05, "loss": 0.0526, "step": 4990 }, { "epoch": 12.114216281895505, "grad_norm": 0.8470299243927002, "learning_rate": 1.4964000000000001e-05, "loss": 0.0406, "step": 4991 }, { "epoch": 12.116646415552855, "grad_norm": 0.9417646527290344, "learning_rate": 1.4967000000000001e-05, "loss": 0.0806, "step": 4992 }, { "epoch": 12.119076549210206, "grad_norm": 1.1163133382797241, "learning_rate": 1.497e-05, "loss": 0.0676, "step": 4993 }, { "epoch": 12.121506682867558, "grad_norm": 3.233337640762329, "learning_rate": 1.4973e-05, "loss": 0.2377, "step": 4994 }, { "epoch": 12.123936816524909, "grad_norm": 1.403336524963379, "learning_rate": 1.4976e-05, "loss": 0.338, "step": 4995 }, { "epoch": 12.12636695018226, "grad_norm": 0.8136360049247742, "learning_rate": 1.4979000000000002e-05, "loss": 0.2817, "step": 4996 }, { "epoch": 12.128797083839611, "grad_norm": 0.6289603114128113, "learning_rate": 1.4982e-05, "loss": 0.2056, "step": 4997 }, { "epoch": 12.131227217496962, "grad_norm": 1.5405915975570679, "learning_rate": 1.4985e-05, "loss": 0.3237, "step": 4998 }, { "epoch": 12.133657351154314, "grad_norm": 0.6219667196273804, "learning_rate": 1.4988e-05, "loss": 0.1593, "step": 4999 }, { "epoch": 12.136087484811664, "grad_norm": 0.7391387820243835, "learning_rate": 1.4991e-05, "loss": 0.1424, "step": 5000 }, { "epoch": 12.136087484811664, "eval_cer": 0.09581559714079191, "eval_loss": 0.26057168841362, "eval_runtime": 8.2758, "eval_samples_per_second": 12.204, "eval_steps_per_second": 0.483, "eval_wer": 0.2940917107583774, "step": 5000 }, { "epoch": 12.138517618469017, "grad_norm": 0.7071038484573364, "learning_rate": 1.4994e-05, "loss": 0.1132, "step": 5001 }, { "epoch": 12.140947752126367, "grad_norm": 0.6464871168136597, "learning_rate": 1.4997e-05, "loss": 0.1228, "step": 5002 }, { "epoch": 12.143377885783718, "grad_norm": 0.5046664476394653, "learning_rate": 1.5e-05, "loss": 0.0739, "step": 5003 }, { "epoch": 12.14580801944107, "grad_norm": 0.6649789810180664, "learning_rate": 1.5003e-05, "loss": 0.0869, "step": 5004 }, { "epoch": 12.14823815309842, "grad_norm": 0.653095543384552, "learning_rate": 1.5006e-05, "loss": 0.0934, "step": 5005 }, { "epoch": 12.15066828675577, "grad_norm": 0.7363130450248718, "learning_rate": 1.5009e-05, "loss": 0.0696, "step": 5006 }, { "epoch": 12.153098420413123, "grad_norm": 0.6002424955368042, "learning_rate": 1.5012e-05, "loss": 0.0878, "step": 5007 }, { "epoch": 12.155528554070473, "grad_norm": 0.5172797441482544, "learning_rate": 1.5015e-05, "loss": 0.0517, "step": 5008 }, { "epoch": 12.157958687727826, "grad_norm": 0.5059643387794495, "learning_rate": 1.5018000000000001e-05, "loss": 0.0616, "step": 5009 }, { "epoch": 12.160388821385176, "grad_norm": 0.7071683406829834, "learning_rate": 1.5021e-05, "loss": 0.0563, "step": 5010 }, { "epoch": 12.162818955042527, "grad_norm": 0.3866122364997864, "learning_rate": 1.5024e-05, "loss": 0.0457, "step": 5011 }, { "epoch": 12.165249088699879, "grad_norm": 1.0567407608032227, "learning_rate": 1.5027e-05, "loss": 0.0521, "step": 5012 }, { "epoch": 12.16767922235723, "grad_norm": 0.5918346643447876, "learning_rate": 1.503e-05, "loss": 0.0472, "step": 5013 }, { "epoch": 12.170109356014581, "grad_norm": 0.8230270743370056, "learning_rate": 1.5033e-05, "loss": 0.0603, "step": 5014 }, { "epoch": 12.172539489671932, "grad_norm": 0.6350327730178833, "learning_rate": 1.5036e-05, "loss": 0.0487, "step": 5015 }, { "epoch": 12.174969623329282, "grad_norm": 0.3954455554485321, "learning_rate": 1.5039e-05, "loss": 0.0422, "step": 5016 }, { "epoch": 12.177399756986635, "grad_norm": 0.51186203956604, "learning_rate": 1.5042e-05, "loss": 0.038, "step": 5017 }, { "epoch": 12.179829890643985, "grad_norm": 0.6016738414764404, "learning_rate": 1.5044999999999999e-05, "loss": 0.0539, "step": 5018 }, { "epoch": 12.182260024301337, "grad_norm": 0.8701807260513306, "learning_rate": 1.5048000000000002e-05, "loss": 0.0457, "step": 5019 }, { "epoch": 12.184690157958688, "grad_norm": 0.5650331974029541, "learning_rate": 1.5051000000000002e-05, "loss": 0.0435, "step": 5020 }, { "epoch": 12.187120291616038, "grad_norm": 0.582831859588623, "learning_rate": 1.5054000000000002e-05, "loss": 0.0783, "step": 5021 }, { "epoch": 12.18955042527339, "grad_norm": 0.6515710949897766, "learning_rate": 1.5057e-05, "loss": 0.0555, "step": 5022 }, { "epoch": 12.19198055893074, "grad_norm": 1.9930046796798706, "learning_rate": 1.506e-05, "loss": 0.0335, "step": 5023 }, { "epoch": 12.194410692588093, "grad_norm": 0.4510922431945801, "learning_rate": 1.5063e-05, "loss": 0.0255, "step": 5024 }, { "epoch": 12.196840826245444, "grad_norm": 0.6943355202674866, "learning_rate": 1.5066e-05, "loss": 0.0603, "step": 5025 }, { "epoch": 12.199270959902794, "grad_norm": 0.49061650037765503, "learning_rate": 1.5069e-05, "loss": 0.0451, "step": 5026 }, { "epoch": 12.201701093560146, "grad_norm": 0.5568206906318665, "learning_rate": 1.5071999999999999e-05, "loss": 0.0423, "step": 5027 }, { "epoch": 12.204131227217497, "grad_norm": 0.863083004951477, "learning_rate": 1.5074999999999999e-05, "loss": 0.0676, "step": 5028 }, { "epoch": 12.206561360874849, "grad_norm": 0.4450083374977112, "learning_rate": 1.5078000000000002e-05, "loss": 0.028, "step": 5029 }, { "epoch": 12.2089914945322, "grad_norm": 0.6631351709365845, "learning_rate": 1.5081000000000002e-05, "loss": 0.0614, "step": 5030 }, { "epoch": 12.21142162818955, "grad_norm": 0.9415071606636047, "learning_rate": 1.5084000000000002e-05, "loss": 0.0617, "step": 5031 }, { "epoch": 12.213851761846902, "grad_norm": 0.596556544303894, "learning_rate": 1.5087000000000001e-05, "loss": 0.0303, "step": 5032 }, { "epoch": 12.216281895504252, "grad_norm": 1.0191164016723633, "learning_rate": 1.5090000000000001e-05, "loss": 0.0441, "step": 5033 }, { "epoch": 12.218712029161605, "grad_norm": 0.6329861879348755, "learning_rate": 1.5093e-05, "loss": 0.0416, "step": 5034 }, { "epoch": 12.221142162818955, "grad_norm": 0.5205200910568237, "learning_rate": 1.5095999999999999e-05, "loss": 0.0371, "step": 5035 }, { "epoch": 12.223572296476306, "grad_norm": 0.6456330418586731, "learning_rate": 1.5098999999999999e-05, "loss": 0.0453, "step": 5036 }, { "epoch": 12.226002430133658, "grad_norm": 0.6082269549369812, "learning_rate": 1.5101999999999999e-05, "loss": 0.064, "step": 5037 }, { "epoch": 12.228432563791008, "grad_norm": 0.8189157843589783, "learning_rate": 1.5104999999999999e-05, "loss": 0.0492, "step": 5038 }, { "epoch": 12.23086269744836, "grad_norm": 1.0299922227859497, "learning_rate": 1.5108000000000002e-05, "loss": 0.0684, "step": 5039 }, { "epoch": 12.233292831105711, "grad_norm": 1.3476063013076782, "learning_rate": 1.5111000000000002e-05, "loss": 0.0599, "step": 5040 }, { "epoch": 12.235722964763061, "grad_norm": 1.2419227361679077, "learning_rate": 1.5114000000000001e-05, "loss": 0.0697, "step": 5041 }, { "epoch": 12.238153098420414, "grad_norm": 0.9392030835151672, "learning_rate": 1.5117000000000001e-05, "loss": 0.0726, "step": 5042 }, { "epoch": 12.240583232077764, "grad_norm": 1.011650800704956, "learning_rate": 1.5120000000000001e-05, "loss": 0.0679, "step": 5043 }, { "epoch": 12.243013365735115, "grad_norm": 1.2555201053619385, "learning_rate": 1.5123e-05, "loss": 0.1049, "step": 5044 }, { "epoch": 12.245443499392467, "grad_norm": 1.6031520366668701, "learning_rate": 1.5126e-05, "loss": 0.3081, "step": 5045 }, { "epoch": 12.247873633049817, "grad_norm": 0.9019491076469421, "learning_rate": 1.5129e-05, "loss": 0.2514, "step": 5046 }, { "epoch": 12.25030376670717, "grad_norm": 0.7901356816291809, "learning_rate": 1.5131999999999998e-05, "loss": 0.2383, "step": 5047 }, { "epoch": 12.25273390036452, "grad_norm": 0.8853936195373535, "learning_rate": 1.5134999999999998e-05, "loss": 0.2048, "step": 5048 }, { "epoch": 12.25516403402187, "grad_norm": 1.1853495836257935, "learning_rate": 1.5138000000000001e-05, "loss": 0.1732, "step": 5049 }, { "epoch": 12.257594167679223, "grad_norm": 0.7311804294586182, "learning_rate": 1.5141000000000001e-05, "loss": 0.1341, "step": 5050 }, { "epoch": 12.260024301336573, "grad_norm": 0.7745294570922852, "learning_rate": 1.5144000000000001e-05, "loss": 0.1622, "step": 5051 }, { "epoch": 12.262454434993925, "grad_norm": 0.6221544146537781, "learning_rate": 1.5147e-05, "loss": 0.084, "step": 5052 }, { "epoch": 12.264884568651276, "grad_norm": 0.8479889631271362, "learning_rate": 1.515e-05, "loss": 0.0887, "step": 5053 }, { "epoch": 12.267314702308626, "grad_norm": 0.7964215278625488, "learning_rate": 1.5153e-05, "loss": 0.1008, "step": 5054 }, { "epoch": 12.269744835965978, "grad_norm": 1.2711905241012573, "learning_rate": 1.5156e-05, "loss": 0.08, "step": 5055 }, { "epoch": 12.272174969623329, "grad_norm": 0.5538597702980042, "learning_rate": 1.5159e-05, "loss": 0.0528, "step": 5056 }, { "epoch": 12.274605103280681, "grad_norm": 0.6001152396202087, "learning_rate": 1.5162e-05, "loss": 0.0497, "step": 5057 }, { "epoch": 12.277035236938032, "grad_norm": 0.6951942443847656, "learning_rate": 1.5165e-05, "loss": 0.0575, "step": 5058 }, { "epoch": 12.279465370595382, "grad_norm": 0.7800722718238831, "learning_rate": 1.5168000000000001e-05, "loss": 0.0805, "step": 5059 }, { "epoch": 12.281895504252734, "grad_norm": 0.5623221397399902, "learning_rate": 1.5171000000000001e-05, "loss": 0.0626, "step": 5060 }, { "epoch": 12.284325637910085, "grad_norm": 0.7491820454597473, "learning_rate": 1.5174e-05, "loss": 0.0606, "step": 5061 }, { "epoch": 12.286755771567437, "grad_norm": 0.7521931529045105, "learning_rate": 1.5177e-05, "loss": 0.0955, "step": 5062 }, { "epoch": 12.289185905224787, "grad_norm": 0.5563541650772095, "learning_rate": 1.518e-05, "loss": 0.0419, "step": 5063 }, { "epoch": 12.291616038882138, "grad_norm": 0.5545581579208374, "learning_rate": 1.5183e-05, "loss": 0.052, "step": 5064 }, { "epoch": 12.29404617253949, "grad_norm": 0.4603431820869446, "learning_rate": 1.5186e-05, "loss": 0.052, "step": 5065 }, { "epoch": 12.29647630619684, "grad_norm": 0.5302017331123352, "learning_rate": 1.5189e-05, "loss": 0.0377, "step": 5066 }, { "epoch": 12.298906439854193, "grad_norm": 0.7364862561225891, "learning_rate": 1.5192e-05, "loss": 0.137, "step": 5067 }, { "epoch": 12.301336573511543, "grad_norm": 0.5346447229385376, "learning_rate": 1.5195e-05, "loss": 0.0353, "step": 5068 }, { "epoch": 12.303766707168894, "grad_norm": 0.6366817951202393, "learning_rate": 1.5198000000000003e-05, "loss": 0.0533, "step": 5069 }, { "epoch": 12.306196840826246, "grad_norm": 0.9484778046607971, "learning_rate": 1.5201000000000002e-05, "loss": 0.1306, "step": 5070 }, { "epoch": 12.308626974483596, "grad_norm": 0.5498313903808594, "learning_rate": 1.5204e-05, "loss": 0.0542, "step": 5071 }, { "epoch": 12.311057108140949, "grad_norm": 0.6838496327400208, "learning_rate": 1.5207e-05, "loss": 0.0422, "step": 5072 }, { "epoch": 12.313487241798299, "grad_norm": 0.7737215757369995, "learning_rate": 1.521e-05, "loss": 0.0471, "step": 5073 }, { "epoch": 12.31591737545565, "grad_norm": 0.5320304036140442, "learning_rate": 1.5213e-05, "loss": 0.0373, "step": 5074 }, { "epoch": 12.318347509113002, "grad_norm": 0.655024528503418, "learning_rate": 1.5216e-05, "loss": 0.0398, "step": 5075 }, { "epoch": 12.320777642770352, "grad_norm": 0.7749716639518738, "learning_rate": 1.5219e-05, "loss": 0.0586, "step": 5076 }, { "epoch": 12.323207776427704, "grad_norm": 0.5921481847763062, "learning_rate": 1.5222e-05, "loss": 0.0685, "step": 5077 }, { "epoch": 12.325637910085055, "grad_norm": 0.859259307384491, "learning_rate": 1.5224999999999999e-05, "loss": 0.0812, "step": 5078 }, { "epoch": 12.328068043742405, "grad_norm": 0.6936589479446411, "learning_rate": 1.5228000000000002e-05, "loss": 0.0469, "step": 5079 }, { "epoch": 12.330498177399758, "grad_norm": 0.5815767645835876, "learning_rate": 1.5231000000000002e-05, "loss": 0.0508, "step": 5080 }, { "epoch": 12.332928311057108, "grad_norm": 0.5281894207000732, "learning_rate": 1.5234000000000002e-05, "loss": 0.0428, "step": 5081 }, { "epoch": 12.335358444714458, "grad_norm": 0.4972163438796997, "learning_rate": 1.5237000000000002e-05, "loss": 0.0464, "step": 5082 }, { "epoch": 12.33778857837181, "grad_norm": 0.6086931824684143, "learning_rate": 1.524e-05, "loss": 0.0456, "step": 5083 }, { "epoch": 12.340218712029161, "grad_norm": 0.6809419989585876, "learning_rate": 1.5243e-05, "loss": 0.0662, "step": 5084 }, { "epoch": 12.342648845686513, "grad_norm": 0.8113179206848145, "learning_rate": 1.5246e-05, "loss": 0.0465, "step": 5085 }, { "epoch": 12.345078979343864, "grad_norm": 0.9149553179740906, "learning_rate": 1.5249e-05, "loss": 0.0626, "step": 5086 }, { "epoch": 12.347509113001214, "grad_norm": 0.6903405785560608, "learning_rate": 1.5251999999999999e-05, "loss": 0.0627, "step": 5087 }, { "epoch": 12.349939246658566, "grad_norm": 1.3331170082092285, "learning_rate": 1.5254999999999999e-05, "loss": 0.0636, "step": 5088 }, { "epoch": 12.352369380315917, "grad_norm": 1.1663914918899536, "learning_rate": 1.5258000000000002e-05, "loss": 0.0857, "step": 5089 }, { "epoch": 12.35479951397327, "grad_norm": 1.2321454286575317, "learning_rate": 1.5261000000000002e-05, "loss": 0.0791, "step": 5090 }, { "epoch": 12.35722964763062, "grad_norm": 1.4715900421142578, "learning_rate": 1.5264e-05, "loss": 0.0534, "step": 5091 }, { "epoch": 12.35965978128797, "grad_norm": 0.7588229775428772, "learning_rate": 1.5267e-05, "loss": 0.0542, "step": 5092 }, { "epoch": 12.362089914945322, "grad_norm": 1.3329766988754272, "learning_rate": 1.527e-05, "loss": 0.0826, "step": 5093 }, { "epoch": 12.364520048602673, "grad_norm": 1.8678123950958252, "learning_rate": 1.5273e-05, "loss": 0.11, "step": 5094 }, { "epoch": 12.366950182260025, "grad_norm": 3.0776174068450928, "learning_rate": 1.5276e-05, "loss": 0.3936, "step": 5095 }, { "epoch": 12.369380315917375, "grad_norm": 1.2563629150390625, "learning_rate": 1.5279e-05, "loss": 0.2662, "step": 5096 }, { "epoch": 12.371810449574726, "grad_norm": 0.716672420501709, "learning_rate": 1.5282e-05, "loss": 0.1937, "step": 5097 }, { "epoch": 12.374240583232078, "grad_norm": 1.1625020503997803, "learning_rate": 1.5285e-05, "loss": 0.1723, "step": 5098 }, { "epoch": 12.376670716889429, "grad_norm": 1.235754370689392, "learning_rate": 1.5288000000000003e-05, "loss": 0.1596, "step": 5099 }, { "epoch": 12.37910085054678, "grad_norm": 1.35556161403656, "learning_rate": 1.5291000000000003e-05, "loss": 0.1867, "step": 5100 }, { "epoch": 12.381530984204131, "grad_norm": 0.8614840507507324, "learning_rate": 1.5294000000000003e-05, "loss": 0.1407, "step": 5101 }, { "epoch": 12.383961117861482, "grad_norm": 0.7162484526634216, "learning_rate": 1.5297e-05, "loss": 0.0809, "step": 5102 }, { "epoch": 12.386391251518834, "grad_norm": 0.7060248255729675, "learning_rate": 1.53e-05, "loss": 0.0826, "step": 5103 }, { "epoch": 12.388821385176184, "grad_norm": 0.6158637404441833, "learning_rate": 1.5303e-05, "loss": 0.0738, "step": 5104 }, { "epoch": 12.391251518833537, "grad_norm": 0.6356690526008606, "learning_rate": 1.5306e-05, "loss": 0.0738, "step": 5105 }, { "epoch": 12.393681652490887, "grad_norm": 0.8118489980697632, "learning_rate": 1.5309e-05, "loss": 0.0806, "step": 5106 }, { "epoch": 12.396111786148237, "grad_norm": 0.6323510408401489, "learning_rate": 1.5312e-05, "loss": 0.0819, "step": 5107 }, { "epoch": 12.39854191980559, "grad_norm": 0.619004487991333, "learning_rate": 1.5314999999999998e-05, "loss": 0.0589, "step": 5108 }, { "epoch": 12.40097205346294, "grad_norm": 0.7263326644897461, "learning_rate": 1.5318e-05, "loss": 0.0801, "step": 5109 }, { "epoch": 12.403402187120292, "grad_norm": 0.6711441874504089, "learning_rate": 1.5321e-05, "loss": 0.0477, "step": 5110 }, { "epoch": 12.405832320777643, "grad_norm": 0.45552051067352295, "learning_rate": 1.5324e-05, "loss": 0.0652, "step": 5111 }, { "epoch": 12.408262454434993, "grad_norm": 0.6326191425323486, "learning_rate": 1.5327e-05, "loss": 0.0616, "step": 5112 }, { "epoch": 12.410692588092346, "grad_norm": 0.5338039994239807, "learning_rate": 1.533e-05, "loss": 0.0526, "step": 5113 }, { "epoch": 12.413122721749696, "grad_norm": 0.3963862955570221, "learning_rate": 1.5333e-05, "loss": 0.034, "step": 5114 }, { "epoch": 12.415552855407048, "grad_norm": 0.6119261384010315, "learning_rate": 1.5336e-05, "loss": 0.0499, "step": 5115 }, { "epoch": 12.417982989064399, "grad_norm": 0.5263854265213013, "learning_rate": 1.5339e-05, "loss": 0.0571, "step": 5116 }, { "epoch": 12.42041312272175, "grad_norm": 0.8560634851455688, "learning_rate": 1.5342e-05, "loss": 0.0621, "step": 5117 }, { "epoch": 12.422843256379101, "grad_norm": 0.4148641526699066, "learning_rate": 1.5345e-05, "loss": 0.0455, "step": 5118 }, { "epoch": 12.425273390036452, "grad_norm": 0.922590434551239, "learning_rate": 1.5348000000000003e-05, "loss": 0.0874, "step": 5119 }, { "epoch": 12.427703523693804, "grad_norm": 0.6691349148750305, "learning_rate": 1.5351000000000003e-05, "loss": 0.0563, "step": 5120 }, { "epoch": 12.430133657351154, "grad_norm": 0.4702610373497009, "learning_rate": 1.5354000000000002e-05, "loss": 0.0407, "step": 5121 }, { "epoch": 12.432563791008505, "grad_norm": 0.48475784063339233, "learning_rate": 1.5357000000000002e-05, "loss": 0.0597, "step": 5122 }, { "epoch": 12.434993924665857, "grad_norm": 1.061401128768921, "learning_rate": 1.5360000000000002e-05, "loss": 0.063, "step": 5123 }, { "epoch": 12.437424058323208, "grad_norm": 0.6483595371246338, "learning_rate": 1.5363000000000002e-05, "loss": 0.0581, "step": 5124 }, { "epoch": 12.439854191980558, "grad_norm": 0.7068065404891968, "learning_rate": 1.5366e-05, "loss": 0.0522, "step": 5125 }, { "epoch": 12.44228432563791, "grad_norm": 0.5942395329475403, "learning_rate": 1.5368999999999998e-05, "loss": 0.0468, "step": 5126 }, { "epoch": 12.44471445929526, "grad_norm": 0.5835719108581543, "learning_rate": 1.5371999999999998e-05, "loss": 0.0483, "step": 5127 }, { "epoch": 12.447144592952613, "grad_norm": 0.795875608921051, "learning_rate": 1.5374999999999998e-05, "loss": 0.0501, "step": 5128 }, { "epoch": 12.449574726609963, "grad_norm": 0.5087441802024841, "learning_rate": 1.5377999999999997e-05, "loss": 0.0338, "step": 5129 }, { "epoch": 12.452004860267314, "grad_norm": 0.6921018958091736, "learning_rate": 1.5381e-05, "loss": 0.0619, "step": 5130 }, { "epoch": 12.454434993924666, "grad_norm": 0.5982897281646729, "learning_rate": 1.5384e-05, "loss": 0.0426, "step": 5131 }, { "epoch": 12.456865127582017, "grad_norm": 0.9832257032394409, "learning_rate": 1.5387e-05, "loss": 0.0679, "step": 5132 }, { "epoch": 12.459295261239369, "grad_norm": 1.1098870038986206, "learning_rate": 1.539e-05, "loss": 0.0846, "step": 5133 }, { "epoch": 12.46172539489672, "grad_norm": 0.5763831734657288, "learning_rate": 1.5393e-05, "loss": 0.0447, "step": 5134 }, { "epoch": 12.46415552855407, "grad_norm": 2.2285397052764893, "learning_rate": 1.5396e-05, "loss": 0.0663, "step": 5135 }, { "epoch": 12.466585662211422, "grad_norm": 1.1163444519042969, "learning_rate": 1.5399e-05, "loss": 0.0544, "step": 5136 }, { "epoch": 12.469015795868772, "grad_norm": 0.6644837856292725, "learning_rate": 1.5402e-05, "loss": 0.0594, "step": 5137 }, { "epoch": 12.471445929526125, "grad_norm": 1.4328092336654663, "learning_rate": 1.5405e-05, "loss": 0.1098, "step": 5138 }, { "epoch": 12.473876063183475, "grad_norm": 0.764091968536377, "learning_rate": 1.5408e-05, "loss": 0.0502, "step": 5139 }, { "epoch": 12.476306196840826, "grad_norm": 1.0011266469955444, "learning_rate": 1.5411000000000002e-05, "loss": 0.0609, "step": 5140 }, { "epoch": 12.478736330498178, "grad_norm": 1.0133390426635742, "learning_rate": 1.5414000000000002e-05, "loss": 0.0844, "step": 5141 }, { "epoch": 12.481166464155528, "grad_norm": 0.7739400863647461, "learning_rate": 1.5417e-05, "loss": 0.0747, "step": 5142 }, { "epoch": 12.48359659781288, "grad_norm": 2.08569073677063, "learning_rate": 1.542e-05, "loss": 0.087, "step": 5143 }, { "epoch": 12.486026731470231, "grad_norm": 1.556098461151123, "learning_rate": 1.5423e-05, "loss": 0.0745, "step": 5144 }, { "epoch": 12.488456865127581, "grad_norm": 2.6545920372009277, "learning_rate": 1.5426e-05, "loss": 0.3878, "step": 5145 }, { "epoch": 12.490886998784934, "grad_norm": 1.031832218170166, "learning_rate": 1.5429e-05, "loss": 0.2606, "step": 5146 }, { "epoch": 12.493317132442284, "grad_norm": 0.9534667730331421, "learning_rate": 1.5432e-05, "loss": 0.251, "step": 5147 }, { "epoch": 12.495747266099636, "grad_norm": 0.9113352298736572, "learning_rate": 1.5435e-05, "loss": 0.2039, "step": 5148 }, { "epoch": 12.498177399756987, "grad_norm": 0.6244789958000183, "learning_rate": 1.5438e-05, "loss": 0.1542, "step": 5149 }, { "epoch": 12.500607533414337, "grad_norm": 0.6452707648277283, "learning_rate": 1.5441000000000003e-05, "loss": 0.1342, "step": 5150 }, { "epoch": 12.50303766707169, "grad_norm": 0.5520910024642944, "learning_rate": 1.5444e-05, "loss": 0.1243, "step": 5151 }, { "epoch": 12.50546780072904, "grad_norm": 0.7119898200035095, "learning_rate": 1.5447e-05, "loss": 0.1249, "step": 5152 }, { "epoch": 12.507897934386392, "grad_norm": 0.5267981886863708, "learning_rate": 1.545e-05, "loss": 0.0716, "step": 5153 }, { "epoch": 12.510328068043743, "grad_norm": 0.7308489084243774, "learning_rate": 1.5453e-05, "loss": 0.0562, "step": 5154 }, { "epoch": 12.512758201701093, "grad_norm": 0.4779048264026642, "learning_rate": 1.5456e-05, "loss": 0.0573, "step": 5155 }, { "epoch": 12.515188335358445, "grad_norm": 1.3737878799438477, "learning_rate": 1.5459e-05, "loss": 0.0813, "step": 5156 }, { "epoch": 12.517618469015796, "grad_norm": 0.6300175786018372, "learning_rate": 1.5462e-05, "loss": 0.0564, "step": 5157 }, { "epoch": 12.520048602673146, "grad_norm": 0.5980567336082458, "learning_rate": 1.5465e-05, "loss": 0.0513, "step": 5158 }, { "epoch": 12.522478736330498, "grad_norm": 0.7824360728263855, "learning_rate": 1.5467999999999998e-05, "loss": 0.0515, "step": 5159 }, { "epoch": 12.524908869987849, "grad_norm": 0.6528307795524597, "learning_rate": 1.5471e-05, "loss": 0.0467, "step": 5160 }, { "epoch": 12.527339003645201, "grad_norm": 0.6055038571357727, "learning_rate": 1.5474e-05, "loss": 0.0664, "step": 5161 }, { "epoch": 12.529769137302551, "grad_norm": 0.9527074098587036, "learning_rate": 1.5477e-05, "loss": 0.0759, "step": 5162 }, { "epoch": 12.532199270959904, "grad_norm": 0.8491119742393494, "learning_rate": 1.548e-05, "loss": 0.0453, "step": 5163 }, { "epoch": 12.534629404617254, "grad_norm": 0.7462834715843201, "learning_rate": 1.5483e-05, "loss": 0.0457, "step": 5164 }, { "epoch": 12.537059538274605, "grad_norm": 0.690310537815094, "learning_rate": 1.5486e-05, "loss": 0.0737, "step": 5165 }, { "epoch": 12.539489671931957, "grad_norm": 0.610205352306366, "learning_rate": 1.5489e-05, "loss": 0.0488, "step": 5166 }, { "epoch": 12.541919805589307, "grad_norm": 0.6438760161399841, "learning_rate": 1.5492e-05, "loss": 0.0611, "step": 5167 }, { "epoch": 12.544349939246658, "grad_norm": 0.48339927196502686, "learning_rate": 1.5495e-05, "loss": 0.0448, "step": 5168 }, { "epoch": 12.54678007290401, "grad_norm": 0.5821022391319275, "learning_rate": 1.5498e-05, "loss": 0.0399, "step": 5169 }, { "epoch": 12.54921020656136, "grad_norm": 0.7230927348136902, "learning_rate": 1.5501000000000003e-05, "loss": 0.0547, "step": 5170 }, { "epoch": 12.551640340218713, "grad_norm": 0.632839024066925, "learning_rate": 1.5504000000000003e-05, "loss": 0.0536, "step": 5171 }, { "epoch": 12.554070473876063, "grad_norm": 0.5739094018936157, "learning_rate": 1.5507000000000002e-05, "loss": 0.0535, "step": 5172 }, { "epoch": 12.556500607533414, "grad_norm": 0.46720635890960693, "learning_rate": 1.5510000000000002e-05, "loss": 0.0436, "step": 5173 }, { "epoch": 12.558930741190766, "grad_norm": 0.6125528812408447, "learning_rate": 1.5513000000000002e-05, "loss": 0.0385, "step": 5174 }, { "epoch": 12.561360874848116, "grad_norm": 0.6404817700386047, "learning_rate": 1.5516000000000002e-05, "loss": 0.0459, "step": 5175 }, { "epoch": 12.563791008505468, "grad_norm": 0.6638503670692444, "learning_rate": 1.5518999999999998e-05, "loss": 0.0351, "step": 5176 }, { "epoch": 12.566221142162819, "grad_norm": 0.8065598011016846, "learning_rate": 1.5521999999999998e-05, "loss": 0.0661, "step": 5177 }, { "epoch": 12.56865127582017, "grad_norm": 0.7772315144538879, "learning_rate": 1.5524999999999998e-05, "loss": 0.0577, "step": 5178 }, { "epoch": 12.571081409477522, "grad_norm": 0.8560531735420227, "learning_rate": 1.5527999999999998e-05, "loss": 0.0977, "step": 5179 }, { "epoch": 12.573511543134872, "grad_norm": 1.6488913297653198, "learning_rate": 1.5531e-05, "loss": 0.1039, "step": 5180 }, { "epoch": 12.575941676792224, "grad_norm": 0.5496894121170044, "learning_rate": 1.5534e-05, "loss": 0.0495, "step": 5181 }, { "epoch": 12.578371810449575, "grad_norm": 0.8854308128356934, "learning_rate": 1.5537e-05, "loss": 0.0698, "step": 5182 }, { "epoch": 12.580801944106925, "grad_norm": 1.2328193187713623, "learning_rate": 1.554e-05, "loss": 0.0632, "step": 5183 }, { "epoch": 12.583232077764277, "grad_norm": 0.8222094178199768, "learning_rate": 1.5543e-05, "loss": 0.0612, "step": 5184 }, { "epoch": 12.585662211421628, "grad_norm": 0.6149528622627258, "learning_rate": 1.5546e-05, "loss": 0.0477, "step": 5185 }, { "epoch": 12.58809234507898, "grad_norm": 0.6092702746391296, "learning_rate": 1.5549e-05, "loss": 0.0472, "step": 5186 }, { "epoch": 12.59052247873633, "grad_norm": 0.5800856947898865, "learning_rate": 1.5552e-05, "loss": 0.0405, "step": 5187 }, { "epoch": 12.592952612393681, "grad_norm": 5.022829055786133, "learning_rate": 1.5555e-05, "loss": 0.0671, "step": 5188 }, { "epoch": 12.595382746051033, "grad_norm": 0.7989799976348877, "learning_rate": 1.5558e-05, "loss": 0.0488, "step": 5189 }, { "epoch": 12.597812879708384, "grad_norm": 1.0740928649902344, "learning_rate": 1.5561000000000002e-05, "loss": 0.0716, "step": 5190 }, { "epoch": 12.600243013365736, "grad_norm": 1.4213274717330933, "learning_rate": 1.5564000000000002e-05, "loss": 0.1008, "step": 5191 }, { "epoch": 12.602673147023086, "grad_norm": 1.2940791845321655, "learning_rate": 1.5567000000000002e-05, "loss": 0.0791, "step": 5192 }, { "epoch": 12.605103280680437, "grad_norm": 1.5863357782363892, "learning_rate": 1.5570000000000002e-05, "loss": 0.1534, "step": 5193 }, { "epoch": 12.607533414337789, "grad_norm": 1.3867956399917603, "learning_rate": 1.5573e-05, "loss": 0.1204, "step": 5194 }, { "epoch": 12.60996354799514, "grad_norm": 1.9861044883728027, "learning_rate": 1.5576e-05, "loss": 0.4038, "step": 5195 }, { "epoch": 12.612393681652492, "grad_norm": 1.315995454788208, "learning_rate": 1.5579e-05, "loss": 0.236, "step": 5196 }, { "epoch": 12.614823815309842, "grad_norm": 0.8110657930374146, "learning_rate": 1.5582e-05, "loss": 0.2237, "step": 5197 }, { "epoch": 12.617253948967193, "grad_norm": 0.6846145391464233, "learning_rate": 1.5585e-05, "loss": 0.1767, "step": 5198 }, { "epoch": 12.619684082624545, "grad_norm": 1.0695173740386963, "learning_rate": 1.5588e-05, "loss": 0.184, "step": 5199 }, { "epoch": 12.622114216281895, "grad_norm": 0.8663557171821594, "learning_rate": 1.5591e-05, "loss": 0.1645, "step": 5200 }, { "epoch": 12.624544349939246, "grad_norm": 0.6355769634246826, "learning_rate": 1.5594e-05, "loss": 0.1289, "step": 5201 }, { "epoch": 12.626974483596598, "grad_norm": 0.6699353456497192, "learning_rate": 1.5597e-05, "loss": 0.1158, "step": 5202 }, { "epoch": 12.629404617253948, "grad_norm": 0.6649885177612305, "learning_rate": 1.56e-05, "loss": 0.1037, "step": 5203 }, { "epoch": 12.6318347509113, "grad_norm": 0.5615257024765015, "learning_rate": 1.5603e-05, "loss": 0.0937, "step": 5204 }, { "epoch": 12.634264884568651, "grad_norm": 0.572065532207489, "learning_rate": 1.5606e-05, "loss": 0.0607, "step": 5205 }, { "epoch": 12.636695018226002, "grad_norm": 0.670598566532135, "learning_rate": 1.5609e-05, "loss": 0.0664, "step": 5206 }, { "epoch": 12.639125151883354, "grad_norm": 0.6572765111923218, "learning_rate": 1.5612e-05, "loss": 0.0548, "step": 5207 }, { "epoch": 12.641555285540704, "grad_norm": 0.4599183201789856, "learning_rate": 1.5615e-05, "loss": 0.0527, "step": 5208 }, { "epoch": 12.643985419198057, "grad_norm": 0.5170769095420837, "learning_rate": 1.5618e-05, "loss": 0.0712, "step": 5209 }, { "epoch": 12.646415552855407, "grad_norm": 0.36207127571105957, "learning_rate": 1.5621000000000002e-05, "loss": 0.033, "step": 5210 }, { "epoch": 12.648845686512757, "grad_norm": 0.5799140334129333, "learning_rate": 1.5624e-05, "loss": 0.0529, "step": 5211 }, { "epoch": 12.65127582017011, "grad_norm": 0.7703135013580322, "learning_rate": 1.5627e-05, "loss": 0.0558, "step": 5212 }, { "epoch": 12.65370595382746, "grad_norm": 0.5704256296157837, "learning_rate": 1.563e-05, "loss": 0.0474, "step": 5213 }, { "epoch": 12.656136087484812, "grad_norm": 0.4671339690685272, "learning_rate": 1.5633e-05, "loss": 0.0472, "step": 5214 }, { "epoch": 12.658566221142163, "grad_norm": 0.5054295063018799, "learning_rate": 1.5636e-05, "loss": 0.0533, "step": 5215 }, { "epoch": 12.660996354799513, "grad_norm": 0.5985282063484192, "learning_rate": 1.5639e-05, "loss": 0.0482, "step": 5216 }, { "epoch": 12.663426488456865, "grad_norm": 0.7214949131011963, "learning_rate": 1.5642e-05, "loss": 0.0476, "step": 5217 }, { "epoch": 12.665856622114216, "grad_norm": 0.40073081851005554, "learning_rate": 1.5645e-05, "loss": 0.0381, "step": 5218 }, { "epoch": 12.668286755771568, "grad_norm": 0.6465504765510559, "learning_rate": 1.5648e-05, "loss": 0.0405, "step": 5219 }, { "epoch": 12.670716889428919, "grad_norm": 0.8125755786895752, "learning_rate": 1.5651000000000003e-05, "loss": 0.0506, "step": 5220 }, { "epoch": 12.673147023086269, "grad_norm": 0.5801917910575867, "learning_rate": 1.5654000000000003e-05, "loss": 0.0466, "step": 5221 }, { "epoch": 12.675577156743621, "grad_norm": 0.6369403600692749, "learning_rate": 1.5657000000000003e-05, "loss": 0.0372, "step": 5222 }, { "epoch": 12.678007290400972, "grad_norm": 0.6793203949928284, "learning_rate": 1.5660000000000003e-05, "loss": 0.045, "step": 5223 }, { "epoch": 12.680437424058324, "grad_norm": 0.6404121518135071, "learning_rate": 1.5663000000000002e-05, "loss": 0.0385, "step": 5224 }, { "epoch": 12.682867557715674, "grad_norm": 0.618468701839447, "learning_rate": 1.5666e-05, "loss": 0.0433, "step": 5225 }, { "epoch": 12.685297691373025, "grad_norm": 0.8415457010269165, "learning_rate": 1.5669e-05, "loss": 0.0531, "step": 5226 }, { "epoch": 12.687727825030377, "grad_norm": 0.6971750259399414, "learning_rate": 1.5672e-05, "loss": 0.0461, "step": 5227 }, { "epoch": 12.690157958687728, "grad_norm": 0.5146746039390564, "learning_rate": 1.5674999999999998e-05, "loss": 0.0411, "step": 5228 }, { "epoch": 12.69258809234508, "grad_norm": 0.9035937190055847, "learning_rate": 1.5677999999999998e-05, "loss": 0.049, "step": 5229 }, { "epoch": 12.69501822600243, "grad_norm": 0.6232673525810242, "learning_rate": 1.5681e-05, "loss": 0.0572, "step": 5230 }, { "epoch": 12.69744835965978, "grad_norm": 0.8900043368339539, "learning_rate": 1.5684e-05, "loss": 0.0394, "step": 5231 }, { "epoch": 12.699878493317133, "grad_norm": 1.104740858078003, "learning_rate": 1.5687e-05, "loss": 0.1195, "step": 5232 }, { "epoch": 12.702308626974483, "grad_norm": 0.997053861618042, "learning_rate": 1.569e-05, "loss": 0.0532, "step": 5233 }, { "epoch": 12.704738760631834, "grad_norm": 0.9141441583633423, "learning_rate": 1.5693e-05, "loss": 0.0636, "step": 5234 }, { "epoch": 12.707168894289186, "grad_norm": 0.6628722548484802, "learning_rate": 1.5696e-05, "loss": 0.0452, "step": 5235 }, { "epoch": 12.709599027946537, "grad_norm": 0.9610047340393066, "learning_rate": 1.5699e-05, "loss": 0.056, "step": 5236 }, { "epoch": 12.712029161603889, "grad_norm": 0.7786219716072083, "learning_rate": 1.5702e-05, "loss": 0.0647, "step": 5237 }, { "epoch": 12.71445929526124, "grad_norm": 0.8685443997383118, "learning_rate": 1.5705e-05, "loss": 0.0804, "step": 5238 }, { "epoch": 12.716889428918591, "grad_norm": 1.2266117334365845, "learning_rate": 1.5708e-05, "loss": 0.0779, "step": 5239 }, { "epoch": 12.719319562575942, "grad_norm": 1.2535102367401123, "learning_rate": 1.5711000000000003e-05, "loss": 0.0459, "step": 5240 }, { "epoch": 12.721749696233292, "grad_norm": 0.6488484144210815, "learning_rate": 1.5714000000000002e-05, "loss": 0.0686, "step": 5241 }, { "epoch": 12.724179829890645, "grad_norm": 1.1075632572174072, "learning_rate": 1.5717000000000002e-05, "loss": 0.1018, "step": 5242 }, { "epoch": 12.726609963547995, "grad_norm": 1.0223819017410278, "learning_rate": 1.5720000000000002e-05, "loss": 0.0738, "step": 5243 }, { "epoch": 12.729040097205345, "grad_norm": 1.6172704696655273, "learning_rate": 1.5723000000000002e-05, "loss": 0.1341, "step": 5244 }, { "epoch": 12.731470230862698, "grad_norm": 1.9688892364501953, "learning_rate": 1.5726e-05, "loss": 0.3712, "step": 5245 }, { "epoch": 12.733900364520048, "grad_norm": 0.9226818680763245, "learning_rate": 1.5729e-05, "loss": 0.2862, "step": 5246 }, { "epoch": 12.7363304981774, "grad_norm": 0.8704564571380615, "learning_rate": 1.5732e-05, "loss": 0.2373, "step": 5247 }, { "epoch": 12.73876063183475, "grad_norm": 0.9547804594039917, "learning_rate": 1.5735e-05, "loss": 0.2066, "step": 5248 }, { "epoch": 12.741190765492101, "grad_norm": 0.6900214552879333, "learning_rate": 1.5737999999999997e-05, "loss": 0.1699, "step": 5249 }, { "epoch": 12.743620899149454, "grad_norm": 1.091575026512146, "learning_rate": 1.5741e-05, "loss": 0.177, "step": 5250 }, { "epoch": 12.746051032806804, "grad_norm": 0.9055350422859192, "learning_rate": 1.5744e-05, "loss": 0.1045, "step": 5251 }, { "epoch": 12.748481166464156, "grad_norm": 0.9167103171348572, "learning_rate": 1.5747e-05, "loss": 0.1244, "step": 5252 }, { "epoch": 12.750911300121507, "grad_norm": 0.9386987686157227, "learning_rate": 1.575e-05, "loss": 0.1466, "step": 5253 }, { "epoch": 12.753341433778857, "grad_norm": 0.5084399580955505, "learning_rate": 1.5753e-05, "loss": 0.0781, "step": 5254 }, { "epoch": 12.75577156743621, "grad_norm": 0.6492592692375183, "learning_rate": 1.5756e-05, "loss": 0.0628, "step": 5255 }, { "epoch": 12.75820170109356, "grad_norm": 0.6768681406974792, "learning_rate": 1.5759e-05, "loss": 0.0723, "step": 5256 }, { "epoch": 12.760631834750912, "grad_norm": 0.8740474581718445, "learning_rate": 1.5762e-05, "loss": 0.0577, "step": 5257 }, { "epoch": 12.763061968408262, "grad_norm": 0.9615817070007324, "learning_rate": 1.5765e-05, "loss": 0.088, "step": 5258 }, { "epoch": 12.765492102065613, "grad_norm": 0.7349742650985718, "learning_rate": 1.5768e-05, "loss": 0.0654, "step": 5259 }, { "epoch": 12.767922235722965, "grad_norm": 0.45257124304771423, "learning_rate": 1.5771e-05, "loss": 0.0562, "step": 5260 }, { "epoch": 12.770352369380316, "grad_norm": 0.5325304865837097, "learning_rate": 1.5774000000000002e-05, "loss": 0.0477, "step": 5261 }, { "epoch": 12.772782503037668, "grad_norm": 0.886762261390686, "learning_rate": 1.5777e-05, "loss": 0.0808, "step": 5262 }, { "epoch": 12.775212636695018, "grad_norm": 0.5115048289299011, "learning_rate": 1.578e-05, "loss": 0.0341, "step": 5263 }, { "epoch": 12.777642770352369, "grad_norm": 0.9348110556602478, "learning_rate": 1.5783e-05, "loss": 0.0635, "step": 5264 }, { "epoch": 12.780072904009721, "grad_norm": 0.7350127100944519, "learning_rate": 1.5786e-05, "loss": 0.0503, "step": 5265 }, { "epoch": 12.782503037667071, "grad_norm": 0.44686374068260193, "learning_rate": 1.5789e-05, "loss": 0.0442, "step": 5266 }, { "epoch": 12.784933171324424, "grad_norm": 0.5813581943511963, "learning_rate": 1.5792e-05, "loss": 0.0451, "step": 5267 }, { "epoch": 12.787363304981774, "grad_norm": 0.4765399694442749, "learning_rate": 1.5795e-05, "loss": 0.0464, "step": 5268 }, { "epoch": 12.789793438639125, "grad_norm": 0.5655251145362854, "learning_rate": 1.5798e-05, "loss": 0.0487, "step": 5269 }, { "epoch": 12.792223572296477, "grad_norm": 0.6517568230628967, "learning_rate": 1.5801e-05, "loss": 0.0435, "step": 5270 }, { "epoch": 12.794653705953827, "grad_norm": 0.8548974394798279, "learning_rate": 1.5804000000000003e-05, "loss": 0.1199, "step": 5271 }, { "epoch": 12.79708383961118, "grad_norm": 0.5480860471725464, "learning_rate": 1.5807000000000003e-05, "loss": 0.0712, "step": 5272 }, { "epoch": 12.79951397326853, "grad_norm": 0.9876378774642944, "learning_rate": 1.5810000000000003e-05, "loss": 0.0536, "step": 5273 }, { "epoch": 12.80194410692588, "grad_norm": 0.7293201684951782, "learning_rate": 1.5813e-05, "loss": 0.0736, "step": 5274 }, { "epoch": 12.804374240583233, "grad_norm": 0.6550408601760864, "learning_rate": 1.5816e-05, "loss": 0.0554, "step": 5275 }, { "epoch": 12.806804374240583, "grad_norm": 0.770728349685669, "learning_rate": 1.5819e-05, "loss": 0.0596, "step": 5276 }, { "epoch": 12.809234507897933, "grad_norm": 0.594364583492279, "learning_rate": 1.5822e-05, "loss": 0.0553, "step": 5277 }, { "epoch": 12.811664641555286, "grad_norm": 0.6740139722824097, "learning_rate": 1.5825e-05, "loss": 0.0508, "step": 5278 }, { "epoch": 12.814094775212636, "grad_norm": 1.1612862348556519, "learning_rate": 1.5827999999999998e-05, "loss": 0.0725, "step": 5279 }, { "epoch": 12.816524908869988, "grad_norm": 0.5975507497787476, "learning_rate": 1.5830999999999998e-05, "loss": 0.0482, "step": 5280 }, { "epoch": 12.818955042527339, "grad_norm": 0.7489326596260071, "learning_rate": 1.5834e-05, "loss": 0.0657, "step": 5281 }, { "epoch": 12.821385176184691, "grad_norm": 0.8287584185600281, "learning_rate": 1.5837e-05, "loss": 0.0499, "step": 5282 }, { "epoch": 12.823815309842042, "grad_norm": 0.9469150304794312, "learning_rate": 1.584e-05, "loss": 0.0801, "step": 5283 }, { "epoch": 12.826245443499392, "grad_norm": 0.5155798196792603, "learning_rate": 1.5843e-05, "loss": 0.0406, "step": 5284 }, { "epoch": 12.828675577156744, "grad_norm": 0.7054024934768677, "learning_rate": 1.5846e-05, "loss": 0.0538, "step": 5285 }, { "epoch": 12.831105710814095, "grad_norm": 0.8598802089691162, "learning_rate": 1.5849e-05, "loss": 0.0532, "step": 5286 }, { "epoch": 12.833535844471445, "grad_norm": 0.5829834342002869, "learning_rate": 1.5852e-05, "loss": 0.0498, "step": 5287 }, { "epoch": 12.835965978128797, "grad_norm": 0.9818636775016785, "learning_rate": 1.5855e-05, "loss": 0.0713, "step": 5288 }, { "epoch": 12.838396111786148, "grad_norm": 0.6918102502822876, "learning_rate": 1.5858e-05, "loss": 0.0529, "step": 5289 }, { "epoch": 12.8408262454435, "grad_norm": 0.7898814082145691, "learning_rate": 1.5861e-05, "loss": 0.0573, "step": 5290 }, { "epoch": 12.84325637910085, "grad_norm": 0.9464079737663269, "learning_rate": 1.5864000000000003e-05, "loss": 0.0355, "step": 5291 }, { "epoch": 12.845686512758201, "grad_norm": 1.6708277463912964, "learning_rate": 1.5867000000000002e-05, "loss": 0.0945, "step": 5292 }, { "epoch": 12.848116646415553, "grad_norm": 1.928588628768921, "learning_rate": 1.5870000000000002e-05, "loss": 0.1064, "step": 5293 }, { "epoch": 12.850546780072904, "grad_norm": 2.586110830307007, "learning_rate": 1.5873000000000002e-05, "loss": 0.1036, "step": 5294 }, { "epoch": 12.852976913730256, "grad_norm": 1.4058105945587158, "learning_rate": 1.5876000000000002e-05, "loss": 0.3211, "step": 5295 }, { "epoch": 12.855407047387606, "grad_norm": 0.7423413395881653, "learning_rate": 1.5879e-05, "loss": 0.2227, "step": 5296 }, { "epoch": 12.857837181044957, "grad_norm": 0.7338122725486755, "learning_rate": 1.5882e-05, "loss": 0.2082, "step": 5297 }, { "epoch": 12.860267314702309, "grad_norm": 0.8064481616020203, "learning_rate": 1.5884999999999998e-05, "loss": 0.1787, "step": 5298 }, { "epoch": 12.86269744835966, "grad_norm": 0.777604341506958, "learning_rate": 1.5887999999999998e-05, "loss": 0.1556, "step": 5299 }, { "epoch": 12.865127582017012, "grad_norm": 0.7548807859420776, "learning_rate": 1.5890999999999997e-05, "loss": 0.1251, "step": 5300 }, { "epoch": 12.867557715674362, "grad_norm": 0.6431045532226562, "learning_rate": 1.5894e-05, "loss": 0.1149, "step": 5301 }, { "epoch": 12.869987849331713, "grad_norm": 0.7652623653411865, "learning_rate": 1.5897e-05, "loss": 0.1155, "step": 5302 }, { "epoch": 12.872417982989065, "grad_norm": 0.5370259881019592, "learning_rate": 1.59e-05, "loss": 0.0687, "step": 5303 }, { "epoch": 12.874848116646415, "grad_norm": 0.5986829400062561, "learning_rate": 1.5903e-05, "loss": 0.0629, "step": 5304 }, { "epoch": 12.877278250303767, "grad_norm": 0.7520310282707214, "learning_rate": 1.5906e-05, "loss": 0.087, "step": 5305 }, { "epoch": 12.879708383961118, "grad_norm": 0.5467406511306763, "learning_rate": 1.5909e-05, "loss": 0.0607, "step": 5306 }, { "epoch": 12.882138517618468, "grad_norm": 0.7220439910888672, "learning_rate": 1.5912e-05, "loss": 0.0646, "step": 5307 }, { "epoch": 12.88456865127582, "grad_norm": 0.5510358214378357, "learning_rate": 1.5915e-05, "loss": 0.0322, "step": 5308 }, { "epoch": 12.886998784933171, "grad_norm": 0.6916252374649048, "learning_rate": 1.5918e-05, "loss": 0.0562, "step": 5309 }, { "epoch": 12.889428918590523, "grad_norm": 0.5464776754379272, "learning_rate": 1.5921e-05, "loss": 0.046, "step": 5310 }, { "epoch": 12.891859052247874, "grad_norm": 0.8518863320350647, "learning_rate": 1.5924000000000002e-05, "loss": 0.0707, "step": 5311 }, { "epoch": 12.894289185905224, "grad_norm": 0.5795825123786926, "learning_rate": 1.5927000000000002e-05, "loss": 0.0672, "step": 5312 }, { "epoch": 12.896719319562576, "grad_norm": 0.4795616567134857, "learning_rate": 1.593e-05, "loss": 0.0538, "step": 5313 }, { "epoch": 12.899149453219927, "grad_norm": 0.481871634721756, "learning_rate": 1.5933e-05, "loss": 0.0467, "step": 5314 }, { "epoch": 12.90157958687728, "grad_norm": 0.7670661211013794, "learning_rate": 1.5936e-05, "loss": 0.0799, "step": 5315 }, { "epoch": 12.90400972053463, "grad_norm": 0.5517655611038208, "learning_rate": 1.5939e-05, "loss": 0.0653, "step": 5316 }, { "epoch": 12.90643985419198, "grad_norm": 0.5773467421531677, "learning_rate": 1.5942e-05, "loss": 0.0715, "step": 5317 }, { "epoch": 12.908869987849332, "grad_norm": 0.6403663754463196, "learning_rate": 1.5945e-05, "loss": 0.0639, "step": 5318 }, { "epoch": 12.911300121506683, "grad_norm": 0.5884896516799927, "learning_rate": 1.5948e-05, "loss": 0.0784, "step": 5319 }, { "epoch": 12.913730255164033, "grad_norm": 1.329199194908142, "learning_rate": 1.5951e-05, "loss": 0.0551, "step": 5320 }, { "epoch": 12.916160388821385, "grad_norm": 0.5812296867370605, "learning_rate": 1.5954000000000003e-05, "loss": 0.0403, "step": 5321 }, { "epoch": 12.918590522478736, "grad_norm": 0.7256731986999512, "learning_rate": 1.5957000000000003e-05, "loss": 0.0629, "step": 5322 }, { "epoch": 12.921020656136088, "grad_norm": 0.541075587272644, "learning_rate": 1.596e-05, "loss": 0.0461, "step": 5323 }, { "epoch": 12.923450789793439, "grad_norm": 0.6946406364440918, "learning_rate": 1.5963e-05, "loss": 0.0552, "step": 5324 }, { "epoch": 12.925880923450789, "grad_norm": 0.7475541234016418, "learning_rate": 1.5966e-05, "loss": 0.0801, "step": 5325 }, { "epoch": 12.928311057108141, "grad_norm": 0.506834864616394, "learning_rate": 1.5969e-05, "loss": 0.0399, "step": 5326 }, { "epoch": 12.930741190765492, "grad_norm": 0.4983169436454773, "learning_rate": 1.5972e-05, "loss": 0.039, "step": 5327 }, { "epoch": 12.933171324422844, "grad_norm": 0.8009323477745056, "learning_rate": 1.5975e-05, "loss": 0.0555, "step": 5328 }, { "epoch": 12.935601458080194, "grad_norm": 0.5273650884628296, "learning_rate": 1.5978e-05, "loss": 0.0461, "step": 5329 }, { "epoch": 12.938031591737545, "grad_norm": 0.813077449798584, "learning_rate": 1.5980999999999998e-05, "loss": 0.0764, "step": 5330 }, { "epoch": 12.940461725394897, "grad_norm": 0.794827401638031, "learning_rate": 1.5984e-05, "loss": 0.1129, "step": 5331 }, { "epoch": 12.942891859052247, "grad_norm": 0.8560178875923157, "learning_rate": 1.5987e-05, "loss": 0.0458, "step": 5332 }, { "epoch": 12.9453219927096, "grad_norm": 1.505643367767334, "learning_rate": 1.599e-05, "loss": 0.064, "step": 5333 }, { "epoch": 12.94775212636695, "grad_norm": 1.3262325525283813, "learning_rate": 1.5993e-05, "loss": 0.0664, "step": 5334 }, { "epoch": 12.9501822600243, "grad_norm": 0.9581536650657654, "learning_rate": 1.5996e-05, "loss": 0.0649, "step": 5335 }, { "epoch": 12.952612393681653, "grad_norm": 0.7778873443603516, "learning_rate": 1.5999e-05, "loss": 0.0496, "step": 5336 }, { "epoch": 12.955042527339003, "grad_norm": 0.7242713570594788, "learning_rate": 1.6002e-05, "loss": 0.0507, "step": 5337 }, { "epoch": 12.957472660996356, "grad_norm": 0.6822221279144287, "learning_rate": 1.6005e-05, "loss": 0.0336, "step": 5338 }, { "epoch": 12.959902794653706, "grad_norm": 1.2802282571792603, "learning_rate": 1.6008e-05, "loss": 0.0783, "step": 5339 }, { "epoch": 12.962332928311056, "grad_norm": 0.8412064909934998, "learning_rate": 1.6011e-05, "loss": 0.0637, "step": 5340 }, { "epoch": 12.964763061968409, "grad_norm": 2.6924471855163574, "learning_rate": 1.6014000000000003e-05, "loss": 0.1086, "step": 5341 }, { "epoch": 12.96719319562576, "grad_norm": 1.1232706308364868, "learning_rate": 1.6017000000000003e-05, "loss": 0.0893, "step": 5342 }, { "epoch": 12.969623329283111, "grad_norm": 1.3283416032791138, "learning_rate": 1.6020000000000002e-05, "loss": 0.0959, "step": 5343 }, { "epoch": 12.972053462940462, "grad_norm": 1.8908920288085938, "learning_rate": 1.6023000000000002e-05, "loss": 0.1575, "step": 5344 }, { "epoch": 12.974483596597812, "grad_norm": 1.0472813844680786, "learning_rate": 1.6026000000000002e-05, "loss": 0.2287, "step": 5345 }, { "epoch": 12.976913730255164, "grad_norm": 0.9255588054656982, "learning_rate": 1.6029000000000002e-05, "loss": 0.1623, "step": 5346 }, { "epoch": 12.979343863912515, "grad_norm": 0.5880234837532043, "learning_rate": 1.6032e-05, "loss": 0.0631, "step": 5347 }, { "epoch": 12.981773997569867, "grad_norm": 0.6160930395126343, "learning_rate": 1.6034999999999998e-05, "loss": 0.0502, "step": 5348 }, { "epoch": 12.984204131227218, "grad_norm": 0.6529883146286011, "learning_rate": 1.6037999999999998e-05, "loss": 0.0486, "step": 5349 }, { "epoch": 12.986634264884568, "grad_norm": 0.5300912857055664, "learning_rate": 1.6040999999999998e-05, "loss": 0.0457, "step": 5350 }, { "epoch": 12.98906439854192, "grad_norm": 0.6281200051307678, "learning_rate": 1.6044e-05, "loss": 0.0497, "step": 5351 }, { "epoch": 12.99149453219927, "grad_norm": 0.564112663269043, "learning_rate": 1.6047e-05, "loss": 0.0406, "step": 5352 }, { "epoch": 12.993924665856621, "grad_norm": 0.6477898359298706, "learning_rate": 1.605e-05, "loss": 0.0682, "step": 5353 }, { "epoch": 12.996354799513973, "grad_norm": 0.8044454455375671, "learning_rate": 1.6053e-05, "loss": 0.0508, "step": 5354 }, { "epoch": 12.998784933171324, "grad_norm": 1.300079584121704, "learning_rate": 1.6056e-05, "loss": 0.085, "step": 5355 }, { "epoch": 13.0, "grad_norm": 0.8249844908714294, "learning_rate": 1.6059e-05, "loss": 0.0406, "step": 5356 }, { "epoch": 13.00243013365735, "grad_norm": 2.8685266971588135, "learning_rate": 1.6062e-05, "loss": 0.3645, "step": 5357 }, { "epoch": 13.004860267314703, "grad_norm": 0.8245605230331421, "learning_rate": 1.6065e-05, "loss": 0.2258, "step": 5358 }, { "epoch": 13.007290400972053, "grad_norm": 0.9627644419670105, "learning_rate": 1.6068e-05, "loss": 0.1855, "step": 5359 }, { "epoch": 13.009720534629405, "grad_norm": 1.138790249824524, "learning_rate": 1.6071e-05, "loss": 0.198, "step": 5360 }, { "epoch": 13.012150668286756, "grad_norm": 1.185203194618225, "learning_rate": 1.6074000000000002e-05, "loss": 0.1851, "step": 5361 }, { "epoch": 13.014580801944106, "grad_norm": 0.9275107979774475, "learning_rate": 1.6077000000000002e-05, "loss": 0.1269, "step": 5362 }, { "epoch": 13.017010935601458, "grad_norm": 0.7931016087532043, "learning_rate": 1.6080000000000002e-05, "loss": 0.1529, "step": 5363 }, { "epoch": 13.019441069258809, "grad_norm": 0.8855624198913574, "learning_rate": 1.6083000000000002e-05, "loss": 0.1078, "step": 5364 }, { "epoch": 13.021871202916161, "grad_norm": 0.6965871453285217, "learning_rate": 1.6086e-05, "loss": 0.0964, "step": 5365 }, { "epoch": 13.024301336573512, "grad_norm": 0.7442613840103149, "learning_rate": 1.6089e-05, "loss": 0.1003, "step": 5366 }, { "epoch": 13.026731470230862, "grad_norm": 0.6347047686576843, "learning_rate": 1.6092e-05, "loss": 0.0672, "step": 5367 }, { "epoch": 13.029161603888214, "grad_norm": 0.5417145490646362, "learning_rate": 1.6095e-05, "loss": 0.0695, "step": 5368 }, { "epoch": 13.031591737545565, "grad_norm": 0.5960386991500854, "learning_rate": 1.6098e-05, "loss": 0.0717, "step": 5369 }, { "epoch": 13.034021871202917, "grad_norm": 0.6362292766571045, "learning_rate": 1.6101e-05, "loss": 0.0411, "step": 5370 }, { "epoch": 13.036452004860267, "grad_norm": 0.593944787979126, "learning_rate": 1.6104000000000004e-05, "loss": 0.0637, "step": 5371 }, { "epoch": 13.038882138517618, "grad_norm": 1.0347200632095337, "learning_rate": 1.6107e-05, "loss": 0.0528, "step": 5372 }, { "epoch": 13.04131227217497, "grad_norm": 0.5713120102882385, "learning_rate": 1.611e-05, "loss": 0.0438, "step": 5373 }, { "epoch": 13.04374240583232, "grad_norm": 0.548524796962738, "learning_rate": 1.6113e-05, "loss": 0.0449, "step": 5374 }, { "epoch": 13.046172539489673, "grad_norm": 0.6049216985702515, "learning_rate": 1.6116e-05, "loss": 0.0459, "step": 5375 }, { "epoch": 13.048602673147023, "grad_norm": 0.5699776411056519, "learning_rate": 1.6119e-05, "loss": 0.0576, "step": 5376 }, { "epoch": 13.051032806804374, "grad_norm": 0.5092930793762207, "learning_rate": 1.6122e-05, "loss": 0.0418, "step": 5377 }, { "epoch": 13.053462940461726, "grad_norm": 0.4267071485519409, "learning_rate": 1.6125e-05, "loss": 0.0343, "step": 5378 }, { "epoch": 13.055893074119076, "grad_norm": 0.8784050941467285, "learning_rate": 1.6128e-05, "loss": 0.0531, "step": 5379 }, { "epoch": 13.058323207776427, "grad_norm": 0.6865471601486206, "learning_rate": 1.6131e-05, "loss": 0.0325, "step": 5380 }, { "epoch": 13.060753341433779, "grad_norm": 0.35626494884490967, "learning_rate": 1.6134e-05, "loss": 0.0255, "step": 5381 }, { "epoch": 13.06318347509113, "grad_norm": 0.6821785569190979, "learning_rate": 1.6137e-05, "loss": 0.0457, "step": 5382 }, { "epoch": 13.065613608748482, "grad_norm": 0.5646979808807373, "learning_rate": 1.614e-05, "loss": 0.0407, "step": 5383 }, { "epoch": 13.068043742405832, "grad_norm": 0.9419880509376526, "learning_rate": 1.6143e-05, "loss": 0.0766, "step": 5384 }, { "epoch": 13.070473876063183, "grad_norm": 0.7199594974517822, "learning_rate": 1.6146e-05, "loss": 0.0422, "step": 5385 }, { "epoch": 13.072904009720535, "grad_norm": 0.4644034206867218, "learning_rate": 1.6149e-05, "loss": 0.0357, "step": 5386 }, { "epoch": 13.075334143377885, "grad_norm": 0.9607357978820801, "learning_rate": 1.6152e-05, "loss": 0.0569, "step": 5387 }, { "epoch": 13.077764277035238, "grad_norm": 0.8359339237213135, "learning_rate": 1.6155e-05, "loss": 0.0445, "step": 5388 }, { "epoch": 13.080194410692588, "grad_norm": 0.4759109616279602, "learning_rate": 1.6158e-05, "loss": 0.034, "step": 5389 }, { "epoch": 13.082624544349938, "grad_norm": 0.5670905709266663, "learning_rate": 1.6161e-05, "loss": 0.0594, "step": 5390 }, { "epoch": 13.08505467800729, "grad_norm": 0.609805166721344, "learning_rate": 1.6164e-05, "loss": 0.0386, "step": 5391 }, { "epoch": 13.087484811664641, "grad_norm": 0.4765615165233612, "learning_rate": 1.6167000000000003e-05, "loss": 0.0403, "step": 5392 }, { "epoch": 13.089914945321993, "grad_norm": 1.0071990489959717, "learning_rate": 1.6170000000000003e-05, "loss": 0.0617, "step": 5393 }, { "epoch": 13.092345078979344, "grad_norm": 1.324240803718567, "learning_rate": 1.6173000000000003e-05, "loss": 0.0373, "step": 5394 }, { "epoch": 13.094775212636694, "grad_norm": 0.6541887521743774, "learning_rate": 1.6176000000000002e-05, "loss": 0.0497, "step": 5395 }, { "epoch": 13.097205346294047, "grad_norm": 0.7364475131034851, "learning_rate": 1.6179000000000002e-05, "loss": 0.0531, "step": 5396 }, { "epoch": 13.099635479951397, "grad_norm": 1.0612307786941528, "learning_rate": 1.6182e-05, "loss": 0.0492, "step": 5397 }, { "epoch": 13.10206561360875, "grad_norm": 0.8428243398666382, "learning_rate": 1.6185e-05, "loss": 0.0447, "step": 5398 }, { "epoch": 13.1044957472661, "grad_norm": 1.0061317682266235, "learning_rate": 1.6187999999999998e-05, "loss": 0.0391, "step": 5399 }, { "epoch": 13.10692588092345, "grad_norm": 1.548415184020996, "learning_rate": 1.6190999999999998e-05, "loss": 0.0486, "step": 5400 }, { "epoch": 13.109356014580802, "grad_norm": 0.902320921421051, "learning_rate": 1.6193999999999998e-05, "loss": 0.055, "step": 5401 }, { "epoch": 13.111786148238153, "grad_norm": 1.1040817499160767, "learning_rate": 1.6197e-05, "loss": 0.0582, "step": 5402 }, { "epoch": 13.114216281895505, "grad_norm": 1.3520560264587402, "learning_rate": 1.62e-05, "loss": 0.0836, "step": 5403 }, { "epoch": 13.116646415552855, "grad_norm": 1.2683899402618408, "learning_rate": 1.6203e-05, "loss": 0.075, "step": 5404 }, { "epoch": 13.119076549210206, "grad_norm": 1.5453851222991943, "learning_rate": 1.6206e-05, "loss": 0.135, "step": 5405 }, { "epoch": 13.121506682867558, "grad_norm": 2.6513619422912598, "learning_rate": 1.6209e-05, "loss": 0.1837, "step": 5406 }, { "epoch": 13.123936816524909, "grad_norm": 1.145983099937439, "learning_rate": 1.6212e-05, "loss": 0.3159, "step": 5407 }, { "epoch": 13.12636695018226, "grad_norm": 0.7336910963058472, "learning_rate": 1.6215e-05, "loss": 0.2498, "step": 5408 }, { "epoch": 13.128797083839611, "grad_norm": 0.7127447128295898, "learning_rate": 1.6218e-05, "loss": 0.2175, "step": 5409 }, { "epoch": 13.131227217496962, "grad_norm": 0.7098100781440735, "learning_rate": 1.6221e-05, "loss": 0.2138, "step": 5410 }, { "epoch": 13.133657351154314, "grad_norm": 0.6697761416435242, "learning_rate": 1.6224e-05, "loss": 0.1258, "step": 5411 }, { "epoch": 13.136087484811664, "grad_norm": 0.7993456125259399, "learning_rate": 1.6227000000000002e-05, "loss": 0.1246, "step": 5412 }, { "epoch": 13.138517618469017, "grad_norm": 0.5602075457572937, "learning_rate": 1.6230000000000002e-05, "loss": 0.0978, "step": 5413 }, { "epoch": 13.140947752126367, "grad_norm": 0.9987695217132568, "learning_rate": 1.6233000000000002e-05, "loss": 0.1068, "step": 5414 }, { "epoch": 13.143377885783718, "grad_norm": 0.5665537118911743, "learning_rate": 1.6236000000000002e-05, "loss": 0.0601, "step": 5415 }, { "epoch": 13.14580801944107, "grad_norm": 0.6560064554214478, "learning_rate": 1.6239e-05, "loss": 0.066, "step": 5416 }, { "epoch": 13.14823815309842, "grad_norm": 0.6115995049476624, "learning_rate": 1.6242e-05, "loss": 0.0594, "step": 5417 }, { "epoch": 13.15066828675577, "grad_norm": 0.6506797075271606, "learning_rate": 1.6245e-05, "loss": 0.0467, "step": 5418 }, { "epoch": 13.153098420413123, "grad_norm": 0.9697650671005249, "learning_rate": 1.6248e-05, "loss": 0.0726, "step": 5419 }, { "epoch": 13.155528554070473, "grad_norm": 0.5404383540153503, "learning_rate": 1.6251e-05, "loss": 0.0425, "step": 5420 }, { "epoch": 13.157958687727826, "grad_norm": 0.5247336030006409, "learning_rate": 1.6253999999999997e-05, "loss": 0.0386, "step": 5421 }, { "epoch": 13.160388821385176, "grad_norm": 0.5224378705024719, "learning_rate": 1.6257e-05, "loss": 0.0392, "step": 5422 }, { "epoch": 13.162818955042527, "grad_norm": 0.5229789018630981, "learning_rate": 1.626e-05, "loss": 0.0492, "step": 5423 }, { "epoch": 13.165249088699879, "grad_norm": 0.5495187044143677, "learning_rate": 1.6263e-05, "loss": 0.0477, "step": 5424 }, { "epoch": 13.16767922235723, "grad_norm": 0.6668313145637512, "learning_rate": 1.6266e-05, "loss": 0.0562, "step": 5425 }, { "epoch": 13.170109356014581, "grad_norm": 0.7653876543045044, "learning_rate": 1.6269e-05, "loss": 0.0661, "step": 5426 }, { "epoch": 13.172539489671932, "grad_norm": 0.6121452450752258, "learning_rate": 1.6272e-05, "loss": 0.0487, "step": 5427 }, { "epoch": 13.174969623329282, "grad_norm": 0.6622475981712341, "learning_rate": 1.6275e-05, "loss": 0.1101, "step": 5428 }, { "epoch": 13.177399756986635, "grad_norm": 0.5214522480964661, "learning_rate": 1.6278e-05, "loss": 0.0384, "step": 5429 }, { "epoch": 13.179829890643985, "grad_norm": 0.603281557559967, "learning_rate": 1.6281e-05, "loss": 0.0615, "step": 5430 }, { "epoch": 13.182260024301337, "grad_norm": 0.6112319827079773, "learning_rate": 1.6284e-05, "loss": 0.0515, "step": 5431 }, { "epoch": 13.184690157958688, "grad_norm": 0.5742805004119873, "learning_rate": 1.6287000000000002e-05, "loss": 0.0645, "step": 5432 }, { "epoch": 13.187120291616038, "grad_norm": 0.5436689257621765, "learning_rate": 1.629e-05, "loss": 0.0395, "step": 5433 }, { "epoch": 13.18955042527339, "grad_norm": 0.5993199348449707, "learning_rate": 1.6293e-05, "loss": 0.0303, "step": 5434 }, { "epoch": 13.19198055893074, "grad_norm": 0.672516942024231, "learning_rate": 1.6296e-05, "loss": 0.0501, "step": 5435 }, { "epoch": 13.194410692588093, "grad_norm": 0.40560269355773926, "learning_rate": 1.6299e-05, "loss": 0.0329, "step": 5436 }, { "epoch": 13.196840826245444, "grad_norm": 0.6971234679222107, "learning_rate": 1.6302e-05, "loss": 0.034, "step": 5437 }, { "epoch": 13.199270959902794, "grad_norm": 0.5087785720825195, "learning_rate": 1.6305e-05, "loss": 0.0469, "step": 5438 }, { "epoch": 13.201701093560146, "grad_norm": 0.34448182582855225, "learning_rate": 1.6308e-05, "loss": 0.0284, "step": 5439 }, { "epoch": 13.204131227217497, "grad_norm": 1.1239583492279053, "learning_rate": 1.6311e-05, "loss": 0.0789, "step": 5440 }, { "epoch": 13.206561360874849, "grad_norm": 0.9970930218696594, "learning_rate": 1.6314e-05, "loss": 0.1002, "step": 5441 }, { "epoch": 13.2089914945322, "grad_norm": 0.5647603273391724, "learning_rate": 1.6317000000000003e-05, "loss": 0.0396, "step": 5442 }, { "epoch": 13.21142162818955, "grad_norm": 0.857897162437439, "learning_rate": 1.6320000000000003e-05, "loss": 0.0485, "step": 5443 }, { "epoch": 13.213851761846902, "grad_norm": 0.6967803239822388, "learning_rate": 1.6323000000000003e-05, "loss": 0.0507, "step": 5444 }, { "epoch": 13.216281895504252, "grad_norm": 0.595470666885376, "learning_rate": 1.6326000000000003e-05, "loss": 0.0529, "step": 5445 }, { "epoch": 13.218712029161605, "grad_norm": 0.8134706020355225, "learning_rate": 1.6329e-05, "loss": 0.0533, "step": 5446 }, { "epoch": 13.221142162818955, "grad_norm": 0.6919566988945007, "learning_rate": 1.6332e-05, "loss": 0.0586, "step": 5447 }, { "epoch": 13.223572296476306, "grad_norm": 0.4675111174583435, "learning_rate": 1.6335e-05, "loss": 0.042, "step": 5448 }, { "epoch": 13.226002430133658, "grad_norm": 0.4655759334564209, "learning_rate": 1.6338e-05, "loss": 0.0356, "step": 5449 }, { "epoch": 13.228432563791008, "grad_norm": 1.0172127485275269, "learning_rate": 1.6340999999999998e-05, "loss": 0.0529, "step": 5450 }, { "epoch": 13.23086269744836, "grad_norm": 0.563166618347168, "learning_rate": 1.6343999999999998e-05, "loss": 0.0381, "step": 5451 }, { "epoch": 13.233292831105711, "grad_norm": 0.6735266447067261, "learning_rate": 1.6347e-05, "loss": 0.0266, "step": 5452 }, { "epoch": 13.235722964763061, "grad_norm": 1.5521037578582764, "learning_rate": 1.635e-05, "loss": 0.0704, "step": 5453 }, { "epoch": 13.238153098420414, "grad_norm": 1.052100419998169, "learning_rate": 1.6353e-05, "loss": 0.0706, "step": 5454 }, { "epoch": 13.240583232077764, "grad_norm": 1.7089303731918335, "learning_rate": 1.6356e-05, "loss": 0.1433, "step": 5455 }, { "epoch": 13.243013365735115, "grad_norm": 1.2682044506072998, "learning_rate": 1.6359e-05, "loss": 0.1076, "step": 5456 }, { "epoch": 13.245443499392467, "grad_norm": 1.343487024307251, "learning_rate": 1.6362e-05, "loss": 0.347, "step": 5457 }, { "epoch": 13.247873633049817, "grad_norm": 0.7867065072059631, "learning_rate": 1.6365e-05, "loss": 0.258, "step": 5458 }, { "epoch": 13.25030376670717, "grad_norm": 0.7829591631889343, "learning_rate": 1.6368e-05, "loss": 0.1958, "step": 5459 }, { "epoch": 13.25273390036452, "grad_norm": 0.8008043766021729, "learning_rate": 1.6371e-05, "loss": 0.2347, "step": 5460 }, { "epoch": 13.25516403402187, "grad_norm": 0.7070314884185791, "learning_rate": 1.6374e-05, "loss": 0.1526, "step": 5461 }, { "epoch": 13.257594167679223, "grad_norm": 1.0546960830688477, "learning_rate": 1.6377000000000003e-05, "loss": 0.1643, "step": 5462 }, { "epoch": 13.260024301336573, "grad_norm": 0.8754344582557678, "learning_rate": 1.6380000000000002e-05, "loss": 0.1163, "step": 5463 }, { "epoch": 13.262454434993925, "grad_norm": 0.5651683211326599, "learning_rate": 1.6383000000000002e-05, "loss": 0.1011, "step": 5464 }, { "epoch": 13.264884568651276, "grad_norm": 0.6788341999053955, "learning_rate": 1.6386000000000002e-05, "loss": 0.0893, "step": 5465 }, { "epoch": 13.267314702308626, "grad_norm": 0.8249924182891846, "learning_rate": 1.6389000000000002e-05, "loss": 0.0691, "step": 5466 }, { "epoch": 13.269744835965978, "grad_norm": 0.7505711913108826, "learning_rate": 1.6392e-05, "loss": 0.0833, "step": 5467 }, { "epoch": 13.272174969623329, "grad_norm": 0.7824767827987671, "learning_rate": 1.6395e-05, "loss": 0.07, "step": 5468 }, { "epoch": 13.274605103280681, "grad_norm": 0.5118164420127869, "learning_rate": 1.6398e-05, "loss": 0.0486, "step": 5469 }, { "epoch": 13.277035236938032, "grad_norm": 0.32173216342926025, "learning_rate": 1.6400999999999998e-05, "loss": 0.0373, "step": 5470 }, { "epoch": 13.279465370595382, "grad_norm": 0.475147545337677, "learning_rate": 1.6403999999999997e-05, "loss": 0.0614, "step": 5471 }, { "epoch": 13.281895504252734, "grad_norm": 0.8350782990455627, "learning_rate": 1.6407e-05, "loss": 0.08, "step": 5472 }, { "epoch": 13.284325637910085, "grad_norm": 0.46830374002456665, "learning_rate": 1.641e-05, "loss": 0.0431, "step": 5473 }, { "epoch": 13.286755771567437, "grad_norm": 0.8462149500846863, "learning_rate": 1.6413e-05, "loss": 0.0602, "step": 5474 }, { "epoch": 13.289185905224787, "grad_norm": 0.402767151594162, "learning_rate": 1.6416e-05, "loss": 0.0324, "step": 5475 }, { "epoch": 13.291616038882138, "grad_norm": 0.547050416469574, "learning_rate": 1.6419e-05, "loss": 0.0404, "step": 5476 }, { "epoch": 13.29404617253949, "grad_norm": 0.5969418287277222, "learning_rate": 1.6422e-05, "loss": 0.0471, "step": 5477 }, { "epoch": 13.29647630619684, "grad_norm": 0.9801397323608398, "learning_rate": 1.6425e-05, "loss": 0.0603, "step": 5478 }, { "epoch": 13.298906439854193, "grad_norm": 0.4606703221797943, "learning_rate": 1.6428e-05, "loss": 0.0358, "step": 5479 }, { "epoch": 13.301336573511543, "grad_norm": 0.42030593752861023, "learning_rate": 1.6431e-05, "loss": 0.0364, "step": 5480 }, { "epoch": 13.303766707168894, "grad_norm": 0.7511512041091919, "learning_rate": 1.6434e-05, "loss": 0.0529, "step": 5481 }, { "epoch": 13.306196840826246, "grad_norm": 0.7120251059532166, "learning_rate": 1.6437000000000002e-05, "loss": 0.1122, "step": 5482 }, { "epoch": 13.308626974483596, "grad_norm": 0.5522324442863464, "learning_rate": 1.6440000000000002e-05, "loss": 0.057, "step": 5483 }, { "epoch": 13.311057108140949, "grad_norm": 0.44110050797462463, "learning_rate": 1.6443e-05, "loss": 0.0438, "step": 5484 }, { "epoch": 13.313487241798299, "grad_norm": 0.4569816291332245, "learning_rate": 1.6446e-05, "loss": 0.0419, "step": 5485 }, { "epoch": 13.31591737545565, "grad_norm": 0.6826367974281311, "learning_rate": 1.6449e-05, "loss": 0.0329, "step": 5486 }, { "epoch": 13.318347509113002, "grad_norm": 0.447550892829895, "learning_rate": 1.6452e-05, "loss": 0.0429, "step": 5487 }, { "epoch": 13.320777642770352, "grad_norm": 0.7075797319412231, "learning_rate": 1.6455e-05, "loss": 0.0438, "step": 5488 }, { "epoch": 13.323207776427704, "grad_norm": 0.5081048607826233, "learning_rate": 1.6458e-05, "loss": 0.0385, "step": 5489 }, { "epoch": 13.325637910085055, "grad_norm": 0.4146362543106079, "learning_rate": 1.6461e-05, "loss": 0.0364, "step": 5490 }, { "epoch": 13.328068043742405, "grad_norm": 1.0108251571655273, "learning_rate": 1.6464e-05, "loss": 0.0971, "step": 5491 }, { "epoch": 13.330498177399758, "grad_norm": 0.7383033633232117, "learning_rate": 1.6467000000000003e-05, "loss": 0.0865, "step": 5492 }, { "epoch": 13.332928311057108, "grad_norm": 0.5894416570663452, "learning_rate": 1.6470000000000003e-05, "loss": 0.0346, "step": 5493 }, { "epoch": 13.335358444714458, "grad_norm": 0.7508704662322998, "learning_rate": 1.6473000000000003e-05, "loss": 0.0421, "step": 5494 }, { "epoch": 13.33778857837181, "grad_norm": 0.6328942775726318, "learning_rate": 1.6476e-05, "loss": 0.049, "step": 5495 }, { "epoch": 13.340218712029161, "grad_norm": 0.6817615032196045, "learning_rate": 1.6479e-05, "loss": 0.0562, "step": 5496 }, { "epoch": 13.342648845686513, "grad_norm": 0.6140010952949524, "learning_rate": 1.6482e-05, "loss": 0.033, "step": 5497 }, { "epoch": 13.345078979343864, "grad_norm": 0.5919824838638306, "learning_rate": 1.6485e-05, "loss": 0.0419, "step": 5498 }, { "epoch": 13.347509113001214, "grad_norm": 0.6980516910552979, "learning_rate": 1.6488e-05, "loss": 0.0357, "step": 5499 }, { "epoch": 13.349939246658566, "grad_norm": 1.2109086513519287, "learning_rate": 1.6491e-05, "loss": 0.0604, "step": 5500 }, { "epoch": 13.352369380315917, "grad_norm": 1.3724054098129272, "learning_rate": 1.6493999999999998e-05, "loss": 0.0469, "step": 5501 }, { "epoch": 13.35479951397327, "grad_norm": 0.6947828531265259, "learning_rate": 1.6497e-05, "loss": 0.0563, "step": 5502 }, { "epoch": 13.35722964763062, "grad_norm": 1.0576634407043457, "learning_rate": 1.65e-05, "loss": 0.0592, "step": 5503 }, { "epoch": 13.35965978128797, "grad_norm": 1.4751687049865723, "learning_rate": 1.6503e-05, "loss": 0.0508, "step": 5504 }, { "epoch": 13.362089914945322, "grad_norm": 1.4655393362045288, "learning_rate": 1.6506e-05, "loss": 0.0844, "step": 5505 }, { "epoch": 13.364520048602673, "grad_norm": 1.8729538917541504, "learning_rate": 1.6509e-05, "loss": 0.1104, "step": 5506 }, { "epoch": 13.366950182260025, "grad_norm": 2.7040131092071533, "learning_rate": 1.6512e-05, "loss": 0.3934, "step": 5507 }, { "epoch": 13.369380315917375, "grad_norm": 1.0193812847137451, "learning_rate": 1.6515e-05, "loss": 0.2517, "step": 5508 }, { "epoch": 13.371810449574726, "grad_norm": 0.6879043579101562, "learning_rate": 1.6518e-05, "loss": 0.1674, "step": 5509 }, { "epoch": 13.374240583232078, "grad_norm": 0.9234668016433716, "learning_rate": 1.6521e-05, "loss": 0.1949, "step": 5510 }, { "epoch": 13.376670716889429, "grad_norm": 1.0042622089385986, "learning_rate": 1.6524e-05, "loss": 0.205, "step": 5511 }, { "epoch": 13.37910085054678, "grad_norm": 0.8955734968185425, "learning_rate": 1.6527e-05, "loss": 0.1202, "step": 5512 }, { "epoch": 13.381530984204131, "grad_norm": 0.7303063273429871, "learning_rate": 1.6530000000000003e-05, "loss": 0.1462, "step": 5513 }, { "epoch": 13.383961117861482, "grad_norm": 1.2447596788406372, "learning_rate": 1.6533000000000002e-05, "loss": 0.0929, "step": 5514 }, { "epoch": 13.386391251518834, "grad_norm": 0.5071356296539307, "learning_rate": 1.6536000000000002e-05, "loss": 0.0552, "step": 5515 }, { "epoch": 13.388821385176184, "grad_norm": 0.5925241112709045, "learning_rate": 1.6539000000000002e-05, "loss": 0.0625, "step": 5516 }, { "epoch": 13.391251518833537, "grad_norm": 1.447898268699646, "learning_rate": 1.6542000000000002e-05, "loss": 0.1018, "step": 5517 }, { "epoch": 13.393681652490887, "grad_norm": 0.5412305593490601, "learning_rate": 1.6545e-05, "loss": 0.0479, "step": 5518 }, { "epoch": 13.396111786148237, "grad_norm": 0.5611773729324341, "learning_rate": 1.6548e-05, "loss": 0.0499, "step": 5519 }, { "epoch": 13.39854191980559, "grad_norm": 0.7196307182312012, "learning_rate": 1.6550999999999998e-05, "loss": 0.0643, "step": 5520 }, { "epoch": 13.40097205346294, "grad_norm": 0.4894862771034241, "learning_rate": 1.6553999999999998e-05, "loss": 0.0444, "step": 5521 }, { "epoch": 13.403402187120292, "grad_norm": 0.508087694644928, "learning_rate": 1.6556999999999998e-05, "loss": 0.0571, "step": 5522 }, { "epoch": 13.405832320777643, "grad_norm": 0.7057396769523621, "learning_rate": 1.656e-05, "loss": 0.0621, "step": 5523 }, { "epoch": 13.408262454434993, "grad_norm": 0.5257561802864075, "learning_rate": 1.6563e-05, "loss": 0.0641, "step": 5524 }, { "epoch": 13.410692588092346, "grad_norm": 0.5234394073486328, "learning_rate": 1.6566e-05, "loss": 0.049, "step": 5525 }, { "epoch": 13.413122721749696, "grad_norm": 0.625653088092804, "learning_rate": 1.6569e-05, "loss": 0.0484, "step": 5526 }, { "epoch": 13.415552855407048, "grad_norm": 0.42989978194236755, "learning_rate": 1.6572e-05, "loss": 0.0329, "step": 5527 }, { "epoch": 13.417982989064399, "grad_norm": 0.5795109868049622, "learning_rate": 1.6575e-05, "loss": 0.0454, "step": 5528 }, { "epoch": 13.42041312272175, "grad_norm": 0.5863187909126282, "learning_rate": 1.6578e-05, "loss": 0.0718, "step": 5529 }, { "epoch": 13.422843256379101, "grad_norm": 0.36958757042884827, "learning_rate": 1.6581e-05, "loss": 0.0345, "step": 5530 }, { "epoch": 13.425273390036452, "grad_norm": 0.5193707346916199, "learning_rate": 1.6584e-05, "loss": 0.0422, "step": 5531 }, { "epoch": 13.427703523693804, "grad_norm": 0.537275493144989, "learning_rate": 1.6587e-05, "loss": 0.0423, "step": 5532 }, { "epoch": 13.430133657351154, "grad_norm": 0.5797203779220581, "learning_rate": 1.6590000000000002e-05, "loss": 0.0329, "step": 5533 }, { "epoch": 13.432563791008505, "grad_norm": 0.6235591769218445, "learning_rate": 1.6593000000000002e-05, "loss": 0.0377, "step": 5534 }, { "epoch": 13.434993924665857, "grad_norm": 0.5457510352134705, "learning_rate": 1.6596000000000002e-05, "loss": 0.0435, "step": 5535 }, { "epoch": 13.437424058323208, "grad_norm": 0.6517664194107056, "learning_rate": 1.6599e-05, "loss": 0.0447, "step": 5536 }, { "epoch": 13.439854191980558, "grad_norm": 0.5575335621833801, "learning_rate": 1.6602e-05, "loss": 0.0383, "step": 5537 }, { "epoch": 13.44228432563791, "grad_norm": 0.5616682171821594, "learning_rate": 1.6605e-05, "loss": 0.0351, "step": 5538 }, { "epoch": 13.44471445929526, "grad_norm": 0.6019074320793152, "learning_rate": 1.6608e-05, "loss": 0.0493, "step": 5539 }, { "epoch": 13.447144592952613, "grad_norm": 0.8517675995826721, "learning_rate": 1.6611e-05, "loss": 0.0431, "step": 5540 }, { "epoch": 13.449574726609963, "grad_norm": 1.7309770584106445, "learning_rate": 1.6614e-05, "loss": 0.0532, "step": 5541 }, { "epoch": 13.452004860267314, "grad_norm": 0.6742914915084839, "learning_rate": 1.6617e-05, "loss": 0.0342, "step": 5542 }, { "epoch": 13.454434993924666, "grad_norm": 1.2514015436172485, "learning_rate": 1.6620000000000004e-05, "loss": 0.035, "step": 5543 }, { "epoch": 13.456865127582017, "grad_norm": 0.8769716024398804, "learning_rate": 1.6623e-05, "loss": 0.0735, "step": 5544 }, { "epoch": 13.459295261239369, "grad_norm": 1.893641471862793, "learning_rate": 1.6626e-05, "loss": 0.0395, "step": 5545 }, { "epoch": 13.46172539489672, "grad_norm": 0.3451139032840729, "learning_rate": 1.6629e-05, "loss": 0.0169, "step": 5546 }, { "epoch": 13.46415552855407, "grad_norm": 1.3966628313064575, "learning_rate": 1.6632e-05, "loss": 0.0554, "step": 5547 }, { "epoch": 13.466585662211422, "grad_norm": 0.7815367579460144, "learning_rate": 1.6635e-05, "loss": 0.0391, "step": 5548 }, { "epoch": 13.469015795868772, "grad_norm": 0.9360536336898804, "learning_rate": 1.6638e-05, "loss": 0.052, "step": 5549 }, { "epoch": 13.471445929526125, "grad_norm": 0.9074949026107788, "learning_rate": 1.6641e-05, "loss": 0.0465, "step": 5550 }, { "epoch": 13.473876063183475, "grad_norm": 0.8692833185195923, "learning_rate": 1.6644e-05, "loss": 0.0458, "step": 5551 }, { "epoch": 13.476306196840826, "grad_norm": 1.990012288093567, "learning_rate": 1.6647e-05, "loss": 0.065, "step": 5552 }, { "epoch": 13.478736330498178, "grad_norm": 1.993389368057251, "learning_rate": 1.665e-05, "loss": 0.0689, "step": 5553 }, { "epoch": 13.481166464155528, "grad_norm": 1.2766170501708984, "learning_rate": 1.6653e-05, "loss": 0.086, "step": 5554 }, { "epoch": 13.48359659781288, "grad_norm": 1.5583425760269165, "learning_rate": 1.6656e-05, "loss": 0.0672, "step": 5555 }, { "epoch": 13.486026731470231, "grad_norm": 1.9480493068695068, "learning_rate": 1.6659e-05, "loss": 0.1325, "step": 5556 }, { "epoch": 13.488456865127581, "grad_norm": 1.6707656383514404, "learning_rate": 1.6662e-05, "loss": 0.354, "step": 5557 }, { "epoch": 13.490886998784934, "grad_norm": 0.8654770255088806, "learning_rate": 1.6665e-05, "loss": 0.2728, "step": 5558 }, { "epoch": 13.493317132442284, "grad_norm": 0.761056125164032, "learning_rate": 1.6668e-05, "loss": 0.2056, "step": 5559 }, { "epoch": 13.495747266099636, "grad_norm": 1.176239252090454, "learning_rate": 1.6671e-05, "loss": 0.179, "step": 5560 }, { "epoch": 13.498177399756987, "grad_norm": 0.6395474672317505, "learning_rate": 1.6674e-05, "loss": 0.149, "step": 5561 }, { "epoch": 13.500607533414337, "grad_norm": 0.7563707232475281, "learning_rate": 1.6677e-05, "loss": 0.1436, "step": 5562 }, { "epoch": 13.50303766707169, "grad_norm": 0.9689038991928101, "learning_rate": 1.6680000000000003e-05, "loss": 0.1302, "step": 5563 }, { "epoch": 13.50546780072904, "grad_norm": 0.7865955829620361, "learning_rate": 1.6683000000000003e-05, "loss": 0.1196, "step": 5564 }, { "epoch": 13.507897934386392, "grad_norm": 0.7180125713348389, "learning_rate": 1.6686000000000003e-05, "loss": 0.0939, "step": 5565 }, { "epoch": 13.510328068043743, "grad_norm": 0.42638933658599854, "learning_rate": 1.6689000000000002e-05, "loss": 0.0698, "step": 5566 }, { "epoch": 13.512758201701093, "grad_norm": 0.8168793320655823, "learning_rate": 1.6692000000000002e-05, "loss": 0.0719, "step": 5567 }, { "epoch": 13.515188335358445, "grad_norm": 0.526640772819519, "learning_rate": 1.6695000000000002e-05, "loss": 0.0623, "step": 5568 }, { "epoch": 13.517618469015796, "grad_norm": 0.7032027840614319, "learning_rate": 1.6698e-05, "loss": 0.0659, "step": 5569 }, { "epoch": 13.520048602673146, "grad_norm": 0.5265986323356628, "learning_rate": 1.6700999999999998e-05, "loss": 0.048, "step": 5570 }, { "epoch": 13.522478736330498, "grad_norm": 0.7240424752235413, "learning_rate": 1.6703999999999998e-05, "loss": 0.0609, "step": 5571 }, { "epoch": 13.524908869987849, "grad_norm": 0.4587748646736145, "learning_rate": 1.6706999999999998e-05, "loss": 0.0582, "step": 5572 }, { "epoch": 13.527339003645201, "grad_norm": 0.8163689374923706, "learning_rate": 1.671e-05, "loss": 0.0424, "step": 5573 }, { "epoch": 13.529769137302551, "grad_norm": 0.5553855299949646, "learning_rate": 1.6713e-05, "loss": 0.048, "step": 5574 }, { "epoch": 13.532199270959904, "grad_norm": 0.4990975856781006, "learning_rate": 1.6716e-05, "loss": 0.0425, "step": 5575 }, { "epoch": 13.534629404617254, "grad_norm": 0.4960717260837555, "learning_rate": 1.6719e-05, "loss": 0.0433, "step": 5576 }, { "epoch": 13.537059538274605, "grad_norm": 0.44410598278045654, "learning_rate": 1.6722e-05, "loss": 0.0539, "step": 5577 }, { "epoch": 13.539489671931957, "grad_norm": 0.6994020342826843, "learning_rate": 1.6725e-05, "loss": 0.0676, "step": 5578 }, { "epoch": 13.541919805589307, "grad_norm": 0.7088056802749634, "learning_rate": 1.6728e-05, "loss": 0.0468, "step": 5579 }, { "epoch": 13.544349939246658, "grad_norm": 0.6827281713485718, "learning_rate": 1.6731e-05, "loss": 0.0659, "step": 5580 }, { "epoch": 13.54678007290401, "grad_norm": 0.7738704681396484, "learning_rate": 1.6734e-05, "loss": 0.0653, "step": 5581 }, { "epoch": 13.54921020656136, "grad_norm": 0.42860475182533264, "learning_rate": 1.6737e-05, "loss": 0.0334, "step": 5582 }, { "epoch": 13.551640340218713, "grad_norm": 0.4374883472919464, "learning_rate": 1.6740000000000002e-05, "loss": 0.0272, "step": 5583 }, { "epoch": 13.554070473876063, "grad_norm": 1.0931240320205688, "learning_rate": 1.6743000000000002e-05, "loss": 0.0632, "step": 5584 }, { "epoch": 13.556500607533414, "grad_norm": 0.5386656522750854, "learning_rate": 1.6746000000000002e-05, "loss": 0.0349, "step": 5585 }, { "epoch": 13.558930741190766, "grad_norm": 0.500220537185669, "learning_rate": 1.6749000000000002e-05, "loss": 0.0327, "step": 5586 }, { "epoch": 13.561360874848116, "grad_norm": 0.5168236494064331, "learning_rate": 1.6752e-05, "loss": 0.0356, "step": 5587 }, { "epoch": 13.563791008505468, "grad_norm": 0.6268650889396667, "learning_rate": 1.6755e-05, "loss": 0.0542, "step": 5588 }, { "epoch": 13.566221142162819, "grad_norm": 0.6357355117797852, "learning_rate": 1.6758e-05, "loss": 0.061, "step": 5589 }, { "epoch": 13.56865127582017, "grad_norm": 0.5112661719322205, "learning_rate": 1.6761e-05, "loss": 0.0368, "step": 5590 }, { "epoch": 13.571081409477522, "grad_norm": 0.9510625600814819, "learning_rate": 1.6764e-05, "loss": 0.0708, "step": 5591 }, { "epoch": 13.573511543134872, "grad_norm": 0.5186821222305298, "learning_rate": 1.6767e-05, "loss": 0.045, "step": 5592 }, { "epoch": 13.575941676792224, "grad_norm": 0.6980659365653992, "learning_rate": 1.677e-05, "loss": 0.0403, "step": 5593 }, { "epoch": 13.578371810449575, "grad_norm": 1.4462699890136719, "learning_rate": 1.6773e-05, "loss": 0.0614, "step": 5594 }, { "epoch": 13.580801944106925, "grad_norm": 0.8205660581588745, "learning_rate": 1.6776e-05, "loss": 0.0392, "step": 5595 }, { "epoch": 13.583232077764277, "grad_norm": 0.47472620010375977, "learning_rate": 1.6779e-05, "loss": 0.0272, "step": 5596 }, { "epoch": 13.585662211421628, "grad_norm": 1.498043179512024, "learning_rate": 1.6782e-05, "loss": 0.0765, "step": 5597 }, { "epoch": 13.58809234507898, "grad_norm": 1.0610984563827515, "learning_rate": 1.6785e-05, "loss": 0.042, "step": 5598 }, { "epoch": 13.59052247873633, "grad_norm": 1.0032472610473633, "learning_rate": 1.6788e-05, "loss": 0.0568, "step": 5599 }, { "epoch": 13.592952612393681, "grad_norm": 0.86098313331604, "learning_rate": 1.6791e-05, "loss": 0.0578, "step": 5600 }, { "epoch": 13.595382746051033, "grad_norm": 0.9649855494499207, "learning_rate": 1.6794e-05, "loss": 0.0517, "step": 5601 }, { "epoch": 13.597812879708384, "grad_norm": 0.682956874370575, "learning_rate": 1.6797e-05, "loss": 0.0467, "step": 5602 }, { "epoch": 13.600243013365736, "grad_norm": 1.7451262474060059, "learning_rate": 1.6800000000000002e-05, "loss": 0.0613, "step": 5603 }, { "epoch": 13.602673147023086, "grad_norm": 1.1091077327728271, "learning_rate": 1.6803e-05, "loss": 0.067, "step": 5604 }, { "epoch": 13.605103280680437, "grad_norm": 2.055915117263794, "learning_rate": 1.6806e-05, "loss": 0.0813, "step": 5605 }, { "epoch": 13.607533414337789, "grad_norm": 1.4273878335952759, "learning_rate": 1.6809e-05, "loss": 0.1067, "step": 5606 }, { "epoch": 13.60996354799514, "grad_norm": 1.6914215087890625, "learning_rate": 1.6812e-05, "loss": 0.3222, "step": 5607 }, { "epoch": 13.612393681652492, "grad_norm": 0.9125978350639343, "learning_rate": 1.6815e-05, "loss": 0.2505, "step": 5608 }, { "epoch": 13.614823815309842, "grad_norm": 0.7556240558624268, "learning_rate": 1.6818e-05, "loss": 0.2131, "step": 5609 }, { "epoch": 13.617253948967193, "grad_norm": 0.6228081583976746, "learning_rate": 1.6821e-05, "loss": 0.1761, "step": 5610 }, { "epoch": 13.619684082624545, "grad_norm": 0.8321386575698853, "learning_rate": 1.6824e-05, "loss": 0.143, "step": 5611 }, { "epoch": 13.622114216281895, "grad_norm": 0.8248468041419983, "learning_rate": 1.6827e-05, "loss": 0.1188, "step": 5612 }, { "epoch": 13.624544349939246, "grad_norm": 0.5992690920829773, "learning_rate": 1.6830000000000003e-05, "loss": 0.1008, "step": 5613 }, { "epoch": 13.626974483596598, "grad_norm": 0.5930070281028748, "learning_rate": 1.6833000000000003e-05, "loss": 0.0785, "step": 5614 }, { "epoch": 13.629404617253948, "grad_norm": 0.5372921228408813, "learning_rate": 1.6836000000000003e-05, "loss": 0.0729, "step": 5615 }, { "epoch": 13.6318347509113, "grad_norm": 0.7072445750236511, "learning_rate": 1.6839000000000003e-05, "loss": 0.0752, "step": 5616 }, { "epoch": 13.634264884568651, "grad_norm": 0.6023057103157043, "learning_rate": 1.6842000000000002e-05, "loss": 0.0596, "step": 5617 }, { "epoch": 13.636695018226002, "grad_norm": 0.6259257793426514, "learning_rate": 1.6845e-05, "loss": 0.075, "step": 5618 }, { "epoch": 13.639125151883354, "grad_norm": 0.7036105990409851, "learning_rate": 1.6848e-05, "loss": 0.0534, "step": 5619 }, { "epoch": 13.641555285540704, "grad_norm": 0.42806902527809143, "learning_rate": 1.6851e-05, "loss": 0.0305, "step": 5620 }, { "epoch": 13.643985419198057, "grad_norm": 0.39449453353881836, "learning_rate": 1.6853999999999998e-05, "loss": 0.0575, "step": 5621 }, { "epoch": 13.646415552855407, "grad_norm": 0.6082115769386292, "learning_rate": 1.6856999999999998e-05, "loss": 0.0439, "step": 5622 }, { "epoch": 13.648845686512757, "grad_norm": 0.6542407274246216, "learning_rate": 1.686e-05, "loss": 0.0626, "step": 5623 }, { "epoch": 13.65127582017011, "grad_norm": 0.5431905388832092, "learning_rate": 1.6863e-05, "loss": 0.1078, "step": 5624 }, { "epoch": 13.65370595382746, "grad_norm": 0.6038932204246521, "learning_rate": 1.6866e-05, "loss": 0.0407, "step": 5625 }, { "epoch": 13.656136087484812, "grad_norm": 0.7135062217712402, "learning_rate": 1.6869e-05, "loss": 0.0611, "step": 5626 }, { "epoch": 13.658566221142163, "grad_norm": 0.4497270882129669, "learning_rate": 1.6872e-05, "loss": 0.0451, "step": 5627 }, { "epoch": 13.660996354799513, "grad_norm": 1.0281257629394531, "learning_rate": 1.6875e-05, "loss": 0.0398, "step": 5628 }, { "epoch": 13.663426488456865, "grad_norm": 0.6832324862480164, "learning_rate": 1.6878e-05, "loss": 0.0813, "step": 5629 }, { "epoch": 13.665856622114216, "grad_norm": 0.35916945338249207, "learning_rate": 1.6881e-05, "loss": 0.0322, "step": 5630 }, { "epoch": 13.668286755771568, "grad_norm": 0.6817032098770142, "learning_rate": 1.6884e-05, "loss": 0.0421, "step": 5631 }, { "epoch": 13.670716889428919, "grad_norm": 0.5841270089149475, "learning_rate": 1.6887e-05, "loss": 0.041, "step": 5632 }, { "epoch": 13.673147023086269, "grad_norm": 0.5439932942390442, "learning_rate": 1.689e-05, "loss": 0.0369, "step": 5633 }, { "epoch": 13.675577156743621, "grad_norm": 0.5458900332450867, "learning_rate": 1.6893000000000002e-05, "loss": 0.0409, "step": 5634 }, { "epoch": 13.678007290400972, "grad_norm": 0.7030209302902222, "learning_rate": 1.6896000000000002e-05, "loss": 0.0534, "step": 5635 }, { "epoch": 13.680437424058324, "grad_norm": 0.48416775465011597, "learning_rate": 1.6899000000000002e-05, "loss": 0.0304, "step": 5636 }, { "epoch": 13.682867557715674, "grad_norm": 0.6751400828361511, "learning_rate": 1.6902000000000002e-05, "loss": 0.0561, "step": 5637 }, { "epoch": 13.685297691373025, "grad_norm": 0.8511307239532471, "learning_rate": 1.6905e-05, "loss": 0.0471, "step": 5638 }, { "epoch": 13.687727825030377, "grad_norm": 0.4256485104560852, "learning_rate": 1.6908e-05, "loss": 0.0354, "step": 5639 }, { "epoch": 13.690157958687728, "grad_norm": 0.7157992720603943, "learning_rate": 1.6911e-05, "loss": 0.0745, "step": 5640 }, { "epoch": 13.69258809234508, "grad_norm": 0.7904087901115417, "learning_rate": 1.6914e-05, "loss": 0.0438, "step": 5641 }, { "epoch": 13.69501822600243, "grad_norm": 0.7312514781951904, "learning_rate": 1.6916999999999997e-05, "loss": 0.0396, "step": 5642 }, { "epoch": 13.69744835965978, "grad_norm": 0.6479973793029785, "learning_rate": 1.6919999999999997e-05, "loss": 0.0815, "step": 5643 }, { "epoch": 13.699878493317133, "grad_norm": 0.667722761631012, "learning_rate": 1.6923e-05, "loss": 0.0544, "step": 5644 }, { "epoch": 13.702308626974483, "grad_norm": 0.6964704990386963, "learning_rate": 1.6926e-05, "loss": 0.0622, "step": 5645 }, { "epoch": 13.704738760631834, "grad_norm": 0.580859363079071, "learning_rate": 1.6929e-05, "loss": 0.0403, "step": 5646 }, { "epoch": 13.707168894289186, "grad_norm": 0.4771612286567688, "learning_rate": 1.6932e-05, "loss": 0.0481, "step": 5647 }, { "epoch": 13.709599027946537, "grad_norm": 0.7265433669090271, "learning_rate": 1.6935e-05, "loss": 0.0322, "step": 5648 }, { "epoch": 13.712029161603889, "grad_norm": 0.6570371389389038, "learning_rate": 1.6938e-05, "loss": 0.0545, "step": 5649 }, { "epoch": 13.71445929526124, "grad_norm": 0.5600059628486633, "learning_rate": 1.6941e-05, "loss": 0.0475, "step": 5650 }, { "epoch": 13.716889428918591, "grad_norm": 1.1436299085617065, "learning_rate": 1.6944e-05, "loss": 0.0772, "step": 5651 }, { "epoch": 13.719319562575942, "grad_norm": 0.742560863494873, "learning_rate": 1.6947e-05, "loss": 0.0557, "step": 5652 }, { "epoch": 13.721749696233292, "grad_norm": 0.9632616639137268, "learning_rate": 1.695e-05, "loss": 0.0623, "step": 5653 }, { "epoch": 13.724179829890645, "grad_norm": 1.543525218963623, "learning_rate": 1.6953000000000002e-05, "loss": 0.0673, "step": 5654 }, { "epoch": 13.726609963547995, "grad_norm": 1.066227674484253, "learning_rate": 1.6956e-05, "loss": 0.0614, "step": 5655 }, { "epoch": 13.729040097205345, "grad_norm": 1.6017955541610718, "learning_rate": 1.6959e-05, "loss": 0.109, "step": 5656 }, { "epoch": 13.731470230862698, "grad_norm": 1.402610421180725, "learning_rate": 1.6962e-05, "loss": 0.3534, "step": 5657 }, { "epoch": 13.733900364520048, "grad_norm": 0.8584165573120117, "learning_rate": 1.6965e-05, "loss": 0.227, "step": 5658 }, { "epoch": 13.7363304981774, "grad_norm": 0.7703742980957031, "learning_rate": 1.6968e-05, "loss": 0.2402, "step": 5659 }, { "epoch": 13.73876063183475, "grad_norm": 1.0491999387741089, "learning_rate": 1.6971e-05, "loss": 0.1891, "step": 5660 }, { "epoch": 13.741190765492101, "grad_norm": 0.7397430539131165, "learning_rate": 1.6974e-05, "loss": 0.2176, "step": 5661 }, { "epoch": 13.743620899149454, "grad_norm": 0.841898500919342, "learning_rate": 1.6977e-05, "loss": 0.1515, "step": 5662 }, { "epoch": 13.746051032806804, "grad_norm": 0.6244447827339172, "learning_rate": 1.698e-05, "loss": 0.1289, "step": 5663 }, { "epoch": 13.748481166464156, "grad_norm": 0.6098883748054504, "learning_rate": 1.6983000000000003e-05, "loss": 0.131, "step": 5664 }, { "epoch": 13.750911300121507, "grad_norm": 0.852403461933136, "learning_rate": 1.6986000000000003e-05, "loss": 0.1329, "step": 5665 }, { "epoch": 13.753341433778857, "grad_norm": 0.652797281742096, "learning_rate": 1.6989000000000003e-05, "loss": 0.0913, "step": 5666 }, { "epoch": 13.75577156743621, "grad_norm": 0.6281891465187073, "learning_rate": 1.6992e-05, "loss": 0.047, "step": 5667 }, { "epoch": 13.75820170109356, "grad_norm": 0.5983633995056152, "learning_rate": 1.6995e-05, "loss": 0.0551, "step": 5668 }, { "epoch": 13.760631834750912, "grad_norm": 0.5510176420211792, "learning_rate": 1.6998e-05, "loss": 0.0517, "step": 5669 }, { "epoch": 13.763061968408262, "grad_norm": 0.5778676271438599, "learning_rate": 1.7001e-05, "loss": 0.0754, "step": 5670 }, { "epoch": 13.765492102065613, "grad_norm": 0.4593488872051239, "learning_rate": 1.7004e-05, "loss": 0.0571, "step": 5671 }, { "epoch": 13.767922235722965, "grad_norm": 0.5483900904655457, "learning_rate": 1.7006999999999998e-05, "loss": 0.0477, "step": 5672 }, { "epoch": 13.770352369380316, "grad_norm": 0.47549042105674744, "learning_rate": 1.7009999999999998e-05, "loss": 0.0644, "step": 5673 }, { "epoch": 13.772782503037668, "grad_norm": 1.2091912031173706, "learning_rate": 1.7013e-05, "loss": 0.0637, "step": 5674 }, { "epoch": 13.775212636695018, "grad_norm": 0.4901452362537384, "learning_rate": 1.7016e-05, "loss": 0.038, "step": 5675 }, { "epoch": 13.777642770352369, "grad_norm": 0.6104363203048706, "learning_rate": 1.7019e-05, "loss": 0.0541, "step": 5676 }, { "epoch": 13.780072904009721, "grad_norm": 0.434736043214798, "learning_rate": 1.7022e-05, "loss": 0.0343, "step": 5677 }, { "epoch": 13.782503037667071, "grad_norm": 0.5681013464927673, "learning_rate": 1.7025e-05, "loss": 0.0427, "step": 5678 }, { "epoch": 13.784933171324424, "grad_norm": 0.4403536021709442, "learning_rate": 1.7028e-05, "loss": 0.0449, "step": 5679 }, { "epoch": 13.787363304981774, "grad_norm": 0.48995810747146606, "learning_rate": 1.7031e-05, "loss": 0.0479, "step": 5680 }, { "epoch": 13.789793438639125, "grad_norm": 0.5223362445831299, "learning_rate": 1.7034e-05, "loss": 0.0468, "step": 5681 }, { "epoch": 13.792223572296477, "grad_norm": 0.5054879188537598, "learning_rate": 1.7037e-05, "loss": 0.038, "step": 5682 }, { "epoch": 13.794653705953827, "grad_norm": 0.6960294842720032, "learning_rate": 1.704e-05, "loss": 0.0883, "step": 5683 }, { "epoch": 13.79708383961118, "grad_norm": 0.6844866275787354, "learning_rate": 1.7043000000000003e-05, "loss": 0.05, "step": 5684 }, { "epoch": 13.79951397326853, "grad_norm": 0.600155234336853, "learning_rate": 1.7046000000000002e-05, "loss": 0.0373, "step": 5685 }, { "epoch": 13.80194410692588, "grad_norm": 0.7491584420204163, "learning_rate": 1.7049000000000002e-05, "loss": 0.0301, "step": 5686 }, { "epoch": 13.804374240583233, "grad_norm": 0.7586638927459717, "learning_rate": 1.7052000000000002e-05, "loss": 0.0782, "step": 5687 }, { "epoch": 13.806804374240583, "grad_norm": 0.4897966682910919, "learning_rate": 1.7055000000000002e-05, "loss": 0.0427, "step": 5688 }, { "epoch": 13.809234507897933, "grad_norm": 0.5205718874931335, "learning_rate": 1.7058e-05, "loss": 0.0341, "step": 5689 }, { "epoch": 13.811664641555286, "grad_norm": 0.8169156908988953, "learning_rate": 1.7061e-05, "loss": 0.0704, "step": 5690 }, { "epoch": 13.814094775212636, "grad_norm": 0.7551054954528809, "learning_rate": 1.7064e-05, "loss": 0.0601, "step": 5691 }, { "epoch": 13.816524908869988, "grad_norm": 0.6607508063316345, "learning_rate": 1.7066999999999998e-05, "loss": 0.0464, "step": 5692 }, { "epoch": 13.818955042527339, "grad_norm": 0.9259284734725952, "learning_rate": 1.7069999999999998e-05, "loss": 0.095, "step": 5693 }, { "epoch": 13.821385176184691, "grad_norm": 0.41915902495384216, "learning_rate": 1.7073e-05, "loss": 0.0244, "step": 5694 }, { "epoch": 13.823815309842042, "grad_norm": 0.8410823941230774, "learning_rate": 1.7076e-05, "loss": 0.0509, "step": 5695 }, { "epoch": 13.826245443499392, "grad_norm": 0.8165566921234131, "learning_rate": 1.7079e-05, "loss": 0.0732, "step": 5696 }, { "epoch": 13.828675577156744, "grad_norm": 0.6969083547592163, "learning_rate": 1.7082e-05, "loss": 0.0484, "step": 5697 }, { "epoch": 13.831105710814095, "grad_norm": 0.6060789227485657, "learning_rate": 1.7085e-05, "loss": 0.0566, "step": 5698 }, { "epoch": 13.833535844471445, "grad_norm": 0.6229352951049805, "learning_rate": 1.7088e-05, "loss": 0.0464, "step": 5699 }, { "epoch": 13.835965978128797, "grad_norm": 0.6714246273040771, "learning_rate": 1.7091e-05, "loss": 0.0553, "step": 5700 }, { "epoch": 13.838396111786148, "grad_norm": 0.5732796788215637, "learning_rate": 1.7094e-05, "loss": 0.0461, "step": 5701 }, { "epoch": 13.8408262454435, "grad_norm": 1.069448471069336, "learning_rate": 1.7097e-05, "loss": 0.0602, "step": 5702 }, { "epoch": 13.84325637910085, "grad_norm": 1.1785427331924438, "learning_rate": 1.71e-05, "loss": 0.0754, "step": 5703 }, { "epoch": 13.845686512758201, "grad_norm": 1.2308945655822754, "learning_rate": 1.7103000000000002e-05, "loss": 0.0722, "step": 5704 }, { "epoch": 13.848116646415553, "grad_norm": 1.7399353981018066, "learning_rate": 1.7106000000000002e-05, "loss": 0.1135, "step": 5705 }, { "epoch": 13.850546780072904, "grad_norm": 1.2043817043304443, "learning_rate": 1.7109000000000002e-05, "loss": 0.1058, "step": 5706 }, { "epoch": 13.852976913730256, "grad_norm": 1.4787224531173706, "learning_rate": 1.7112e-05, "loss": 0.3376, "step": 5707 }, { "epoch": 13.855407047387606, "grad_norm": 0.800216555595398, "learning_rate": 1.7115e-05, "loss": 0.256, "step": 5708 }, { "epoch": 13.857837181044957, "grad_norm": 1.1450194120407104, "learning_rate": 1.7118e-05, "loss": 0.2263, "step": 5709 }, { "epoch": 13.860267314702309, "grad_norm": 0.9500156044960022, "learning_rate": 1.7121e-05, "loss": 0.172, "step": 5710 }, { "epoch": 13.86269744835966, "grad_norm": 1.0729702711105347, "learning_rate": 1.7124e-05, "loss": 0.1864, "step": 5711 }, { "epoch": 13.865127582017012, "grad_norm": 0.7620933651924133, "learning_rate": 1.7127e-05, "loss": 0.1305, "step": 5712 }, { "epoch": 13.867557715674362, "grad_norm": 0.7000483870506287, "learning_rate": 1.713e-05, "loss": 0.1253, "step": 5713 }, { "epoch": 13.869987849331713, "grad_norm": 0.45772677659988403, "learning_rate": 1.7133000000000004e-05, "loss": 0.0585, "step": 5714 }, { "epoch": 13.872417982989065, "grad_norm": 0.5877811908721924, "learning_rate": 1.7136000000000003e-05, "loss": 0.0693, "step": 5715 }, { "epoch": 13.874848116646415, "grad_norm": 0.6870561242103577, "learning_rate": 1.7139e-05, "loss": 0.0717, "step": 5716 }, { "epoch": 13.877278250303767, "grad_norm": 0.49175864458084106, "learning_rate": 1.7142e-05, "loss": 0.0619, "step": 5717 }, { "epoch": 13.879708383961118, "grad_norm": 0.7513668537139893, "learning_rate": 1.7145e-05, "loss": 0.0774, "step": 5718 }, { "epoch": 13.882138517618468, "grad_norm": 0.6086025834083557, "learning_rate": 1.7148e-05, "loss": 0.059, "step": 5719 }, { "epoch": 13.88456865127582, "grad_norm": 0.5361020565032959, "learning_rate": 1.7151e-05, "loss": 0.05, "step": 5720 }, { "epoch": 13.886998784933171, "grad_norm": 0.6700699329376221, "learning_rate": 1.7154e-05, "loss": 0.0539, "step": 5721 }, { "epoch": 13.889428918590523, "grad_norm": 0.5523734092712402, "learning_rate": 1.7157e-05, "loss": 0.0392, "step": 5722 }, { "epoch": 13.891859052247874, "grad_norm": 0.5225424766540527, "learning_rate": 1.716e-05, "loss": 0.0489, "step": 5723 }, { "epoch": 13.894289185905224, "grad_norm": 0.7195396423339844, "learning_rate": 1.7163e-05, "loss": 0.0715, "step": 5724 }, { "epoch": 13.896719319562576, "grad_norm": 0.2853162884712219, "learning_rate": 1.7166e-05, "loss": 0.0173, "step": 5725 }, { "epoch": 13.899149453219927, "grad_norm": 0.5345601439476013, "learning_rate": 1.7169e-05, "loss": 0.0368, "step": 5726 }, { "epoch": 13.90157958687728, "grad_norm": 0.3346693217754364, "learning_rate": 1.7172e-05, "loss": 0.026, "step": 5727 }, { "epoch": 13.90400972053463, "grad_norm": 0.45805639028549194, "learning_rate": 1.7175e-05, "loss": 0.0423, "step": 5728 }, { "epoch": 13.90643985419198, "grad_norm": 0.6306438446044922, "learning_rate": 1.7178e-05, "loss": 0.0588, "step": 5729 }, { "epoch": 13.908869987849332, "grad_norm": 0.5100935101509094, "learning_rate": 1.7181e-05, "loss": 0.0395, "step": 5730 }, { "epoch": 13.911300121506683, "grad_norm": 0.5406461358070374, "learning_rate": 1.7184e-05, "loss": 0.0502, "step": 5731 }, { "epoch": 13.913730255164033, "grad_norm": 0.6011360883712769, "learning_rate": 1.7187e-05, "loss": 0.0451, "step": 5732 }, { "epoch": 13.916160388821385, "grad_norm": 0.791988730430603, "learning_rate": 1.719e-05, "loss": 0.0593, "step": 5733 }, { "epoch": 13.918590522478736, "grad_norm": 0.7021889686584473, "learning_rate": 1.7193000000000003e-05, "loss": 0.0478, "step": 5734 }, { "epoch": 13.921020656136088, "grad_norm": 0.6758793592453003, "learning_rate": 1.7196000000000003e-05, "loss": 0.037, "step": 5735 }, { "epoch": 13.923450789793439, "grad_norm": 0.4779198169708252, "learning_rate": 1.7199000000000003e-05, "loss": 0.0324, "step": 5736 }, { "epoch": 13.925880923450789, "grad_norm": 0.6964991092681885, "learning_rate": 1.7202000000000002e-05, "loss": 0.0526, "step": 5737 }, { "epoch": 13.928311057108141, "grad_norm": 0.48586976528167725, "learning_rate": 1.7205000000000002e-05, "loss": 0.044, "step": 5738 }, { "epoch": 13.930741190765492, "grad_norm": 0.6723239421844482, "learning_rate": 1.7208000000000002e-05, "loss": 0.0448, "step": 5739 }, { "epoch": 13.933171324422844, "grad_norm": 0.9921326041221619, "learning_rate": 1.7211000000000002e-05, "loss": 0.0363, "step": 5740 }, { "epoch": 13.935601458080194, "grad_norm": 0.5246656537055969, "learning_rate": 1.7213999999999998e-05, "loss": 0.0269, "step": 5741 }, { "epoch": 13.938031591737545, "grad_norm": 0.5926381945610046, "learning_rate": 1.7216999999999998e-05, "loss": 0.0468, "step": 5742 }, { "epoch": 13.940461725394897, "grad_norm": 0.9661222696304321, "learning_rate": 1.7219999999999998e-05, "loss": 0.0554, "step": 5743 }, { "epoch": 13.942891859052247, "grad_norm": 0.7208148241043091, "learning_rate": 1.7223e-05, "loss": 0.0432, "step": 5744 }, { "epoch": 13.9453219927096, "grad_norm": 0.6827936768531799, "learning_rate": 1.7226e-05, "loss": 0.0491, "step": 5745 }, { "epoch": 13.94775212636695, "grad_norm": 0.748034656047821, "learning_rate": 1.7229e-05, "loss": 0.0569, "step": 5746 }, { "epoch": 13.9501822600243, "grad_norm": 1.1228196620941162, "learning_rate": 1.7232e-05, "loss": 0.0586, "step": 5747 }, { "epoch": 13.952612393681653, "grad_norm": 0.6284981966018677, "learning_rate": 1.7235e-05, "loss": 0.0487, "step": 5748 }, { "epoch": 13.955042527339003, "grad_norm": 0.9195168018341064, "learning_rate": 1.7238e-05, "loss": 0.0628, "step": 5749 }, { "epoch": 13.957472660996356, "grad_norm": 0.8178842067718506, "learning_rate": 1.7241e-05, "loss": 0.1287, "step": 5750 }, { "epoch": 13.959902794653706, "grad_norm": 0.6927173137664795, "learning_rate": 1.7244e-05, "loss": 0.0526, "step": 5751 }, { "epoch": 13.962332928311056, "grad_norm": 0.8929076194763184, "learning_rate": 1.7247e-05, "loss": 0.0407, "step": 5752 }, { "epoch": 13.964763061968409, "grad_norm": 0.7673362493515015, "learning_rate": 1.725e-05, "loss": 0.0545, "step": 5753 }, { "epoch": 13.96719319562576, "grad_norm": 0.9238954782485962, "learning_rate": 1.7253e-05, "loss": 0.0578, "step": 5754 }, { "epoch": 13.969623329283111, "grad_norm": 1.9202970266342163, "learning_rate": 1.7256000000000002e-05, "loss": 0.0721, "step": 5755 }, { "epoch": 13.972053462940462, "grad_norm": 1.3485608100891113, "learning_rate": 1.7259000000000002e-05, "loss": 0.1002, "step": 5756 }, { "epoch": 13.974483596597812, "grad_norm": 1.3175300359725952, "learning_rate": 1.7262000000000002e-05, "loss": 0.2568, "step": 5757 }, { "epoch": 13.976913730255164, "grad_norm": 0.6452950239181519, "learning_rate": 1.7265e-05, "loss": 0.107, "step": 5758 }, { "epoch": 13.979343863912515, "grad_norm": 0.8943352103233337, "learning_rate": 1.7268e-05, "loss": 0.061, "step": 5759 }, { "epoch": 13.981773997569867, "grad_norm": 0.6040223240852356, "learning_rate": 1.7271e-05, "loss": 0.0489, "step": 5760 }, { "epoch": 13.984204131227218, "grad_norm": 0.49145764112472534, "learning_rate": 1.7274e-05, "loss": 0.0573, "step": 5761 }, { "epoch": 13.986634264884568, "grad_norm": 0.6011208891868591, "learning_rate": 1.7277e-05, "loss": 0.0382, "step": 5762 }, { "epoch": 13.98906439854192, "grad_norm": 0.39768150448799133, "learning_rate": 1.728e-05, "loss": 0.0311, "step": 5763 }, { "epoch": 13.99149453219927, "grad_norm": 0.5578417181968689, "learning_rate": 1.7283e-05, "loss": 0.0548, "step": 5764 }, { "epoch": 13.993924665856621, "grad_norm": 0.7605617046356201, "learning_rate": 1.7286e-05, "loss": 0.0762, "step": 5765 }, { "epoch": 13.996354799513973, "grad_norm": 0.6860693693161011, "learning_rate": 1.7289e-05, "loss": 0.0602, "step": 5766 }, { "epoch": 13.998784933171324, "grad_norm": 0.8740776777267456, "learning_rate": 1.7292e-05, "loss": 0.0577, "step": 5767 }, { "epoch": 14.0, "grad_norm": 0.7881490588188171, "learning_rate": 1.7295e-05, "loss": 0.0403, "step": 5768 }, { "epoch": 14.00243013365735, "grad_norm": 1.122729778289795, "learning_rate": 1.7298e-05, "loss": 0.2781, "step": 5769 }, { "epoch": 14.004860267314703, "grad_norm": 0.669036328792572, "learning_rate": 1.7301e-05, "loss": 0.2078, "step": 5770 }, { "epoch": 14.007290400972053, "grad_norm": 0.9252261519432068, "learning_rate": 1.7304e-05, "loss": 0.213, "step": 5771 }, { "epoch": 14.009720534629405, "grad_norm": 0.927193820476532, "learning_rate": 1.7307e-05, "loss": 0.1638, "step": 5772 }, { "epoch": 14.012150668286756, "grad_norm": 0.6790722608566284, "learning_rate": 1.731e-05, "loss": 0.14, "step": 5773 }, { "epoch": 14.014580801944106, "grad_norm": 0.5998080968856812, "learning_rate": 1.7313e-05, "loss": 0.1004, "step": 5774 }, { "epoch": 14.017010935601458, "grad_norm": 1.1067349910736084, "learning_rate": 1.7316e-05, "loss": 0.0872, "step": 5775 }, { "epoch": 14.019441069258809, "grad_norm": 0.8404110074043274, "learning_rate": 1.7319e-05, "loss": 0.071, "step": 5776 }, { "epoch": 14.021871202916161, "grad_norm": 0.8552334308624268, "learning_rate": 1.7322e-05, "loss": 0.0941, "step": 5777 }, { "epoch": 14.024301336573512, "grad_norm": 0.406393438577652, "learning_rate": 1.7325e-05, "loss": 0.0543, "step": 5778 }, { "epoch": 14.026731470230862, "grad_norm": 0.46878114342689514, "learning_rate": 1.7328e-05, "loss": 0.0507, "step": 5779 }, { "epoch": 14.029161603888214, "grad_norm": 0.4884035885334015, "learning_rate": 1.7331e-05, "loss": 0.0446, "step": 5780 }, { "epoch": 14.031591737545565, "grad_norm": 0.5140330791473389, "learning_rate": 1.7334e-05, "loss": 0.0562, "step": 5781 }, { "epoch": 14.034021871202917, "grad_norm": 0.39542916417121887, "learning_rate": 1.7337e-05, "loss": 0.0367, "step": 5782 }, { "epoch": 14.036452004860267, "grad_norm": 0.545689046382904, "learning_rate": 1.734e-05, "loss": 0.0653, "step": 5783 }, { "epoch": 14.038882138517618, "grad_norm": 0.45864859223365784, "learning_rate": 1.7343e-05, "loss": 0.0433, "step": 5784 }, { "epoch": 14.04131227217497, "grad_norm": 0.44963160157203674, "learning_rate": 1.7346000000000003e-05, "loss": 0.0342, "step": 5785 }, { "epoch": 14.04374240583232, "grad_norm": 0.45823487639427185, "learning_rate": 1.7349000000000003e-05, "loss": 0.038, "step": 5786 }, { "epoch": 14.046172539489673, "grad_norm": 0.6827455163002014, "learning_rate": 1.7352000000000003e-05, "loss": 0.0368, "step": 5787 }, { "epoch": 14.048602673147023, "grad_norm": 1.1677109003067017, "learning_rate": 1.7355000000000002e-05, "loss": 0.0571, "step": 5788 }, { "epoch": 14.051032806804374, "grad_norm": 0.5049906969070435, "learning_rate": 1.7358000000000002e-05, "loss": 0.0363, "step": 5789 }, { "epoch": 14.053462940461726, "grad_norm": 0.7404844760894775, "learning_rate": 1.7361e-05, "loss": 0.0427, "step": 5790 }, { "epoch": 14.055893074119076, "grad_norm": 0.7705649733543396, "learning_rate": 1.7364e-05, "loss": 0.1095, "step": 5791 }, { "epoch": 14.058323207776427, "grad_norm": 0.5409405827522278, "learning_rate": 1.7366999999999998e-05, "loss": 0.0555, "step": 5792 }, { "epoch": 14.060753341433779, "grad_norm": 0.6327266693115234, "learning_rate": 1.7369999999999998e-05, "loss": 0.0352, "step": 5793 }, { "epoch": 14.06318347509113, "grad_norm": 0.6430701613426208, "learning_rate": 1.7372999999999998e-05, "loss": 0.0377, "step": 5794 }, { "epoch": 14.065613608748482, "grad_norm": 0.33846211433410645, "learning_rate": 1.7376e-05, "loss": 0.0354, "step": 5795 }, { "epoch": 14.068043742405832, "grad_norm": 0.43777191638946533, "learning_rate": 1.7379e-05, "loss": 0.0324, "step": 5796 }, { "epoch": 14.070473876063183, "grad_norm": 1.106742262840271, "learning_rate": 1.7382e-05, "loss": 0.038, "step": 5797 }, { "epoch": 14.072904009720535, "grad_norm": 0.572219967842102, "learning_rate": 1.7385e-05, "loss": 0.0361, "step": 5798 }, { "epoch": 14.075334143377885, "grad_norm": 0.8043216466903687, "learning_rate": 1.7388e-05, "loss": 0.0568, "step": 5799 }, { "epoch": 14.077764277035238, "grad_norm": 0.38136374950408936, "learning_rate": 1.7391e-05, "loss": 0.0254, "step": 5800 }, { "epoch": 14.080194410692588, "grad_norm": 0.4689059257507324, "learning_rate": 1.7394e-05, "loss": 0.0362, "step": 5801 }, { "epoch": 14.082624544349938, "grad_norm": 0.6938255429267883, "learning_rate": 1.7397e-05, "loss": 0.048, "step": 5802 }, { "epoch": 14.08505467800729, "grad_norm": 0.34544050693511963, "learning_rate": 1.74e-05, "loss": 0.0339, "step": 5803 }, { "epoch": 14.087484811664641, "grad_norm": 1.0029937028884888, "learning_rate": 1.7403e-05, "loss": 0.0454, "step": 5804 }, { "epoch": 14.089914945321993, "grad_norm": 0.5335415005683899, "learning_rate": 1.7406000000000002e-05, "loss": 0.0405, "step": 5805 }, { "epoch": 14.092345078979344, "grad_norm": 0.6258255839347839, "learning_rate": 1.7409000000000002e-05, "loss": 0.0366, "step": 5806 }, { "epoch": 14.094775212636694, "grad_norm": 0.765789270401001, "learning_rate": 1.7412000000000002e-05, "loss": 0.0376, "step": 5807 }, { "epoch": 14.097205346294047, "grad_norm": 1.882287859916687, "learning_rate": 1.7415000000000002e-05, "loss": 0.0857, "step": 5808 }, { "epoch": 14.099635479951397, "grad_norm": 1.017652153968811, "learning_rate": 1.7418e-05, "loss": 0.0485, "step": 5809 }, { "epoch": 14.10206561360875, "grad_norm": 0.570949912071228, "learning_rate": 1.7421e-05, "loss": 0.0376, "step": 5810 }, { "epoch": 14.1044957472661, "grad_norm": 0.6957190632820129, "learning_rate": 1.7424e-05, "loss": 0.0437, "step": 5811 }, { "epoch": 14.10692588092345, "grad_norm": 0.6340164542198181, "learning_rate": 1.7427e-05, "loss": 0.0422, "step": 5812 }, { "epoch": 14.109356014580802, "grad_norm": 0.6835065484046936, "learning_rate": 1.743e-05, "loss": 0.0347, "step": 5813 }, { "epoch": 14.111786148238153, "grad_norm": 0.8426299095153809, "learning_rate": 1.7432999999999997e-05, "loss": 0.0428, "step": 5814 }, { "epoch": 14.114216281895505, "grad_norm": 0.6471785306930542, "learning_rate": 1.7436e-05, "loss": 0.0551, "step": 5815 }, { "epoch": 14.116646415552855, "grad_norm": 0.872048020362854, "learning_rate": 1.7439e-05, "loss": 0.0351, "step": 5816 }, { "epoch": 14.119076549210206, "grad_norm": 1.174958348274231, "learning_rate": 1.7442e-05, "loss": 0.0516, "step": 5817 }, { "epoch": 14.121506682867558, "grad_norm": 1.4188302755355835, "learning_rate": 1.7445e-05, "loss": 0.0682, "step": 5818 }, { "epoch": 14.123936816524909, "grad_norm": 1.3326233625411987, "learning_rate": 1.7448e-05, "loss": 0.3183, "step": 5819 }, { "epoch": 14.12636695018226, "grad_norm": 0.7611290812492371, "learning_rate": 1.7451e-05, "loss": 0.2531, "step": 5820 }, { "epoch": 14.128797083839611, "grad_norm": 0.6527715921401978, "learning_rate": 1.7454e-05, "loss": 0.1663, "step": 5821 }, { "epoch": 14.131227217496962, "grad_norm": 1.3221272230148315, "learning_rate": 1.7457e-05, "loss": 0.1688, "step": 5822 }, { "epoch": 14.133657351154314, "grad_norm": 0.6095055937767029, "learning_rate": 1.746e-05, "loss": 0.1024, "step": 5823 }, { "epoch": 14.136087484811664, "grad_norm": 0.6253858804702759, "learning_rate": 1.7463e-05, "loss": 0.1138, "step": 5824 }, { "epoch": 14.138517618469017, "grad_norm": 0.619983434677124, "learning_rate": 1.7466000000000002e-05, "loss": 0.1016, "step": 5825 }, { "epoch": 14.140947752126367, "grad_norm": 0.7135565876960754, "learning_rate": 1.7469e-05, "loss": 0.0882, "step": 5826 }, { "epoch": 14.143377885783718, "grad_norm": 0.5608729720115662, "learning_rate": 1.7472e-05, "loss": 0.0542, "step": 5827 }, { "epoch": 14.14580801944107, "grad_norm": 0.6044433116912842, "learning_rate": 1.7475e-05, "loss": 0.0548, "step": 5828 }, { "epoch": 14.14823815309842, "grad_norm": 0.5545024275779724, "learning_rate": 1.7478e-05, "loss": 0.0588, "step": 5829 }, { "epoch": 14.15066828675577, "grad_norm": 0.5041187405586243, "learning_rate": 1.7481e-05, "loss": 0.0532, "step": 5830 }, { "epoch": 14.153098420413123, "grad_norm": 0.679315447807312, "learning_rate": 1.7484e-05, "loss": 0.1008, "step": 5831 }, { "epoch": 14.155528554070473, "grad_norm": 0.4548518657684326, "learning_rate": 1.7487e-05, "loss": 0.044, "step": 5832 }, { "epoch": 14.157958687727826, "grad_norm": 1.9997340440750122, "learning_rate": 1.749e-05, "loss": 0.0474, "step": 5833 }, { "epoch": 14.160388821385176, "grad_norm": 0.6456324458122253, "learning_rate": 1.7493e-05, "loss": 0.0606, "step": 5834 }, { "epoch": 14.162818955042527, "grad_norm": 0.7375527024269104, "learning_rate": 1.7496000000000003e-05, "loss": 0.0422, "step": 5835 }, { "epoch": 14.165249088699879, "grad_norm": 0.6705561876296997, "learning_rate": 1.7499000000000003e-05, "loss": 0.0556, "step": 5836 }, { "epoch": 14.16767922235723, "grad_norm": 0.8001432418823242, "learning_rate": 1.7502000000000003e-05, "loss": 0.0399, "step": 5837 }, { "epoch": 14.170109356014581, "grad_norm": 0.48470214009284973, "learning_rate": 1.7505000000000003e-05, "loss": 0.0594, "step": 5838 }, { "epoch": 14.172539489671932, "grad_norm": 1.08933687210083, "learning_rate": 1.7508e-05, "loss": 0.0432, "step": 5839 }, { "epoch": 14.174969623329282, "grad_norm": 0.5354251861572266, "learning_rate": 1.7511e-05, "loss": 0.043, "step": 5840 }, { "epoch": 14.177399756986635, "grad_norm": 0.5407425165176392, "learning_rate": 1.7514e-05, "loss": 0.0403, "step": 5841 }, { "epoch": 14.179829890643985, "grad_norm": 0.4019291400909424, "learning_rate": 1.7517e-05, "loss": 0.0342, "step": 5842 }, { "epoch": 14.182260024301337, "grad_norm": 0.6001057028770447, "learning_rate": 1.7519999999999998e-05, "loss": 0.0337, "step": 5843 }, { "epoch": 14.184690157958688, "grad_norm": 0.6254070997238159, "learning_rate": 1.7522999999999998e-05, "loss": 0.0411, "step": 5844 }, { "epoch": 14.187120291616038, "grad_norm": 0.8917573690414429, "learning_rate": 1.7526e-05, "loss": 0.0767, "step": 5845 }, { "epoch": 14.18955042527339, "grad_norm": 0.5576424598693848, "learning_rate": 1.7529e-05, "loss": 0.056, "step": 5846 }, { "epoch": 14.19198055893074, "grad_norm": 0.6207370758056641, "learning_rate": 1.7532e-05, "loss": 0.051, "step": 5847 }, { "epoch": 14.194410692588093, "grad_norm": 0.535384476184845, "learning_rate": 1.7535e-05, "loss": 0.0272, "step": 5848 }, { "epoch": 14.196840826245444, "grad_norm": 0.7120511531829834, "learning_rate": 1.7538e-05, "loss": 0.0529, "step": 5849 }, { "epoch": 14.199270959902794, "grad_norm": 0.9115514159202576, "learning_rate": 1.7541e-05, "loss": 0.0413, "step": 5850 }, { "epoch": 14.201701093560146, "grad_norm": 0.8452960252761841, "learning_rate": 1.7544e-05, "loss": 0.0338, "step": 5851 }, { "epoch": 14.204131227217497, "grad_norm": 0.4970962405204773, "learning_rate": 1.7547e-05, "loss": 0.0405, "step": 5852 }, { "epoch": 14.206561360874849, "grad_norm": 0.6323218941688538, "learning_rate": 1.755e-05, "loss": 0.0377, "step": 5853 }, { "epoch": 14.2089914945322, "grad_norm": 0.7380632162094116, "learning_rate": 1.7553e-05, "loss": 0.0426, "step": 5854 }, { "epoch": 14.21142162818955, "grad_norm": 0.5352709293365479, "learning_rate": 1.7556000000000003e-05, "loss": 0.0411, "step": 5855 }, { "epoch": 14.213851761846902, "grad_norm": 0.6493246555328369, "learning_rate": 1.7559000000000002e-05, "loss": 0.0263, "step": 5856 }, { "epoch": 14.216281895504252, "grad_norm": 0.7454003095626831, "learning_rate": 1.7562000000000002e-05, "loss": 0.0393, "step": 5857 }, { "epoch": 14.218712029161605, "grad_norm": 0.9158544540405273, "learning_rate": 1.7565000000000002e-05, "loss": 0.0474, "step": 5858 }, { "epoch": 14.221142162818955, "grad_norm": 0.6306418180465698, "learning_rate": 1.7568000000000002e-05, "loss": 0.0373, "step": 5859 }, { "epoch": 14.223572296476306, "grad_norm": 0.7441117167472839, "learning_rate": 1.7571e-05, "loss": 0.0522, "step": 5860 }, { "epoch": 14.226002430133658, "grad_norm": 0.7330625653266907, "learning_rate": 1.7574e-05, "loss": 0.0423, "step": 5861 }, { "epoch": 14.228432563791008, "grad_norm": 0.9675202965736389, "learning_rate": 1.7577e-05, "loss": 0.0464, "step": 5862 }, { "epoch": 14.23086269744836, "grad_norm": 0.7728137373924255, "learning_rate": 1.758e-05, "loss": 0.0482, "step": 5863 }, { "epoch": 14.233292831105711, "grad_norm": 0.910757839679718, "learning_rate": 1.7582999999999998e-05, "loss": 0.0471, "step": 5864 }, { "epoch": 14.235722964763061, "grad_norm": 1.5592979192733765, "learning_rate": 1.7586e-05, "loss": 0.0532, "step": 5865 }, { "epoch": 14.238153098420414, "grad_norm": 1.505794882774353, "learning_rate": 1.7589e-05, "loss": 0.0668, "step": 5866 }, { "epoch": 14.240583232077764, "grad_norm": 1.0095287561416626, "learning_rate": 1.7592e-05, "loss": 0.0637, "step": 5867 }, { "epoch": 14.243013365735115, "grad_norm": 1.5440418720245361, "learning_rate": 1.7595e-05, "loss": 0.1363, "step": 5868 }, { "epoch": 14.245443499392467, "grad_norm": 1.1025199890136719, "learning_rate": 1.7598e-05, "loss": 0.2887, "step": 5869 }, { "epoch": 14.247873633049817, "grad_norm": 0.7599254846572876, "learning_rate": 1.7601e-05, "loss": 0.2759, "step": 5870 }, { "epoch": 14.25030376670717, "grad_norm": 0.6413333415985107, "learning_rate": 1.7604e-05, "loss": 0.1898, "step": 5871 }, { "epoch": 14.25273390036452, "grad_norm": 0.9177147150039673, "learning_rate": 1.7607e-05, "loss": 0.1845, "step": 5872 }, { "epoch": 14.25516403402187, "grad_norm": 0.7976751327514648, "learning_rate": 1.761e-05, "loss": 0.165, "step": 5873 }, { "epoch": 14.257594167679223, "grad_norm": 0.62505042552948, "learning_rate": 1.7613e-05, "loss": 0.1068, "step": 5874 }, { "epoch": 14.260024301336573, "grad_norm": 0.5787855982780457, "learning_rate": 1.7616000000000002e-05, "loss": 0.1207, "step": 5875 }, { "epoch": 14.262454434993925, "grad_norm": 0.8210296630859375, "learning_rate": 1.7619000000000002e-05, "loss": 0.1272, "step": 5876 }, { "epoch": 14.264884568651276, "grad_norm": 0.627981960773468, "learning_rate": 1.7622000000000002e-05, "loss": 0.0556, "step": 5877 }, { "epoch": 14.267314702308626, "grad_norm": 0.4663373827934265, "learning_rate": 1.7625e-05, "loss": 0.0584, "step": 5878 }, { "epoch": 14.269744835965978, "grad_norm": 0.4709450602531433, "learning_rate": 1.7628e-05, "loss": 0.0548, "step": 5879 }, { "epoch": 14.272174969623329, "grad_norm": 0.8409227132797241, "learning_rate": 1.7631e-05, "loss": 0.0659, "step": 5880 }, { "epoch": 14.274605103280681, "grad_norm": 0.3936697840690613, "learning_rate": 1.7634e-05, "loss": 0.0362, "step": 5881 }, { "epoch": 14.277035236938032, "grad_norm": 1.1933391094207764, "learning_rate": 1.7637e-05, "loss": 0.0481, "step": 5882 }, { "epoch": 14.279465370595382, "grad_norm": 0.6689121127128601, "learning_rate": 1.764e-05, "loss": 0.0574, "step": 5883 }, { "epoch": 14.281895504252734, "grad_norm": 0.5572854280471802, "learning_rate": 1.7643e-05, "loss": 0.0354, "step": 5884 }, { "epoch": 14.284325637910085, "grad_norm": 0.43750062584877014, "learning_rate": 1.7646e-05, "loss": 0.041, "step": 5885 }, { "epoch": 14.286755771567437, "grad_norm": 0.6003758907318115, "learning_rate": 1.7649000000000003e-05, "loss": 0.0629, "step": 5886 }, { "epoch": 14.289185905224787, "grad_norm": 0.49679577350616455, "learning_rate": 1.7652000000000003e-05, "loss": 0.0423, "step": 5887 }, { "epoch": 14.291616038882138, "grad_norm": 1.1057639122009277, "learning_rate": 1.7655e-05, "loss": 0.0457, "step": 5888 }, { "epoch": 14.29404617253949, "grad_norm": 0.589986264705658, "learning_rate": 1.7658e-05, "loss": 0.0449, "step": 5889 }, { "epoch": 14.29647630619684, "grad_norm": 1.0472402572631836, "learning_rate": 1.7661e-05, "loss": 0.0348, "step": 5890 }, { "epoch": 14.298906439854193, "grad_norm": 0.6197836399078369, "learning_rate": 1.7664e-05, "loss": 0.0424, "step": 5891 }, { "epoch": 14.301336573511543, "grad_norm": 0.41178473830223083, "learning_rate": 1.7667e-05, "loss": 0.0439, "step": 5892 }, { "epoch": 14.303766707168894, "grad_norm": 0.5548760294914246, "learning_rate": 1.767e-05, "loss": 0.0344, "step": 5893 }, { "epoch": 14.306196840826246, "grad_norm": 0.5949868559837341, "learning_rate": 1.7673e-05, "loss": 0.0515, "step": 5894 }, { "epoch": 14.308626974483596, "grad_norm": 0.5320510268211365, "learning_rate": 1.7675999999999998e-05, "loss": 0.032, "step": 5895 }, { "epoch": 14.311057108140949, "grad_norm": 1.077077031135559, "learning_rate": 1.7679e-05, "loss": 0.0501, "step": 5896 }, { "epoch": 14.313487241798299, "grad_norm": 0.4522995352745056, "learning_rate": 1.7682e-05, "loss": 0.0234, "step": 5897 }, { "epoch": 14.31591737545565, "grad_norm": 0.38273754715919495, "learning_rate": 1.7685e-05, "loss": 0.0263, "step": 5898 }, { "epoch": 14.318347509113002, "grad_norm": 0.6275960803031921, "learning_rate": 1.7688e-05, "loss": 0.0338, "step": 5899 }, { "epoch": 14.320777642770352, "grad_norm": 0.6694764494895935, "learning_rate": 1.7691e-05, "loss": 0.0552, "step": 5900 }, { "epoch": 14.323207776427704, "grad_norm": 0.6636548638343811, "learning_rate": 1.7694e-05, "loss": 0.0594, "step": 5901 }, { "epoch": 14.325637910085055, "grad_norm": 0.59577876329422, "learning_rate": 1.7697e-05, "loss": 0.0374, "step": 5902 }, { "epoch": 14.328068043742405, "grad_norm": 0.7497307062149048, "learning_rate": 1.77e-05, "loss": 0.0486, "step": 5903 }, { "epoch": 14.330498177399758, "grad_norm": 0.36166390776634216, "learning_rate": 1.7703e-05, "loss": 0.0371, "step": 5904 }, { "epoch": 14.332928311057108, "grad_norm": 0.48707666993141174, "learning_rate": 1.7706e-05, "loss": 0.0335, "step": 5905 }, { "epoch": 14.335358444714458, "grad_norm": 0.7203441262245178, "learning_rate": 1.7709000000000003e-05, "loss": 0.0325, "step": 5906 }, { "epoch": 14.33778857837181, "grad_norm": 1.4468177556991577, "learning_rate": 1.7712000000000003e-05, "loss": 0.0581, "step": 5907 }, { "epoch": 14.340218712029161, "grad_norm": 0.8881824612617493, "learning_rate": 1.7715000000000002e-05, "loss": 0.0409, "step": 5908 }, { "epoch": 14.342648845686513, "grad_norm": 0.9057915806770325, "learning_rate": 1.7718000000000002e-05, "loss": 0.0391, "step": 5909 }, { "epoch": 14.345078979343864, "grad_norm": 0.7573592066764832, "learning_rate": 1.7721000000000002e-05, "loss": 0.0527, "step": 5910 }, { "epoch": 14.347509113001214, "grad_norm": 0.5565733909606934, "learning_rate": 1.7724000000000002e-05, "loss": 0.0359, "step": 5911 }, { "epoch": 14.349939246658566, "grad_norm": 0.6314507126808167, "learning_rate": 1.7727e-05, "loss": 0.0424, "step": 5912 }, { "epoch": 14.352369380315917, "grad_norm": 0.6477142572402954, "learning_rate": 1.7729999999999998e-05, "loss": 0.0434, "step": 5913 }, { "epoch": 14.35479951397327, "grad_norm": 0.7794473171234131, "learning_rate": 1.7732999999999998e-05, "loss": 0.0535, "step": 5914 }, { "epoch": 14.35722964763062, "grad_norm": 0.7560591101646423, "learning_rate": 1.7735999999999998e-05, "loss": 0.0636, "step": 5915 }, { "epoch": 14.35965978128797, "grad_norm": 1.1797608137130737, "learning_rate": 1.7739e-05, "loss": 0.0684, "step": 5916 }, { "epoch": 14.362089914945322, "grad_norm": 1.4413259029388428, "learning_rate": 1.7742e-05, "loss": 0.0679, "step": 5917 }, { "epoch": 14.364520048602673, "grad_norm": 2.1770617961883545, "learning_rate": 1.7745e-05, "loss": 0.1917, "step": 5918 }, { "epoch": 14.366950182260025, "grad_norm": 0.9010595679283142, "learning_rate": 1.7748e-05, "loss": 0.2965, "step": 5919 }, { "epoch": 14.369380315917375, "grad_norm": 0.6586623191833496, "learning_rate": 1.7751e-05, "loss": 0.2115, "step": 5920 }, { "epoch": 14.371810449574726, "grad_norm": 0.7222536206245422, "learning_rate": 1.7754e-05, "loss": 0.1687, "step": 5921 }, { "epoch": 14.374240583232078, "grad_norm": 0.9742243885993958, "learning_rate": 1.7757e-05, "loss": 0.1799, "step": 5922 }, { "epoch": 14.376670716889429, "grad_norm": 1.1171832084655762, "learning_rate": 1.776e-05, "loss": 0.1273, "step": 5923 }, { "epoch": 14.37910085054678, "grad_norm": 0.6506574153900146, "learning_rate": 1.7763e-05, "loss": 0.1351, "step": 5924 }, { "epoch": 14.381530984204131, "grad_norm": 0.7028788924217224, "learning_rate": 1.7766e-05, "loss": 0.1117, "step": 5925 }, { "epoch": 14.383961117861482, "grad_norm": 0.6559932827949524, "learning_rate": 1.7769000000000002e-05, "loss": 0.0952, "step": 5926 }, { "epoch": 14.386391251518834, "grad_norm": 0.5435236096382141, "learning_rate": 1.7772000000000002e-05, "loss": 0.0794, "step": 5927 }, { "epoch": 14.388821385176184, "grad_norm": 0.8493327498435974, "learning_rate": 1.7775000000000002e-05, "loss": 0.115, "step": 5928 }, { "epoch": 14.391251518833537, "grad_norm": 0.6011657118797302, "learning_rate": 1.7778e-05, "loss": 0.0622, "step": 5929 }, { "epoch": 14.393681652490887, "grad_norm": 0.5949008464813232, "learning_rate": 1.7781e-05, "loss": 0.0677, "step": 5930 }, { "epoch": 14.396111786148237, "grad_norm": 0.43350154161453247, "learning_rate": 1.7784e-05, "loss": 0.0442, "step": 5931 }, { "epoch": 14.39854191980559, "grad_norm": 0.49828851222991943, "learning_rate": 1.7787e-05, "loss": 0.0671, "step": 5932 }, { "epoch": 14.40097205346294, "grad_norm": 0.557921826839447, "learning_rate": 1.779e-05, "loss": 0.0722, "step": 5933 }, { "epoch": 14.403402187120292, "grad_norm": 0.4109615087509155, "learning_rate": 1.7793e-05, "loss": 0.0449, "step": 5934 }, { "epoch": 14.405832320777643, "grad_norm": 0.5036341547966003, "learning_rate": 1.7796e-05, "loss": 0.0689, "step": 5935 }, { "epoch": 14.408262454434993, "grad_norm": 0.443614661693573, "learning_rate": 1.7799000000000004e-05, "loss": 0.0394, "step": 5936 }, { "epoch": 14.410692588092346, "grad_norm": 0.5300524830818176, "learning_rate": 1.7802e-05, "loss": 0.0364, "step": 5937 }, { "epoch": 14.413122721749696, "grad_norm": 1.021604061126709, "learning_rate": 1.7805e-05, "loss": 0.0443, "step": 5938 }, { "epoch": 14.415552855407048, "grad_norm": 0.47849568724632263, "learning_rate": 1.7808e-05, "loss": 0.0504, "step": 5939 }, { "epoch": 14.417982989064399, "grad_norm": 0.5278506875038147, "learning_rate": 1.7811e-05, "loss": 0.0349, "step": 5940 }, { "epoch": 14.42041312272175, "grad_norm": 0.4029710590839386, "learning_rate": 1.7814e-05, "loss": 0.0324, "step": 5941 }, { "epoch": 14.422843256379101, "grad_norm": 0.6401958465576172, "learning_rate": 1.7817e-05, "loss": 0.0323, "step": 5942 }, { "epoch": 14.425273390036452, "grad_norm": 1.085909366607666, "learning_rate": 1.782e-05, "loss": 0.0567, "step": 5943 }, { "epoch": 14.427703523693804, "grad_norm": 0.8198987245559692, "learning_rate": 1.7823e-05, "loss": 0.0531, "step": 5944 }, { "epoch": 14.430133657351154, "grad_norm": 0.6187098622322083, "learning_rate": 1.7826e-05, "loss": 0.0272, "step": 5945 }, { "epoch": 14.432563791008505, "grad_norm": 0.44625377655029297, "learning_rate": 1.7829e-05, "loss": 0.0339, "step": 5946 }, { "epoch": 14.434993924665857, "grad_norm": 0.64915931224823, "learning_rate": 1.7832e-05, "loss": 0.0434, "step": 5947 }, { "epoch": 14.437424058323208, "grad_norm": 0.6895845532417297, "learning_rate": 1.7835e-05, "loss": 0.0426, "step": 5948 }, { "epoch": 14.439854191980558, "grad_norm": 1.5712146759033203, "learning_rate": 1.7838e-05, "loss": 0.0381, "step": 5949 }, { "epoch": 14.44228432563791, "grad_norm": 0.4557584524154663, "learning_rate": 1.7841e-05, "loss": 0.0339, "step": 5950 }, { "epoch": 14.44471445929526, "grad_norm": 0.47477391362190247, "learning_rate": 1.7844e-05, "loss": 0.0261, "step": 5951 }, { "epoch": 14.447144592952613, "grad_norm": 0.57861328125, "learning_rate": 1.7847e-05, "loss": 0.0532, "step": 5952 }, { "epoch": 14.449574726609963, "grad_norm": 0.6340107917785645, "learning_rate": 1.785e-05, "loss": 0.0389, "step": 5953 }, { "epoch": 14.452004860267314, "grad_norm": 0.7413413524627686, "learning_rate": 1.7853e-05, "loss": 0.0418, "step": 5954 }, { "epoch": 14.454434993924666, "grad_norm": 0.5624810457229614, "learning_rate": 1.7856e-05, "loss": 0.0384, "step": 5955 }, { "epoch": 14.456865127582017, "grad_norm": 0.8258420825004578, "learning_rate": 1.7859000000000003e-05, "loss": 0.0438, "step": 5956 }, { "epoch": 14.459295261239369, "grad_norm": 0.6420139670372009, "learning_rate": 1.7862000000000003e-05, "loss": 0.0557, "step": 5957 }, { "epoch": 14.46172539489672, "grad_norm": 0.8885139226913452, "learning_rate": 1.7865000000000003e-05, "loss": 0.0662, "step": 5958 }, { "epoch": 14.46415552855407, "grad_norm": 0.5940405130386353, "learning_rate": 1.7868000000000002e-05, "loss": 0.0407, "step": 5959 }, { "epoch": 14.466585662211422, "grad_norm": 0.5662451982498169, "learning_rate": 1.7871000000000002e-05, "loss": 0.0362, "step": 5960 }, { "epoch": 14.469015795868772, "grad_norm": 0.5374141335487366, "learning_rate": 1.7874000000000002e-05, "loss": 0.0349, "step": 5961 }, { "epoch": 14.471445929526125, "grad_norm": 0.8629875779151917, "learning_rate": 1.7877e-05, "loss": 0.0832, "step": 5962 }, { "epoch": 14.473876063183475, "grad_norm": 0.6122832894325256, "learning_rate": 1.7879999999999998e-05, "loss": 0.0493, "step": 5963 }, { "epoch": 14.476306196840826, "grad_norm": 1.2153408527374268, "learning_rate": 1.7882999999999998e-05, "loss": 0.0647, "step": 5964 }, { "epoch": 14.478736330498178, "grad_norm": 7.152200222015381, "learning_rate": 1.7885999999999998e-05, "loss": 0.0745, "step": 5965 }, { "epoch": 14.481166464155528, "grad_norm": 1.6142889261245728, "learning_rate": 1.7889e-05, "loss": 0.0905, "step": 5966 }, { "epoch": 14.48359659781288, "grad_norm": 1.8517242670059204, "learning_rate": 1.7892e-05, "loss": 0.1001, "step": 5967 }, { "epoch": 14.486026731470231, "grad_norm": 1.561193585395813, "learning_rate": 1.7895e-05, "loss": 0.1164, "step": 5968 }, { "epoch": 14.488456865127581, "grad_norm": 1.1550976037979126, "learning_rate": 1.7898e-05, "loss": 0.2832, "step": 5969 }, { "epoch": 14.490886998784934, "grad_norm": 0.7668511867523193, "learning_rate": 1.7901e-05, "loss": 0.227, "step": 5970 }, { "epoch": 14.493317132442284, "grad_norm": 0.7935046553611755, "learning_rate": 1.7904e-05, "loss": 0.2113, "step": 5971 }, { "epoch": 14.495747266099636, "grad_norm": 0.8009583353996277, "learning_rate": 1.7907e-05, "loss": 0.1272, "step": 5972 }, { "epoch": 14.498177399756987, "grad_norm": 0.9302282333374023, "learning_rate": 1.791e-05, "loss": 0.1421, "step": 5973 }, { "epoch": 14.500607533414337, "grad_norm": 0.7679850459098816, "learning_rate": 1.7913e-05, "loss": 0.1224, "step": 5974 }, { "epoch": 14.50303766707169, "grad_norm": 0.9800891280174255, "learning_rate": 1.7916e-05, "loss": 0.097, "step": 5975 }, { "epoch": 14.50546780072904, "grad_norm": 0.7220163941383362, "learning_rate": 1.7919000000000002e-05, "loss": 0.0891, "step": 5976 }, { "epoch": 14.507897934386392, "grad_norm": 0.7190508246421814, "learning_rate": 1.7922000000000002e-05, "loss": 0.0795, "step": 5977 }, { "epoch": 14.510328068043743, "grad_norm": 0.6016267538070679, "learning_rate": 1.7925000000000002e-05, "loss": 0.0606, "step": 5978 }, { "epoch": 14.512758201701093, "grad_norm": 0.5871527791023254, "learning_rate": 1.7928000000000002e-05, "loss": 0.0634, "step": 5979 }, { "epoch": 14.515188335358445, "grad_norm": 0.50627201795578, "learning_rate": 1.7931e-05, "loss": 0.0438, "step": 5980 }, { "epoch": 14.517618469015796, "grad_norm": 0.7982969880104065, "learning_rate": 1.7934e-05, "loss": 0.0506, "step": 5981 }, { "epoch": 14.520048602673146, "grad_norm": 0.6422207355499268, "learning_rate": 1.7937e-05, "loss": 0.0516, "step": 5982 }, { "epoch": 14.522478736330498, "grad_norm": 0.6729443669319153, "learning_rate": 1.794e-05, "loss": 0.0628, "step": 5983 }, { "epoch": 14.524908869987849, "grad_norm": 0.5972122550010681, "learning_rate": 1.7943e-05, "loss": 0.0428, "step": 5984 }, { "epoch": 14.527339003645201, "grad_norm": 0.6076911091804504, "learning_rate": 1.7946e-05, "loss": 0.0518, "step": 5985 }, { "epoch": 14.529769137302551, "grad_norm": 0.9565354585647583, "learning_rate": 1.7949e-05, "loss": 0.066, "step": 5986 }, { "epoch": 14.532199270959904, "grad_norm": 0.7125601172447205, "learning_rate": 1.7952e-05, "loss": 0.0593, "step": 5987 }, { "epoch": 14.534629404617254, "grad_norm": 0.4868280589580536, "learning_rate": 1.7955e-05, "loss": 0.0487, "step": 5988 }, { "epoch": 14.537059538274605, "grad_norm": 0.49493834376335144, "learning_rate": 1.7958e-05, "loss": 0.0424, "step": 5989 }, { "epoch": 14.539489671931957, "grad_norm": 0.5466259121894836, "learning_rate": 1.7961e-05, "loss": 0.0415, "step": 5990 }, { "epoch": 14.541919805589307, "grad_norm": 0.7320353388786316, "learning_rate": 1.7964e-05, "loss": 0.0504, "step": 5991 }, { "epoch": 14.544349939246658, "grad_norm": 0.4703092575073242, "learning_rate": 1.7967e-05, "loss": 0.0454, "step": 5992 }, { "epoch": 14.54678007290401, "grad_norm": 0.6790112853050232, "learning_rate": 1.797e-05, "loss": 0.0377, "step": 5993 }, { "epoch": 14.54921020656136, "grad_norm": 0.6234554648399353, "learning_rate": 1.7973e-05, "loss": 0.0561, "step": 5994 }, { "epoch": 14.551640340218713, "grad_norm": 0.5437823534011841, "learning_rate": 1.7976e-05, "loss": 0.0607, "step": 5995 }, { "epoch": 14.554070473876063, "grad_norm": 0.5518783330917358, "learning_rate": 1.7979000000000002e-05, "loss": 0.0321, "step": 5996 }, { "epoch": 14.556500607533414, "grad_norm": 0.6089041233062744, "learning_rate": 1.7982e-05, "loss": 0.0585, "step": 5997 }, { "epoch": 14.558930741190766, "grad_norm": 0.7021185159683228, "learning_rate": 1.7985e-05, "loss": 0.0375, "step": 5998 }, { "epoch": 14.561360874848116, "grad_norm": 0.772318959236145, "learning_rate": 1.7988e-05, "loss": 0.0351, "step": 5999 }, { "epoch": 14.563791008505468, "grad_norm": 0.7703716158866882, "learning_rate": 1.7991e-05, "loss": 0.0376, "step": 6000 }, { "epoch": 14.563791008505468, "eval_cer": 0.08971166974540197, "eval_loss": 0.2693476676940918, "eval_runtime": 8.1163, "eval_samples_per_second": 12.444, "eval_steps_per_second": 0.493, "eval_wer": 0.2786596119929453, "step": 6000 }, { "epoch": 14.566221142162819, "grad_norm": 0.48438867926597595, "learning_rate": 1.7994e-05, "loss": 0.0387, "step": 6001 }, { "epoch": 14.56865127582017, "grad_norm": 0.698074996471405, "learning_rate": 1.7997e-05, "loss": 0.063, "step": 6002 }, { "epoch": 14.571081409477522, "grad_norm": 0.5454766154289246, "learning_rate": 1.8e-05, "loss": 0.0595, "step": 6003 }, { "epoch": 14.573511543134872, "grad_norm": 0.5557267665863037, "learning_rate": 1.8003e-05, "loss": 0.0563, "step": 6004 }, { "epoch": 14.575941676792224, "grad_norm": 0.7276142835617065, "learning_rate": 1.8006e-05, "loss": 0.0822, "step": 6005 }, { "epoch": 14.578371810449575, "grad_norm": 0.5134299993515015, "learning_rate": 1.8009e-05, "loss": 0.029, "step": 6006 }, { "epoch": 14.580801944106925, "grad_norm": 0.7276889085769653, "learning_rate": 1.8012000000000003e-05, "loss": 0.0541, "step": 6007 }, { "epoch": 14.583232077764277, "grad_norm": 0.6846140623092651, "learning_rate": 1.8015000000000003e-05, "loss": 0.0564, "step": 6008 }, { "epoch": 14.585662211421628, "grad_norm": 0.8325905799865723, "learning_rate": 1.8018000000000003e-05, "loss": 0.0573, "step": 6009 }, { "epoch": 14.58809234507898, "grad_norm": 0.6609763503074646, "learning_rate": 1.8021000000000002e-05, "loss": 0.0365, "step": 6010 }, { "epoch": 14.59052247873633, "grad_norm": 1.9809858798980713, "learning_rate": 1.8024e-05, "loss": 0.0728, "step": 6011 }, { "epoch": 14.592952612393681, "grad_norm": 0.6242518424987793, "learning_rate": 1.8027e-05, "loss": 0.0397, "step": 6012 }, { "epoch": 14.595382746051033, "grad_norm": 1.4600776433944702, "learning_rate": 1.803e-05, "loss": 0.0587, "step": 6013 }, { "epoch": 14.597812879708384, "grad_norm": 0.7815288305282593, "learning_rate": 1.8032999999999998e-05, "loss": 0.0573, "step": 6014 }, { "epoch": 14.600243013365736, "grad_norm": 1.2504901885986328, "learning_rate": 1.8035999999999998e-05, "loss": 0.0505, "step": 6015 }, { "epoch": 14.602673147023086, "grad_norm": 1.2018942832946777, "learning_rate": 1.8038999999999998e-05, "loss": 0.0627, "step": 6016 }, { "epoch": 14.605103280680437, "grad_norm": 0.8675606846809387, "learning_rate": 1.8042e-05, "loss": 0.064, "step": 6017 }, { "epoch": 14.607533414337789, "grad_norm": 1.45564603805542, "learning_rate": 1.8045e-05, "loss": 0.0885, "step": 6018 }, { "epoch": 14.60996354799514, "grad_norm": 1.0382329225540161, "learning_rate": 1.8048e-05, "loss": 0.304, "step": 6019 }, { "epoch": 14.612393681652492, "grad_norm": 0.8727678060531616, "learning_rate": 1.8051e-05, "loss": 0.2288, "step": 6020 }, { "epoch": 14.614823815309842, "grad_norm": 0.7376929521560669, "learning_rate": 1.8054e-05, "loss": 0.1508, "step": 6021 }, { "epoch": 14.617253948967193, "grad_norm": 0.8001503348350525, "learning_rate": 1.8057e-05, "loss": 0.2135, "step": 6022 }, { "epoch": 14.619684082624545, "grad_norm": 1.2075227499008179, "learning_rate": 1.806e-05, "loss": 0.2596, "step": 6023 }, { "epoch": 14.622114216281895, "grad_norm": 0.6390612721443176, "learning_rate": 1.8063e-05, "loss": 0.1364, "step": 6024 }, { "epoch": 14.624544349939246, "grad_norm": 0.8433422446250916, "learning_rate": 1.8066e-05, "loss": 0.1249, "step": 6025 }, { "epoch": 14.626974483596598, "grad_norm": 0.511867105960846, "learning_rate": 1.8069e-05, "loss": 0.0926, "step": 6026 }, { "epoch": 14.629404617253948, "grad_norm": 0.7697824239730835, "learning_rate": 1.8072000000000002e-05, "loss": 0.0641, "step": 6027 }, { "epoch": 14.6318347509113, "grad_norm": 0.6734755039215088, "learning_rate": 1.8075000000000002e-05, "loss": 0.0963, "step": 6028 }, { "epoch": 14.634264884568651, "grad_norm": 0.4476313292980194, "learning_rate": 1.8078000000000002e-05, "loss": 0.0699, "step": 6029 }, { "epoch": 14.636695018226002, "grad_norm": 0.5254029631614685, "learning_rate": 1.8081000000000002e-05, "loss": 0.058, "step": 6030 }, { "epoch": 14.639125151883354, "grad_norm": 0.3881220817565918, "learning_rate": 1.8084e-05, "loss": 0.0435, "step": 6031 }, { "epoch": 14.641555285540704, "grad_norm": 1.054410696029663, "learning_rate": 1.8087e-05, "loss": 0.055, "step": 6032 }, { "epoch": 14.643985419198057, "grad_norm": 0.4940447211265564, "learning_rate": 1.809e-05, "loss": 0.0456, "step": 6033 }, { "epoch": 14.646415552855407, "grad_norm": 0.3962053954601288, "learning_rate": 1.8093e-05, "loss": 0.037, "step": 6034 }, { "epoch": 14.648845686512757, "grad_norm": 0.43394264578819275, "learning_rate": 1.8096e-05, "loss": 0.0358, "step": 6035 }, { "epoch": 14.65127582017011, "grad_norm": 0.7748329639434814, "learning_rate": 1.8098999999999997e-05, "loss": 0.1071, "step": 6036 }, { "epoch": 14.65370595382746, "grad_norm": 0.3769468367099762, "learning_rate": 1.8102e-05, "loss": 0.0353, "step": 6037 }, { "epoch": 14.656136087484812, "grad_norm": 0.6349499225616455, "learning_rate": 1.8105e-05, "loss": 0.0608, "step": 6038 }, { "epoch": 14.658566221142163, "grad_norm": 0.6687986850738525, "learning_rate": 1.8108e-05, "loss": 0.0324, "step": 6039 }, { "epoch": 14.660996354799513, "grad_norm": 0.5772004127502441, "learning_rate": 1.8111e-05, "loss": 0.0566, "step": 6040 }, { "epoch": 14.663426488456865, "grad_norm": 0.6065280437469482, "learning_rate": 1.8114e-05, "loss": 0.0456, "step": 6041 }, { "epoch": 14.665856622114216, "grad_norm": 0.6387342810630798, "learning_rate": 1.8117e-05, "loss": 0.0412, "step": 6042 }, { "epoch": 14.668286755771568, "grad_norm": 0.7840889096260071, "learning_rate": 1.812e-05, "loss": 0.051, "step": 6043 }, { "epoch": 14.670716889428919, "grad_norm": 0.6212177872657776, "learning_rate": 1.8123e-05, "loss": 0.0376, "step": 6044 }, { "epoch": 14.673147023086269, "grad_norm": 0.425228089094162, "learning_rate": 1.8126e-05, "loss": 0.0407, "step": 6045 }, { "epoch": 14.675577156743621, "grad_norm": 0.6620621681213379, "learning_rate": 1.8129e-05, "loss": 0.0495, "step": 6046 }, { "epoch": 14.678007290400972, "grad_norm": 0.5583348274230957, "learning_rate": 1.8132000000000002e-05, "loss": 0.0324, "step": 6047 }, { "epoch": 14.680437424058324, "grad_norm": 0.41261765360832214, "learning_rate": 1.8135000000000002e-05, "loss": 0.0234, "step": 6048 }, { "epoch": 14.682867557715674, "grad_norm": 0.6499979496002197, "learning_rate": 1.8138e-05, "loss": 0.0459, "step": 6049 }, { "epoch": 14.685297691373025, "grad_norm": 0.6076393127441406, "learning_rate": 1.8141e-05, "loss": 0.0519, "step": 6050 }, { "epoch": 14.687727825030377, "grad_norm": 0.5927857756614685, "learning_rate": 1.8144e-05, "loss": 0.0563, "step": 6051 }, { "epoch": 14.690157958687728, "grad_norm": 0.6292839050292969, "learning_rate": 1.8147e-05, "loss": 0.0516, "step": 6052 }, { "epoch": 14.69258809234508, "grad_norm": 0.6774417757987976, "learning_rate": 1.815e-05, "loss": 0.0389, "step": 6053 }, { "epoch": 14.69501822600243, "grad_norm": 0.7787654995918274, "learning_rate": 1.8153e-05, "loss": 0.0918, "step": 6054 }, { "epoch": 14.69744835965978, "grad_norm": 0.9342547655105591, "learning_rate": 1.8156e-05, "loss": 0.0591, "step": 6055 }, { "epoch": 14.699878493317133, "grad_norm": 0.44422465562820435, "learning_rate": 1.8159e-05, "loss": 0.0255, "step": 6056 }, { "epoch": 14.702308626974483, "grad_norm": 0.6817962527275085, "learning_rate": 1.8162000000000003e-05, "loss": 0.0471, "step": 6057 }, { "epoch": 14.704738760631834, "grad_norm": 0.8185964226722717, "learning_rate": 1.8165000000000003e-05, "loss": 0.0547, "step": 6058 }, { "epoch": 14.707168894289186, "grad_norm": 0.919131875038147, "learning_rate": 1.8168000000000003e-05, "loss": 0.0562, "step": 6059 }, { "epoch": 14.709599027946537, "grad_norm": 0.8104518055915833, "learning_rate": 1.8171e-05, "loss": 0.0426, "step": 6060 }, { "epoch": 14.712029161603889, "grad_norm": 0.9527058601379395, "learning_rate": 1.8174e-05, "loss": 0.0503, "step": 6061 }, { "epoch": 14.71445929526124, "grad_norm": 0.7596612572669983, "learning_rate": 1.8177e-05, "loss": 0.0437, "step": 6062 }, { "epoch": 14.716889428918591, "grad_norm": 0.9736880660057068, "learning_rate": 1.818e-05, "loss": 0.064, "step": 6063 }, { "epoch": 14.719319562575942, "grad_norm": 1.2712315320968628, "learning_rate": 1.8183e-05, "loss": 0.0591, "step": 6064 }, { "epoch": 14.721749696233292, "grad_norm": 0.8201224207878113, "learning_rate": 1.8186e-05, "loss": 0.042, "step": 6065 }, { "epoch": 14.724179829890645, "grad_norm": 0.9584680199623108, "learning_rate": 1.8188999999999998e-05, "loss": 0.0785, "step": 6066 }, { "epoch": 14.726609963547995, "grad_norm": 0.9868584871292114, "learning_rate": 1.8192e-05, "loss": 0.0573, "step": 6067 }, { "epoch": 14.729040097205345, "grad_norm": 2.2694623470306396, "learning_rate": 1.8195e-05, "loss": 0.0838, "step": 6068 }, { "epoch": 14.731470230862698, "grad_norm": 1.0398969650268555, "learning_rate": 1.8198e-05, "loss": 0.341, "step": 6069 }, { "epoch": 14.733900364520048, "grad_norm": 0.8519121408462524, "learning_rate": 1.8201e-05, "loss": 0.2952, "step": 6070 }, { "epoch": 14.7363304981774, "grad_norm": 0.5602055788040161, "learning_rate": 1.8204e-05, "loss": 0.1811, "step": 6071 }, { "epoch": 14.73876063183475, "grad_norm": 0.6130563616752625, "learning_rate": 1.8207e-05, "loss": 0.1578, "step": 6072 }, { "epoch": 14.741190765492101, "grad_norm": 1.3267053365707397, "learning_rate": 1.821e-05, "loss": 0.1553, "step": 6073 }, { "epoch": 14.743620899149454, "grad_norm": 0.8337870836257935, "learning_rate": 1.8213e-05, "loss": 0.1632, "step": 6074 }, { "epoch": 14.746051032806804, "grad_norm": 0.5393021106719971, "learning_rate": 1.8216e-05, "loss": 0.1125, "step": 6075 }, { "epoch": 14.748481166464156, "grad_norm": 0.5365158319473267, "learning_rate": 1.8219e-05, "loss": 0.1189, "step": 6076 }, { "epoch": 14.750911300121507, "grad_norm": 0.7447524666786194, "learning_rate": 1.8222000000000003e-05, "loss": 0.0903, "step": 6077 }, { "epoch": 14.753341433778857, "grad_norm": 0.5605825185775757, "learning_rate": 1.8225000000000003e-05, "loss": 0.0563, "step": 6078 }, { "epoch": 14.75577156743621, "grad_norm": 0.42722174525260925, "learning_rate": 1.8228000000000002e-05, "loss": 0.0544, "step": 6079 }, { "epoch": 14.75820170109356, "grad_norm": 0.6616435647010803, "learning_rate": 1.8231000000000002e-05, "loss": 0.0895, "step": 6080 }, { "epoch": 14.760631834750912, "grad_norm": 0.5810850262641907, "learning_rate": 1.8234000000000002e-05, "loss": 0.0457, "step": 6081 }, { "epoch": 14.763061968408262, "grad_norm": 0.6124719977378845, "learning_rate": 1.8237000000000002e-05, "loss": 0.0285, "step": 6082 }, { "epoch": 14.765492102065613, "grad_norm": 0.64861661195755, "learning_rate": 1.824e-05, "loss": 0.0651, "step": 6083 }, { "epoch": 14.767922235722965, "grad_norm": 1.2184476852416992, "learning_rate": 1.8243e-05, "loss": 0.0498, "step": 6084 }, { "epoch": 14.770352369380316, "grad_norm": 0.5196377635002136, "learning_rate": 1.8245999999999998e-05, "loss": 0.043, "step": 6085 }, { "epoch": 14.772782503037668, "grad_norm": 0.5499024987220764, "learning_rate": 1.8248999999999998e-05, "loss": 0.0571, "step": 6086 }, { "epoch": 14.775212636695018, "grad_norm": 0.6706680655479431, "learning_rate": 1.8252e-05, "loss": 0.0666, "step": 6087 }, { "epoch": 14.777642770352369, "grad_norm": 0.44167083501815796, "learning_rate": 1.8255e-05, "loss": 0.0358, "step": 6088 }, { "epoch": 14.780072904009721, "grad_norm": 0.5299032926559448, "learning_rate": 1.8258e-05, "loss": 0.0655, "step": 6089 }, { "epoch": 14.782503037667071, "grad_norm": 0.4602087140083313, "learning_rate": 1.8261e-05, "loss": 0.0297, "step": 6090 }, { "epoch": 14.784933171324424, "grad_norm": 0.5103431344032288, "learning_rate": 1.8264e-05, "loss": 0.0406, "step": 6091 }, { "epoch": 14.787363304981774, "grad_norm": 0.7576766014099121, "learning_rate": 1.8267e-05, "loss": 0.0753, "step": 6092 }, { "epoch": 14.789793438639125, "grad_norm": 0.4615596532821655, "learning_rate": 1.827e-05, "loss": 0.0426, "step": 6093 }, { "epoch": 14.792223572296477, "grad_norm": 0.5810841917991638, "learning_rate": 1.8273e-05, "loss": 0.0363, "step": 6094 }, { "epoch": 14.794653705953827, "grad_norm": 0.4755033552646637, "learning_rate": 1.8276e-05, "loss": 0.0433, "step": 6095 }, { "epoch": 14.79708383961118, "grad_norm": 0.48595163226127625, "learning_rate": 1.8279e-05, "loss": 0.039, "step": 6096 }, { "epoch": 14.79951397326853, "grad_norm": 0.8740439414978027, "learning_rate": 1.8282000000000002e-05, "loss": 0.0665, "step": 6097 }, { "epoch": 14.80194410692588, "grad_norm": 0.764750599861145, "learning_rate": 1.8285000000000002e-05, "loss": 0.0429, "step": 6098 }, { "epoch": 14.804374240583233, "grad_norm": 0.5748232007026672, "learning_rate": 1.8288000000000002e-05, "loss": 0.0436, "step": 6099 }, { "epoch": 14.806804374240583, "grad_norm": 0.4763253927230835, "learning_rate": 1.8291e-05, "loss": 0.04, "step": 6100 }, { "epoch": 14.809234507897933, "grad_norm": 0.5129628777503967, "learning_rate": 1.8294e-05, "loss": 0.0262, "step": 6101 }, { "epoch": 14.811664641555286, "grad_norm": 0.4361320734024048, "learning_rate": 1.8297e-05, "loss": 0.0244, "step": 6102 }, { "epoch": 14.814094775212636, "grad_norm": 0.5968680381774902, "learning_rate": 1.83e-05, "loss": 0.0395, "step": 6103 }, { "epoch": 14.816524908869988, "grad_norm": 0.7440581321716309, "learning_rate": 1.8303e-05, "loss": 0.0586, "step": 6104 }, { "epoch": 14.818955042527339, "grad_norm": 0.9688655138015747, "learning_rate": 1.8306e-05, "loss": 0.0343, "step": 6105 }, { "epoch": 14.821385176184691, "grad_norm": 0.7566888928413391, "learning_rate": 1.8309e-05, "loss": 0.059, "step": 6106 }, { "epoch": 14.823815309842042, "grad_norm": 0.7516209483146667, "learning_rate": 1.8312000000000004e-05, "loss": 0.0408, "step": 6107 }, { "epoch": 14.826245443499392, "grad_norm": 0.6432201266288757, "learning_rate": 1.8315000000000003e-05, "loss": 0.0325, "step": 6108 }, { "epoch": 14.828675577156744, "grad_norm": 1.1688523292541504, "learning_rate": 1.8318e-05, "loss": 0.052, "step": 6109 }, { "epoch": 14.831105710814095, "grad_norm": 0.6554526090621948, "learning_rate": 1.8321e-05, "loss": 0.047, "step": 6110 }, { "epoch": 14.833535844471445, "grad_norm": 0.5354809761047363, "learning_rate": 1.8324e-05, "loss": 0.0351, "step": 6111 }, { "epoch": 14.835965978128797, "grad_norm": 0.7638748288154602, "learning_rate": 1.8327e-05, "loss": 0.0543, "step": 6112 }, { "epoch": 14.838396111786148, "grad_norm": 0.4532495141029358, "learning_rate": 1.833e-05, "loss": 0.0339, "step": 6113 }, { "epoch": 14.8408262454435, "grad_norm": 0.5608651041984558, "learning_rate": 1.8333e-05, "loss": 0.0468, "step": 6114 }, { "epoch": 14.84325637910085, "grad_norm": 1.2120161056518555, "learning_rate": 1.8336e-05, "loss": 0.1002, "step": 6115 }, { "epoch": 14.845686512758201, "grad_norm": 1.152968168258667, "learning_rate": 1.8339e-05, "loss": 0.0736, "step": 6116 }, { "epoch": 14.848116646415553, "grad_norm": 1.0190799236297607, "learning_rate": 1.8342e-05, "loss": 0.0616, "step": 6117 }, { "epoch": 14.850546780072904, "grad_norm": 1.8860667943954468, "learning_rate": 1.8345e-05, "loss": 0.0982, "step": 6118 }, { "epoch": 14.852976913730256, "grad_norm": 2.0575664043426514, "learning_rate": 1.8348e-05, "loss": 0.386, "step": 6119 }, { "epoch": 14.855407047387606, "grad_norm": 1.1743959188461304, "learning_rate": 1.8351e-05, "loss": 0.2501, "step": 6120 }, { "epoch": 14.857837181044957, "grad_norm": 1.0362063646316528, "learning_rate": 1.8354e-05, "loss": 0.1839, "step": 6121 }, { "epoch": 14.860267314702309, "grad_norm": 0.6583211421966553, "learning_rate": 1.8357e-05, "loss": 0.1674, "step": 6122 }, { "epoch": 14.86269744835966, "grad_norm": 0.8425068259239197, "learning_rate": 1.836e-05, "loss": 0.1325, "step": 6123 }, { "epoch": 14.865127582017012, "grad_norm": 1.1921255588531494, "learning_rate": 1.8363e-05, "loss": 0.161, "step": 6124 }, { "epoch": 14.867557715674362, "grad_norm": 0.9521715641021729, "learning_rate": 1.8366e-05, "loss": 0.1219, "step": 6125 }, { "epoch": 14.869987849331713, "grad_norm": 0.6621464490890503, "learning_rate": 1.8369e-05, "loss": 0.092, "step": 6126 }, { "epoch": 14.872417982989065, "grad_norm": 0.5353065133094788, "learning_rate": 1.8372000000000003e-05, "loss": 0.075, "step": 6127 }, { "epoch": 14.874848116646415, "grad_norm": 0.764825701713562, "learning_rate": 1.8375000000000003e-05, "loss": 0.055, "step": 6128 }, { "epoch": 14.877278250303767, "grad_norm": 0.6202283501625061, "learning_rate": 1.8378000000000003e-05, "loss": 0.0621, "step": 6129 }, { "epoch": 14.879708383961118, "grad_norm": 0.5701768398284912, "learning_rate": 1.8381000000000002e-05, "loss": 0.0469, "step": 6130 }, { "epoch": 14.882138517618468, "grad_norm": 0.8062351942062378, "learning_rate": 1.8384000000000002e-05, "loss": 0.0495, "step": 6131 }, { "epoch": 14.88456865127582, "grad_norm": 0.9965530633926392, "learning_rate": 1.8387000000000002e-05, "loss": 0.046, "step": 6132 }, { "epoch": 14.886998784933171, "grad_norm": 0.5871007442474365, "learning_rate": 1.8390000000000002e-05, "loss": 0.0722, "step": 6133 }, { "epoch": 14.889428918590523, "grad_norm": 0.7908463478088379, "learning_rate": 1.8392999999999998e-05, "loss": 0.0508, "step": 6134 }, { "epoch": 14.891859052247874, "grad_norm": 0.5514386892318726, "learning_rate": 1.8395999999999998e-05, "loss": 0.0474, "step": 6135 }, { "epoch": 14.894289185905224, "grad_norm": 0.47649508714675903, "learning_rate": 1.8398999999999998e-05, "loss": 0.0359, "step": 6136 }, { "epoch": 14.896719319562576, "grad_norm": 0.40098318457603455, "learning_rate": 1.8401999999999998e-05, "loss": 0.0389, "step": 6137 }, { "epoch": 14.899149453219927, "grad_norm": 0.6810685992240906, "learning_rate": 1.8405e-05, "loss": 0.0412, "step": 6138 }, { "epoch": 14.90157958687728, "grad_norm": 0.4345964193344116, "learning_rate": 1.8408e-05, "loss": 0.0438, "step": 6139 }, { "epoch": 14.90400972053463, "grad_norm": 0.415131151676178, "learning_rate": 1.8411e-05, "loss": 0.03, "step": 6140 }, { "epoch": 14.90643985419198, "grad_norm": 0.5744482278823853, "learning_rate": 1.8414e-05, "loss": 0.0594, "step": 6141 }, { "epoch": 14.908869987849332, "grad_norm": 0.551821231842041, "learning_rate": 1.8417e-05, "loss": 0.0348, "step": 6142 }, { "epoch": 14.911300121506683, "grad_norm": 0.6408091187477112, "learning_rate": 1.842e-05, "loss": 0.0412, "step": 6143 }, { "epoch": 14.913730255164033, "grad_norm": 0.4173608720302582, "learning_rate": 1.8423e-05, "loss": 0.0407, "step": 6144 }, { "epoch": 14.916160388821385, "grad_norm": 0.4386938512325287, "learning_rate": 1.8426e-05, "loss": 0.0362, "step": 6145 }, { "epoch": 14.918590522478736, "grad_norm": 0.8777297139167786, "learning_rate": 1.8429e-05, "loss": 0.0551, "step": 6146 }, { "epoch": 14.921020656136088, "grad_norm": 0.7911838293075562, "learning_rate": 1.8432e-05, "loss": 0.0516, "step": 6147 }, { "epoch": 14.923450789793439, "grad_norm": 0.5799395442008972, "learning_rate": 1.8435000000000002e-05, "loss": 0.0381, "step": 6148 }, { "epoch": 14.925880923450789, "grad_norm": 0.5661290884017944, "learning_rate": 1.8438000000000002e-05, "loss": 0.0371, "step": 6149 }, { "epoch": 14.928311057108141, "grad_norm": 0.6661924719810486, "learning_rate": 1.8441000000000002e-05, "loss": 0.0498, "step": 6150 }, { "epoch": 14.930741190765492, "grad_norm": 0.7276904582977295, "learning_rate": 1.8444e-05, "loss": 0.0494, "step": 6151 }, { "epoch": 14.933171324422844, "grad_norm": 0.7617564797401428, "learning_rate": 1.8447e-05, "loss": 0.0533, "step": 6152 }, { "epoch": 14.935601458080194, "grad_norm": 0.8661511540412903, "learning_rate": 1.845e-05, "loss": 0.1169, "step": 6153 }, { "epoch": 14.938031591737545, "grad_norm": 0.4733335077762604, "learning_rate": 1.8453e-05, "loss": 0.0419, "step": 6154 }, { "epoch": 14.940461725394897, "grad_norm": 0.8391436338424683, "learning_rate": 1.8456e-05, "loss": 0.0349, "step": 6155 }, { "epoch": 14.942891859052247, "grad_norm": 0.8286105394363403, "learning_rate": 1.8459e-05, "loss": 0.0903, "step": 6156 }, { "epoch": 14.9453219927096, "grad_norm": 0.6249679923057556, "learning_rate": 1.8462e-05, "loss": 0.0425, "step": 6157 }, { "epoch": 14.94775212636695, "grad_norm": 0.7452312707901001, "learning_rate": 1.8465e-05, "loss": 0.0603, "step": 6158 }, { "epoch": 14.9501822600243, "grad_norm": 0.6180629730224609, "learning_rate": 1.8468e-05, "loss": 0.0428, "step": 6159 }, { "epoch": 14.952612393681653, "grad_norm": 0.58921217918396, "learning_rate": 1.8471e-05, "loss": 0.0359, "step": 6160 }, { "epoch": 14.955042527339003, "grad_norm": 0.7029327154159546, "learning_rate": 1.8474e-05, "loss": 0.0431, "step": 6161 }, { "epoch": 14.957472660996356, "grad_norm": 1.2568339109420776, "learning_rate": 1.8477e-05, "loss": 0.0598, "step": 6162 }, { "epoch": 14.959902794653706, "grad_norm": 1.2031769752502441, "learning_rate": 1.848e-05, "loss": 0.0452, "step": 6163 }, { "epoch": 14.962332928311056, "grad_norm": 1.4745385646820068, "learning_rate": 1.8483e-05, "loss": 0.0683, "step": 6164 }, { "epoch": 14.964763061968409, "grad_norm": 0.9565783739089966, "learning_rate": 1.8486e-05, "loss": 0.0756, "step": 6165 }, { "epoch": 14.96719319562576, "grad_norm": 0.8918164372444153, "learning_rate": 1.8489e-05, "loss": 0.0464, "step": 6166 }, { "epoch": 14.969623329283111, "grad_norm": 1.4514979124069214, "learning_rate": 1.8492e-05, "loss": 0.1147, "step": 6167 }, { "epoch": 14.972053462940462, "grad_norm": 2.3291544914245605, "learning_rate": 1.8495e-05, "loss": 0.1143, "step": 6168 }, { "epoch": 14.974483596597812, "grad_norm": 1.2720322608947754, "learning_rate": 1.8498e-05, "loss": 0.2625, "step": 6169 }, { "epoch": 14.976913730255164, "grad_norm": 0.8484690189361572, "learning_rate": 1.8501e-05, "loss": 0.2089, "step": 6170 }, { "epoch": 14.979343863912515, "grad_norm": 0.8113705515861511, "learning_rate": 1.8504e-05, "loss": 0.1201, "step": 6171 }, { "epoch": 14.981773997569867, "grad_norm": 0.5034781694412231, "learning_rate": 1.8507e-05, "loss": 0.0532, "step": 6172 }, { "epoch": 14.984204131227218, "grad_norm": 0.5633240342140198, "learning_rate": 1.851e-05, "loss": 0.0319, "step": 6173 }, { "epoch": 14.986634264884568, "grad_norm": 0.7940555214881897, "learning_rate": 1.8513e-05, "loss": 0.0497, "step": 6174 }, { "epoch": 14.98906439854192, "grad_norm": 1.1312717199325562, "learning_rate": 1.8516e-05, "loss": 0.1368, "step": 6175 }, { "epoch": 14.99149453219927, "grad_norm": 0.5801849365234375, "learning_rate": 1.8519e-05, "loss": 0.048, "step": 6176 }, { "epoch": 14.993924665856621, "grad_norm": 1.3394443988800049, "learning_rate": 1.8522e-05, "loss": 0.0492, "step": 6177 }, { "epoch": 14.996354799513973, "grad_norm": 0.5459449887275696, "learning_rate": 1.8525000000000003e-05, "loss": 0.0364, "step": 6178 }, { "epoch": 14.998784933171324, "grad_norm": 0.8620681166648865, "learning_rate": 1.8528000000000003e-05, "loss": 0.0447, "step": 6179 }, { "epoch": 15.0, "grad_norm": 0.8811067342758179, "learning_rate": 1.8531000000000003e-05, "loss": 0.044, "step": 6180 }, { "epoch": 15.00243013365735, "grad_norm": 2.1677253246307373, "learning_rate": 1.8534000000000002e-05, "loss": 0.3404, "step": 6181 }, { "epoch": 15.004860267314703, "grad_norm": 1.0436500310897827, "learning_rate": 1.8537000000000002e-05, "loss": 0.2495, "step": 6182 }, { "epoch": 15.007290400972053, "grad_norm": 0.9517599940299988, "learning_rate": 1.854e-05, "loss": 0.1759, "step": 6183 }, { "epoch": 15.009720534629405, "grad_norm": 1.120038628578186, "learning_rate": 1.8543e-05, "loss": 0.2161, "step": 6184 }, { "epoch": 15.012150668286756, "grad_norm": 0.8675456047058105, "learning_rate": 1.8545999999999998e-05, "loss": 0.1148, "step": 6185 }, { "epoch": 15.014580801944106, "grad_norm": 0.7482844591140747, "learning_rate": 1.8548999999999998e-05, "loss": 0.1362, "step": 6186 }, { "epoch": 15.017010935601458, "grad_norm": 0.5624939799308777, "learning_rate": 1.8551999999999998e-05, "loss": 0.0518, "step": 6187 }, { "epoch": 15.019441069258809, "grad_norm": 0.7858014106750488, "learning_rate": 1.8555e-05, "loss": 0.0806, "step": 6188 }, { "epoch": 15.021871202916161, "grad_norm": 0.6121029853820801, "learning_rate": 1.8558e-05, "loss": 0.0608, "step": 6189 }, { "epoch": 15.024301336573512, "grad_norm": 0.6564504504203796, "learning_rate": 1.8561e-05, "loss": 0.0659, "step": 6190 }, { "epoch": 15.026731470230862, "grad_norm": 1.3216863870620728, "learning_rate": 1.8564e-05, "loss": 0.0584, "step": 6191 }, { "epoch": 15.029161603888214, "grad_norm": 0.6234644055366516, "learning_rate": 1.8567e-05, "loss": 0.047, "step": 6192 }, { "epoch": 15.031591737545565, "grad_norm": 0.6375327706336975, "learning_rate": 1.857e-05, "loss": 0.0459, "step": 6193 }, { "epoch": 15.034021871202917, "grad_norm": 0.5573687553405762, "learning_rate": 1.8573e-05, "loss": 0.0405, "step": 6194 }, { "epoch": 15.036452004860267, "grad_norm": 0.5854278206825256, "learning_rate": 1.8576e-05, "loss": 0.053, "step": 6195 }, { "epoch": 15.038882138517618, "grad_norm": 0.520939290523529, "learning_rate": 1.8579e-05, "loss": 0.0457, "step": 6196 }, { "epoch": 15.04131227217497, "grad_norm": 0.459705650806427, "learning_rate": 1.8582e-05, "loss": 0.0481, "step": 6197 }, { "epoch": 15.04374240583232, "grad_norm": 0.6687761545181274, "learning_rate": 1.8585000000000002e-05, "loss": 0.0736, "step": 6198 }, { "epoch": 15.046172539489673, "grad_norm": 0.5087074041366577, "learning_rate": 1.8588000000000002e-05, "loss": 0.0366, "step": 6199 }, { "epoch": 15.048602673147023, "grad_norm": 0.43544885516166687, "learning_rate": 1.8591000000000002e-05, "loss": 0.0273, "step": 6200 }, { "epoch": 15.051032806804374, "grad_norm": 0.42468878626823425, "learning_rate": 1.8594000000000002e-05, "loss": 0.0309, "step": 6201 }, { "epoch": 15.053462940461726, "grad_norm": 0.6629211902618408, "learning_rate": 1.8597e-05, "loss": 0.0576, "step": 6202 }, { "epoch": 15.055893074119076, "grad_norm": 0.8778886198997498, "learning_rate": 1.86e-05, "loss": 0.0397, "step": 6203 }, { "epoch": 15.058323207776427, "grad_norm": 0.7316730618476868, "learning_rate": 1.8603e-05, "loss": 0.0288, "step": 6204 }, { "epoch": 15.060753341433779, "grad_norm": 1.5687165260314941, "learning_rate": 1.8606e-05, "loss": 0.1202, "step": 6205 }, { "epoch": 15.06318347509113, "grad_norm": 0.6316965222358704, "learning_rate": 1.8609e-05, "loss": 0.0249, "step": 6206 }, { "epoch": 15.065613608748482, "grad_norm": 0.4629405736923218, "learning_rate": 1.8612e-05, "loss": 0.038, "step": 6207 }, { "epoch": 15.068043742405832, "grad_norm": 0.40428271889686584, "learning_rate": 1.8615e-05, "loss": 0.0292, "step": 6208 }, { "epoch": 15.070473876063183, "grad_norm": 0.6130084991455078, "learning_rate": 1.8618e-05, "loss": 0.0557, "step": 6209 }, { "epoch": 15.072904009720535, "grad_norm": 0.3563232719898224, "learning_rate": 1.8621e-05, "loss": 0.0246, "step": 6210 }, { "epoch": 15.075334143377885, "grad_norm": 0.5901391506195068, "learning_rate": 1.8624e-05, "loss": 0.0371, "step": 6211 }, { "epoch": 15.077764277035238, "grad_norm": 0.4497651159763336, "learning_rate": 1.8627e-05, "loss": 0.0335, "step": 6212 }, { "epoch": 15.080194410692588, "grad_norm": 0.7552116513252258, "learning_rate": 1.863e-05, "loss": 0.0236, "step": 6213 }, { "epoch": 15.082624544349938, "grad_norm": 0.6918234825134277, "learning_rate": 1.8633e-05, "loss": 0.0994, "step": 6214 }, { "epoch": 15.08505467800729, "grad_norm": 0.46071189641952515, "learning_rate": 1.8636e-05, "loss": 0.0299, "step": 6215 }, { "epoch": 15.087484811664641, "grad_norm": 0.5500078201293945, "learning_rate": 1.8639e-05, "loss": 0.0362, "step": 6216 }, { "epoch": 15.089914945321993, "grad_norm": 0.5789892673492432, "learning_rate": 1.8642e-05, "loss": 0.0299, "step": 6217 }, { "epoch": 15.092345078979344, "grad_norm": 1.1744167804718018, "learning_rate": 1.8645000000000002e-05, "loss": 0.052, "step": 6218 }, { "epoch": 15.094775212636694, "grad_norm": 0.4729054272174835, "learning_rate": 1.8648000000000002e-05, "loss": 0.0353, "step": 6219 }, { "epoch": 15.097205346294047, "grad_norm": 0.7095296382904053, "learning_rate": 1.8651e-05, "loss": 0.0337, "step": 6220 }, { "epoch": 15.099635479951397, "grad_norm": 0.710736870765686, "learning_rate": 1.8654e-05, "loss": 0.0328, "step": 6221 }, { "epoch": 15.10206561360875, "grad_norm": 0.40847447514533997, "learning_rate": 1.8657e-05, "loss": 0.0176, "step": 6222 }, { "epoch": 15.1044957472661, "grad_norm": 0.9846264719963074, "learning_rate": 1.866e-05, "loss": 0.0425, "step": 6223 }, { "epoch": 15.10692588092345, "grad_norm": 1.1341654062271118, "learning_rate": 1.8663e-05, "loss": 0.0705, "step": 6224 }, { "epoch": 15.109356014580802, "grad_norm": 0.925653338432312, "learning_rate": 1.8666e-05, "loss": 0.0548, "step": 6225 }, { "epoch": 15.111786148238153, "grad_norm": 1.2040585279464722, "learning_rate": 1.8669e-05, "loss": 0.048, "step": 6226 }, { "epoch": 15.114216281895505, "grad_norm": 0.6463131904602051, "learning_rate": 1.8672e-05, "loss": 0.0474, "step": 6227 }, { "epoch": 15.116646415552855, "grad_norm": 6.246738910675049, "learning_rate": 1.8675000000000003e-05, "loss": 0.0526, "step": 6228 }, { "epoch": 15.119076549210206, "grad_norm": 1.474442481994629, "learning_rate": 1.8678000000000003e-05, "loss": 0.0494, "step": 6229 }, { "epoch": 15.121506682867558, "grad_norm": 1.6007267236709595, "learning_rate": 1.8681000000000003e-05, "loss": 0.1042, "step": 6230 }, { "epoch": 15.123936816524909, "grad_norm": 3.059098482131958, "learning_rate": 1.8684000000000003e-05, "loss": 0.4226, "step": 6231 }, { "epoch": 15.12636695018226, "grad_norm": 0.9655137062072754, "learning_rate": 1.8687e-05, "loss": 0.2282, "step": 6232 }, { "epoch": 15.128797083839611, "grad_norm": 0.8658247590065002, "learning_rate": 1.869e-05, "loss": 0.1734, "step": 6233 }, { "epoch": 15.131227217496962, "grad_norm": 0.700975775718689, "learning_rate": 1.8693e-05, "loss": 0.1739, "step": 6234 }, { "epoch": 15.133657351154314, "grad_norm": 0.9085336327552795, "learning_rate": 1.8696e-05, "loss": 0.1749, "step": 6235 }, { "epoch": 15.136087484811664, "grad_norm": 0.9408630728721619, "learning_rate": 1.8699e-05, "loss": 0.1138, "step": 6236 }, { "epoch": 15.138517618469017, "grad_norm": 0.7106732130050659, "learning_rate": 1.8701999999999998e-05, "loss": 0.1279, "step": 6237 }, { "epoch": 15.140947752126367, "grad_norm": 0.5463520288467407, "learning_rate": 1.8705e-05, "loss": 0.071, "step": 6238 }, { "epoch": 15.143377885783718, "grad_norm": 0.7219159603118896, "learning_rate": 1.8708e-05, "loss": 0.1111, "step": 6239 }, { "epoch": 15.14580801944107, "grad_norm": 0.5002772212028503, "learning_rate": 1.8711e-05, "loss": 0.0648, "step": 6240 }, { "epoch": 15.14823815309842, "grad_norm": 0.5204650163650513, "learning_rate": 1.8714e-05, "loss": 0.0537, "step": 6241 }, { "epoch": 15.15066828675577, "grad_norm": 0.6393994688987732, "learning_rate": 1.8717e-05, "loss": 0.0622, "step": 6242 }, { "epoch": 15.153098420413123, "grad_norm": 0.551814079284668, "learning_rate": 1.872e-05, "loss": 0.0546, "step": 6243 }, { "epoch": 15.155528554070473, "grad_norm": 1.2862753868103027, "learning_rate": 1.8723e-05, "loss": 0.0649, "step": 6244 }, { "epoch": 15.157958687727826, "grad_norm": 0.6165100336074829, "learning_rate": 1.8726e-05, "loss": 0.0431, "step": 6245 }, { "epoch": 15.160388821385176, "grad_norm": 0.3741230070590973, "learning_rate": 1.8729e-05, "loss": 0.0504, "step": 6246 }, { "epoch": 15.162818955042527, "grad_norm": 0.3269560635089874, "learning_rate": 1.8732e-05, "loss": 0.027, "step": 6247 }, { "epoch": 15.165249088699879, "grad_norm": 0.47192883491516113, "learning_rate": 1.8735000000000003e-05, "loss": 0.0636, "step": 6248 }, { "epoch": 15.16767922235723, "grad_norm": 0.8966943621635437, "learning_rate": 1.8738000000000003e-05, "loss": 0.0472, "step": 6249 }, { "epoch": 15.170109356014581, "grad_norm": 0.4716310501098633, "learning_rate": 1.8741000000000002e-05, "loss": 0.0431, "step": 6250 }, { "epoch": 15.172539489671932, "grad_norm": 0.3297073245048523, "learning_rate": 1.8744000000000002e-05, "loss": 0.0274, "step": 6251 }, { "epoch": 15.174969623329282, "grad_norm": 0.36914747953414917, "learning_rate": 1.8747000000000002e-05, "loss": 0.0293, "step": 6252 }, { "epoch": 15.177399756986635, "grad_norm": 0.44955456256866455, "learning_rate": 1.8750000000000002e-05, "loss": 0.0469, "step": 6253 }, { "epoch": 15.179829890643985, "grad_norm": 0.45048367977142334, "learning_rate": 1.8753e-05, "loss": 0.03, "step": 6254 }, { "epoch": 15.182260024301337, "grad_norm": 0.41195327043533325, "learning_rate": 1.8756e-05, "loss": 0.0327, "step": 6255 }, { "epoch": 15.184690157958688, "grad_norm": 0.6704474687576294, "learning_rate": 1.8759e-05, "loss": 0.0468, "step": 6256 }, { "epoch": 15.187120291616038, "grad_norm": 0.5898314714431763, "learning_rate": 1.8761999999999998e-05, "loss": 0.045, "step": 6257 }, { "epoch": 15.18955042527339, "grad_norm": 0.31825217604637146, "learning_rate": 1.8764999999999997e-05, "loss": 0.0252, "step": 6258 }, { "epoch": 15.19198055893074, "grad_norm": 0.4262683391571045, "learning_rate": 1.8768e-05, "loss": 0.0296, "step": 6259 }, { "epoch": 15.194410692588093, "grad_norm": 0.4086453914642334, "learning_rate": 1.8771e-05, "loss": 0.0287, "step": 6260 }, { "epoch": 15.196840826245444, "grad_norm": 0.6901947259902954, "learning_rate": 1.8774e-05, "loss": 0.0365, "step": 6261 }, { "epoch": 15.199270959902794, "grad_norm": 0.5325922966003418, "learning_rate": 1.8777e-05, "loss": 0.0289, "step": 6262 }, { "epoch": 15.201701093560146, "grad_norm": 0.7605176568031311, "learning_rate": 1.878e-05, "loss": 0.0437, "step": 6263 }, { "epoch": 15.204131227217497, "grad_norm": 0.6777277588844299, "learning_rate": 1.8783e-05, "loss": 0.0275, "step": 6264 }, { "epoch": 15.206561360874849, "grad_norm": 0.5418494343757629, "learning_rate": 1.8786e-05, "loss": 0.0517, "step": 6265 }, { "epoch": 15.2089914945322, "grad_norm": 0.5883374214172363, "learning_rate": 1.8789e-05, "loss": 0.0539, "step": 6266 }, { "epoch": 15.21142162818955, "grad_norm": 0.5849698185920715, "learning_rate": 1.8792e-05, "loss": 0.0285, "step": 6267 }, { "epoch": 15.213851761846902, "grad_norm": 0.5624638795852661, "learning_rate": 1.8795e-05, "loss": 0.0273, "step": 6268 }, { "epoch": 15.216281895504252, "grad_norm": 0.6490262150764465, "learning_rate": 1.8798000000000002e-05, "loss": 0.0414, "step": 6269 }, { "epoch": 15.218712029161605, "grad_norm": 1.8099215030670166, "learning_rate": 1.8801000000000002e-05, "loss": 0.0673, "step": 6270 }, { "epoch": 15.221142162818955, "grad_norm": 0.7734848856925964, "learning_rate": 1.8804e-05, "loss": 0.0336, "step": 6271 }, { "epoch": 15.223572296476306, "grad_norm": 1.2021044492721558, "learning_rate": 1.8807e-05, "loss": 0.0503, "step": 6272 }, { "epoch": 15.226002430133658, "grad_norm": 0.644913911819458, "learning_rate": 1.881e-05, "loss": 0.0416, "step": 6273 }, { "epoch": 15.228432563791008, "grad_norm": 1.5344600677490234, "learning_rate": 1.8813e-05, "loss": 0.0963, "step": 6274 }, { "epoch": 15.23086269744836, "grad_norm": 0.834686815738678, "learning_rate": 1.8816e-05, "loss": 0.0524, "step": 6275 }, { "epoch": 15.233292831105711, "grad_norm": 0.5729988813400269, "learning_rate": 1.8819e-05, "loss": 0.0301, "step": 6276 }, { "epoch": 15.235722964763061, "grad_norm": 0.8467680215835571, "learning_rate": 1.8822e-05, "loss": 0.0509, "step": 6277 }, { "epoch": 15.238153098420414, "grad_norm": 1.054994821548462, "learning_rate": 1.8825e-05, "loss": 0.056, "step": 6278 }, { "epoch": 15.240583232077764, "grad_norm": 0.8763142228126526, "learning_rate": 1.8828000000000003e-05, "loss": 0.0709, "step": 6279 }, { "epoch": 15.243013365735115, "grad_norm": 1.5692846775054932, "learning_rate": 1.8831000000000003e-05, "loss": 0.0913, "step": 6280 }, { "epoch": 15.245443499392467, "grad_norm": 0.939686119556427, "learning_rate": 1.8834e-05, "loss": 0.2836, "step": 6281 }, { "epoch": 15.247873633049817, "grad_norm": 0.7776698470115662, "learning_rate": 1.8837e-05, "loss": 0.2107, "step": 6282 }, { "epoch": 15.25030376670717, "grad_norm": 0.7335750460624695, "learning_rate": 1.884e-05, "loss": 0.1763, "step": 6283 }, { "epoch": 15.25273390036452, "grad_norm": 0.6546770334243774, "learning_rate": 1.8843e-05, "loss": 0.1422, "step": 6284 }, { "epoch": 15.25516403402187, "grad_norm": 0.620419442653656, "learning_rate": 1.8846e-05, "loss": 0.1408, "step": 6285 }, { "epoch": 15.257594167679223, "grad_norm": 0.8489781618118286, "learning_rate": 1.8849e-05, "loss": 0.1473, "step": 6286 }, { "epoch": 15.260024301336573, "grad_norm": 0.7043729424476624, "learning_rate": 1.8852e-05, "loss": 0.1286, "step": 6287 }, { "epoch": 15.262454434993925, "grad_norm": 0.7431401014328003, "learning_rate": 1.8854999999999998e-05, "loss": 0.1001, "step": 6288 }, { "epoch": 15.264884568651276, "grad_norm": 0.6984792351722717, "learning_rate": 1.8858e-05, "loss": 0.0838, "step": 6289 }, { "epoch": 15.267314702308626, "grad_norm": 0.7720186114311218, "learning_rate": 1.8861e-05, "loss": 0.061, "step": 6290 }, { "epoch": 15.269744835965978, "grad_norm": 0.5918534994125366, "learning_rate": 1.8864e-05, "loss": 0.0577, "step": 6291 }, { "epoch": 15.272174969623329, "grad_norm": 0.45905163884162903, "learning_rate": 1.8867e-05, "loss": 0.0366, "step": 6292 }, { "epoch": 15.274605103280681, "grad_norm": 0.493717759847641, "learning_rate": 1.887e-05, "loss": 0.054, "step": 6293 }, { "epoch": 15.277035236938032, "grad_norm": 0.5838837623596191, "learning_rate": 1.8873e-05, "loss": 0.0574, "step": 6294 }, { "epoch": 15.279465370595382, "grad_norm": 0.4891863465309143, "learning_rate": 1.8876e-05, "loss": 0.0521, "step": 6295 }, { "epoch": 15.281895504252734, "grad_norm": 0.4574906826019287, "learning_rate": 1.8879e-05, "loss": 0.0473, "step": 6296 }, { "epoch": 15.284325637910085, "grad_norm": 0.4449053108692169, "learning_rate": 1.8882e-05, "loss": 0.0592, "step": 6297 }, { "epoch": 15.286755771567437, "grad_norm": 0.7176754474639893, "learning_rate": 1.8885e-05, "loss": 0.0557, "step": 6298 }, { "epoch": 15.289185905224787, "grad_norm": 0.40866363048553467, "learning_rate": 1.8888000000000003e-05, "loss": 0.0267, "step": 6299 }, { "epoch": 15.291616038882138, "grad_norm": 0.3086712956428528, "learning_rate": 1.8891000000000003e-05, "loss": 0.0249, "step": 6300 }, { "epoch": 15.29404617253949, "grad_norm": 0.49705442786216736, "learning_rate": 1.8894000000000002e-05, "loss": 0.047, "step": 6301 }, { "epoch": 15.29647630619684, "grad_norm": 0.5933817028999329, "learning_rate": 1.8897000000000002e-05, "loss": 0.0465, "step": 6302 }, { "epoch": 15.298906439854193, "grad_norm": 0.6834675073623657, "learning_rate": 1.8900000000000002e-05, "loss": 0.0511, "step": 6303 }, { "epoch": 15.301336573511543, "grad_norm": 0.7356652617454529, "learning_rate": 1.8903000000000002e-05, "loss": 0.0442, "step": 6304 }, { "epoch": 15.303766707168894, "grad_norm": 0.6480945944786072, "learning_rate": 1.8906e-05, "loss": 0.0526, "step": 6305 }, { "epoch": 15.306196840826246, "grad_norm": 0.4660624563694, "learning_rate": 1.8908999999999998e-05, "loss": 0.0323, "step": 6306 }, { "epoch": 15.308626974483596, "grad_norm": 0.5959030389785767, "learning_rate": 1.8911999999999998e-05, "loss": 0.0372, "step": 6307 }, { "epoch": 15.311057108140949, "grad_norm": 0.5779510736465454, "learning_rate": 1.8914999999999998e-05, "loss": 0.0413, "step": 6308 }, { "epoch": 15.313487241798299, "grad_norm": 0.5182461142539978, "learning_rate": 1.8918e-05, "loss": 0.0186, "step": 6309 }, { "epoch": 15.31591737545565, "grad_norm": 0.4570067822933197, "learning_rate": 1.8921e-05, "loss": 0.027, "step": 6310 }, { "epoch": 15.318347509113002, "grad_norm": 0.4442293643951416, "learning_rate": 1.8924e-05, "loss": 0.034, "step": 6311 }, { "epoch": 15.320777642770352, "grad_norm": 0.5937516093254089, "learning_rate": 1.8927e-05, "loss": 0.0273, "step": 6312 }, { "epoch": 15.323207776427704, "grad_norm": 0.48736315965652466, "learning_rate": 1.893e-05, "loss": 0.0425, "step": 6313 }, { "epoch": 15.325637910085055, "grad_norm": 1.3494070768356323, "learning_rate": 1.8933e-05, "loss": 0.053, "step": 6314 }, { "epoch": 15.328068043742405, "grad_norm": 0.5385133028030396, "learning_rate": 1.8936e-05, "loss": 0.0265, "step": 6315 }, { "epoch": 15.330498177399758, "grad_norm": 0.5525935292243958, "learning_rate": 1.8939e-05, "loss": 0.0467, "step": 6316 }, { "epoch": 15.332928311057108, "grad_norm": 0.7597702741622925, "learning_rate": 1.8942e-05, "loss": 0.048, "step": 6317 }, { "epoch": 15.335358444714458, "grad_norm": 0.42888379096984863, "learning_rate": 1.8945e-05, "loss": 0.0198, "step": 6318 }, { "epoch": 15.33778857837181, "grad_norm": 0.8357192873954773, "learning_rate": 1.8948000000000002e-05, "loss": 0.0468, "step": 6319 }, { "epoch": 15.340218712029161, "grad_norm": 1.0709047317504883, "learning_rate": 1.8951000000000002e-05, "loss": 0.0235, "step": 6320 }, { "epoch": 15.342648845686513, "grad_norm": 1.2753785848617554, "learning_rate": 1.8954000000000002e-05, "loss": 0.0397, "step": 6321 }, { "epoch": 15.345078979343864, "grad_norm": 0.9725159406661987, "learning_rate": 1.8957e-05, "loss": 0.0495, "step": 6322 }, { "epoch": 15.347509113001214, "grad_norm": 1.0477943420410156, "learning_rate": 1.896e-05, "loss": 0.0481, "step": 6323 }, { "epoch": 15.349939246658566, "grad_norm": 0.9414762854576111, "learning_rate": 1.8963e-05, "loss": 0.0254, "step": 6324 }, { "epoch": 15.352369380315917, "grad_norm": 1.4519168138504028, "learning_rate": 1.8966e-05, "loss": 0.0559, "step": 6325 }, { "epoch": 15.35479951397327, "grad_norm": 0.7917785048484802, "learning_rate": 1.8969e-05, "loss": 0.0388, "step": 6326 }, { "epoch": 15.35722964763062, "grad_norm": 1.455063819885254, "learning_rate": 1.8972e-05, "loss": 0.0523, "step": 6327 }, { "epoch": 15.35965978128797, "grad_norm": 2.4656460285186768, "learning_rate": 1.8975e-05, "loss": 0.0748, "step": 6328 }, { "epoch": 15.362089914945322, "grad_norm": 1.1589133739471436, "learning_rate": 1.8978000000000004e-05, "loss": 0.1018, "step": 6329 }, { "epoch": 15.364520048602673, "grad_norm": 1.1569859981536865, "learning_rate": 1.8981e-05, "loss": 0.0736, "step": 6330 }, { "epoch": 15.366950182260025, "grad_norm": 1.8255274295806885, "learning_rate": 1.8984e-05, "loss": 0.3718, "step": 6331 }, { "epoch": 15.369380315917375, "grad_norm": 0.8795007467269897, "learning_rate": 1.8987e-05, "loss": 0.2266, "step": 6332 }, { "epoch": 15.371810449574726, "grad_norm": 0.6917574405670166, "learning_rate": 1.899e-05, "loss": 0.211, "step": 6333 }, { "epoch": 15.374240583232078, "grad_norm": 0.8154267072677612, "learning_rate": 1.8993e-05, "loss": 0.1572, "step": 6334 }, { "epoch": 15.376670716889429, "grad_norm": 0.8749942779541016, "learning_rate": 1.8996e-05, "loss": 0.1378, "step": 6335 }, { "epoch": 15.37910085054678, "grad_norm": 0.8270253539085388, "learning_rate": 1.8999e-05, "loss": 0.1267, "step": 6336 }, { "epoch": 15.381530984204131, "grad_norm": 0.5117214918136597, "learning_rate": 1.9002e-05, "loss": 0.0955, "step": 6337 }, { "epoch": 15.383961117861482, "grad_norm": 0.9645501971244812, "learning_rate": 1.9005e-05, "loss": 0.0753, "step": 6338 }, { "epoch": 15.386391251518834, "grad_norm": 0.484303742647171, "learning_rate": 1.9008e-05, "loss": 0.0685, "step": 6339 }, { "epoch": 15.388821385176184, "grad_norm": 0.6339733004570007, "learning_rate": 1.9011e-05, "loss": 0.0684, "step": 6340 }, { "epoch": 15.391251518833537, "grad_norm": 0.8483863472938538, "learning_rate": 1.9014e-05, "loss": 0.0765, "step": 6341 }, { "epoch": 15.393681652490887, "grad_norm": 0.5375246405601501, "learning_rate": 1.9017e-05, "loss": 0.0438, "step": 6342 }, { "epoch": 15.396111786148237, "grad_norm": 1.2121530771255493, "learning_rate": 1.902e-05, "loss": 0.0701, "step": 6343 }, { "epoch": 15.39854191980559, "grad_norm": 0.506196141242981, "learning_rate": 1.9023e-05, "loss": 0.0392, "step": 6344 }, { "epoch": 15.40097205346294, "grad_norm": 0.3485780954360962, "learning_rate": 1.9026e-05, "loss": 0.0341, "step": 6345 }, { "epoch": 15.403402187120292, "grad_norm": 0.5342236757278442, "learning_rate": 1.9029e-05, "loss": 0.0373, "step": 6346 }, { "epoch": 15.405832320777643, "grad_norm": 0.45016080141067505, "learning_rate": 1.9032e-05, "loss": 0.0355, "step": 6347 }, { "epoch": 15.408262454434993, "grad_norm": 0.9489158987998962, "learning_rate": 1.9035e-05, "loss": 0.0717, "step": 6348 }, { "epoch": 15.410692588092346, "grad_norm": 0.4582458436489105, "learning_rate": 1.9038000000000003e-05, "loss": 0.039, "step": 6349 }, { "epoch": 15.413122721749696, "grad_norm": 0.8017370700836182, "learning_rate": 1.9041000000000003e-05, "loss": 0.0627, "step": 6350 }, { "epoch": 15.415552855407048, "grad_norm": 0.3992334306240082, "learning_rate": 1.9044000000000003e-05, "loss": 0.0382, "step": 6351 }, { "epoch": 15.417982989064399, "grad_norm": 0.5843932628631592, "learning_rate": 1.9047000000000002e-05, "loss": 0.0445, "step": 6352 }, { "epoch": 15.42041312272175, "grad_norm": 0.4295574426651001, "learning_rate": 1.9050000000000002e-05, "loss": 0.0458, "step": 6353 }, { "epoch": 15.422843256379101, "grad_norm": 0.2956964373588562, "learning_rate": 1.9053000000000002e-05, "loss": 0.0248, "step": 6354 }, { "epoch": 15.425273390036452, "grad_norm": 0.3741416931152344, "learning_rate": 1.9056e-05, "loss": 0.0264, "step": 6355 }, { "epoch": 15.427703523693804, "grad_norm": 0.5542199611663818, "learning_rate": 1.9058999999999998e-05, "loss": 0.0327, "step": 6356 }, { "epoch": 15.430133657351154, "grad_norm": 0.6846186518669128, "learning_rate": 1.9061999999999998e-05, "loss": 0.0289, "step": 6357 }, { "epoch": 15.432563791008505, "grad_norm": 0.47994524240493774, "learning_rate": 1.9064999999999998e-05, "loss": 0.0335, "step": 6358 }, { "epoch": 15.434993924665857, "grad_norm": 0.7510870099067688, "learning_rate": 1.9068e-05, "loss": 0.0359, "step": 6359 }, { "epoch": 15.437424058323208, "grad_norm": 0.7214968204498291, "learning_rate": 1.9071e-05, "loss": 0.0376, "step": 6360 }, { "epoch": 15.439854191980558, "grad_norm": 0.6409575939178467, "learning_rate": 1.9074e-05, "loss": 0.0502, "step": 6361 }, { "epoch": 15.44228432563791, "grad_norm": 0.6383762359619141, "learning_rate": 1.9077e-05, "loss": 0.037, "step": 6362 }, { "epoch": 15.44471445929526, "grad_norm": 0.7095885872840881, "learning_rate": 1.908e-05, "loss": 0.0365, "step": 6363 }, { "epoch": 15.447144592952613, "grad_norm": 0.7763673663139343, "learning_rate": 1.9083e-05, "loss": 0.0777, "step": 6364 }, { "epoch": 15.449574726609963, "grad_norm": 0.8553168177604675, "learning_rate": 1.9086e-05, "loss": 0.0773, "step": 6365 }, { "epoch": 15.452004860267314, "grad_norm": 0.8523399829864502, "learning_rate": 1.9089e-05, "loss": 0.0404, "step": 6366 }, { "epoch": 15.454434993924666, "grad_norm": 0.5545443296432495, "learning_rate": 1.9092e-05, "loss": 0.0401, "step": 6367 }, { "epoch": 15.456865127582017, "grad_norm": 0.7395119667053223, "learning_rate": 1.9095e-05, "loss": 0.0323, "step": 6368 }, { "epoch": 15.459295261239369, "grad_norm": 0.8506766557693481, "learning_rate": 1.9098000000000002e-05, "loss": 0.032, "step": 6369 }, { "epoch": 15.46172539489672, "grad_norm": 0.7494947910308838, "learning_rate": 1.9101000000000002e-05, "loss": 0.0613, "step": 6370 }, { "epoch": 15.46415552855407, "grad_norm": 0.603352963924408, "learning_rate": 1.9104000000000002e-05, "loss": 0.0418, "step": 6371 }, { "epoch": 15.466585662211422, "grad_norm": 0.6340727806091309, "learning_rate": 1.9107000000000002e-05, "loss": 0.033, "step": 6372 }, { "epoch": 15.469015795868772, "grad_norm": 0.9221788644790649, "learning_rate": 1.911e-05, "loss": 0.0705, "step": 6373 }, { "epoch": 15.471445929526125, "grad_norm": 0.8941628932952881, "learning_rate": 1.9113e-05, "loss": 0.0549, "step": 6374 }, { "epoch": 15.473876063183475, "grad_norm": 0.5783444046974182, "learning_rate": 1.9116e-05, "loss": 0.0284, "step": 6375 }, { "epoch": 15.476306196840826, "grad_norm": 0.886258602142334, "learning_rate": 1.9119e-05, "loss": 0.0488, "step": 6376 }, { "epoch": 15.478736330498178, "grad_norm": 0.8826324343681335, "learning_rate": 1.9122e-05, "loss": 0.0636, "step": 6377 }, { "epoch": 15.481166464155528, "grad_norm": 1.3838104009628296, "learning_rate": 1.9125e-05, "loss": 0.0551, "step": 6378 }, { "epoch": 15.48359659781288, "grad_norm": 1.5437475442886353, "learning_rate": 1.9128e-05, "loss": 0.1202, "step": 6379 }, { "epoch": 15.486026731470231, "grad_norm": 1.743752121925354, "learning_rate": 1.9131e-05, "loss": 0.0694, "step": 6380 }, { "epoch": 15.488456865127581, "grad_norm": 1.1960874795913696, "learning_rate": 1.9134e-05, "loss": 0.2558, "step": 6381 }, { "epoch": 15.490886998784934, "grad_norm": 0.9469449520111084, "learning_rate": 1.9137e-05, "loss": 0.2181, "step": 6382 }, { "epoch": 15.493317132442284, "grad_norm": 0.5234766602516174, "learning_rate": 1.914e-05, "loss": 0.1534, "step": 6383 }, { "epoch": 15.495747266099636, "grad_norm": 0.5438645482063293, "learning_rate": 1.9143e-05, "loss": 0.177, "step": 6384 }, { "epoch": 15.498177399756987, "grad_norm": 0.5504951477050781, "learning_rate": 1.9146e-05, "loss": 0.1097, "step": 6385 }, { "epoch": 15.500607533414337, "grad_norm": 0.5973840951919556, "learning_rate": 1.9149e-05, "loss": 0.1173, "step": 6386 }, { "epoch": 15.50303766707169, "grad_norm": 0.6687917113304138, "learning_rate": 1.9152e-05, "loss": 0.1189, "step": 6387 }, { "epoch": 15.50546780072904, "grad_norm": 1.1504212617874146, "learning_rate": 1.9155e-05, "loss": 0.1302, "step": 6388 }, { "epoch": 15.507897934386392, "grad_norm": 0.5748867988586426, "learning_rate": 1.9158e-05, "loss": 0.0749, "step": 6389 }, { "epoch": 15.510328068043743, "grad_norm": 0.5510390400886536, "learning_rate": 1.9161000000000002e-05, "loss": 0.066, "step": 6390 }, { "epoch": 15.512758201701093, "grad_norm": 0.5184352397918701, "learning_rate": 1.9164e-05, "loss": 0.0763, "step": 6391 }, { "epoch": 15.515188335358445, "grad_norm": 0.5661241412162781, "learning_rate": 1.9167e-05, "loss": 0.0617, "step": 6392 }, { "epoch": 15.517618469015796, "grad_norm": 0.6519650220870972, "learning_rate": 1.917e-05, "loss": 0.0548, "step": 6393 }, { "epoch": 15.520048602673146, "grad_norm": 0.3485950231552124, "learning_rate": 1.9173e-05, "loss": 0.0444, "step": 6394 }, { "epoch": 15.522478736330498, "grad_norm": 0.5453152656555176, "learning_rate": 1.9176e-05, "loss": 0.0536, "step": 6395 }, { "epoch": 15.524908869987849, "grad_norm": 0.6485771536827087, "learning_rate": 1.9179e-05, "loss": 0.0525, "step": 6396 }, { "epoch": 15.527339003645201, "grad_norm": 0.4988516569137573, "learning_rate": 1.9182e-05, "loss": 0.0453, "step": 6397 }, { "epoch": 15.529769137302551, "grad_norm": 0.4299941658973694, "learning_rate": 1.9185e-05, "loss": 0.0402, "step": 6398 }, { "epoch": 15.532199270959904, "grad_norm": 0.44787663221359253, "learning_rate": 1.9188e-05, "loss": 0.0373, "step": 6399 }, { "epoch": 15.534629404617254, "grad_norm": 0.49662235379219055, "learning_rate": 1.9191000000000003e-05, "loss": 0.0288, "step": 6400 }, { "epoch": 15.537059538274605, "grad_norm": 0.8205453157424927, "learning_rate": 1.9194000000000003e-05, "loss": 0.0397, "step": 6401 }, { "epoch": 15.539489671931957, "grad_norm": 0.6963546276092529, "learning_rate": 1.9197000000000003e-05, "loss": 0.0371, "step": 6402 }, { "epoch": 15.541919805589307, "grad_norm": 1.838684320449829, "learning_rate": 1.9200000000000003e-05, "loss": 0.0429, "step": 6403 }, { "epoch": 15.544349939246658, "grad_norm": 0.46974438428878784, "learning_rate": 1.9203e-05, "loss": 0.0317, "step": 6404 }, { "epoch": 15.54678007290401, "grad_norm": 0.4928613305091858, "learning_rate": 1.9206e-05, "loss": 0.0383, "step": 6405 }, { "epoch": 15.54921020656136, "grad_norm": 0.5242393612861633, "learning_rate": 1.9209e-05, "loss": 0.0394, "step": 6406 }, { "epoch": 15.551640340218713, "grad_norm": 0.7268774509429932, "learning_rate": 1.9212e-05, "loss": 0.0362, "step": 6407 }, { "epoch": 15.554070473876063, "grad_norm": 0.9115424752235413, "learning_rate": 1.9214999999999998e-05, "loss": 0.0453, "step": 6408 }, { "epoch": 15.556500607533414, "grad_norm": 0.7921799421310425, "learning_rate": 1.9217999999999998e-05, "loss": 0.0391, "step": 6409 }, { "epoch": 15.558930741190766, "grad_norm": 0.603668212890625, "learning_rate": 1.9221e-05, "loss": 0.0432, "step": 6410 }, { "epoch": 15.561360874848116, "grad_norm": 0.5135540962219238, "learning_rate": 1.9224e-05, "loss": 0.0334, "step": 6411 }, { "epoch": 15.563791008505468, "grad_norm": 0.536200225353241, "learning_rate": 1.9227e-05, "loss": 0.0428, "step": 6412 }, { "epoch": 15.566221142162819, "grad_norm": 0.7282557487487793, "learning_rate": 1.923e-05, "loss": 0.031, "step": 6413 }, { "epoch": 15.56865127582017, "grad_norm": 0.5947582125663757, "learning_rate": 1.9233e-05, "loss": 0.0472, "step": 6414 }, { "epoch": 15.571081409477522, "grad_norm": 0.5569626688957214, "learning_rate": 1.9236e-05, "loss": 0.0415, "step": 6415 }, { "epoch": 15.573511543134872, "grad_norm": 0.6933316588401794, "learning_rate": 1.9239e-05, "loss": 0.0424, "step": 6416 }, { "epoch": 15.575941676792224, "grad_norm": 1.205612301826477, "learning_rate": 1.9242e-05, "loss": 0.0411, "step": 6417 }, { "epoch": 15.578371810449575, "grad_norm": 1.1803818941116333, "learning_rate": 1.9245e-05, "loss": 0.1073, "step": 6418 }, { "epoch": 15.580801944106925, "grad_norm": 0.5373958945274353, "learning_rate": 1.9248e-05, "loss": 0.0356, "step": 6419 }, { "epoch": 15.583232077764277, "grad_norm": 0.5622637271881104, "learning_rate": 1.9251000000000003e-05, "loss": 0.0469, "step": 6420 }, { "epoch": 15.585662211421628, "grad_norm": 0.3959793746471405, "learning_rate": 1.9254000000000002e-05, "loss": 0.0291, "step": 6421 }, { "epoch": 15.58809234507898, "grad_norm": 0.6136201024055481, "learning_rate": 1.9257000000000002e-05, "loss": 0.0469, "step": 6422 }, { "epoch": 15.59052247873633, "grad_norm": 0.7783249616622925, "learning_rate": 1.9260000000000002e-05, "loss": 0.0294, "step": 6423 }, { "epoch": 15.592952612393681, "grad_norm": 0.6778849959373474, "learning_rate": 1.9263000000000002e-05, "loss": 0.0327, "step": 6424 }, { "epoch": 15.595382746051033, "grad_norm": 0.6407123804092407, "learning_rate": 1.9266e-05, "loss": 0.0393, "step": 6425 }, { "epoch": 15.597812879708384, "grad_norm": 1.2067370414733887, "learning_rate": 1.9269e-05, "loss": 0.0464, "step": 6426 }, { "epoch": 15.600243013365736, "grad_norm": 1.1530555486679077, "learning_rate": 1.9272e-05, "loss": 0.0512, "step": 6427 }, { "epoch": 15.602673147023086, "grad_norm": 1.49922513961792, "learning_rate": 1.9275e-05, "loss": 0.0694, "step": 6428 }, { "epoch": 15.605103280680437, "grad_norm": 1.3375353813171387, "learning_rate": 1.9277999999999997e-05, "loss": 0.0724, "step": 6429 }, { "epoch": 15.607533414337789, "grad_norm": 4.887784957885742, "learning_rate": 1.9281e-05, "loss": 0.1438, "step": 6430 }, { "epoch": 15.60996354799514, "grad_norm": 1.5435746908187866, "learning_rate": 1.9284e-05, "loss": 0.3292, "step": 6431 }, { "epoch": 15.612393681652492, "grad_norm": 1.0352035760879517, "learning_rate": 1.9287e-05, "loss": 0.2359, "step": 6432 }, { "epoch": 15.614823815309842, "grad_norm": 0.7973591089248657, "learning_rate": 1.929e-05, "loss": 0.1881, "step": 6433 }, { "epoch": 15.617253948967193, "grad_norm": 0.8265044689178467, "learning_rate": 1.9293e-05, "loss": 0.1539, "step": 6434 }, { "epoch": 15.619684082624545, "grad_norm": 1.0089904069900513, "learning_rate": 1.9296e-05, "loss": 0.2104, "step": 6435 }, { "epoch": 15.622114216281895, "grad_norm": 1.0751676559448242, "learning_rate": 1.9299e-05, "loss": 0.133, "step": 6436 }, { "epoch": 15.624544349939246, "grad_norm": 0.9430895447731018, "learning_rate": 1.9302e-05, "loss": 0.1344, "step": 6437 }, { "epoch": 15.626974483596598, "grad_norm": 0.6846063137054443, "learning_rate": 1.9305e-05, "loss": 0.1057, "step": 6438 }, { "epoch": 15.629404617253948, "grad_norm": 0.48819708824157715, "learning_rate": 1.9308e-05, "loss": 0.0838, "step": 6439 }, { "epoch": 15.6318347509113, "grad_norm": 0.9282357096672058, "learning_rate": 1.9311000000000002e-05, "loss": 0.0752, "step": 6440 }, { "epoch": 15.634264884568651, "grad_norm": 0.4671950340270996, "learning_rate": 1.9314000000000002e-05, "loss": 0.0454, "step": 6441 }, { "epoch": 15.636695018226002, "grad_norm": 0.4864813983440399, "learning_rate": 1.9317e-05, "loss": 0.055, "step": 6442 }, { "epoch": 15.639125151883354, "grad_norm": 0.6352891325950623, "learning_rate": 1.932e-05, "loss": 0.0621, "step": 6443 }, { "epoch": 15.641555285540704, "grad_norm": 0.6981072425842285, "learning_rate": 1.9323e-05, "loss": 0.0635, "step": 6444 }, { "epoch": 15.643985419198057, "grad_norm": 0.7445560693740845, "learning_rate": 1.9326e-05, "loss": 0.0689, "step": 6445 }, { "epoch": 15.646415552855407, "grad_norm": 0.733340322971344, "learning_rate": 1.9329e-05, "loss": 0.0457, "step": 6446 }, { "epoch": 15.648845686512757, "grad_norm": 0.4835315942764282, "learning_rate": 1.9332e-05, "loss": 0.0412, "step": 6447 }, { "epoch": 15.65127582017011, "grad_norm": 0.35078731179237366, "learning_rate": 1.9335e-05, "loss": 0.0346, "step": 6448 }, { "epoch": 15.65370595382746, "grad_norm": 0.7139492034912109, "learning_rate": 1.9338e-05, "loss": 0.0691, "step": 6449 }, { "epoch": 15.656136087484812, "grad_norm": 0.5815795063972473, "learning_rate": 1.9341000000000003e-05, "loss": 0.0492, "step": 6450 }, { "epoch": 15.658566221142163, "grad_norm": 0.5239266753196716, "learning_rate": 1.9344000000000003e-05, "loss": 0.0571, "step": 6451 }, { "epoch": 15.660996354799513, "grad_norm": 0.6847760081291199, "learning_rate": 1.9347000000000003e-05, "loss": 0.0303, "step": 6452 }, { "epoch": 15.663426488456865, "grad_norm": 0.5207905173301697, "learning_rate": 1.935e-05, "loss": 0.032, "step": 6453 }, { "epoch": 15.665856622114216, "grad_norm": 0.3963988125324249, "learning_rate": 1.9353e-05, "loss": 0.028, "step": 6454 }, { "epoch": 15.668286755771568, "grad_norm": 0.8621256351470947, "learning_rate": 1.9356e-05, "loss": 0.0356, "step": 6455 }, { "epoch": 15.670716889428919, "grad_norm": 0.5858620405197144, "learning_rate": 1.9359e-05, "loss": 0.0405, "step": 6456 }, { "epoch": 15.673147023086269, "grad_norm": 0.5066786408424377, "learning_rate": 1.9362e-05, "loss": 0.0343, "step": 6457 }, { "epoch": 15.675577156743621, "grad_norm": 0.5550910830497742, "learning_rate": 1.9365e-05, "loss": 0.0413, "step": 6458 }, { "epoch": 15.678007290400972, "grad_norm": 0.5200429558753967, "learning_rate": 1.9367999999999998e-05, "loss": 0.0298, "step": 6459 }, { "epoch": 15.680437424058324, "grad_norm": 0.4716522991657257, "learning_rate": 1.9371e-05, "loss": 0.0247, "step": 6460 }, { "epoch": 15.682867557715674, "grad_norm": 0.8210851550102234, "learning_rate": 1.9374e-05, "loss": 0.0605, "step": 6461 }, { "epoch": 15.685297691373025, "grad_norm": 0.8732902407646179, "learning_rate": 1.9377e-05, "loss": 0.0489, "step": 6462 }, { "epoch": 15.687727825030377, "grad_norm": 0.4347923696041107, "learning_rate": 1.938e-05, "loss": 0.0246, "step": 6463 }, { "epoch": 15.690157958687728, "grad_norm": 1.6235642433166504, "learning_rate": 1.9383e-05, "loss": 0.079, "step": 6464 }, { "epoch": 15.69258809234508, "grad_norm": 0.626548171043396, "learning_rate": 1.9386e-05, "loss": 0.0438, "step": 6465 }, { "epoch": 15.69501822600243, "grad_norm": 0.6551858186721802, "learning_rate": 1.9389e-05, "loss": 0.0326, "step": 6466 }, { "epoch": 15.69744835965978, "grad_norm": 0.7150661945343018, "learning_rate": 1.9392e-05, "loss": 0.0387, "step": 6467 }, { "epoch": 15.699878493317133, "grad_norm": 0.44152140617370605, "learning_rate": 1.9395e-05, "loss": 0.0242, "step": 6468 }, { "epoch": 15.702308626974483, "grad_norm": 0.5180478096008301, "learning_rate": 1.9398e-05, "loss": 0.0274, "step": 6469 }, { "epoch": 15.704738760631834, "grad_norm": 1.3137465715408325, "learning_rate": 1.9401000000000003e-05, "loss": 0.0359, "step": 6470 }, { "epoch": 15.707168894289186, "grad_norm": 1.7552679777145386, "learning_rate": 1.9404000000000003e-05, "loss": 0.0471, "step": 6471 }, { "epoch": 15.709599027946537, "grad_norm": 0.4817345142364502, "learning_rate": 1.9407000000000002e-05, "loss": 0.0381, "step": 6472 }, { "epoch": 15.712029161603889, "grad_norm": 0.6432149410247803, "learning_rate": 1.9410000000000002e-05, "loss": 0.0404, "step": 6473 }, { "epoch": 15.71445929526124, "grad_norm": 0.5537720322608948, "learning_rate": 1.9413000000000002e-05, "loss": 0.0278, "step": 6474 }, { "epoch": 15.716889428918591, "grad_norm": 0.7666966915130615, "learning_rate": 1.9416000000000002e-05, "loss": 0.0462, "step": 6475 }, { "epoch": 15.719319562575942, "grad_norm": 0.8534502387046814, "learning_rate": 1.9419e-05, "loss": 0.0422, "step": 6476 }, { "epoch": 15.721749696233292, "grad_norm": 0.6910253167152405, "learning_rate": 1.9422e-05, "loss": 0.026, "step": 6477 }, { "epoch": 15.724179829890645, "grad_norm": 2.1192424297332764, "learning_rate": 1.9424999999999998e-05, "loss": 0.0723, "step": 6478 }, { "epoch": 15.726609963547995, "grad_norm": 1.352705955505371, "learning_rate": 1.9427999999999998e-05, "loss": 0.0467, "step": 6479 }, { "epoch": 15.729040097205345, "grad_norm": 4.686713695526123, "learning_rate": 1.9431e-05, "loss": 0.0923, "step": 6480 }, { "epoch": 15.731470230862698, "grad_norm": 1.020697832107544, "learning_rate": 1.9434e-05, "loss": 0.3256, "step": 6481 }, { "epoch": 15.733900364520048, "grad_norm": 0.8204785585403442, "learning_rate": 1.9437e-05, "loss": 0.2385, "step": 6482 }, { "epoch": 15.7363304981774, "grad_norm": 0.5709099769592285, "learning_rate": 1.944e-05, "loss": 0.1598, "step": 6483 }, { "epoch": 15.73876063183475, "grad_norm": 0.7659446001052856, "learning_rate": 1.9443e-05, "loss": 0.1891, "step": 6484 }, { "epoch": 15.741190765492101, "grad_norm": 0.6988766193389893, "learning_rate": 1.9446e-05, "loss": 0.1432, "step": 6485 }, { "epoch": 15.743620899149454, "grad_norm": 0.6670263409614563, "learning_rate": 1.9449e-05, "loss": 0.1242, "step": 6486 }, { "epoch": 15.746051032806804, "grad_norm": 0.7416924238204956, "learning_rate": 1.9452e-05, "loss": 0.095, "step": 6487 }, { "epoch": 15.748481166464156, "grad_norm": 0.7295534610748291, "learning_rate": 1.9455e-05, "loss": 0.095, "step": 6488 }, { "epoch": 15.750911300121507, "grad_norm": 0.7284835577011108, "learning_rate": 1.9458e-05, "loss": 0.1063, "step": 6489 }, { "epoch": 15.753341433778857, "grad_norm": 0.7071718573570251, "learning_rate": 1.9461000000000002e-05, "loss": 0.0693, "step": 6490 }, { "epoch": 15.75577156743621, "grad_norm": 0.6361767649650574, "learning_rate": 1.9464000000000002e-05, "loss": 0.0574, "step": 6491 }, { "epoch": 15.75820170109356, "grad_norm": 0.5441663861274719, "learning_rate": 1.9467000000000002e-05, "loss": 0.065, "step": 6492 }, { "epoch": 15.760631834750912, "grad_norm": 0.5525217652320862, "learning_rate": 1.947e-05, "loss": 0.0482, "step": 6493 }, { "epoch": 15.763061968408262, "grad_norm": 0.5577378273010254, "learning_rate": 1.9473e-05, "loss": 0.0558, "step": 6494 }, { "epoch": 15.765492102065613, "grad_norm": 0.7357743382453918, "learning_rate": 1.9476e-05, "loss": 0.0426, "step": 6495 }, { "epoch": 15.767922235722965, "grad_norm": 0.5767481923103333, "learning_rate": 1.9479e-05, "loss": 0.0461, "step": 6496 }, { "epoch": 15.770352369380316, "grad_norm": 0.5532986521720886, "learning_rate": 1.9482e-05, "loss": 0.0455, "step": 6497 }, { "epoch": 15.772782503037668, "grad_norm": 0.6185905933380127, "learning_rate": 1.9485e-05, "loss": 0.0648, "step": 6498 }, { "epoch": 15.775212636695018, "grad_norm": 0.5661841630935669, "learning_rate": 1.9488e-05, "loss": 0.0354, "step": 6499 }, { "epoch": 15.777642770352369, "grad_norm": 0.4704468846321106, "learning_rate": 1.9491000000000004e-05, "loss": 0.0458, "step": 6500 }, { "epoch": 15.780072904009721, "grad_norm": 0.502532422542572, "learning_rate": 1.9494000000000003e-05, "loss": 0.0403, "step": 6501 }, { "epoch": 15.782503037667071, "grad_norm": 0.8224111199378967, "learning_rate": 1.9497e-05, "loss": 0.0385, "step": 6502 }, { "epoch": 15.784933171324424, "grad_norm": 0.47095468640327454, "learning_rate": 1.95e-05, "loss": 0.0317, "step": 6503 }, { "epoch": 15.787363304981774, "grad_norm": 0.4209509491920471, "learning_rate": 1.9503e-05, "loss": 0.0281, "step": 6504 }, { "epoch": 15.789793438639125, "grad_norm": 0.6750888824462891, "learning_rate": 1.9506e-05, "loss": 0.0477, "step": 6505 }, { "epoch": 15.792223572296477, "grad_norm": 0.8994753360748291, "learning_rate": 1.9509e-05, "loss": 0.0589, "step": 6506 }, { "epoch": 15.794653705953827, "grad_norm": 0.6014294028282166, "learning_rate": 1.9512e-05, "loss": 0.0275, "step": 6507 }, { "epoch": 15.79708383961118, "grad_norm": 0.8271967768669128, "learning_rate": 1.9515e-05, "loss": 0.0869, "step": 6508 }, { "epoch": 15.79951397326853, "grad_norm": 0.7147731184959412, "learning_rate": 1.9518e-05, "loss": 0.0491, "step": 6509 }, { "epoch": 15.80194410692588, "grad_norm": 0.6397818326950073, "learning_rate": 1.9520999999999998e-05, "loss": 0.0412, "step": 6510 }, { "epoch": 15.804374240583233, "grad_norm": 0.751969575881958, "learning_rate": 1.9524e-05, "loss": 0.0451, "step": 6511 }, { "epoch": 15.806804374240583, "grad_norm": 0.6372379064559937, "learning_rate": 1.9527e-05, "loss": 0.0556, "step": 6512 }, { "epoch": 15.809234507897933, "grad_norm": 0.5179954767227173, "learning_rate": 1.953e-05, "loss": 0.0402, "step": 6513 }, { "epoch": 15.811664641555286, "grad_norm": 0.43995943665504456, "learning_rate": 1.9533e-05, "loss": 0.0371, "step": 6514 }, { "epoch": 15.814094775212636, "grad_norm": 0.47490543127059937, "learning_rate": 1.9536e-05, "loss": 0.0424, "step": 6515 }, { "epoch": 15.816524908869988, "grad_norm": 0.36665621399879456, "learning_rate": 1.9539e-05, "loss": 0.0293, "step": 6516 }, { "epoch": 15.818955042527339, "grad_norm": 0.7318155765533447, "learning_rate": 1.9542e-05, "loss": 0.0551, "step": 6517 }, { "epoch": 15.821385176184691, "grad_norm": 0.6032623648643494, "learning_rate": 1.9545e-05, "loss": 0.045, "step": 6518 }, { "epoch": 15.823815309842042, "grad_norm": 0.7502490878105164, "learning_rate": 1.9548e-05, "loss": 0.0493, "step": 6519 }, { "epoch": 15.826245443499392, "grad_norm": 0.5147375464439392, "learning_rate": 1.9551e-05, "loss": 0.0347, "step": 6520 }, { "epoch": 15.828675577156744, "grad_norm": 1.4383622407913208, "learning_rate": 1.9554000000000003e-05, "loss": 0.0566, "step": 6521 }, { "epoch": 15.831105710814095, "grad_norm": 0.7408705949783325, "learning_rate": 1.9557000000000003e-05, "loss": 0.0432, "step": 6522 }, { "epoch": 15.833535844471445, "grad_norm": 0.6746755838394165, "learning_rate": 1.9560000000000002e-05, "loss": 0.0456, "step": 6523 }, { "epoch": 15.835965978128797, "grad_norm": 0.5945210456848145, "learning_rate": 1.9563000000000002e-05, "loss": 0.042, "step": 6524 }, { "epoch": 15.838396111786148, "grad_norm": 0.9174257516860962, "learning_rate": 1.9566000000000002e-05, "loss": 0.0495, "step": 6525 }, { "epoch": 15.8408262454435, "grad_norm": 0.810045063495636, "learning_rate": 1.9569000000000002e-05, "loss": 0.0376, "step": 6526 }, { "epoch": 15.84325637910085, "grad_norm": 1.1186801195144653, "learning_rate": 1.9571999999999998e-05, "loss": 0.0451, "step": 6527 }, { "epoch": 15.845686512758201, "grad_norm": 1.7553142309188843, "learning_rate": 1.9574999999999998e-05, "loss": 0.072, "step": 6528 }, { "epoch": 15.848116646415553, "grad_norm": 1.3541827201843262, "learning_rate": 1.9577999999999998e-05, "loss": 0.0541, "step": 6529 }, { "epoch": 15.850546780072904, "grad_norm": 1.1738042831420898, "learning_rate": 1.9580999999999998e-05, "loss": 0.0656, "step": 6530 }, { "epoch": 15.852976913730256, "grad_norm": 1.6882964372634888, "learning_rate": 1.9584e-05, "loss": 0.2986, "step": 6531 }, { "epoch": 15.855407047387606, "grad_norm": 0.8612640500068665, "learning_rate": 1.9587e-05, "loss": 0.265, "step": 6532 }, { "epoch": 15.857837181044957, "grad_norm": 0.882732629776001, "learning_rate": 1.959e-05, "loss": 0.1792, "step": 6533 }, { "epoch": 15.860267314702309, "grad_norm": 1.1689743995666504, "learning_rate": 1.9593e-05, "loss": 0.1737, "step": 6534 }, { "epoch": 15.86269744835966, "grad_norm": 0.8211115002632141, "learning_rate": 1.9596e-05, "loss": 0.1416, "step": 6535 }, { "epoch": 15.865127582017012, "grad_norm": 0.5989269018173218, "learning_rate": 1.9599e-05, "loss": 0.0974, "step": 6536 }, { "epoch": 15.867557715674362, "grad_norm": 0.7878609895706177, "learning_rate": 1.9602e-05, "loss": 0.1039, "step": 6537 }, { "epoch": 15.869987849331713, "grad_norm": 0.9615452885627747, "learning_rate": 1.9605e-05, "loss": 0.0679, "step": 6538 }, { "epoch": 15.872417982989065, "grad_norm": 0.8021740317344666, "learning_rate": 1.9608e-05, "loss": 0.0553, "step": 6539 }, { "epoch": 15.874848116646415, "grad_norm": 0.5032227635383606, "learning_rate": 1.9611e-05, "loss": 0.0499, "step": 6540 }, { "epoch": 15.877278250303767, "grad_norm": 0.4631168246269226, "learning_rate": 1.9614000000000002e-05, "loss": 0.0561, "step": 6541 }, { "epoch": 15.879708383961118, "grad_norm": 0.43524104356765747, "learning_rate": 1.9617000000000002e-05, "loss": 0.0411, "step": 6542 }, { "epoch": 15.882138517618468, "grad_norm": 0.46117353439331055, "learning_rate": 1.9620000000000002e-05, "loss": 0.0367, "step": 6543 }, { "epoch": 15.88456865127582, "grad_norm": 0.5075899362564087, "learning_rate": 1.9623e-05, "loss": 0.0414, "step": 6544 }, { "epoch": 15.886998784933171, "grad_norm": 0.5995011329650879, "learning_rate": 1.9626e-05, "loss": 0.0546, "step": 6545 }, { "epoch": 15.889428918590523, "grad_norm": 0.7004613876342773, "learning_rate": 1.9629e-05, "loss": 0.0384, "step": 6546 }, { "epoch": 15.891859052247874, "grad_norm": 0.4672735631465912, "learning_rate": 1.9632e-05, "loss": 0.0323, "step": 6547 }, { "epoch": 15.894289185905224, "grad_norm": 0.6617659330368042, "learning_rate": 1.9635e-05, "loss": 0.04, "step": 6548 }, { "epoch": 15.896719319562576, "grad_norm": 0.35379838943481445, "learning_rate": 1.9638e-05, "loss": 0.029, "step": 6549 }, { "epoch": 15.899149453219927, "grad_norm": 0.618165910243988, "learning_rate": 1.9641e-05, "loss": 0.0581, "step": 6550 }, { "epoch": 15.90157958687728, "grad_norm": 0.7119225263595581, "learning_rate": 1.9644e-05, "loss": 0.032, "step": 6551 }, { "epoch": 15.90400972053463, "grad_norm": 0.43703633546829224, "learning_rate": 1.9647e-05, "loss": 0.032, "step": 6552 }, { "epoch": 15.90643985419198, "grad_norm": 0.7007567286491394, "learning_rate": 1.965e-05, "loss": 0.1017, "step": 6553 }, { "epoch": 15.908869987849332, "grad_norm": 0.5769993662834167, "learning_rate": 1.9653e-05, "loss": 0.0495, "step": 6554 }, { "epoch": 15.911300121506683, "grad_norm": 0.5540100336074829, "learning_rate": 1.9656e-05, "loss": 0.0355, "step": 6555 }, { "epoch": 15.913730255164033, "grad_norm": 0.3679448068141937, "learning_rate": 1.9659e-05, "loss": 0.021, "step": 6556 }, { "epoch": 15.916160388821385, "grad_norm": 0.4809013605117798, "learning_rate": 1.9662e-05, "loss": 0.0234, "step": 6557 }, { "epoch": 15.918590522478736, "grad_norm": 0.7573971748352051, "learning_rate": 1.9665e-05, "loss": 0.0468, "step": 6558 }, { "epoch": 15.921020656136088, "grad_norm": 0.7641445994377136, "learning_rate": 1.9668e-05, "loss": 0.0428, "step": 6559 }, { "epoch": 15.923450789793439, "grad_norm": 0.4742893874645233, "learning_rate": 1.9671e-05, "loss": 0.0274, "step": 6560 }, { "epoch": 15.925880923450789, "grad_norm": 0.5510801076889038, "learning_rate": 1.9674000000000002e-05, "loss": 0.0475, "step": 6561 }, { "epoch": 15.928311057108141, "grad_norm": 1.1271787881851196, "learning_rate": 1.9677e-05, "loss": 0.0254, "step": 6562 }, { "epoch": 15.930741190765492, "grad_norm": 0.6522717475891113, "learning_rate": 1.968e-05, "loss": 0.0419, "step": 6563 }, { "epoch": 15.933171324422844, "grad_norm": 0.4367833435535431, "learning_rate": 1.9683e-05, "loss": 0.0326, "step": 6564 }, { "epoch": 15.935601458080194, "grad_norm": 0.511759877204895, "learning_rate": 1.9686e-05, "loss": 0.042, "step": 6565 }, { "epoch": 15.938031591737545, "grad_norm": 0.3645158112049103, "learning_rate": 1.9689e-05, "loss": 0.0306, "step": 6566 }, { "epoch": 15.940461725394897, "grad_norm": 0.6977981328964233, "learning_rate": 1.9692e-05, "loss": 0.031, "step": 6567 }, { "epoch": 15.942891859052247, "grad_norm": 0.653771162033081, "learning_rate": 1.9695e-05, "loss": 0.0422, "step": 6568 }, { "epoch": 15.9453219927096, "grad_norm": 0.7309830188751221, "learning_rate": 1.9698e-05, "loss": 0.0473, "step": 6569 }, { "epoch": 15.94775212636695, "grad_norm": 0.9212996959686279, "learning_rate": 1.9701e-05, "loss": 0.0417, "step": 6570 }, { "epoch": 15.9501822600243, "grad_norm": 0.5411525964736938, "learning_rate": 1.9704000000000003e-05, "loss": 0.0376, "step": 6571 }, { "epoch": 15.952612393681653, "grad_norm": 0.5020602941513062, "learning_rate": 1.9707000000000003e-05, "loss": 0.0238, "step": 6572 }, { "epoch": 15.955042527339003, "grad_norm": 0.7702956795692444, "learning_rate": 1.9710000000000003e-05, "loss": 0.0256, "step": 6573 }, { "epoch": 15.957472660996356, "grad_norm": 0.5485971570014954, "learning_rate": 1.9713000000000003e-05, "loss": 0.0323, "step": 6574 }, { "epoch": 15.959902794653706, "grad_norm": 0.847766101360321, "learning_rate": 1.9716000000000002e-05, "loss": 0.0469, "step": 6575 }, { "epoch": 15.962332928311056, "grad_norm": 1.200136423110962, "learning_rate": 1.9719e-05, "loss": 0.0546, "step": 6576 }, { "epoch": 15.964763061968409, "grad_norm": 0.9952907562255859, "learning_rate": 1.9722e-05, "loss": 0.0628, "step": 6577 }, { "epoch": 15.96719319562576, "grad_norm": 1.4219485521316528, "learning_rate": 1.9725e-05, "loss": 0.07, "step": 6578 }, { "epoch": 15.969623329283111, "grad_norm": 1.0412992238998413, "learning_rate": 1.9727999999999998e-05, "loss": 0.0534, "step": 6579 }, { "epoch": 15.972053462940462, "grad_norm": 1.313302755355835, "learning_rate": 1.9730999999999998e-05, "loss": 0.066, "step": 6580 }, { "epoch": 15.974483596597812, "grad_norm": 1.2279961109161377, "learning_rate": 1.9734e-05, "loss": 0.2169, "step": 6581 }, { "epoch": 15.976913730255164, "grad_norm": 0.6817932724952698, "learning_rate": 1.9737e-05, "loss": 0.0988, "step": 6582 }, { "epoch": 15.979343863912515, "grad_norm": 0.7316840887069702, "learning_rate": 1.974e-05, "loss": 0.0636, "step": 6583 }, { "epoch": 15.981773997569867, "grad_norm": 0.5839886665344238, "learning_rate": 1.9743e-05, "loss": 0.0296, "step": 6584 }, { "epoch": 15.984204131227218, "grad_norm": 0.4980129599571228, "learning_rate": 1.9746e-05, "loss": 0.0553, "step": 6585 }, { "epoch": 15.986634264884568, "grad_norm": 0.5902254581451416, "learning_rate": 1.9749e-05, "loss": 0.0379, "step": 6586 }, { "epoch": 15.98906439854192, "grad_norm": 0.6273553967475891, "learning_rate": 1.9752e-05, "loss": 0.0467, "step": 6587 }, { "epoch": 15.99149453219927, "grad_norm": 0.7092445492744446, "learning_rate": 1.9755e-05, "loss": 0.0396, "step": 6588 }, { "epoch": 15.993924665856621, "grad_norm": 0.6257092952728271, "learning_rate": 1.9758e-05, "loss": 0.0858, "step": 6589 }, { "epoch": 15.996354799513973, "grad_norm": 0.9578611254692078, "learning_rate": 1.9761e-05, "loss": 0.0562, "step": 6590 }, { "epoch": 15.998784933171324, "grad_norm": 0.9343767166137695, "learning_rate": 1.9764000000000003e-05, "loss": 0.0534, "step": 6591 }, { "epoch": 16.0, "grad_norm": 0.7602003216743469, "learning_rate": 1.9767000000000002e-05, "loss": 0.098, "step": 6592 }, { "epoch": 16.002430133657352, "grad_norm": 1.5079723596572876, "learning_rate": 1.9770000000000002e-05, "loss": 0.2775, "step": 6593 }, { "epoch": 16.0048602673147, "grad_norm": 0.7751253843307495, "learning_rate": 1.9773000000000002e-05, "loss": 0.2328, "step": 6594 }, { "epoch": 16.007290400972053, "grad_norm": 0.8338161110877991, "learning_rate": 1.9776000000000002e-05, "loss": 0.1752, "step": 6595 }, { "epoch": 16.009720534629405, "grad_norm": 0.6880357265472412, "learning_rate": 1.9779e-05, "loss": 0.1548, "step": 6596 }, { "epoch": 16.012150668286754, "grad_norm": 0.8785845637321472, "learning_rate": 1.9782e-05, "loss": 0.1867, "step": 6597 }, { "epoch": 16.014580801944106, "grad_norm": 0.817703902721405, "learning_rate": 1.9785e-05, "loss": 0.0955, "step": 6598 }, { "epoch": 16.01701093560146, "grad_norm": 0.9156400561332703, "learning_rate": 1.9788e-05, "loss": 0.1374, "step": 6599 }, { "epoch": 16.01944106925881, "grad_norm": 0.7103717923164368, "learning_rate": 1.9791e-05, "loss": 0.0761, "step": 6600 }, { "epoch": 16.02187120291616, "grad_norm": 0.6918307542800903, "learning_rate": 1.9794e-05, "loss": 0.0806, "step": 6601 }, { "epoch": 16.02430133657351, "grad_norm": 0.42693498730659485, "learning_rate": 1.9797e-05, "loss": 0.051, "step": 6602 }, { "epoch": 16.026731470230864, "grad_norm": 0.34729841351509094, "learning_rate": 1.98e-05, "loss": 0.0344, "step": 6603 }, { "epoch": 16.029161603888213, "grad_norm": 0.5464868545532227, "learning_rate": 1.9803e-05, "loss": 0.0593, "step": 6604 }, { "epoch": 16.031591737545565, "grad_norm": 0.8021151423454285, "learning_rate": 1.9806e-05, "loss": 0.0588, "step": 6605 }, { "epoch": 16.034021871202917, "grad_norm": 0.4131837785243988, "learning_rate": 1.9809e-05, "loss": 0.0351, "step": 6606 }, { "epoch": 16.036452004860266, "grad_norm": 0.514269232749939, "learning_rate": 1.9812e-05, "loss": 0.0385, "step": 6607 }, { "epoch": 16.038882138517618, "grad_norm": 0.4703933596611023, "learning_rate": 1.9815e-05, "loss": 0.0413, "step": 6608 }, { "epoch": 16.04131227217497, "grad_norm": 0.29616889357566833, "learning_rate": 1.9818e-05, "loss": 0.027, "step": 6609 }, { "epoch": 16.043742405832322, "grad_norm": 0.32781246304512024, "learning_rate": 1.9821e-05, "loss": 0.0375, "step": 6610 }, { "epoch": 16.04617253948967, "grad_norm": 0.3699805438518524, "learning_rate": 1.9824000000000002e-05, "loss": 0.0277, "step": 6611 }, { "epoch": 16.048602673147023, "grad_norm": 0.6841250658035278, "learning_rate": 1.9827000000000002e-05, "loss": 0.0449, "step": 6612 }, { "epoch": 16.051032806804375, "grad_norm": 0.3401043117046356, "learning_rate": 1.983e-05, "loss": 0.0314, "step": 6613 }, { "epoch": 16.053462940461724, "grad_norm": 0.48708710074424744, "learning_rate": 1.9833e-05, "loss": 0.0266, "step": 6614 }, { "epoch": 16.055893074119076, "grad_norm": 0.7264524102210999, "learning_rate": 1.9836e-05, "loss": 0.1007, "step": 6615 }, { "epoch": 16.05832320777643, "grad_norm": 0.6908791661262512, "learning_rate": 1.9839e-05, "loss": 0.0388, "step": 6616 }, { "epoch": 16.060753341433777, "grad_norm": 0.5278740525245667, "learning_rate": 1.9842e-05, "loss": 0.0308, "step": 6617 }, { "epoch": 16.06318347509113, "grad_norm": 0.6753385066986084, "learning_rate": 1.9845e-05, "loss": 0.0242, "step": 6618 }, { "epoch": 16.06561360874848, "grad_norm": 0.39890256524086, "learning_rate": 1.9848e-05, "loss": 0.0273, "step": 6619 }, { "epoch": 16.068043742405834, "grad_norm": 0.6051116585731506, "learning_rate": 1.9851e-05, "loss": 0.0468, "step": 6620 }, { "epoch": 16.070473876063183, "grad_norm": 0.4919916093349457, "learning_rate": 1.9854000000000003e-05, "loss": 0.035, "step": 6621 }, { "epoch": 16.072904009720535, "grad_norm": 0.5838719606399536, "learning_rate": 1.9857000000000003e-05, "loss": 0.0328, "step": 6622 }, { "epoch": 16.075334143377887, "grad_norm": 0.4803670048713684, "learning_rate": 1.9860000000000003e-05, "loss": 0.0338, "step": 6623 }, { "epoch": 16.077764277035236, "grad_norm": 0.7797152400016785, "learning_rate": 1.9863000000000003e-05, "loss": 0.0482, "step": 6624 }, { "epoch": 16.080194410692588, "grad_norm": 0.5386581420898438, "learning_rate": 1.9866e-05, "loss": 0.0408, "step": 6625 }, { "epoch": 16.08262454434994, "grad_norm": 0.6398144364356995, "learning_rate": 1.9869e-05, "loss": 0.0278, "step": 6626 }, { "epoch": 16.08505467800729, "grad_norm": 1.8339189291000366, "learning_rate": 1.9872e-05, "loss": 0.0627, "step": 6627 }, { "epoch": 16.08748481166464, "grad_norm": 0.8111991882324219, "learning_rate": 1.9875e-05, "loss": 0.0248, "step": 6628 }, { "epoch": 16.089914945321993, "grad_norm": 0.3426704704761505, "learning_rate": 1.9878e-05, "loss": 0.0248, "step": 6629 }, { "epoch": 16.092345078979346, "grad_norm": 0.7969396114349365, "learning_rate": 1.9880999999999998e-05, "loss": 0.0329, "step": 6630 }, { "epoch": 16.094775212636694, "grad_norm": 0.5434114933013916, "learning_rate": 1.9883999999999998e-05, "loss": 0.0194, "step": 6631 }, { "epoch": 16.097205346294047, "grad_norm": 0.807499349117279, "learning_rate": 1.9887e-05, "loss": 0.0566, "step": 6632 }, { "epoch": 16.0996354799514, "grad_norm": 0.826481819152832, "learning_rate": 1.989e-05, "loss": 0.0464, "step": 6633 }, { "epoch": 16.102065613608747, "grad_norm": 1.0325162410736084, "learning_rate": 1.9893e-05, "loss": 0.0393, "step": 6634 }, { "epoch": 16.1044957472661, "grad_norm": 0.5098801851272583, "learning_rate": 1.9896e-05, "loss": 0.0266, "step": 6635 }, { "epoch": 16.106925880923452, "grad_norm": 0.6945914030075073, "learning_rate": 1.9899e-05, "loss": 0.0285, "step": 6636 }, { "epoch": 16.1093560145808, "grad_norm": 0.700771689414978, "learning_rate": 1.9902e-05, "loss": 0.032, "step": 6637 }, { "epoch": 16.111786148238153, "grad_norm": 0.9585519433021545, "learning_rate": 1.9905e-05, "loss": 0.0598, "step": 6638 }, { "epoch": 16.114216281895505, "grad_norm": 1.0768755674362183, "learning_rate": 1.9908e-05, "loss": 0.0535, "step": 6639 }, { "epoch": 16.116646415552854, "grad_norm": 1.0770118236541748, "learning_rate": 1.9911e-05, "loss": 0.0469, "step": 6640 }, { "epoch": 16.119076549210206, "grad_norm": 0.5471060276031494, "learning_rate": 1.9914e-05, "loss": 0.0308, "step": 6641 }, { "epoch": 16.121506682867558, "grad_norm": 1.2092456817626953, "learning_rate": 1.9917000000000003e-05, "loss": 0.084, "step": 6642 }, { "epoch": 16.12393681652491, "grad_norm": 1.7596677541732788, "learning_rate": 1.9920000000000002e-05, "loss": 0.2704, "step": 6643 }, { "epoch": 16.12636695018226, "grad_norm": 0.7279687523841858, "learning_rate": 1.9923000000000002e-05, "loss": 0.2027, "step": 6644 }, { "epoch": 16.12879708383961, "grad_norm": 0.6632529497146606, "learning_rate": 1.9926000000000002e-05, "loss": 0.1244, "step": 6645 }, { "epoch": 16.131227217496964, "grad_norm": 0.5287688970565796, "learning_rate": 1.9929000000000002e-05, "loss": 0.116, "step": 6646 }, { "epoch": 16.133657351154312, "grad_norm": 0.7135950922966003, "learning_rate": 1.9932e-05, "loss": 0.1382, "step": 6647 }, { "epoch": 16.136087484811664, "grad_norm": 0.6188684105873108, "learning_rate": 1.9935e-05, "loss": 0.102, "step": 6648 }, { "epoch": 16.138517618469017, "grad_norm": 0.6614843010902405, "learning_rate": 1.9938e-05, "loss": 0.1243, "step": 6649 }, { "epoch": 16.140947752126365, "grad_norm": 0.9849742650985718, "learning_rate": 1.9940999999999998e-05, "loss": 0.0988, "step": 6650 }, { "epoch": 16.143377885783718, "grad_norm": 0.6080458164215088, "learning_rate": 1.9943999999999997e-05, "loss": 0.0597, "step": 6651 }, { "epoch": 16.14580801944107, "grad_norm": 0.4255931079387665, "learning_rate": 1.9947e-05, "loss": 0.0484, "step": 6652 }, { "epoch": 16.148238153098422, "grad_norm": 0.6096090078353882, "learning_rate": 1.995e-05, "loss": 0.0549, "step": 6653 }, { "epoch": 16.15066828675577, "grad_norm": 0.4163444936275482, "learning_rate": 1.9953e-05, "loss": 0.0461, "step": 6654 }, { "epoch": 16.153098420413123, "grad_norm": 0.5702942609786987, "learning_rate": 1.9956e-05, "loss": 0.0334, "step": 6655 }, { "epoch": 16.155528554070475, "grad_norm": 0.5541343092918396, "learning_rate": 1.9959e-05, "loss": 0.0649, "step": 6656 }, { "epoch": 16.157958687727824, "grad_norm": 0.5554463267326355, "learning_rate": 1.9962e-05, "loss": 0.0441, "step": 6657 }, { "epoch": 16.160388821385176, "grad_norm": 0.4772166609764099, "learning_rate": 1.9965e-05, "loss": 0.0351, "step": 6658 }, { "epoch": 16.16281895504253, "grad_norm": 0.47546297311782837, "learning_rate": 1.9968e-05, "loss": 0.0367, "step": 6659 }, { "epoch": 16.165249088699877, "grad_norm": 0.640117347240448, "learning_rate": 1.9971e-05, "loss": 0.0604, "step": 6660 }, { "epoch": 16.16767922235723, "grad_norm": 0.4494363069534302, "learning_rate": 1.9974e-05, "loss": 0.0335, "step": 6661 }, { "epoch": 16.17010935601458, "grad_norm": 0.7318449020385742, "learning_rate": 1.9977000000000002e-05, "loss": 0.0544, "step": 6662 }, { "epoch": 16.172539489671934, "grad_norm": 0.35107091069221497, "learning_rate": 1.9980000000000002e-05, "loss": 0.0394, "step": 6663 }, { "epoch": 16.174969623329282, "grad_norm": 0.6311874389648438, "learning_rate": 1.9983e-05, "loss": 0.0307, "step": 6664 }, { "epoch": 16.177399756986635, "grad_norm": 0.9733560681343079, "learning_rate": 1.9986e-05, "loss": 0.0572, "step": 6665 }, { "epoch": 16.179829890643987, "grad_norm": 0.45678478479385376, "learning_rate": 1.9989e-05, "loss": 0.0396, "step": 6666 }, { "epoch": 16.182260024301335, "grad_norm": 0.41147902607917786, "learning_rate": 1.9992e-05, "loss": 0.032, "step": 6667 }, { "epoch": 16.184690157958688, "grad_norm": 1.0588592290878296, "learning_rate": 1.9995e-05, "loss": 0.0258, "step": 6668 }, { "epoch": 16.18712029161604, "grad_norm": 0.5832549333572388, "learning_rate": 1.9998e-05, "loss": 0.0307, "step": 6669 }, { "epoch": 16.18955042527339, "grad_norm": 0.8939762115478516, "learning_rate": 2.0001e-05, "loss": 0.0452, "step": 6670 }, { "epoch": 16.19198055893074, "grad_norm": 0.5296880006790161, "learning_rate": 2.0004e-05, "loss": 0.0381, "step": 6671 }, { "epoch": 16.194410692588093, "grad_norm": 0.4195530116558075, "learning_rate": 2.0007000000000003e-05, "loss": 0.0284, "step": 6672 }, { "epoch": 16.19684082624544, "grad_norm": 0.6159576773643494, "learning_rate": 2.0010000000000003e-05, "loss": 0.0473, "step": 6673 }, { "epoch": 16.199270959902794, "grad_norm": 0.44327056407928467, "learning_rate": 2.0013e-05, "loss": 0.0283, "step": 6674 }, { "epoch": 16.201701093560146, "grad_norm": 0.5594122409820557, "learning_rate": 2.0016e-05, "loss": 0.0393, "step": 6675 }, { "epoch": 16.2041312272175, "grad_norm": 0.499485582113266, "learning_rate": 2.0019e-05, "loss": 0.0255, "step": 6676 }, { "epoch": 16.206561360874847, "grad_norm": 0.4113265872001648, "learning_rate": 2.0022e-05, "loss": 0.0263, "step": 6677 }, { "epoch": 16.2089914945322, "grad_norm": 1.5382097959518433, "learning_rate": 2.0025e-05, "loss": 0.041, "step": 6678 }, { "epoch": 16.21142162818955, "grad_norm": 0.9174782633781433, "learning_rate": 2.0028e-05, "loss": 0.0414, "step": 6679 }, { "epoch": 16.2138517618469, "grad_norm": 0.3526415526866913, "learning_rate": 2.0031e-05, "loss": 0.0243, "step": 6680 }, { "epoch": 16.216281895504252, "grad_norm": 0.6484573483467102, "learning_rate": 2.0033999999999998e-05, "loss": 0.0386, "step": 6681 }, { "epoch": 16.218712029161605, "grad_norm": 0.8860385417938232, "learning_rate": 2.0037e-05, "loss": 0.0695, "step": 6682 }, { "epoch": 16.221142162818953, "grad_norm": 0.5176900625228882, "learning_rate": 2.004e-05, "loss": 0.0288, "step": 6683 }, { "epoch": 16.223572296476306, "grad_norm": 0.500455915927887, "learning_rate": 2.0043e-05, "loss": 0.0351, "step": 6684 }, { "epoch": 16.226002430133658, "grad_norm": 0.758141040802002, "learning_rate": 2.0046e-05, "loss": 0.0311, "step": 6685 }, { "epoch": 16.22843256379101, "grad_norm": 0.9962798953056335, "learning_rate": 2.0049e-05, "loss": 0.0536, "step": 6686 }, { "epoch": 16.23086269744836, "grad_norm": 0.6790388226509094, "learning_rate": 2.0052e-05, "loss": 0.0379, "step": 6687 }, { "epoch": 16.23329283110571, "grad_norm": 0.7544812560081482, "learning_rate": 2.0055e-05, "loss": 0.0384, "step": 6688 }, { "epoch": 16.235722964763063, "grad_norm": 0.5066736340522766, "learning_rate": 2.0058e-05, "loss": 0.038, "step": 6689 }, { "epoch": 16.238153098420412, "grad_norm": 1.103952169418335, "learning_rate": 2.0061e-05, "loss": 0.0505, "step": 6690 }, { "epoch": 16.240583232077764, "grad_norm": 0.9495031237602234, "learning_rate": 2.0064e-05, "loss": 0.05, "step": 6691 }, { "epoch": 16.243013365735116, "grad_norm": 2.2433550357818604, "learning_rate": 2.0067000000000003e-05, "loss": 0.07, "step": 6692 }, { "epoch": 16.245443499392465, "grad_norm": 1.5484912395477295, "learning_rate": 2.0070000000000003e-05, "loss": 0.3076, "step": 6693 }, { "epoch": 16.247873633049817, "grad_norm": 0.6634373068809509, "learning_rate": 2.0073000000000002e-05, "loss": 0.1854, "step": 6694 }, { "epoch": 16.25030376670717, "grad_norm": 0.6103556156158447, "learning_rate": 2.0076000000000002e-05, "loss": 0.1711, "step": 6695 }, { "epoch": 16.25273390036452, "grad_norm": 0.8945972919464111, "learning_rate": 2.0079000000000002e-05, "loss": 0.1595, "step": 6696 }, { "epoch": 16.25516403402187, "grad_norm": 0.9211353659629822, "learning_rate": 2.0082000000000002e-05, "loss": 0.1188, "step": 6697 }, { "epoch": 16.257594167679223, "grad_norm": 1.475879192352295, "learning_rate": 2.0085e-05, "loss": 0.1062, "step": 6698 }, { "epoch": 16.260024301336575, "grad_norm": 0.4845871031284332, "learning_rate": 2.0087999999999998e-05, "loss": 0.0752, "step": 6699 }, { "epoch": 16.262454434993924, "grad_norm": 0.7426387071609497, "learning_rate": 2.0090999999999998e-05, "loss": 0.0916, "step": 6700 }, { "epoch": 16.264884568651276, "grad_norm": 0.6132792234420776, "learning_rate": 2.0093999999999998e-05, "loss": 0.081, "step": 6701 }, { "epoch": 16.267314702308628, "grad_norm": 0.6964823007583618, "learning_rate": 2.0097e-05, "loss": 0.0436, "step": 6702 }, { "epoch": 16.269744835965977, "grad_norm": 0.7590800523757935, "learning_rate": 2.01e-05, "loss": 0.0674, "step": 6703 }, { "epoch": 16.27217496962333, "grad_norm": 0.5078913569450378, "learning_rate": 2.0103e-05, "loss": 0.0551, "step": 6704 }, { "epoch": 16.27460510328068, "grad_norm": 0.49593061208724976, "learning_rate": 2.0106e-05, "loss": 0.0454, "step": 6705 }, { "epoch": 16.277035236938033, "grad_norm": 0.5085136294364929, "learning_rate": 2.0109e-05, "loss": 0.0544, "step": 6706 }, { "epoch": 16.279465370595382, "grad_norm": 0.5542231202125549, "learning_rate": 2.0112e-05, "loss": 0.0426, "step": 6707 }, { "epoch": 16.281895504252734, "grad_norm": 0.47864705324172974, "learning_rate": 2.0115e-05, "loss": 0.0369, "step": 6708 }, { "epoch": 16.284325637910086, "grad_norm": 0.3986717462539673, "learning_rate": 2.0118e-05, "loss": 0.0592, "step": 6709 }, { "epoch": 16.286755771567435, "grad_norm": 0.4244610667228699, "learning_rate": 2.0121e-05, "loss": 0.0526, "step": 6710 }, { "epoch": 16.289185905224787, "grad_norm": 0.48482367396354675, "learning_rate": 2.0124e-05, "loss": 0.0262, "step": 6711 }, { "epoch": 16.29161603888214, "grad_norm": 0.45995932817459106, "learning_rate": 2.0127000000000002e-05, "loss": 0.0247, "step": 6712 }, { "epoch": 16.29404617253949, "grad_norm": 0.5758072733879089, "learning_rate": 2.0130000000000002e-05, "loss": 0.0445, "step": 6713 }, { "epoch": 16.29647630619684, "grad_norm": 0.5733565092086792, "learning_rate": 2.0133000000000002e-05, "loss": 0.0394, "step": 6714 }, { "epoch": 16.298906439854193, "grad_norm": 0.3739757537841797, "learning_rate": 2.0136e-05, "loss": 0.0399, "step": 6715 }, { "epoch": 16.30133657351154, "grad_norm": 0.42538973689079285, "learning_rate": 2.0139e-05, "loss": 0.0322, "step": 6716 }, { "epoch": 16.303766707168894, "grad_norm": 0.731419563293457, "learning_rate": 2.0142e-05, "loss": 0.0516, "step": 6717 }, { "epoch": 16.306196840826246, "grad_norm": 0.6503987312316895, "learning_rate": 2.0145e-05, "loss": 0.0371, "step": 6718 }, { "epoch": 16.308626974483598, "grad_norm": 0.5302380919456482, "learning_rate": 2.0148e-05, "loss": 0.0276, "step": 6719 }, { "epoch": 16.311057108140947, "grad_norm": 0.7832352519035339, "learning_rate": 2.0151e-05, "loss": 0.0352, "step": 6720 }, { "epoch": 16.3134872417983, "grad_norm": 0.7491165399551392, "learning_rate": 2.0154e-05, "loss": 0.0355, "step": 6721 }, { "epoch": 16.31591737545565, "grad_norm": 0.7251560091972351, "learning_rate": 2.0157000000000004e-05, "loss": 0.0386, "step": 6722 }, { "epoch": 16.318347509113, "grad_norm": 0.515417218208313, "learning_rate": 2.016e-05, "loss": 0.0316, "step": 6723 }, { "epoch": 16.320777642770352, "grad_norm": 0.4890521466732025, "learning_rate": 2.0163e-05, "loss": 0.0334, "step": 6724 }, { "epoch": 16.323207776427704, "grad_norm": 0.7507418394088745, "learning_rate": 2.0166e-05, "loss": 0.0346, "step": 6725 }, { "epoch": 16.325637910085053, "grad_norm": 1.212032675743103, "learning_rate": 2.0169e-05, "loss": 0.0649, "step": 6726 }, { "epoch": 16.328068043742405, "grad_norm": 0.47515854239463806, "learning_rate": 2.0172e-05, "loss": 0.0267, "step": 6727 }, { "epoch": 16.330498177399758, "grad_norm": 0.3836577534675598, "learning_rate": 2.0175e-05, "loss": 0.0207, "step": 6728 }, { "epoch": 16.33292831105711, "grad_norm": 0.6486530900001526, "learning_rate": 2.0178e-05, "loss": 0.0405, "step": 6729 }, { "epoch": 16.33535844471446, "grad_norm": 0.44598403573036194, "learning_rate": 2.0181e-05, "loss": 0.0183, "step": 6730 }, { "epoch": 16.33778857837181, "grad_norm": 0.6303940415382385, "learning_rate": 2.0184e-05, "loss": 0.0309, "step": 6731 }, { "epoch": 16.340218712029163, "grad_norm": 0.5369646549224854, "learning_rate": 2.0187000000000002e-05, "loss": 0.0381, "step": 6732 }, { "epoch": 16.34264884568651, "grad_norm": 0.6086966395378113, "learning_rate": 2.019e-05, "loss": 0.0387, "step": 6733 }, { "epoch": 16.345078979343864, "grad_norm": 0.47301673889160156, "learning_rate": 2.0193e-05, "loss": 0.0287, "step": 6734 }, { "epoch": 16.347509113001216, "grad_norm": 0.7718220949172974, "learning_rate": 2.0196e-05, "loss": 0.0543, "step": 6735 }, { "epoch": 16.349939246658565, "grad_norm": 0.9530096054077148, "learning_rate": 2.0199e-05, "loss": 0.0519, "step": 6736 }, { "epoch": 16.352369380315917, "grad_norm": 0.5679015517234802, "learning_rate": 2.0202e-05, "loss": 0.0325, "step": 6737 }, { "epoch": 16.35479951397327, "grad_norm": 0.8931829929351807, "learning_rate": 2.0205e-05, "loss": 0.0401, "step": 6738 }, { "epoch": 16.35722964763062, "grad_norm": 1.0959359407424927, "learning_rate": 2.0208e-05, "loss": 0.0433, "step": 6739 }, { "epoch": 16.35965978128797, "grad_norm": 0.9553524255752563, "learning_rate": 2.0211e-05, "loss": 0.0489, "step": 6740 }, { "epoch": 16.362089914945322, "grad_norm": 0.5824098587036133, "learning_rate": 2.0214e-05, "loss": 0.0295, "step": 6741 }, { "epoch": 16.364520048602675, "grad_norm": 1.6337692737579346, "learning_rate": 2.0217000000000003e-05, "loss": 0.1422, "step": 6742 }, { "epoch": 16.366950182260023, "grad_norm": 0.9034848213195801, "learning_rate": 2.0220000000000003e-05, "loss": 0.276, "step": 6743 }, { "epoch": 16.369380315917375, "grad_norm": 0.8742133378982544, "learning_rate": 2.0223000000000003e-05, "loss": 0.1984, "step": 6744 }, { "epoch": 16.371810449574728, "grad_norm": 0.5516105890274048, "learning_rate": 2.0226000000000003e-05, "loss": 0.1518, "step": 6745 }, { "epoch": 16.374240583232076, "grad_norm": 0.6881749629974365, "learning_rate": 2.0229000000000002e-05, "loss": 0.1631, "step": 6746 }, { "epoch": 16.37667071688943, "grad_norm": 0.7169990539550781, "learning_rate": 2.0232000000000002e-05, "loss": 0.125, "step": 6747 }, { "epoch": 16.37910085054678, "grad_norm": 0.5609129667282104, "learning_rate": 2.0235e-05, "loss": 0.0973, "step": 6748 }, { "epoch": 16.381530984204133, "grad_norm": 0.7443650364875793, "learning_rate": 2.0238e-05, "loss": 0.0956, "step": 6749 }, { "epoch": 16.38396111786148, "grad_norm": 0.4936468005180359, "learning_rate": 2.0240999999999998e-05, "loss": 0.0708, "step": 6750 }, { "epoch": 16.386391251518834, "grad_norm": 0.7638863325119019, "learning_rate": 2.0243999999999998e-05, "loss": 0.1079, "step": 6751 }, { "epoch": 16.388821385176186, "grad_norm": 0.7753946781158447, "learning_rate": 2.0247e-05, "loss": 0.0851, "step": 6752 }, { "epoch": 16.391251518833535, "grad_norm": 0.5046547651290894, "learning_rate": 2.025e-05, "loss": 0.0581, "step": 6753 }, { "epoch": 16.393681652490887, "grad_norm": 1.030934453010559, "learning_rate": 2.0253e-05, "loss": 0.0468, "step": 6754 }, { "epoch": 16.39611178614824, "grad_norm": 0.5272870659828186, "learning_rate": 2.0256e-05, "loss": 0.0405, "step": 6755 }, { "epoch": 16.398541919805588, "grad_norm": 0.4221467971801758, "learning_rate": 2.0259e-05, "loss": 0.0573, "step": 6756 }, { "epoch": 16.40097205346294, "grad_norm": 0.43649569153785706, "learning_rate": 2.0262e-05, "loss": 0.0341, "step": 6757 }, { "epoch": 16.403402187120292, "grad_norm": 0.550229549407959, "learning_rate": 2.0265e-05, "loss": 0.0457, "step": 6758 }, { "epoch": 16.40583232077764, "grad_norm": 0.7974717617034912, "learning_rate": 2.0268e-05, "loss": 0.0338, "step": 6759 }, { "epoch": 16.408262454434993, "grad_norm": 0.770125150680542, "learning_rate": 2.0271e-05, "loss": 0.0278, "step": 6760 }, { "epoch": 16.410692588092346, "grad_norm": 0.5958243608474731, "learning_rate": 2.0274e-05, "loss": 0.0771, "step": 6761 }, { "epoch": 16.413122721749698, "grad_norm": 0.629418671131134, "learning_rate": 2.0277e-05, "loss": 0.0313, "step": 6762 }, { "epoch": 16.415552855407046, "grad_norm": 0.5066065788269043, "learning_rate": 2.0280000000000002e-05, "loss": 0.0367, "step": 6763 }, { "epoch": 16.4179829890644, "grad_norm": 0.530767023563385, "learning_rate": 2.0283000000000002e-05, "loss": 0.0285, "step": 6764 }, { "epoch": 16.42041312272175, "grad_norm": 0.5281427502632141, "learning_rate": 2.0286000000000002e-05, "loss": 0.0442, "step": 6765 }, { "epoch": 16.4228432563791, "grad_norm": 0.7065008878707886, "learning_rate": 2.0289000000000002e-05, "loss": 0.0385, "step": 6766 }, { "epoch": 16.425273390036452, "grad_norm": 0.41417932510375977, "learning_rate": 2.0292e-05, "loss": 0.0169, "step": 6767 }, { "epoch": 16.427703523693804, "grad_norm": 0.6118960976600647, "learning_rate": 2.0295e-05, "loss": 0.0334, "step": 6768 }, { "epoch": 16.430133657351153, "grad_norm": 0.649926483631134, "learning_rate": 2.0298e-05, "loss": 0.0342, "step": 6769 }, { "epoch": 16.432563791008505, "grad_norm": 0.5459389090538025, "learning_rate": 2.0301e-05, "loss": 0.0298, "step": 6770 }, { "epoch": 16.434993924665857, "grad_norm": 0.525270402431488, "learning_rate": 2.0304e-05, "loss": 0.031, "step": 6771 }, { "epoch": 16.43742405832321, "grad_norm": 0.8506150841712952, "learning_rate": 2.0307e-05, "loss": 0.0274, "step": 6772 }, { "epoch": 16.439854191980558, "grad_norm": 0.7600215673446655, "learning_rate": 2.031e-05, "loss": 0.0382, "step": 6773 }, { "epoch": 16.44228432563791, "grad_norm": 0.7093181610107422, "learning_rate": 2.0313e-05, "loss": 0.0417, "step": 6774 }, { "epoch": 16.444714459295263, "grad_norm": 0.5425426363945007, "learning_rate": 2.0316e-05, "loss": 0.0301, "step": 6775 }, { "epoch": 16.44714459295261, "grad_norm": 0.4186245799064636, "learning_rate": 2.0319e-05, "loss": 0.022, "step": 6776 }, { "epoch": 16.449574726609963, "grad_norm": 0.6163753271102905, "learning_rate": 2.0322e-05, "loss": 0.0442, "step": 6777 }, { "epoch": 16.452004860267316, "grad_norm": 0.9404435157775879, "learning_rate": 2.0325e-05, "loss": 0.082, "step": 6778 }, { "epoch": 16.454434993924664, "grad_norm": 0.47721269726753235, "learning_rate": 2.0328e-05, "loss": 0.042, "step": 6779 }, { "epoch": 16.456865127582017, "grad_norm": 1.0727334022521973, "learning_rate": 2.0331e-05, "loss": 0.0552, "step": 6780 }, { "epoch": 16.45929526123937, "grad_norm": 0.39803841710090637, "learning_rate": 2.0334e-05, "loss": 0.0233, "step": 6781 }, { "epoch": 16.46172539489672, "grad_norm": 0.5317063331604004, "learning_rate": 2.0337e-05, "loss": 0.0338, "step": 6782 }, { "epoch": 16.46415552855407, "grad_norm": 0.40051525831222534, "learning_rate": 2.0340000000000002e-05, "loss": 0.0254, "step": 6783 }, { "epoch": 16.466585662211422, "grad_norm": 1.5195016860961914, "learning_rate": 2.0343e-05, "loss": 0.0384, "step": 6784 }, { "epoch": 16.469015795868774, "grad_norm": 1.2828516960144043, "learning_rate": 2.0346e-05, "loss": 0.0482, "step": 6785 }, { "epoch": 16.471445929526123, "grad_norm": 0.6770378947257996, "learning_rate": 2.0349e-05, "loss": 0.0274, "step": 6786 }, { "epoch": 16.473876063183475, "grad_norm": 0.8899814486503601, "learning_rate": 2.0352e-05, "loss": 0.0292, "step": 6787 }, { "epoch": 16.476306196840827, "grad_norm": 0.875531792640686, "learning_rate": 2.0355e-05, "loss": 0.0479, "step": 6788 }, { "epoch": 16.478736330498176, "grad_norm": 0.9305071234703064, "learning_rate": 2.0358e-05, "loss": 0.0422, "step": 6789 }, { "epoch": 16.481166464155528, "grad_norm": 1.2039880752563477, "learning_rate": 2.0361e-05, "loss": 0.0699, "step": 6790 }, { "epoch": 16.48359659781288, "grad_norm": 1.6718764305114746, "learning_rate": 2.0364e-05, "loss": 0.0557, "step": 6791 }, { "epoch": 16.48602673147023, "grad_norm": 1.3391249179840088, "learning_rate": 2.0367e-05, "loss": 0.0813, "step": 6792 }, { "epoch": 16.48845686512758, "grad_norm": 1.0659613609313965, "learning_rate": 2.0370000000000003e-05, "loss": 0.2751, "step": 6793 }, { "epoch": 16.490886998784934, "grad_norm": 0.616400957107544, "learning_rate": 2.0373000000000003e-05, "loss": 0.221, "step": 6794 }, { "epoch": 16.493317132442286, "grad_norm": 0.8488689064979553, "learning_rate": 2.0376000000000003e-05, "loss": 0.1796, "step": 6795 }, { "epoch": 16.495747266099634, "grad_norm": 0.7982436418533325, "learning_rate": 2.0379000000000003e-05, "loss": 0.1705, "step": 6796 }, { "epoch": 16.498177399756987, "grad_norm": 0.5513447523117065, "learning_rate": 2.0382e-05, "loss": 0.1327, "step": 6797 }, { "epoch": 16.50060753341434, "grad_norm": 0.646041750907898, "learning_rate": 2.0385e-05, "loss": 0.1049, "step": 6798 }, { "epoch": 16.503037667071688, "grad_norm": 0.5941236019134521, "learning_rate": 2.0388e-05, "loss": 0.079, "step": 6799 }, { "epoch": 16.50546780072904, "grad_norm": 0.4821906089782715, "learning_rate": 2.0391e-05, "loss": 0.0794, "step": 6800 }, { "epoch": 16.507897934386392, "grad_norm": 0.42552560567855835, "learning_rate": 2.0393999999999998e-05, "loss": 0.0642, "step": 6801 }, { "epoch": 16.51032806804374, "grad_norm": 0.49854183197021484, "learning_rate": 2.0396999999999998e-05, "loss": 0.0553, "step": 6802 }, { "epoch": 16.512758201701093, "grad_norm": 0.410941481590271, "learning_rate": 2.04e-05, "loss": 0.0496, "step": 6803 }, { "epoch": 16.515188335358445, "grad_norm": 0.5746687054634094, "learning_rate": 2.0403e-05, "loss": 0.054, "step": 6804 }, { "epoch": 16.517618469015797, "grad_norm": 0.38404473662376404, "learning_rate": 2.0406e-05, "loss": 0.0381, "step": 6805 }, { "epoch": 16.520048602673146, "grad_norm": 0.6692094802856445, "learning_rate": 2.0409e-05, "loss": 0.0363, "step": 6806 }, { "epoch": 16.5224787363305, "grad_norm": 0.45284610986709595, "learning_rate": 2.0412e-05, "loss": 0.052, "step": 6807 }, { "epoch": 16.52490886998785, "grad_norm": 0.8137634992599487, "learning_rate": 2.0415e-05, "loss": 0.0563, "step": 6808 }, { "epoch": 16.5273390036452, "grad_norm": 0.6095664501190186, "learning_rate": 2.0418e-05, "loss": 0.0691, "step": 6809 }, { "epoch": 16.52976913730255, "grad_norm": 0.44884923100471497, "learning_rate": 2.0421e-05, "loss": 0.0401, "step": 6810 }, { "epoch": 16.532199270959904, "grad_norm": 0.477975070476532, "learning_rate": 2.0424e-05, "loss": 0.0493, "step": 6811 }, { "epoch": 16.534629404617252, "grad_norm": 0.5998480319976807, "learning_rate": 2.0427e-05, "loss": 0.0331, "step": 6812 }, { "epoch": 16.537059538274605, "grad_norm": 0.6386728286743164, "learning_rate": 2.0430000000000003e-05, "loss": 0.0547, "step": 6813 }, { "epoch": 16.539489671931957, "grad_norm": 0.5018784403800964, "learning_rate": 2.0433000000000002e-05, "loss": 0.0345, "step": 6814 }, { "epoch": 16.54191980558931, "grad_norm": 0.46394291520118713, "learning_rate": 2.0436000000000002e-05, "loss": 0.0389, "step": 6815 }, { "epoch": 16.544349939246658, "grad_norm": 0.41681969165802, "learning_rate": 2.0439000000000002e-05, "loss": 0.0396, "step": 6816 }, { "epoch": 16.54678007290401, "grad_norm": 0.843342125415802, "learning_rate": 2.0442000000000002e-05, "loss": 0.0338, "step": 6817 }, { "epoch": 16.549210206561362, "grad_norm": 0.6896272301673889, "learning_rate": 2.0445e-05, "loss": 0.0709, "step": 6818 }, { "epoch": 16.55164034021871, "grad_norm": 0.4938311278820038, "learning_rate": 2.0448e-05, "loss": 0.0716, "step": 6819 }, { "epoch": 16.554070473876063, "grad_norm": 0.49915024638175964, "learning_rate": 2.0451e-05, "loss": 0.0448, "step": 6820 }, { "epoch": 16.556500607533415, "grad_norm": 0.36909496784210205, "learning_rate": 2.0454e-05, "loss": 0.0239, "step": 6821 }, { "epoch": 16.558930741190764, "grad_norm": 0.39232802391052246, "learning_rate": 2.0456999999999997e-05, "loss": 0.0295, "step": 6822 }, { "epoch": 16.561360874848116, "grad_norm": 0.4954896569252014, "learning_rate": 2.046e-05, "loss": 0.0299, "step": 6823 }, { "epoch": 16.56379100850547, "grad_norm": 0.6171260476112366, "learning_rate": 2.0463e-05, "loss": 0.0317, "step": 6824 }, { "epoch": 16.566221142162817, "grad_norm": 0.617729902267456, "learning_rate": 2.0466e-05, "loss": 0.0276, "step": 6825 }, { "epoch": 16.56865127582017, "grad_norm": 0.7023196220397949, "learning_rate": 2.0469e-05, "loss": 0.0458, "step": 6826 }, { "epoch": 16.57108140947752, "grad_norm": 0.6003175973892212, "learning_rate": 2.0472e-05, "loss": 0.0357, "step": 6827 }, { "epoch": 16.573511543134874, "grad_norm": 0.7618695497512817, "learning_rate": 2.0475e-05, "loss": 0.0478, "step": 6828 }, { "epoch": 16.575941676792223, "grad_norm": 1.1655241250991821, "learning_rate": 2.0478e-05, "loss": 0.1063, "step": 6829 }, { "epoch": 16.578371810449575, "grad_norm": 1.0249074697494507, "learning_rate": 2.0481e-05, "loss": 0.0899, "step": 6830 }, { "epoch": 16.580801944106927, "grad_norm": 0.4511158764362335, "learning_rate": 2.0484e-05, "loss": 0.0334, "step": 6831 }, { "epoch": 16.583232077764276, "grad_norm": 0.705813467502594, "learning_rate": 2.0487e-05, "loss": 0.0182, "step": 6832 }, { "epoch": 16.585662211421628, "grad_norm": 0.6266363263130188, "learning_rate": 2.0490000000000002e-05, "loss": 0.033, "step": 6833 }, { "epoch": 16.58809234507898, "grad_norm": 0.7918210029602051, "learning_rate": 2.0493000000000002e-05, "loss": 0.0438, "step": 6834 }, { "epoch": 16.59052247873633, "grad_norm": 0.8345658779144287, "learning_rate": 2.0496e-05, "loss": 0.0464, "step": 6835 }, { "epoch": 16.59295261239368, "grad_norm": 1.0117017030715942, "learning_rate": 2.0499e-05, "loss": 0.0282, "step": 6836 }, { "epoch": 16.595382746051033, "grad_norm": 1.9188570976257324, "learning_rate": 2.0502e-05, "loss": 0.0703, "step": 6837 }, { "epoch": 16.597812879708385, "grad_norm": 0.7138996124267578, "learning_rate": 2.0505e-05, "loss": 0.0304, "step": 6838 }, { "epoch": 16.600243013365734, "grad_norm": 0.7276572585105896, "learning_rate": 2.0508e-05, "loss": 0.0419, "step": 6839 }, { "epoch": 16.602673147023086, "grad_norm": 0.7692210674285889, "learning_rate": 2.0511e-05, "loss": 0.0464, "step": 6840 }, { "epoch": 16.60510328068044, "grad_norm": 1.431616187095642, "learning_rate": 2.0514e-05, "loss": 0.1182, "step": 6841 }, { "epoch": 16.607533414337787, "grad_norm": 0.841350257396698, "learning_rate": 2.0517e-05, "loss": 0.0644, "step": 6842 }, { "epoch": 16.60996354799514, "grad_norm": 1.5931297540664673, "learning_rate": 2.0520000000000003e-05, "loss": 0.2817, "step": 6843 }, { "epoch": 16.61239368165249, "grad_norm": 0.9971089363098145, "learning_rate": 2.0523000000000003e-05, "loss": 0.2132, "step": 6844 }, { "epoch": 16.61482381530984, "grad_norm": 0.7685884833335876, "learning_rate": 2.0526000000000003e-05, "loss": 0.1761, "step": 6845 }, { "epoch": 16.617253948967193, "grad_norm": 1.2650314569473267, "learning_rate": 2.0529e-05, "loss": 0.2243, "step": 6846 }, { "epoch": 16.619684082624545, "grad_norm": 0.7799667716026306, "learning_rate": 2.0532e-05, "loss": 0.1501, "step": 6847 }, { "epoch": 16.622114216281897, "grad_norm": 0.8171242475509644, "learning_rate": 2.0535e-05, "loss": 0.134, "step": 6848 }, { "epoch": 16.624544349939246, "grad_norm": 0.8664853572845459, "learning_rate": 2.0538e-05, "loss": 0.0999, "step": 6849 }, { "epoch": 16.626974483596598, "grad_norm": 0.817193329334259, "learning_rate": 2.0541e-05, "loss": 0.0714, "step": 6850 }, { "epoch": 16.62940461725395, "grad_norm": 0.819351851940155, "learning_rate": 2.0544e-05, "loss": 0.0712, "step": 6851 }, { "epoch": 16.6318347509113, "grad_norm": 0.6398718953132629, "learning_rate": 2.0546999999999998e-05, "loss": 0.0721, "step": 6852 }, { "epoch": 16.63426488456865, "grad_norm": 0.4038325846195221, "learning_rate": 2.055e-05, "loss": 0.0472, "step": 6853 }, { "epoch": 16.636695018226003, "grad_norm": 0.5636379718780518, "learning_rate": 2.0553e-05, "loss": 0.0527, "step": 6854 }, { "epoch": 16.639125151883352, "grad_norm": 0.5314086079597473, "learning_rate": 2.0556e-05, "loss": 0.0407, "step": 6855 }, { "epoch": 16.641555285540704, "grad_norm": 0.6106927990913391, "learning_rate": 2.0559e-05, "loss": 0.0443, "step": 6856 }, { "epoch": 16.643985419198057, "grad_norm": 0.43025603890419006, "learning_rate": 2.0562e-05, "loss": 0.0377, "step": 6857 }, { "epoch": 16.64641555285541, "grad_norm": 0.5429539680480957, "learning_rate": 2.0565e-05, "loss": 0.0401, "step": 6858 }, { "epoch": 16.648845686512757, "grad_norm": 0.5858376026153564, "learning_rate": 2.0568e-05, "loss": 0.0403, "step": 6859 }, { "epoch": 16.65127582017011, "grad_norm": 0.6358614563941956, "learning_rate": 2.0571e-05, "loss": 0.0409, "step": 6860 }, { "epoch": 16.653705953827462, "grad_norm": 0.7111338376998901, "learning_rate": 2.0574e-05, "loss": 0.0439, "step": 6861 }, { "epoch": 16.65613608748481, "grad_norm": 0.7346673607826233, "learning_rate": 2.0577e-05, "loss": 0.047, "step": 6862 }, { "epoch": 16.658566221142163, "grad_norm": 0.4911295473575592, "learning_rate": 2.0580000000000003e-05, "loss": 0.0341, "step": 6863 }, { "epoch": 16.660996354799515, "grad_norm": 0.36179137229919434, "learning_rate": 2.0583000000000003e-05, "loss": 0.0273, "step": 6864 }, { "epoch": 16.663426488456864, "grad_norm": 0.4591819643974304, "learning_rate": 2.0586000000000002e-05, "loss": 0.0365, "step": 6865 }, { "epoch": 16.665856622114216, "grad_norm": 0.6088852882385254, "learning_rate": 2.0589000000000002e-05, "loss": 0.0317, "step": 6866 }, { "epoch": 16.668286755771568, "grad_norm": 0.42234888672828674, "learning_rate": 2.0592000000000002e-05, "loss": 0.0269, "step": 6867 }, { "epoch": 16.670716889428917, "grad_norm": 0.7234625220298767, "learning_rate": 2.0595000000000002e-05, "loss": 0.0517, "step": 6868 }, { "epoch": 16.67314702308627, "grad_norm": 0.371623158454895, "learning_rate": 2.0598e-05, "loss": 0.0254, "step": 6869 }, { "epoch": 16.67557715674362, "grad_norm": 0.6317431926727295, "learning_rate": 2.0601e-05, "loss": 0.0484, "step": 6870 }, { "epoch": 16.678007290400974, "grad_norm": 0.932755172252655, "learning_rate": 2.0603999999999998e-05, "loss": 0.034, "step": 6871 }, { "epoch": 16.680437424058322, "grad_norm": 0.5870157480239868, "learning_rate": 2.0606999999999998e-05, "loss": 0.032, "step": 6872 }, { "epoch": 16.682867557715674, "grad_norm": 0.599737823009491, "learning_rate": 2.061e-05, "loss": 0.0474, "step": 6873 }, { "epoch": 16.685297691373027, "grad_norm": 0.3695632517337799, "learning_rate": 2.0613e-05, "loss": 0.0206, "step": 6874 }, { "epoch": 16.687727825030375, "grad_norm": 0.5830012559890747, "learning_rate": 2.0616e-05, "loss": 0.0251, "step": 6875 }, { "epoch": 16.690157958687728, "grad_norm": 0.6620575189590454, "learning_rate": 2.0619e-05, "loss": 0.0387, "step": 6876 }, { "epoch": 16.69258809234508, "grad_norm": 0.5654508471488953, "learning_rate": 2.0622e-05, "loss": 0.0337, "step": 6877 }, { "epoch": 16.69501822600243, "grad_norm": 0.590104341506958, "learning_rate": 2.0625e-05, "loss": 0.0429, "step": 6878 }, { "epoch": 16.69744835965978, "grad_norm": 0.7753174901008606, "learning_rate": 2.0628e-05, "loss": 0.0584, "step": 6879 }, { "epoch": 16.699878493317133, "grad_norm": 0.6262745261192322, "learning_rate": 2.0631e-05, "loss": 0.0281, "step": 6880 }, { "epoch": 16.702308626974485, "grad_norm": 0.6598519086837769, "learning_rate": 2.0634e-05, "loss": 0.0438, "step": 6881 }, { "epoch": 16.704738760631834, "grad_norm": 0.7011498808860779, "learning_rate": 2.0637e-05, "loss": 0.0538, "step": 6882 }, { "epoch": 16.707168894289186, "grad_norm": 0.6810026168823242, "learning_rate": 2.064e-05, "loss": 0.0363, "step": 6883 }, { "epoch": 16.70959902794654, "grad_norm": 0.4180868864059448, "learning_rate": 2.0643000000000002e-05, "loss": 0.0323, "step": 6884 }, { "epoch": 16.712029161603887, "grad_norm": 0.49026960134506226, "learning_rate": 2.0646000000000002e-05, "loss": 0.0292, "step": 6885 }, { "epoch": 16.71445929526124, "grad_norm": 0.7375601530075073, "learning_rate": 2.0649e-05, "loss": 0.0411, "step": 6886 }, { "epoch": 16.71688942891859, "grad_norm": 0.5720340609550476, "learning_rate": 2.0652e-05, "loss": 0.0369, "step": 6887 }, { "epoch": 16.71931956257594, "grad_norm": 0.6861438751220703, "learning_rate": 2.0655e-05, "loss": 0.0351, "step": 6888 }, { "epoch": 16.721749696233292, "grad_norm": 1.419772744178772, "learning_rate": 2.0658e-05, "loss": 0.0652, "step": 6889 }, { "epoch": 16.724179829890645, "grad_norm": 0.5464985370635986, "learning_rate": 2.0661e-05, "loss": 0.0358, "step": 6890 }, { "epoch": 16.726609963547997, "grad_norm": 1.2146092653274536, "learning_rate": 2.0664e-05, "loss": 0.072, "step": 6891 }, { "epoch": 16.729040097205345, "grad_norm": 4.839377403259277, "learning_rate": 2.0667e-05, "loss": 0.1329, "step": 6892 }, { "epoch": 16.731470230862698, "grad_norm": 1.2400928735733032, "learning_rate": 2.067e-05, "loss": 0.3056, "step": 6893 }, { "epoch": 16.73390036452005, "grad_norm": 0.6607919931411743, "learning_rate": 2.0673000000000003e-05, "loss": 0.2141, "step": 6894 }, { "epoch": 16.7363304981774, "grad_norm": 0.6894709467887878, "learning_rate": 2.0676e-05, "loss": 0.1904, "step": 6895 }, { "epoch": 16.73876063183475, "grad_norm": 0.6179702877998352, "learning_rate": 2.0679e-05, "loss": 0.1528, "step": 6896 }, { "epoch": 16.741190765492103, "grad_norm": 0.5801262855529785, "learning_rate": 2.0682e-05, "loss": 0.12, "step": 6897 }, { "epoch": 16.74362089914945, "grad_norm": 0.6725742220878601, "learning_rate": 2.0685e-05, "loss": 0.1195, "step": 6898 }, { "epoch": 16.746051032806804, "grad_norm": 0.8415626883506775, "learning_rate": 2.0688e-05, "loss": 0.1003, "step": 6899 }, { "epoch": 16.748481166464156, "grad_norm": 0.558894693851471, "learning_rate": 2.0691e-05, "loss": 0.0796, "step": 6900 }, { "epoch": 16.75091130012151, "grad_norm": 0.5014123320579529, "learning_rate": 2.0694e-05, "loss": 0.0395, "step": 6901 }, { "epoch": 16.753341433778857, "grad_norm": 0.4852776825428009, "learning_rate": 2.0697e-05, "loss": 0.0476, "step": 6902 }, { "epoch": 16.75577156743621, "grad_norm": 0.5624544024467468, "learning_rate": 2.07e-05, "loss": 0.0533, "step": 6903 }, { "epoch": 16.75820170109356, "grad_norm": 0.451468825340271, "learning_rate": 2.0703e-05, "loss": 0.0575, "step": 6904 }, { "epoch": 16.76063183475091, "grad_norm": 0.3831200897693634, "learning_rate": 2.0706e-05, "loss": 0.0398, "step": 6905 }, { "epoch": 16.763061968408262, "grad_norm": 0.5082759261131287, "learning_rate": 2.0709e-05, "loss": 0.0484, "step": 6906 }, { "epoch": 16.765492102065615, "grad_norm": 0.9571099877357483, "learning_rate": 2.0712e-05, "loss": 0.0305, "step": 6907 }, { "epoch": 16.767922235722963, "grad_norm": 0.5134744644165039, "learning_rate": 2.0715e-05, "loss": 0.0458, "step": 6908 }, { "epoch": 16.770352369380316, "grad_norm": 0.6121795177459717, "learning_rate": 2.0718e-05, "loss": 0.0594, "step": 6909 }, { "epoch": 16.772782503037668, "grad_norm": 0.46093690395355225, "learning_rate": 2.0721e-05, "loss": 0.0408, "step": 6910 }, { "epoch": 16.775212636695016, "grad_norm": 0.506826639175415, "learning_rate": 2.0724e-05, "loss": 0.0276, "step": 6911 }, { "epoch": 16.77764277035237, "grad_norm": 0.5099306702613831, "learning_rate": 2.0727e-05, "loss": 0.0354, "step": 6912 }, { "epoch": 16.78007290400972, "grad_norm": 0.49248185753822327, "learning_rate": 2.073e-05, "loss": 0.0281, "step": 6913 }, { "epoch": 16.782503037667073, "grad_norm": 0.7704623937606812, "learning_rate": 2.0733000000000003e-05, "loss": 0.0454, "step": 6914 }, { "epoch": 16.784933171324422, "grad_norm": 0.4940512180328369, "learning_rate": 2.0736000000000003e-05, "loss": 0.0347, "step": 6915 }, { "epoch": 16.787363304981774, "grad_norm": 0.3863089382648468, "learning_rate": 2.0739000000000003e-05, "loss": 0.0251, "step": 6916 }, { "epoch": 16.789793438639126, "grad_norm": 0.640876054763794, "learning_rate": 2.0742000000000002e-05, "loss": 0.0362, "step": 6917 }, { "epoch": 16.792223572296475, "grad_norm": 0.5945448279380798, "learning_rate": 2.0745000000000002e-05, "loss": 0.0399, "step": 6918 }, { "epoch": 16.794653705953827, "grad_norm": 0.4596477150917053, "learning_rate": 2.0748000000000002e-05, "loss": 0.0359, "step": 6919 }, { "epoch": 16.79708383961118, "grad_norm": 0.49706101417541504, "learning_rate": 2.0751e-05, "loss": 0.0362, "step": 6920 }, { "epoch": 16.799513973268528, "grad_norm": 1.0127840042114258, "learning_rate": 2.0753999999999998e-05, "loss": 0.0466, "step": 6921 }, { "epoch": 16.80194410692588, "grad_norm": 0.5175263285636902, "learning_rate": 2.0756999999999998e-05, "loss": 0.0258, "step": 6922 }, { "epoch": 16.804374240583233, "grad_norm": 0.8035303950309753, "learning_rate": 2.0759999999999998e-05, "loss": 0.0346, "step": 6923 }, { "epoch": 16.806804374240585, "grad_norm": 0.5947206020355225, "learning_rate": 2.0763e-05, "loss": 0.0266, "step": 6924 }, { "epoch": 16.809234507897933, "grad_norm": 1.0296026468276978, "learning_rate": 2.0766e-05, "loss": 0.0544, "step": 6925 }, { "epoch": 16.811664641555286, "grad_norm": 0.5986934304237366, "learning_rate": 2.0769e-05, "loss": 0.0557, "step": 6926 }, { "epoch": 16.814094775212638, "grad_norm": 0.693501353263855, "learning_rate": 2.0772e-05, "loss": 0.0428, "step": 6927 }, { "epoch": 16.816524908869987, "grad_norm": 0.68556809425354, "learning_rate": 2.0775e-05, "loss": 0.0354, "step": 6928 }, { "epoch": 16.81895504252734, "grad_norm": 0.613426923751831, "learning_rate": 2.0778e-05, "loss": 0.0302, "step": 6929 }, { "epoch": 16.82138517618469, "grad_norm": 0.7847046852111816, "learning_rate": 2.0781e-05, "loss": 0.0348, "step": 6930 }, { "epoch": 16.82381530984204, "grad_norm": 0.5899834632873535, "learning_rate": 2.0784e-05, "loss": 0.0337, "step": 6931 }, { "epoch": 16.826245443499392, "grad_norm": 0.36682072281837463, "learning_rate": 2.0787e-05, "loss": 0.0212, "step": 6932 }, { "epoch": 16.828675577156744, "grad_norm": 0.722087562084198, "learning_rate": 2.079e-05, "loss": 0.036, "step": 6933 }, { "epoch": 16.831105710814096, "grad_norm": 0.779990553855896, "learning_rate": 2.0793000000000002e-05, "loss": 0.0455, "step": 6934 }, { "epoch": 16.833535844471445, "grad_norm": 0.7070257067680359, "learning_rate": 2.0796000000000002e-05, "loss": 0.0322, "step": 6935 }, { "epoch": 16.835965978128797, "grad_norm": 1.623121738433838, "learning_rate": 2.0799000000000002e-05, "loss": 0.1424, "step": 6936 }, { "epoch": 16.83839611178615, "grad_norm": 0.9896919131278992, "learning_rate": 2.0802000000000002e-05, "loss": 0.044, "step": 6937 }, { "epoch": 16.8408262454435, "grad_norm": 1.2558915615081787, "learning_rate": 2.0805e-05, "loss": 0.0624, "step": 6938 }, { "epoch": 16.84325637910085, "grad_norm": 0.8011232614517212, "learning_rate": 2.0808e-05, "loss": 0.0504, "step": 6939 }, { "epoch": 16.845686512758203, "grad_norm": 0.7407183051109314, "learning_rate": 2.0811e-05, "loss": 0.0567, "step": 6940 }, { "epoch": 16.84811664641555, "grad_norm": 0.9052577614784241, "learning_rate": 2.0814e-05, "loss": 0.0386, "step": 6941 }, { "epoch": 16.850546780072904, "grad_norm": 1.8365546464920044, "learning_rate": 2.0817e-05, "loss": 0.1183, "step": 6942 }, { "epoch": 16.852976913730256, "grad_norm": 1.5101525783538818, "learning_rate": 2.082e-05, "loss": 0.2989, "step": 6943 }, { "epoch": 16.855407047387608, "grad_norm": 0.6661175489425659, "learning_rate": 2.0823e-05, "loss": 0.2361, "step": 6944 }, { "epoch": 16.857837181044957, "grad_norm": 0.5133122205734253, "learning_rate": 2.0826e-05, "loss": 0.1424, "step": 6945 }, { "epoch": 16.86026731470231, "grad_norm": 0.6817412972450256, "learning_rate": 2.0829e-05, "loss": 0.1678, "step": 6946 }, { "epoch": 16.86269744835966, "grad_norm": 0.5942744612693787, "learning_rate": 2.0832e-05, "loss": 0.1109, "step": 6947 }, { "epoch": 16.86512758201701, "grad_norm": 0.7767903804779053, "learning_rate": 2.0835e-05, "loss": 0.1209, "step": 6948 }, { "epoch": 16.867557715674362, "grad_norm": 0.5683552622795105, "learning_rate": 2.0838e-05, "loss": 0.1096, "step": 6949 }, { "epoch": 16.869987849331714, "grad_norm": 0.4545753598213196, "learning_rate": 2.0841e-05, "loss": 0.0655, "step": 6950 }, { "epoch": 16.872417982989063, "grad_norm": 0.5935865044593811, "learning_rate": 2.0844e-05, "loss": 0.0693, "step": 6951 }, { "epoch": 16.874848116646415, "grad_norm": 0.7463645339012146, "learning_rate": 2.0847e-05, "loss": 0.0812, "step": 6952 }, { "epoch": 16.877278250303767, "grad_norm": 0.5846714377403259, "learning_rate": 2.085e-05, "loss": 0.0533, "step": 6953 }, { "epoch": 16.879708383961116, "grad_norm": 0.803705096244812, "learning_rate": 2.0853000000000002e-05, "loss": 0.046, "step": 6954 }, { "epoch": 16.88213851761847, "grad_norm": 0.6294832825660706, "learning_rate": 2.0856e-05, "loss": 0.0447, "step": 6955 }, { "epoch": 16.88456865127582, "grad_norm": 0.43973854184150696, "learning_rate": 2.0859e-05, "loss": 0.0327, "step": 6956 }, { "epoch": 16.886998784933173, "grad_norm": 0.5270779728889465, "learning_rate": 2.0862e-05, "loss": 0.0505, "step": 6957 }, { "epoch": 16.88942891859052, "grad_norm": 0.4834468960762024, "learning_rate": 2.0865e-05, "loss": 0.059, "step": 6958 }, { "epoch": 16.891859052247874, "grad_norm": 0.5880892872810364, "learning_rate": 2.0868e-05, "loss": 0.0335, "step": 6959 }, { "epoch": 16.894289185905226, "grad_norm": 0.8898768424987793, "learning_rate": 2.0871e-05, "loss": 0.0214, "step": 6960 }, { "epoch": 16.896719319562575, "grad_norm": 0.8119906783103943, "learning_rate": 2.0874e-05, "loss": 0.035, "step": 6961 }, { "epoch": 16.899149453219927, "grad_norm": 0.49025753140449524, "learning_rate": 2.0877e-05, "loss": 0.0358, "step": 6962 }, { "epoch": 16.90157958687728, "grad_norm": 0.4825421869754791, "learning_rate": 2.088e-05, "loss": 0.0408, "step": 6963 }, { "epoch": 16.904009720534628, "grad_norm": 0.4528196156024933, "learning_rate": 2.0883000000000003e-05, "loss": 0.0336, "step": 6964 }, { "epoch": 16.90643985419198, "grad_norm": 0.3320884108543396, "learning_rate": 2.0886000000000003e-05, "loss": 0.0246, "step": 6965 }, { "epoch": 16.908869987849332, "grad_norm": 0.38834652304649353, "learning_rate": 2.0889000000000003e-05, "loss": 0.0269, "step": 6966 }, { "epoch": 16.911300121506684, "grad_norm": 0.48916932940483093, "learning_rate": 2.0892000000000003e-05, "loss": 0.031, "step": 6967 }, { "epoch": 16.913730255164033, "grad_norm": 0.6917409896850586, "learning_rate": 2.0895000000000002e-05, "loss": 0.0441, "step": 6968 }, { "epoch": 16.916160388821385, "grad_norm": 1.2992733716964722, "learning_rate": 2.0898e-05, "loss": 0.0575, "step": 6969 }, { "epoch": 16.918590522478738, "grad_norm": 0.37051743268966675, "learning_rate": 2.0901e-05, "loss": 0.0325, "step": 6970 }, { "epoch": 16.921020656136086, "grad_norm": 0.5805058479309082, "learning_rate": 2.0904e-05, "loss": 0.0277, "step": 6971 }, { "epoch": 16.92345078979344, "grad_norm": 0.3487201929092407, "learning_rate": 2.0906999999999998e-05, "loss": 0.0253, "step": 6972 }, { "epoch": 16.92588092345079, "grad_norm": 0.9210509061813354, "learning_rate": 2.0909999999999998e-05, "loss": 0.0648, "step": 6973 }, { "epoch": 16.92831105710814, "grad_norm": 1.5118871927261353, "learning_rate": 2.0913e-05, "loss": 0.0498, "step": 6974 }, { "epoch": 16.93074119076549, "grad_norm": 0.5953038334846497, "learning_rate": 2.0916e-05, "loss": 0.036, "step": 6975 }, { "epoch": 16.933171324422844, "grad_norm": 0.8020294904708862, "learning_rate": 2.0919e-05, "loss": 0.0412, "step": 6976 }, { "epoch": 16.935601458080196, "grad_norm": 3.638714551925659, "learning_rate": 2.0922e-05, "loss": 0.0331, "step": 6977 }, { "epoch": 16.938031591737545, "grad_norm": 0.8520907759666443, "learning_rate": 2.0925e-05, "loss": 0.0435, "step": 6978 }, { "epoch": 16.940461725394897, "grad_norm": 0.5470388531684875, "learning_rate": 2.0928e-05, "loss": 0.0456, "step": 6979 }, { "epoch": 16.94289185905225, "grad_norm": 0.5396353006362915, "learning_rate": 2.0931e-05, "loss": 0.0376, "step": 6980 }, { "epoch": 16.945321992709598, "grad_norm": 0.37229788303375244, "learning_rate": 2.0934e-05, "loss": 0.0248, "step": 6981 }, { "epoch": 16.94775212636695, "grad_norm": 0.4745618999004364, "learning_rate": 2.0937e-05, "loss": 0.0446, "step": 6982 }, { "epoch": 16.950182260024302, "grad_norm": 1.0290898084640503, "learning_rate": 2.094e-05, "loss": 0.047, "step": 6983 }, { "epoch": 16.95261239368165, "grad_norm": 0.9205613136291504, "learning_rate": 2.0943000000000003e-05, "loss": 0.0387, "step": 6984 }, { "epoch": 16.955042527339003, "grad_norm": 0.8615684509277344, "learning_rate": 2.0946000000000002e-05, "loss": 0.0357, "step": 6985 }, { "epoch": 16.957472660996356, "grad_norm": 0.5540120005607605, "learning_rate": 2.0949000000000002e-05, "loss": 0.0214, "step": 6986 }, { "epoch": 16.959902794653708, "grad_norm": 1.6261334419250488, "learning_rate": 2.0952000000000002e-05, "loss": 0.0639, "step": 6987 }, { "epoch": 16.962332928311056, "grad_norm": 0.6665809750556946, "learning_rate": 2.0955000000000002e-05, "loss": 0.0314, "step": 6988 }, { "epoch": 16.96476306196841, "grad_norm": 1.0572377443313599, "learning_rate": 2.0958e-05, "loss": 0.0458, "step": 6989 }, { "epoch": 16.96719319562576, "grad_norm": 1.171373724937439, "learning_rate": 2.0961e-05, "loss": 0.0623, "step": 6990 }, { "epoch": 16.96962332928311, "grad_norm": 1.3827159404754639, "learning_rate": 2.0964e-05, "loss": 0.0642, "step": 6991 }, { "epoch": 16.972053462940462, "grad_norm": 2.1679272651672363, "learning_rate": 2.0967e-05, "loss": 0.1012, "step": 6992 }, { "epoch": 16.974483596597814, "grad_norm": 0.8387540578842163, "learning_rate": 2.097e-05, "loss": 0.2069, "step": 6993 }, { "epoch": 16.976913730255163, "grad_norm": 0.9459508061408997, "learning_rate": 2.0973e-05, "loss": 0.1376, "step": 6994 }, { "epoch": 16.979343863912515, "grad_norm": 0.4871624708175659, "learning_rate": 2.0976e-05, "loss": 0.0476, "step": 6995 }, { "epoch": 16.981773997569867, "grad_norm": 0.4788341820240021, "learning_rate": 2.0979e-05, "loss": 0.0348, "step": 6996 }, { "epoch": 16.984204131227216, "grad_norm": 0.6725849509239197, "learning_rate": 2.0982e-05, "loss": 0.0469, "step": 6997 }, { "epoch": 16.986634264884568, "grad_norm": 0.736638069152832, "learning_rate": 2.0985e-05, "loss": 0.0387, "step": 6998 }, { "epoch": 16.98906439854192, "grad_norm": 0.516646146774292, "learning_rate": 2.0988e-05, "loss": 0.0332, "step": 6999 }, { "epoch": 16.991494532199273, "grad_norm": 0.6017102003097534, "learning_rate": 2.0991e-05, "loss": 0.0545, "step": 7000 }, { "epoch": 16.991494532199273, "eval_cer": 0.08971166974540197, "eval_loss": 0.2793155610561371, "eval_runtime": 7.9723, "eval_samples_per_second": 12.669, "eval_steps_per_second": 0.502, "eval_wer": 0.2879188712522046, "step": 7000 }, { "epoch": 16.99392466585662, "grad_norm": 0.7245095372200012, "learning_rate": 2.0994e-05, "loss": 0.0973, "step": 7001 }, { "epoch": 16.996354799513973, "grad_norm": 0.6017758250236511, "learning_rate": 2.0997e-05, "loss": 0.0395, "step": 7002 }, { "epoch": 16.998784933171326, "grad_norm": 0.7306492924690247, "learning_rate": 2.1e-05, "loss": 0.0414, "step": 7003 }, { "epoch": 17.0, "grad_norm": 0.8058084845542908, "learning_rate": 2.1003e-05, "loss": 0.0524, "step": 7004 }, { "epoch": 17.002430133657352, "grad_norm": 1.8303672075271606, "learning_rate": 2.1006000000000002e-05, "loss": 0.2859, "step": 7005 }, { "epoch": 17.0048602673147, "grad_norm": 0.8326247930526733, "learning_rate": 2.1009e-05, "loss": 0.2142, "step": 7006 }, { "epoch": 17.007290400972053, "grad_norm": 0.5999472737312317, "learning_rate": 2.1012e-05, "loss": 0.1579, "step": 7007 }, { "epoch": 17.009720534629405, "grad_norm": 0.5567811131477356, "learning_rate": 2.1015e-05, "loss": 0.1658, "step": 7008 }, { "epoch": 17.012150668286754, "grad_norm": 0.5798822045326233, "learning_rate": 2.1018e-05, "loss": 0.0998, "step": 7009 }, { "epoch": 17.014580801944106, "grad_norm": 0.7309454679489136, "learning_rate": 2.1021e-05, "loss": 0.0982, "step": 7010 }, { "epoch": 17.01701093560146, "grad_norm": 0.5672822594642639, "learning_rate": 2.1024e-05, "loss": 0.0688, "step": 7011 }, { "epoch": 17.01944106925881, "grad_norm": 0.7675696611404419, "learning_rate": 2.1027e-05, "loss": 0.0721, "step": 7012 }, { "epoch": 17.02187120291616, "grad_norm": 0.7766565084457397, "learning_rate": 2.103e-05, "loss": 0.0493, "step": 7013 }, { "epoch": 17.02430133657351, "grad_norm": 0.8217518329620361, "learning_rate": 2.1033e-05, "loss": 0.0613, "step": 7014 }, { "epoch": 17.026731470230864, "grad_norm": 0.5395684242248535, "learning_rate": 2.1036000000000003e-05, "loss": 0.054, "step": 7015 }, { "epoch": 17.029161603888213, "grad_norm": 0.4790891408920288, "learning_rate": 2.1039000000000003e-05, "loss": 0.0342, "step": 7016 }, { "epoch": 17.031591737545565, "grad_norm": 0.7873870134353638, "learning_rate": 2.1042000000000003e-05, "loss": 0.0294, "step": 7017 }, { "epoch": 17.034021871202917, "grad_norm": 0.5293595790863037, "learning_rate": 2.1045e-05, "loss": 0.0478, "step": 7018 }, { "epoch": 17.036452004860266, "grad_norm": 0.7136867046356201, "learning_rate": 2.1048e-05, "loss": 0.0432, "step": 7019 }, { "epoch": 17.038882138517618, "grad_norm": 0.7173410654067993, "learning_rate": 2.1051e-05, "loss": 0.0344, "step": 7020 }, { "epoch": 17.04131227217497, "grad_norm": 0.664620041847229, "learning_rate": 2.1054e-05, "loss": 0.0266, "step": 7021 }, { "epoch": 17.043742405832322, "grad_norm": 0.41970494389533997, "learning_rate": 2.1057e-05, "loss": 0.0241, "step": 7022 }, { "epoch": 17.04617253948967, "grad_norm": 0.7529674172401428, "learning_rate": 2.1059999999999998e-05, "loss": 0.0358, "step": 7023 }, { "epoch": 17.048602673147023, "grad_norm": 0.5272979140281677, "learning_rate": 2.1062999999999998e-05, "loss": 0.0511, "step": 7024 }, { "epoch": 17.051032806804375, "grad_norm": 0.4634327292442322, "learning_rate": 2.1066e-05, "loss": 0.0318, "step": 7025 }, { "epoch": 17.053462940461724, "grad_norm": 0.5944244265556335, "learning_rate": 2.1069e-05, "loss": 0.0433, "step": 7026 }, { "epoch": 17.055893074119076, "grad_norm": 0.5875101685523987, "learning_rate": 2.1072e-05, "loss": 0.0308, "step": 7027 }, { "epoch": 17.05832320777643, "grad_norm": 0.2957003116607666, "learning_rate": 2.1075e-05, "loss": 0.026, "step": 7028 }, { "epoch": 17.060753341433777, "grad_norm": 0.6754763722419739, "learning_rate": 2.1078e-05, "loss": 0.031, "step": 7029 }, { "epoch": 17.06318347509113, "grad_norm": 0.35352903604507446, "learning_rate": 2.1081e-05, "loss": 0.0295, "step": 7030 }, { "epoch": 17.06561360874848, "grad_norm": 0.3360816538333893, "learning_rate": 2.1084e-05, "loss": 0.0263, "step": 7031 }, { "epoch": 17.068043742405834, "grad_norm": 0.6197881102561951, "learning_rate": 2.1087e-05, "loss": 0.0443, "step": 7032 }, { "epoch": 17.070473876063183, "grad_norm": 0.7238675355911255, "learning_rate": 2.109e-05, "loss": 0.0298, "step": 7033 }, { "epoch": 17.072904009720535, "grad_norm": 0.4975394010543823, "learning_rate": 2.1093e-05, "loss": 0.0288, "step": 7034 }, { "epoch": 17.075334143377887, "grad_norm": 0.6416003704071045, "learning_rate": 2.1096000000000003e-05, "loss": 0.0381, "step": 7035 }, { "epoch": 17.077764277035236, "grad_norm": 0.39614903926849365, "learning_rate": 2.1099000000000002e-05, "loss": 0.0233, "step": 7036 }, { "epoch": 17.080194410692588, "grad_norm": 0.5705881118774414, "learning_rate": 2.1102000000000002e-05, "loss": 0.0357, "step": 7037 }, { "epoch": 17.08262454434994, "grad_norm": 0.9267988801002502, "learning_rate": 2.1105000000000002e-05, "loss": 0.0405, "step": 7038 }, { "epoch": 17.08505467800729, "grad_norm": 0.5336192846298218, "learning_rate": 2.1108000000000002e-05, "loss": 0.026, "step": 7039 }, { "epoch": 17.08748481166464, "grad_norm": 0.7681728601455688, "learning_rate": 2.1111e-05, "loss": 0.0412, "step": 7040 }, { "epoch": 17.089914945321993, "grad_norm": 0.970116376876831, "learning_rate": 2.1114e-05, "loss": 0.0411, "step": 7041 }, { "epoch": 17.092345078979346, "grad_norm": 0.6654305458068848, "learning_rate": 2.1117e-05, "loss": 0.0431, "step": 7042 }, { "epoch": 17.094775212636694, "grad_norm": 0.30802133679389954, "learning_rate": 2.1119999999999998e-05, "loss": 0.0213, "step": 7043 }, { "epoch": 17.097205346294047, "grad_norm": 0.48515793681144714, "learning_rate": 2.1122999999999997e-05, "loss": 0.0253, "step": 7044 }, { "epoch": 17.0996354799514, "grad_norm": 0.5918442010879517, "learning_rate": 2.1126e-05, "loss": 0.0255, "step": 7045 }, { "epoch": 17.102065613608747, "grad_norm": 0.5984573364257812, "learning_rate": 2.1129e-05, "loss": 0.0275, "step": 7046 }, { "epoch": 17.1044957472661, "grad_norm": 0.6862162947654724, "learning_rate": 2.1132e-05, "loss": 0.0543, "step": 7047 }, { "epoch": 17.106925880923452, "grad_norm": 0.9372538328170776, "learning_rate": 2.1135e-05, "loss": 0.0255, "step": 7048 }, { "epoch": 17.1093560145808, "grad_norm": 0.7212893962860107, "learning_rate": 2.1138e-05, "loss": 0.041, "step": 7049 }, { "epoch": 17.111786148238153, "grad_norm": 0.7508737444877625, "learning_rate": 2.1141e-05, "loss": 0.0317, "step": 7050 }, { "epoch": 17.114216281895505, "grad_norm": 1.0437462329864502, "learning_rate": 2.1144e-05, "loss": 0.0348, "step": 7051 }, { "epoch": 17.116646415552854, "grad_norm": 0.8411557674407959, "learning_rate": 2.1147e-05, "loss": 0.0353, "step": 7052 }, { "epoch": 17.119076549210206, "grad_norm": 0.7962921261787415, "learning_rate": 2.115e-05, "loss": 0.0469, "step": 7053 }, { "epoch": 17.121506682867558, "grad_norm": 1.5398088693618774, "learning_rate": 2.1153e-05, "loss": 0.1076, "step": 7054 }, { "epoch": 17.12393681652491, "grad_norm": 0.8382475972175598, "learning_rate": 2.1156000000000002e-05, "loss": 0.2502, "step": 7055 }, { "epoch": 17.12636695018226, "grad_norm": 0.625187337398529, "learning_rate": 2.1159000000000002e-05, "loss": 0.1926, "step": 7056 }, { "epoch": 17.12879708383961, "grad_norm": 0.6368475556373596, "learning_rate": 2.1162e-05, "loss": 0.1465, "step": 7057 }, { "epoch": 17.131227217496964, "grad_norm": 0.8098138570785522, "learning_rate": 2.1165e-05, "loss": 0.1609, "step": 7058 }, { "epoch": 17.133657351154312, "grad_norm": 0.8438608050346375, "learning_rate": 2.1168e-05, "loss": 0.1267, "step": 7059 }, { "epoch": 17.136087484811664, "grad_norm": 0.6000354886054993, "learning_rate": 2.1171e-05, "loss": 0.1169, "step": 7060 }, { "epoch": 17.138517618469017, "grad_norm": 0.6089634299278259, "learning_rate": 2.1174e-05, "loss": 0.06, "step": 7061 }, { "epoch": 17.140947752126365, "grad_norm": 1.156995177268982, "learning_rate": 2.1177e-05, "loss": 0.077, "step": 7062 }, { "epoch": 17.143377885783718, "grad_norm": 0.9036250114440918, "learning_rate": 2.118e-05, "loss": 0.0645, "step": 7063 }, { "epoch": 17.14580801944107, "grad_norm": 0.5638657212257385, "learning_rate": 2.1183e-05, "loss": 0.0587, "step": 7064 }, { "epoch": 17.148238153098422, "grad_norm": 0.5781294107437134, "learning_rate": 2.1186000000000003e-05, "loss": 0.0537, "step": 7065 }, { "epoch": 17.15066828675577, "grad_norm": 0.3916108310222626, "learning_rate": 2.1189000000000003e-05, "loss": 0.0401, "step": 7066 }, { "epoch": 17.153098420413123, "grad_norm": 0.6313323378562927, "learning_rate": 2.1192e-05, "loss": 0.0494, "step": 7067 }, { "epoch": 17.155528554070475, "grad_norm": 0.5307614207267761, "learning_rate": 2.1195e-05, "loss": 0.0272, "step": 7068 }, { "epoch": 17.157958687727824, "grad_norm": 0.5507494807243347, "learning_rate": 2.1198e-05, "loss": 0.0359, "step": 7069 }, { "epoch": 17.160388821385176, "grad_norm": 0.7078911066055298, "learning_rate": 2.1201e-05, "loss": 0.0307, "step": 7070 }, { "epoch": 17.16281895504253, "grad_norm": 0.45792248845100403, "learning_rate": 2.1204e-05, "loss": 0.0226, "step": 7071 }, { "epoch": 17.165249088699877, "grad_norm": 0.470740407705307, "learning_rate": 2.1207e-05, "loss": 0.045, "step": 7072 }, { "epoch": 17.16767922235723, "grad_norm": 0.5532474517822266, "learning_rate": 2.121e-05, "loss": 0.0445, "step": 7073 }, { "epoch": 17.17010935601458, "grad_norm": 0.7348586320877075, "learning_rate": 2.1213e-05, "loss": 0.0384, "step": 7074 }, { "epoch": 17.172539489671934, "grad_norm": 0.46910637617111206, "learning_rate": 2.1216e-05, "loss": 0.0334, "step": 7075 }, { "epoch": 17.174969623329282, "grad_norm": 0.40440019965171814, "learning_rate": 2.1219e-05, "loss": 0.0206, "step": 7076 }, { "epoch": 17.177399756986635, "grad_norm": 0.6844358444213867, "learning_rate": 2.1222e-05, "loss": 0.0806, "step": 7077 }, { "epoch": 17.179829890643987, "grad_norm": 0.427805095911026, "learning_rate": 2.1225e-05, "loss": 0.0209, "step": 7078 }, { "epoch": 17.182260024301335, "grad_norm": 0.6662044525146484, "learning_rate": 2.1228e-05, "loss": 0.0291, "step": 7079 }, { "epoch": 17.184690157958688, "grad_norm": 0.40263819694519043, "learning_rate": 2.1231e-05, "loss": 0.0235, "step": 7080 }, { "epoch": 17.18712029161604, "grad_norm": 0.5301557779312134, "learning_rate": 2.1234e-05, "loss": 0.0376, "step": 7081 }, { "epoch": 17.18955042527339, "grad_norm": 0.9183575510978699, "learning_rate": 2.1237e-05, "loss": 0.0305, "step": 7082 }, { "epoch": 17.19198055893074, "grad_norm": 0.5027058124542236, "learning_rate": 2.124e-05, "loss": 0.0284, "step": 7083 }, { "epoch": 17.194410692588093, "grad_norm": 0.6258886456489563, "learning_rate": 2.1243e-05, "loss": 0.0266, "step": 7084 }, { "epoch": 17.19684082624544, "grad_norm": 1.3336007595062256, "learning_rate": 2.1246000000000003e-05, "loss": 0.0479, "step": 7085 }, { "epoch": 17.199270959902794, "grad_norm": 1.001531720161438, "learning_rate": 2.1249000000000003e-05, "loss": 0.0405, "step": 7086 }, { "epoch": 17.201701093560146, "grad_norm": 0.5305118560791016, "learning_rate": 2.1252000000000003e-05, "loss": 0.0358, "step": 7087 }, { "epoch": 17.2041312272175, "grad_norm": 0.21754123270511627, "learning_rate": 2.1255000000000002e-05, "loss": 0.0145, "step": 7088 }, { "epoch": 17.206561360874847, "grad_norm": 0.3929530084133148, "learning_rate": 2.1258000000000002e-05, "loss": 0.031, "step": 7089 }, { "epoch": 17.2089914945322, "grad_norm": 0.5739189982414246, "learning_rate": 2.1261000000000002e-05, "loss": 0.0323, "step": 7090 }, { "epoch": 17.21142162818955, "grad_norm": 0.49884817004203796, "learning_rate": 2.1264000000000002e-05, "loss": 0.0315, "step": 7091 }, { "epoch": 17.2138517618469, "grad_norm": 0.5127723217010498, "learning_rate": 2.1266999999999998e-05, "loss": 0.1003, "step": 7092 }, { "epoch": 17.216281895504252, "grad_norm": 0.7395114898681641, "learning_rate": 2.1269999999999998e-05, "loss": 0.0469, "step": 7093 }, { "epoch": 17.218712029161605, "grad_norm": 0.6724107265472412, "learning_rate": 2.1272999999999998e-05, "loss": 0.0274, "step": 7094 }, { "epoch": 17.221142162818953, "grad_norm": 1.0357909202575684, "learning_rate": 2.1276e-05, "loss": 0.0455, "step": 7095 }, { "epoch": 17.223572296476306, "grad_norm": 0.7841693758964539, "learning_rate": 2.1279e-05, "loss": 0.0375, "step": 7096 }, { "epoch": 17.226002430133658, "grad_norm": 0.8243414759635925, "learning_rate": 2.1282e-05, "loss": 0.0466, "step": 7097 }, { "epoch": 17.22843256379101, "grad_norm": 0.6592020392417908, "learning_rate": 2.1285e-05, "loss": 0.0719, "step": 7098 }, { "epoch": 17.23086269744836, "grad_norm": 1.0489543676376343, "learning_rate": 2.1288e-05, "loss": 0.0349, "step": 7099 }, { "epoch": 17.23329283110571, "grad_norm": 0.5075063109397888, "learning_rate": 2.1291e-05, "loss": 0.0289, "step": 7100 }, { "epoch": 17.235722964763063, "grad_norm": 0.4568033516407013, "learning_rate": 2.1294e-05, "loss": 0.0288, "step": 7101 }, { "epoch": 17.238153098420412, "grad_norm": 1.163650393486023, "learning_rate": 2.1297e-05, "loss": 0.0623, "step": 7102 }, { "epoch": 17.240583232077764, "grad_norm": 0.8542489409446716, "learning_rate": 2.13e-05, "loss": 0.0499, "step": 7103 }, { "epoch": 17.243013365735116, "grad_norm": 0.8819268941879272, "learning_rate": 2.1303e-05, "loss": 0.0478, "step": 7104 }, { "epoch": 17.245443499392465, "grad_norm": 0.9795697927474976, "learning_rate": 2.1306000000000002e-05, "loss": 0.256, "step": 7105 }, { "epoch": 17.247873633049817, "grad_norm": 1.0780813694000244, "learning_rate": 2.1309000000000002e-05, "loss": 0.1951, "step": 7106 }, { "epoch": 17.25030376670717, "grad_norm": 0.8712723255157471, "learning_rate": 2.1312000000000002e-05, "loss": 0.1437, "step": 7107 }, { "epoch": 17.25273390036452, "grad_norm": 0.7598699927330017, "learning_rate": 2.1315000000000002e-05, "loss": 0.1609, "step": 7108 }, { "epoch": 17.25516403402187, "grad_norm": 0.6816322207450867, "learning_rate": 2.1318e-05, "loss": 0.1056, "step": 7109 }, { "epoch": 17.257594167679223, "grad_norm": 0.6941370368003845, "learning_rate": 2.1321e-05, "loss": 0.1138, "step": 7110 }, { "epoch": 17.260024301336575, "grad_norm": 0.7287557721138, "learning_rate": 2.1324e-05, "loss": 0.0749, "step": 7111 }, { "epoch": 17.262454434993924, "grad_norm": 0.6587695479393005, "learning_rate": 2.1327e-05, "loss": 0.0836, "step": 7112 }, { "epoch": 17.264884568651276, "grad_norm": 0.43286076188087463, "learning_rate": 2.133e-05, "loss": 0.0414, "step": 7113 }, { "epoch": 17.267314702308628, "grad_norm": 0.41478997468948364, "learning_rate": 2.1333e-05, "loss": 0.0524, "step": 7114 }, { "epoch": 17.269744835965977, "grad_norm": 0.708457887172699, "learning_rate": 2.1336000000000004e-05, "loss": 0.0397, "step": 7115 }, { "epoch": 17.27217496962333, "grad_norm": 0.7809616923332214, "learning_rate": 2.1339e-05, "loss": 0.0492, "step": 7116 }, { "epoch": 17.27460510328068, "grad_norm": 0.5450369119644165, "learning_rate": 2.1342e-05, "loss": 0.0367, "step": 7117 }, { "epoch": 17.277035236938033, "grad_norm": 0.5219610333442688, "learning_rate": 2.1345e-05, "loss": 0.0459, "step": 7118 }, { "epoch": 17.279465370595382, "grad_norm": 0.40121856331825256, "learning_rate": 2.1348e-05, "loss": 0.0297, "step": 7119 }, { "epoch": 17.281895504252734, "grad_norm": 0.4897347092628479, "learning_rate": 2.1351e-05, "loss": 0.04, "step": 7120 }, { "epoch": 17.284325637910086, "grad_norm": 0.46678653359413147, "learning_rate": 2.1354e-05, "loss": 0.0314, "step": 7121 }, { "epoch": 17.286755771567435, "grad_norm": 0.6858256459236145, "learning_rate": 2.1357e-05, "loss": 0.0525, "step": 7122 }, { "epoch": 17.289185905224787, "grad_norm": 0.8006277680397034, "learning_rate": 2.136e-05, "loss": 0.0228, "step": 7123 }, { "epoch": 17.29161603888214, "grad_norm": 0.5224720239639282, "learning_rate": 2.1363e-05, "loss": 0.0406, "step": 7124 }, { "epoch": 17.29404617253949, "grad_norm": 0.6745660901069641, "learning_rate": 2.1366000000000002e-05, "loss": 0.0348, "step": 7125 }, { "epoch": 17.29647630619684, "grad_norm": 0.543603777885437, "learning_rate": 2.1369e-05, "loss": 0.0379, "step": 7126 }, { "epoch": 17.298906439854193, "grad_norm": 0.4947296977043152, "learning_rate": 2.1372e-05, "loss": 0.0281, "step": 7127 }, { "epoch": 17.30133657351154, "grad_norm": 0.45814013481140137, "learning_rate": 2.1375e-05, "loss": 0.0185, "step": 7128 }, { "epoch": 17.303766707168894, "grad_norm": 0.623613178730011, "learning_rate": 2.1378e-05, "loss": 0.0582, "step": 7129 }, { "epoch": 17.306196840826246, "grad_norm": 0.37609702348709106, "learning_rate": 2.1381e-05, "loss": 0.0293, "step": 7130 }, { "epoch": 17.308626974483598, "grad_norm": 0.6468253135681152, "learning_rate": 2.1384e-05, "loss": 0.032, "step": 7131 }, { "epoch": 17.311057108140947, "grad_norm": 0.44200268387794495, "learning_rate": 2.1387e-05, "loss": 0.0326, "step": 7132 }, { "epoch": 17.3134872417983, "grad_norm": 0.3887631297111511, "learning_rate": 2.139e-05, "loss": 0.0322, "step": 7133 }, { "epoch": 17.31591737545565, "grad_norm": 0.3693605065345764, "learning_rate": 2.1393e-05, "loss": 0.0222, "step": 7134 }, { "epoch": 17.318347509113, "grad_norm": 0.4448549151420593, "learning_rate": 2.1396e-05, "loss": 0.0292, "step": 7135 }, { "epoch": 17.320777642770352, "grad_norm": 0.6535860896110535, "learning_rate": 2.1399000000000003e-05, "loss": 0.0289, "step": 7136 }, { "epoch": 17.323207776427704, "grad_norm": 0.4656779170036316, "learning_rate": 2.1402000000000003e-05, "loss": 0.0278, "step": 7137 }, { "epoch": 17.325637910085053, "grad_norm": 0.9535194039344788, "learning_rate": 2.1405000000000003e-05, "loss": 0.0501, "step": 7138 }, { "epoch": 17.328068043742405, "grad_norm": 0.5544360280036926, "learning_rate": 2.1408000000000002e-05, "loss": 0.04, "step": 7139 }, { "epoch": 17.330498177399758, "grad_norm": 0.5993378758430481, "learning_rate": 2.1411000000000002e-05, "loss": 0.0302, "step": 7140 }, { "epoch": 17.33292831105711, "grad_norm": 1.0252805948257446, "learning_rate": 2.1414e-05, "loss": 0.0314, "step": 7141 }, { "epoch": 17.33535844471446, "grad_norm": 0.6052113771438599, "learning_rate": 2.1417e-05, "loss": 0.026, "step": 7142 }, { "epoch": 17.33778857837181, "grad_norm": 0.7260215282440186, "learning_rate": 2.1419999999999998e-05, "loss": 0.0364, "step": 7143 }, { "epoch": 17.340218712029163, "grad_norm": 0.6997067332267761, "learning_rate": 2.1422999999999998e-05, "loss": 0.0573, "step": 7144 }, { "epoch": 17.34264884568651, "grad_norm": 0.7522026300430298, "learning_rate": 2.1425999999999998e-05, "loss": 0.0257, "step": 7145 }, { "epoch": 17.345078979343864, "grad_norm": 1.0879892110824585, "learning_rate": 2.1429e-05, "loss": 0.0372, "step": 7146 }, { "epoch": 17.347509113001216, "grad_norm": 0.5508469343185425, "learning_rate": 2.1432e-05, "loss": 0.0261, "step": 7147 }, { "epoch": 17.349939246658565, "grad_norm": 0.5571648478507996, "learning_rate": 2.1435e-05, "loss": 0.0241, "step": 7148 }, { "epoch": 17.352369380315917, "grad_norm": 0.5870122313499451, "learning_rate": 2.1438e-05, "loss": 0.0277, "step": 7149 }, { "epoch": 17.35479951397327, "grad_norm": 1.0835299491882324, "learning_rate": 2.1441e-05, "loss": 0.0477, "step": 7150 }, { "epoch": 17.35722964763062, "grad_norm": 0.9929951429367065, "learning_rate": 2.1444e-05, "loss": 0.0569, "step": 7151 }, { "epoch": 17.35965978128797, "grad_norm": 0.439562052488327, "learning_rate": 2.1447e-05, "loss": 0.0301, "step": 7152 }, { "epoch": 17.362089914945322, "grad_norm": 0.7067778706550598, "learning_rate": 2.145e-05, "loss": 0.034, "step": 7153 }, { "epoch": 17.364520048602675, "grad_norm": 1.7131197452545166, "learning_rate": 2.1453e-05, "loss": 0.0953, "step": 7154 }, { "epoch": 17.366950182260023, "grad_norm": 0.7528690099716187, "learning_rate": 2.1456e-05, "loss": 0.2664, "step": 7155 }, { "epoch": 17.369380315917375, "grad_norm": 0.5563688278198242, "learning_rate": 2.1459000000000002e-05, "loss": 0.178, "step": 7156 }, { "epoch": 17.371810449574728, "grad_norm": 0.5709507465362549, "learning_rate": 2.1462000000000002e-05, "loss": 0.1232, "step": 7157 }, { "epoch": 17.374240583232076, "grad_norm": 0.7300594449043274, "learning_rate": 2.1465000000000002e-05, "loss": 0.1333, "step": 7158 }, { "epoch": 17.37667071688943, "grad_norm": 3.4996743202209473, "learning_rate": 2.1468000000000002e-05, "loss": 0.1472, "step": 7159 }, { "epoch": 17.37910085054678, "grad_norm": 1.0870391130447388, "learning_rate": 2.1471e-05, "loss": 0.09, "step": 7160 }, { "epoch": 17.381530984204133, "grad_norm": 0.6254615783691406, "learning_rate": 2.1474e-05, "loss": 0.102, "step": 7161 }, { "epoch": 17.38396111786148, "grad_norm": 0.6287403702735901, "learning_rate": 2.1477e-05, "loss": 0.0987, "step": 7162 }, { "epoch": 17.386391251518834, "grad_norm": 0.5448299050331116, "learning_rate": 2.148e-05, "loss": 0.0774, "step": 7163 }, { "epoch": 17.388821385176186, "grad_norm": 0.350394070148468, "learning_rate": 2.1483e-05, "loss": 0.0397, "step": 7164 }, { "epoch": 17.391251518833535, "grad_norm": 0.4196065068244934, "learning_rate": 2.1486e-05, "loss": 0.0485, "step": 7165 }, { "epoch": 17.393681652490887, "grad_norm": 0.7827077507972717, "learning_rate": 2.1489e-05, "loss": 0.0537, "step": 7166 }, { "epoch": 17.39611178614824, "grad_norm": 0.3561340272426605, "learning_rate": 2.1492e-05, "loss": 0.0412, "step": 7167 }, { "epoch": 17.398541919805588, "grad_norm": 0.26198044419288635, "learning_rate": 2.1495e-05, "loss": 0.0236, "step": 7168 }, { "epoch": 17.40097205346294, "grad_norm": 0.41839292645454407, "learning_rate": 2.1498e-05, "loss": 0.0301, "step": 7169 }, { "epoch": 17.403402187120292, "grad_norm": 0.447734534740448, "learning_rate": 2.1501e-05, "loss": 0.052, "step": 7170 }, { "epoch": 17.40583232077764, "grad_norm": 0.5228763222694397, "learning_rate": 2.1504e-05, "loss": 0.0443, "step": 7171 }, { "epoch": 17.408262454434993, "grad_norm": 0.6634384989738464, "learning_rate": 2.1507e-05, "loss": 0.0276, "step": 7172 }, { "epoch": 17.410692588092346, "grad_norm": 0.539994478225708, "learning_rate": 2.151e-05, "loss": 0.0359, "step": 7173 }, { "epoch": 17.413122721749698, "grad_norm": 0.47566407918930054, "learning_rate": 2.1513e-05, "loss": 0.025, "step": 7174 }, { "epoch": 17.415552855407046, "grad_norm": 0.4918951690196991, "learning_rate": 2.1516e-05, "loss": 0.0314, "step": 7175 }, { "epoch": 17.4179829890644, "grad_norm": 0.6404168605804443, "learning_rate": 2.1519000000000002e-05, "loss": 0.0323, "step": 7176 }, { "epoch": 17.42041312272175, "grad_norm": 0.4398452937602997, "learning_rate": 2.1522e-05, "loss": 0.033, "step": 7177 }, { "epoch": 17.4228432563791, "grad_norm": 0.5313011407852173, "learning_rate": 2.1525e-05, "loss": 0.0303, "step": 7178 }, { "epoch": 17.425273390036452, "grad_norm": 0.7409072518348694, "learning_rate": 2.1528e-05, "loss": 0.0323, "step": 7179 }, { "epoch": 17.427703523693804, "grad_norm": 0.648734986782074, "learning_rate": 2.1531e-05, "loss": 0.0332, "step": 7180 }, { "epoch": 17.430133657351153, "grad_norm": 0.6005830764770508, "learning_rate": 2.1534e-05, "loss": 0.0281, "step": 7181 }, { "epoch": 17.432563791008505, "grad_norm": 1.2305489778518677, "learning_rate": 2.1537e-05, "loss": 0.0427, "step": 7182 }, { "epoch": 17.434993924665857, "grad_norm": 0.4467024505138397, "learning_rate": 2.154e-05, "loss": 0.0182, "step": 7183 }, { "epoch": 17.43742405832321, "grad_norm": 0.4176732003688812, "learning_rate": 2.1543e-05, "loss": 0.0254, "step": 7184 }, { "epoch": 17.439854191980558, "grad_norm": 0.4686618745326996, "learning_rate": 2.1546e-05, "loss": 0.0287, "step": 7185 }, { "epoch": 17.44228432563791, "grad_norm": 0.7144907712936401, "learning_rate": 2.1549000000000003e-05, "loss": 0.035, "step": 7186 }, { "epoch": 17.444714459295263, "grad_norm": 0.619804859161377, "learning_rate": 2.1552000000000003e-05, "loss": 0.0373, "step": 7187 }, { "epoch": 17.44714459295261, "grad_norm": 1.4306362867355347, "learning_rate": 2.1555000000000003e-05, "loss": 0.1117, "step": 7188 }, { "epoch": 17.449574726609963, "grad_norm": 0.4054167866706848, "learning_rate": 2.1558000000000003e-05, "loss": 0.0195, "step": 7189 }, { "epoch": 17.452004860267316, "grad_norm": 0.7767285704612732, "learning_rate": 2.1561e-05, "loss": 0.043, "step": 7190 }, { "epoch": 17.454434993924664, "grad_norm": 0.5924113988876343, "learning_rate": 2.1564e-05, "loss": 0.0333, "step": 7191 }, { "epoch": 17.456865127582017, "grad_norm": 0.38191235065460205, "learning_rate": 2.1567e-05, "loss": 0.0192, "step": 7192 }, { "epoch": 17.45929526123937, "grad_norm": 0.48516061902046204, "learning_rate": 2.157e-05, "loss": 0.0323, "step": 7193 }, { "epoch": 17.46172539489672, "grad_norm": 0.38137051463127136, "learning_rate": 2.1572999999999998e-05, "loss": 0.0329, "step": 7194 }, { "epoch": 17.46415552855407, "grad_norm": 1.147491216659546, "learning_rate": 2.1575999999999998e-05, "loss": 0.0302, "step": 7195 }, { "epoch": 17.466585662211422, "grad_norm": 0.7110522389411926, "learning_rate": 2.1579e-05, "loss": 0.038, "step": 7196 }, { "epoch": 17.469015795868774, "grad_norm": 0.2834237813949585, "learning_rate": 2.1582e-05, "loss": 0.0207, "step": 7197 }, { "epoch": 17.471445929526123, "grad_norm": 1.1489514112472534, "learning_rate": 2.1585e-05, "loss": 0.0552, "step": 7198 }, { "epoch": 17.473876063183475, "grad_norm": 0.5218419432640076, "learning_rate": 2.1588e-05, "loss": 0.0399, "step": 7199 }, { "epoch": 17.476306196840827, "grad_norm": 0.8527503609657288, "learning_rate": 2.1591e-05, "loss": 0.0505, "step": 7200 }, { "epoch": 17.478736330498176, "grad_norm": 0.7157760262489319, "learning_rate": 2.1594e-05, "loss": 0.0371, "step": 7201 }, { "epoch": 17.481166464155528, "grad_norm": 0.9246702194213867, "learning_rate": 2.1597e-05, "loss": 0.0288, "step": 7202 }, { "epoch": 17.48359659781288, "grad_norm": 1.7576159238815308, "learning_rate": 2.16e-05, "loss": 0.1341, "step": 7203 }, { "epoch": 17.48602673147023, "grad_norm": 2.3436760902404785, "learning_rate": 2.1603e-05, "loss": 0.0623, "step": 7204 }, { "epoch": 17.48845686512758, "grad_norm": 1.1047630310058594, "learning_rate": 2.1606e-05, "loss": 0.245, "step": 7205 }, { "epoch": 17.490886998784934, "grad_norm": 0.7493822574615479, "learning_rate": 2.1609000000000003e-05, "loss": 0.2087, "step": 7206 }, { "epoch": 17.493317132442286, "grad_norm": 0.5450553894042969, "learning_rate": 2.1612000000000002e-05, "loss": 0.1338, "step": 7207 }, { "epoch": 17.495747266099634, "grad_norm": 0.6274949908256531, "learning_rate": 2.1615000000000002e-05, "loss": 0.1436, "step": 7208 }, { "epoch": 17.498177399756987, "grad_norm": 0.7783122062683105, "learning_rate": 2.1618000000000002e-05, "loss": 0.1238, "step": 7209 }, { "epoch": 17.50060753341434, "grad_norm": 0.8079338073730469, "learning_rate": 2.1621000000000002e-05, "loss": 0.1068, "step": 7210 }, { "epoch": 17.503037667071688, "grad_norm": 0.5539601445198059, "learning_rate": 2.1624e-05, "loss": 0.0888, "step": 7211 }, { "epoch": 17.50546780072904, "grad_norm": 0.518061637878418, "learning_rate": 2.1627e-05, "loss": 0.0797, "step": 7212 }, { "epoch": 17.507897934386392, "grad_norm": 0.8252894878387451, "learning_rate": 2.163e-05, "loss": 0.1305, "step": 7213 }, { "epoch": 17.51032806804374, "grad_norm": 0.6491341590881348, "learning_rate": 2.1633e-05, "loss": 0.0803, "step": 7214 }, { "epoch": 17.512758201701093, "grad_norm": 1.3171859979629517, "learning_rate": 2.1635999999999997e-05, "loss": 0.0703, "step": 7215 }, { "epoch": 17.515188335358445, "grad_norm": 0.4615323841571808, "learning_rate": 2.1639e-05, "loss": 0.0332, "step": 7216 }, { "epoch": 17.517618469015797, "grad_norm": 0.4326940178871155, "learning_rate": 2.1642e-05, "loss": 0.0438, "step": 7217 }, { "epoch": 17.520048602673146, "grad_norm": 0.4560588598251343, "learning_rate": 2.1645e-05, "loss": 0.0411, "step": 7218 }, { "epoch": 17.5224787363305, "grad_norm": 0.5002053380012512, "learning_rate": 2.1648e-05, "loss": 0.0421, "step": 7219 }, { "epoch": 17.52490886998785, "grad_norm": 0.45459944009780884, "learning_rate": 2.1651e-05, "loss": 0.049, "step": 7220 }, { "epoch": 17.5273390036452, "grad_norm": 0.6359400153160095, "learning_rate": 2.1654e-05, "loss": 0.0303, "step": 7221 }, { "epoch": 17.52976913730255, "grad_norm": 0.5176395177841187, "learning_rate": 2.1657e-05, "loss": 0.0473, "step": 7222 }, { "epoch": 17.532199270959904, "grad_norm": 0.5475077033042908, "learning_rate": 2.166e-05, "loss": 0.05, "step": 7223 }, { "epoch": 17.534629404617252, "grad_norm": 0.36065489053726196, "learning_rate": 2.1663e-05, "loss": 0.024, "step": 7224 }, { "epoch": 17.537059538274605, "grad_norm": 0.5422410368919373, "learning_rate": 2.1666e-05, "loss": 0.0522, "step": 7225 }, { "epoch": 17.539489671931957, "grad_norm": 0.4416055679321289, "learning_rate": 2.1669000000000002e-05, "loss": 0.0199, "step": 7226 }, { "epoch": 17.54191980558931, "grad_norm": 0.4268188774585724, "learning_rate": 2.1672000000000002e-05, "loss": 0.0275, "step": 7227 }, { "epoch": 17.544349939246658, "grad_norm": 0.5781423449516296, "learning_rate": 2.1675e-05, "loss": 0.04, "step": 7228 }, { "epoch": 17.54678007290401, "grad_norm": 0.5559223890304565, "learning_rate": 2.1678e-05, "loss": 0.0284, "step": 7229 }, { "epoch": 17.549210206561362, "grad_norm": 0.39147916436195374, "learning_rate": 2.1681e-05, "loss": 0.0294, "step": 7230 }, { "epoch": 17.55164034021871, "grad_norm": 0.556408703327179, "learning_rate": 2.1684e-05, "loss": 0.0261, "step": 7231 }, { "epoch": 17.554070473876063, "grad_norm": 0.4616871476173401, "learning_rate": 2.1687e-05, "loss": 0.0321, "step": 7232 }, { "epoch": 17.556500607533415, "grad_norm": 0.6032828092575073, "learning_rate": 2.169e-05, "loss": 0.0389, "step": 7233 }, { "epoch": 17.558930741190764, "grad_norm": 0.4446212649345398, "learning_rate": 2.1693e-05, "loss": 0.021, "step": 7234 }, { "epoch": 17.561360874848116, "grad_norm": 0.5156100392341614, "learning_rate": 2.1696e-05, "loss": 0.0251, "step": 7235 }, { "epoch": 17.56379100850547, "grad_norm": 0.7030059695243835, "learning_rate": 2.1699000000000003e-05, "loss": 0.0339, "step": 7236 }, { "epoch": 17.566221142162817, "grad_norm": 0.4418361783027649, "learning_rate": 2.1702000000000003e-05, "loss": 0.034, "step": 7237 }, { "epoch": 17.56865127582017, "grad_norm": 0.5201946496963501, "learning_rate": 2.1705000000000003e-05, "loss": 0.0285, "step": 7238 }, { "epoch": 17.57108140947752, "grad_norm": 0.7185019254684448, "learning_rate": 2.1708e-05, "loss": 0.0444, "step": 7239 }, { "epoch": 17.573511543134874, "grad_norm": 0.47196879982948303, "learning_rate": 2.1711e-05, "loss": 0.0231, "step": 7240 }, { "epoch": 17.575941676792223, "grad_norm": 0.5375658273696899, "learning_rate": 2.1714e-05, "loss": 0.0391, "step": 7241 }, { "epoch": 17.578371810449575, "grad_norm": 0.9855847358703613, "learning_rate": 2.1717e-05, "loss": 0.0355, "step": 7242 }, { "epoch": 17.580801944106927, "grad_norm": 0.577311635017395, "learning_rate": 2.172e-05, "loss": 0.0266, "step": 7243 }, { "epoch": 17.583232077764276, "grad_norm": 1.614475965499878, "learning_rate": 2.1723e-05, "loss": 0.0619, "step": 7244 }, { "epoch": 17.585662211421628, "grad_norm": 0.6637312769889832, "learning_rate": 2.1726e-05, "loss": 0.0378, "step": 7245 }, { "epoch": 17.58809234507898, "grad_norm": 0.7455545663833618, "learning_rate": 2.1729e-05, "loss": 0.0282, "step": 7246 }, { "epoch": 17.59052247873633, "grad_norm": 0.46546441316604614, "learning_rate": 2.1732e-05, "loss": 0.0296, "step": 7247 }, { "epoch": 17.59295261239368, "grad_norm": 1.3809385299682617, "learning_rate": 2.1735e-05, "loss": 0.0405, "step": 7248 }, { "epoch": 17.595382746051033, "grad_norm": 0.5700558423995972, "learning_rate": 2.1738e-05, "loss": 0.0366, "step": 7249 }, { "epoch": 17.597812879708385, "grad_norm": 0.5671756267547607, "learning_rate": 2.1741e-05, "loss": 0.0287, "step": 7250 }, { "epoch": 17.600243013365734, "grad_norm": 1.248019814491272, "learning_rate": 2.1744e-05, "loss": 0.0296, "step": 7251 }, { "epoch": 17.602673147023086, "grad_norm": 0.8477751612663269, "learning_rate": 2.1747e-05, "loss": 0.0514, "step": 7252 }, { "epoch": 17.60510328068044, "grad_norm": 1.4922266006469727, "learning_rate": 2.175e-05, "loss": 0.0515, "step": 7253 }, { "epoch": 17.607533414337787, "grad_norm": 2.1408605575561523, "learning_rate": 2.1753e-05, "loss": 0.0819, "step": 7254 }, { "epoch": 17.60996354799514, "grad_norm": 1.3710750341415405, "learning_rate": 2.1756e-05, "loss": 0.2688, "step": 7255 }, { "epoch": 17.61239368165249, "grad_norm": 1.0551910400390625, "learning_rate": 2.1759e-05, "loss": 0.1959, "step": 7256 }, { "epoch": 17.61482381530984, "grad_norm": 0.7421474456787109, "learning_rate": 2.1762000000000003e-05, "loss": 0.1701, "step": 7257 }, { "epoch": 17.617253948967193, "grad_norm": 0.5208773612976074, "learning_rate": 2.1765000000000003e-05, "loss": 0.1195, "step": 7258 }, { "epoch": 17.619684082624545, "grad_norm": 0.6217235326766968, "learning_rate": 2.1768000000000002e-05, "loss": 0.1206, "step": 7259 }, { "epoch": 17.622114216281897, "grad_norm": 0.8263227343559265, "learning_rate": 2.1771000000000002e-05, "loss": 0.1357, "step": 7260 }, { "epoch": 17.624544349939246, "grad_norm": 0.6323230862617493, "learning_rate": 2.1774000000000002e-05, "loss": 0.1004, "step": 7261 }, { "epoch": 17.626974483596598, "grad_norm": 0.40134871006011963, "learning_rate": 2.1777000000000002e-05, "loss": 0.0572, "step": 7262 }, { "epoch": 17.62940461725395, "grad_norm": 0.5578452944755554, "learning_rate": 2.178e-05, "loss": 0.0514, "step": 7263 }, { "epoch": 17.6318347509113, "grad_norm": 0.5153611302375793, "learning_rate": 2.1782999999999998e-05, "loss": 0.0456, "step": 7264 }, { "epoch": 17.63426488456865, "grad_norm": 0.5834744572639465, "learning_rate": 2.1785999999999998e-05, "loss": 0.0744, "step": 7265 }, { "epoch": 17.636695018226003, "grad_norm": 0.6074334979057312, "learning_rate": 2.1788999999999998e-05, "loss": 0.0671, "step": 7266 }, { "epoch": 17.639125151883352, "grad_norm": 0.5278176665306091, "learning_rate": 2.1792e-05, "loss": 0.0309, "step": 7267 }, { "epoch": 17.641555285540704, "grad_norm": 0.5052857995033264, "learning_rate": 2.1795e-05, "loss": 0.0482, "step": 7268 }, { "epoch": 17.643985419198057, "grad_norm": 0.5198523998260498, "learning_rate": 2.1798e-05, "loss": 0.0385, "step": 7269 }, { "epoch": 17.64641555285541, "grad_norm": 0.41169556975364685, "learning_rate": 2.1801e-05, "loss": 0.0426, "step": 7270 }, { "epoch": 17.648845686512757, "grad_norm": 0.6613333821296692, "learning_rate": 2.1804e-05, "loss": 0.0444, "step": 7271 }, { "epoch": 17.65127582017011, "grad_norm": 0.4951570928096771, "learning_rate": 2.1807e-05, "loss": 0.0471, "step": 7272 }, { "epoch": 17.653705953827462, "grad_norm": 0.35300755500793457, "learning_rate": 2.181e-05, "loss": 0.0216, "step": 7273 }, { "epoch": 17.65613608748481, "grad_norm": 0.47739872336387634, "learning_rate": 2.1813e-05, "loss": 0.0426, "step": 7274 }, { "epoch": 17.658566221142163, "grad_norm": 0.5197376608848572, "learning_rate": 2.1816e-05, "loss": 0.0295, "step": 7275 }, { "epoch": 17.660996354799515, "grad_norm": 1.1161011457443237, "learning_rate": 2.1819e-05, "loss": 0.0433, "step": 7276 }, { "epoch": 17.663426488456864, "grad_norm": 0.7107424736022949, "learning_rate": 2.1822000000000002e-05, "loss": 0.0468, "step": 7277 }, { "epoch": 17.665856622114216, "grad_norm": 0.32309335470199585, "learning_rate": 2.1825000000000002e-05, "loss": 0.0187, "step": 7278 }, { "epoch": 17.668286755771568, "grad_norm": 1.0096654891967773, "learning_rate": 2.1828000000000002e-05, "loss": 0.0444, "step": 7279 }, { "epoch": 17.670716889428917, "grad_norm": 0.649726152420044, "learning_rate": 2.1831e-05, "loss": 0.0776, "step": 7280 }, { "epoch": 17.67314702308627, "grad_norm": 0.5251705050468445, "learning_rate": 2.1834e-05, "loss": 0.0279, "step": 7281 }, { "epoch": 17.67557715674362, "grad_norm": 0.7543448209762573, "learning_rate": 2.1837e-05, "loss": 0.0394, "step": 7282 }, { "epoch": 17.678007290400974, "grad_norm": 0.41344311833381653, "learning_rate": 2.184e-05, "loss": 0.0296, "step": 7283 }, { "epoch": 17.680437424058322, "grad_norm": 0.9614694118499756, "learning_rate": 2.1843e-05, "loss": 0.0369, "step": 7284 }, { "epoch": 17.682867557715674, "grad_norm": 0.4257751405239105, "learning_rate": 2.1846e-05, "loss": 0.0303, "step": 7285 }, { "epoch": 17.685297691373027, "grad_norm": 0.366573691368103, "learning_rate": 2.1849e-05, "loss": 0.0248, "step": 7286 }, { "epoch": 17.687727825030375, "grad_norm": 0.6451033353805542, "learning_rate": 2.1852000000000004e-05, "loss": 0.0409, "step": 7287 }, { "epoch": 17.690157958687728, "grad_norm": 0.7473973631858826, "learning_rate": 2.1855e-05, "loss": 0.0369, "step": 7288 }, { "epoch": 17.69258809234508, "grad_norm": 3.343665361404419, "learning_rate": 2.1858e-05, "loss": 0.0394, "step": 7289 }, { "epoch": 17.69501822600243, "grad_norm": 0.700553834438324, "learning_rate": 2.1861e-05, "loss": 0.0475, "step": 7290 }, { "epoch": 17.69744835965978, "grad_norm": 1.3466922044754028, "learning_rate": 2.1864e-05, "loss": 0.0369, "step": 7291 }, { "epoch": 17.699878493317133, "grad_norm": 0.5693539977073669, "learning_rate": 2.1867e-05, "loss": 0.0277, "step": 7292 }, { "epoch": 17.702308626974485, "grad_norm": 0.8019313216209412, "learning_rate": 2.187e-05, "loss": 0.0423, "step": 7293 }, { "epoch": 17.704738760631834, "grad_norm": 1.4904931783676147, "learning_rate": 2.1873e-05, "loss": 0.0472, "step": 7294 }, { "epoch": 17.707168894289186, "grad_norm": 0.5169356465339661, "learning_rate": 2.1876e-05, "loss": 0.0296, "step": 7295 }, { "epoch": 17.70959902794654, "grad_norm": 0.8369303941726685, "learning_rate": 2.1879e-05, "loss": 0.0431, "step": 7296 }, { "epoch": 17.712029161603887, "grad_norm": 0.4786182641983032, "learning_rate": 2.1882e-05, "loss": 0.0308, "step": 7297 }, { "epoch": 17.71445929526124, "grad_norm": 0.6294431090354919, "learning_rate": 2.1885e-05, "loss": 0.0266, "step": 7298 }, { "epoch": 17.71688942891859, "grad_norm": 0.45072367787361145, "learning_rate": 2.1888e-05, "loss": 0.027, "step": 7299 }, { "epoch": 17.71931956257594, "grad_norm": 0.7054076194763184, "learning_rate": 2.1891e-05, "loss": 0.0295, "step": 7300 }, { "epoch": 17.721749696233292, "grad_norm": 0.9287655353546143, "learning_rate": 2.1894e-05, "loss": 0.0603, "step": 7301 }, { "epoch": 17.724179829890645, "grad_norm": 0.8972072601318359, "learning_rate": 2.1897e-05, "loss": 0.0469, "step": 7302 }, { "epoch": 17.726609963547997, "grad_norm": 0.979400098323822, "learning_rate": 2.19e-05, "loss": 0.0471, "step": 7303 }, { "epoch": 17.729040097205345, "grad_norm": 1.0660254955291748, "learning_rate": 2.1903e-05, "loss": 0.0559, "step": 7304 }, { "epoch": 17.731470230862698, "grad_norm": 1.4192157983779907, "learning_rate": 2.1906e-05, "loss": 0.2689, "step": 7305 }, { "epoch": 17.73390036452005, "grad_norm": 0.7092727422714233, "learning_rate": 2.1909e-05, "loss": 0.2271, "step": 7306 }, { "epoch": 17.7363304981774, "grad_norm": 0.7942292094230652, "learning_rate": 2.1912000000000003e-05, "loss": 0.1774, "step": 7307 }, { "epoch": 17.73876063183475, "grad_norm": 0.8896129131317139, "learning_rate": 2.1915000000000003e-05, "loss": 0.1447, "step": 7308 }, { "epoch": 17.741190765492103, "grad_norm": 0.8643721342086792, "learning_rate": 2.1918000000000003e-05, "loss": 0.1646, "step": 7309 }, { "epoch": 17.74362089914945, "grad_norm": 0.6335258483886719, "learning_rate": 2.1921000000000002e-05, "loss": 0.095, "step": 7310 }, { "epoch": 17.746051032806804, "grad_norm": 0.8081222772598267, "learning_rate": 2.1924000000000002e-05, "loss": 0.1126, "step": 7311 }, { "epoch": 17.748481166464156, "grad_norm": 0.7703576683998108, "learning_rate": 2.1927000000000002e-05, "loss": 0.0972, "step": 7312 }, { "epoch": 17.75091130012151, "grad_norm": 0.5635064244270325, "learning_rate": 2.193e-05, "loss": 0.0699, "step": 7313 }, { "epoch": 17.753341433778857, "grad_norm": 0.4709608852863312, "learning_rate": 2.1932999999999998e-05, "loss": 0.0584, "step": 7314 }, { "epoch": 17.75577156743621, "grad_norm": 0.4259437322616577, "learning_rate": 2.1935999999999998e-05, "loss": 0.0512, "step": 7315 }, { "epoch": 17.75820170109356, "grad_norm": 0.4814186990261078, "learning_rate": 2.1938999999999998e-05, "loss": 0.0443, "step": 7316 }, { "epoch": 17.76063183475091, "grad_norm": 0.6562609672546387, "learning_rate": 2.1942e-05, "loss": 0.0556, "step": 7317 }, { "epoch": 17.763061968408262, "grad_norm": 0.46185389161109924, "learning_rate": 2.1945e-05, "loss": 0.0428, "step": 7318 }, { "epoch": 17.765492102065615, "grad_norm": 0.6201338171958923, "learning_rate": 2.1948e-05, "loss": 0.0486, "step": 7319 }, { "epoch": 17.767922235722963, "grad_norm": 0.4453549087047577, "learning_rate": 2.1951e-05, "loss": 0.0362, "step": 7320 }, { "epoch": 17.770352369380316, "grad_norm": 0.5933368802070618, "learning_rate": 2.1954e-05, "loss": 0.0653, "step": 7321 }, { "epoch": 17.772782503037668, "grad_norm": 0.7601743936538696, "learning_rate": 2.1957e-05, "loss": 0.032, "step": 7322 }, { "epoch": 17.775212636695016, "grad_norm": 0.4874424934387207, "learning_rate": 2.196e-05, "loss": 0.0382, "step": 7323 }, { "epoch": 17.77764277035237, "grad_norm": 0.693382978439331, "learning_rate": 2.1963e-05, "loss": 0.0522, "step": 7324 }, { "epoch": 17.78007290400972, "grad_norm": 0.4631690979003906, "learning_rate": 2.1966e-05, "loss": 0.0334, "step": 7325 }, { "epoch": 17.782503037667073, "grad_norm": 0.5232514142990112, "learning_rate": 2.1969e-05, "loss": 0.0417, "step": 7326 }, { "epoch": 17.784933171324422, "grad_norm": 0.5247769951820374, "learning_rate": 2.1972000000000002e-05, "loss": 0.0537, "step": 7327 }, { "epoch": 17.787363304981774, "grad_norm": 1.391687035560608, "learning_rate": 2.1975000000000002e-05, "loss": 0.027, "step": 7328 }, { "epoch": 17.789793438639126, "grad_norm": 0.3018486499786377, "learning_rate": 2.1978000000000002e-05, "loss": 0.0169, "step": 7329 }, { "epoch": 17.792223572296475, "grad_norm": 0.3941092789173126, "learning_rate": 2.1981000000000002e-05, "loss": 0.026, "step": 7330 }, { "epoch": 17.794653705953827, "grad_norm": 0.512395441532135, "learning_rate": 2.1984e-05, "loss": 0.0327, "step": 7331 }, { "epoch": 17.79708383961118, "grad_norm": 0.6149512529373169, "learning_rate": 2.1987e-05, "loss": 0.0314, "step": 7332 }, { "epoch": 17.799513973268528, "grad_norm": 0.5918566584587097, "learning_rate": 2.199e-05, "loss": 0.0278, "step": 7333 }, { "epoch": 17.80194410692588, "grad_norm": 0.4121077358722687, "learning_rate": 2.1993e-05, "loss": 0.0269, "step": 7334 }, { "epoch": 17.804374240583233, "grad_norm": 0.37337803840637207, "learning_rate": 2.1996e-05, "loss": 0.0213, "step": 7335 }, { "epoch": 17.806804374240585, "grad_norm": 0.46516934037208557, "learning_rate": 2.1999e-05, "loss": 0.0403, "step": 7336 }, { "epoch": 17.809234507897933, "grad_norm": 0.3655526340007782, "learning_rate": 2.2002e-05, "loss": 0.0311, "step": 7337 }, { "epoch": 17.811664641555286, "grad_norm": 0.416155606508255, "learning_rate": 2.2005e-05, "loss": 0.0191, "step": 7338 }, { "epoch": 17.814094775212638, "grad_norm": 0.45566895604133606, "learning_rate": 2.2008e-05, "loss": 0.0302, "step": 7339 }, { "epoch": 17.816524908869987, "grad_norm": 0.5861093997955322, "learning_rate": 2.2011e-05, "loss": 0.0355, "step": 7340 }, { "epoch": 17.81895504252734, "grad_norm": 0.5568758845329285, "learning_rate": 2.2014e-05, "loss": 0.033, "step": 7341 }, { "epoch": 17.82138517618469, "grad_norm": 0.5866572260856628, "learning_rate": 2.2017e-05, "loss": 0.0199, "step": 7342 }, { "epoch": 17.82381530984204, "grad_norm": 0.6344202160835266, "learning_rate": 2.202e-05, "loss": 0.047, "step": 7343 }, { "epoch": 17.826245443499392, "grad_norm": 1.0434026718139648, "learning_rate": 2.2023e-05, "loss": 0.039, "step": 7344 }, { "epoch": 17.828675577156744, "grad_norm": 0.9556554555892944, "learning_rate": 2.2026e-05, "loss": 0.0401, "step": 7345 }, { "epoch": 17.831105710814096, "grad_norm": 0.7115185260772705, "learning_rate": 2.2029e-05, "loss": 0.0378, "step": 7346 }, { "epoch": 17.833535844471445, "grad_norm": 2.9165565967559814, "learning_rate": 2.2032000000000002e-05, "loss": 0.0348, "step": 7347 }, { "epoch": 17.835965978128797, "grad_norm": 0.6556969285011292, "learning_rate": 2.2035e-05, "loss": 0.0427, "step": 7348 }, { "epoch": 17.83839611178615, "grad_norm": 1.17570161819458, "learning_rate": 2.2038e-05, "loss": 0.047, "step": 7349 }, { "epoch": 17.8408262454435, "grad_norm": 0.8298782706260681, "learning_rate": 2.2041e-05, "loss": 0.0375, "step": 7350 }, { "epoch": 17.84325637910085, "grad_norm": 0.7157828211784363, "learning_rate": 2.2044e-05, "loss": 0.0373, "step": 7351 }, { "epoch": 17.845686512758203, "grad_norm": 1.5828678607940674, "learning_rate": 2.2047e-05, "loss": 0.0459, "step": 7352 }, { "epoch": 17.84811664641555, "grad_norm": 0.8428768515586853, "learning_rate": 2.205e-05, "loss": 0.0584, "step": 7353 }, { "epoch": 17.850546780072904, "grad_norm": 2.4368205070495605, "learning_rate": 2.2053e-05, "loss": 0.0924, "step": 7354 }, { "epoch": 17.852976913730256, "grad_norm": 1.0926275253295898, "learning_rate": 2.2056e-05, "loss": 0.2462, "step": 7355 }, { "epoch": 17.855407047387608, "grad_norm": 0.661887526512146, "learning_rate": 2.2059e-05, "loss": 0.1901, "step": 7356 }, { "epoch": 17.857837181044957, "grad_norm": 0.5799657702445984, "learning_rate": 2.2062000000000003e-05, "loss": 0.1509, "step": 7357 }, { "epoch": 17.86026731470231, "grad_norm": 0.5554527044296265, "learning_rate": 2.2065000000000003e-05, "loss": 0.1447, "step": 7358 }, { "epoch": 17.86269744835966, "grad_norm": 1.7850841283798218, "learning_rate": 2.2068000000000003e-05, "loss": 0.1665, "step": 7359 }, { "epoch": 17.86512758201701, "grad_norm": 0.712638258934021, "learning_rate": 2.2071000000000003e-05, "loss": 0.1132, "step": 7360 }, { "epoch": 17.867557715674362, "grad_norm": 0.6551244854927063, "learning_rate": 2.2074000000000002e-05, "loss": 0.0943, "step": 7361 }, { "epoch": 17.869987849331714, "grad_norm": 0.5771767497062683, "learning_rate": 2.2077e-05, "loss": 0.0639, "step": 7362 }, { "epoch": 17.872417982989063, "grad_norm": 0.5315906405448914, "learning_rate": 2.208e-05, "loss": 0.0505, "step": 7363 }, { "epoch": 17.874848116646415, "grad_norm": 0.5189736485481262, "learning_rate": 2.2083e-05, "loss": 0.0542, "step": 7364 }, { "epoch": 17.877278250303767, "grad_norm": 0.4937689006328583, "learning_rate": 2.2085999999999998e-05, "loss": 0.0582, "step": 7365 }, { "epoch": 17.879708383961116, "grad_norm": 0.5724221467971802, "learning_rate": 2.2088999999999998e-05, "loss": 0.0348, "step": 7366 }, { "epoch": 17.88213851761847, "grad_norm": 0.7216218113899231, "learning_rate": 2.2092e-05, "loss": 0.0616, "step": 7367 }, { "epoch": 17.88456865127582, "grad_norm": 0.6966433525085449, "learning_rate": 2.2095e-05, "loss": 0.0511, "step": 7368 }, { "epoch": 17.886998784933173, "grad_norm": 0.4528290927410126, "learning_rate": 2.2098e-05, "loss": 0.0296, "step": 7369 }, { "epoch": 17.88942891859052, "grad_norm": 0.7261857986450195, "learning_rate": 2.2101e-05, "loss": 0.0407, "step": 7370 }, { "epoch": 17.891859052247874, "grad_norm": 0.7444187998771667, "learning_rate": 2.2104e-05, "loss": 0.0513, "step": 7371 }, { "epoch": 17.894289185905226, "grad_norm": 0.39775070548057556, "learning_rate": 2.2107e-05, "loss": 0.0422, "step": 7372 }, { "epoch": 17.896719319562575, "grad_norm": 0.5868496894836426, "learning_rate": 2.211e-05, "loss": 0.0316, "step": 7373 }, { "epoch": 17.899149453219927, "grad_norm": 0.4382152557373047, "learning_rate": 2.2113e-05, "loss": 0.0395, "step": 7374 }, { "epoch": 17.90157958687728, "grad_norm": 0.565409779548645, "learning_rate": 2.2116e-05, "loss": 0.0378, "step": 7375 }, { "epoch": 17.904009720534628, "grad_norm": 0.41061165928840637, "learning_rate": 2.2119e-05, "loss": 0.0266, "step": 7376 }, { "epoch": 17.90643985419198, "grad_norm": 0.9179980158805847, "learning_rate": 2.2122000000000003e-05, "loss": 0.0516, "step": 7377 }, { "epoch": 17.908869987849332, "grad_norm": 0.4623428285121918, "learning_rate": 2.2125000000000002e-05, "loss": 0.0371, "step": 7378 }, { "epoch": 17.911300121506684, "grad_norm": 0.7103080749511719, "learning_rate": 2.2128000000000002e-05, "loss": 0.0373, "step": 7379 }, { "epoch": 17.913730255164033, "grad_norm": 0.40592437982559204, "learning_rate": 2.2131000000000002e-05, "loss": 0.0242, "step": 7380 }, { "epoch": 17.916160388821385, "grad_norm": 0.4345076382160187, "learning_rate": 2.2134000000000002e-05, "loss": 0.034, "step": 7381 }, { "epoch": 17.918590522478738, "grad_norm": 0.6162415742874146, "learning_rate": 2.2137e-05, "loss": 0.0411, "step": 7382 }, { "epoch": 17.921020656136086, "grad_norm": 1.302797555923462, "learning_rate": 2.214e-05, "loss": 0.0347, "step": 7383 }, { "epoch": 17.92345078979344, "grad_norm": 0.5689896941184998, "learning_rate": 2.2143e-05, "loss": 0.0354, "step": 7384 }, { "epoch": 17.92588092345079, "grad_norm": 0.538171648979187, "learning_rate": 2.2146e-05, "loss": 0.0328, "step": 7385 }, { "epoch": 17.92831105710814, "grad_norm": 0.31988251209259033, "learning_rate": 2.2149e-05, "loss": 0.0185, "step": 7386 }, { "epoch": 17.93074119076549, "grad_norm": 0.6139757037162781, "learning_rate": 2.2151999999999997e-05, "loss": 0.037, "step": 7387 }, { "epoch": 17.933171324422844, "grad_norm": 0.7157076597213745, "learning_rate": 2.2155e-05, "loss": 0.0375, "step": 7388 }, { "epoch": 17.935601458080196, "grad_norm": 0.5592228174209595, "learning_rate": 2.2158e-05, "loss": 0.0427, "step": 7389 }, { "epoch": 17.938031591737545, "grad_norm": 0.6267107725143433, "learning_rate": 2.2161e-05, "loss": 0.0415, "step": 7390 }, { "epoch": 17.940461725394897, "grad_norm": 0.5096871852874756, "learning_rate": 2.2164e-05, "loss": 0.028, "step": 7391 }, { "epoch": 17.94289185905225, "grad_norm": 0.29782283306121826, "learning_rate": 2.2167e-05, "loss": 0.0111, "step": 7392 }, { "epoch": 17.945321992709598, "grad_norm": 0.6952956318855286, "learning_rate": 2.217e-05, "loss": 0.0454, "step": 7393 }, { "epoch": 17.94775212636695, "grad_norm": 0.778166651725769, "learning_rate": 2.2173e-05, "loss": 0.0258, "step": 7394 }, { "epoch": 17.950182260024302, "grad_norm": 0.8766018152236938, "learning_rate": 2.2176e-05, "loss": 0.0336, "step": 7395 }, { "epoch": 17.95261239368165, "grad_norm": 0.5600637793540955, "learning_rate": 2.2179e-05, "loss": 0.0252, "step": 7396 }, { "epoch": 17.955042527339003, "grad_norm": 0.6789195537567139, "learning_rate": 2.2182e-05, "loss": 0.043, "step": 7397 }, { "epoch": 17.957472660996356, "grad_norm": 0.701492428779602, "learning_rate": 2.2185000000000002e-05, "loss": 0.0618, "step": 7398 }, { "epoch": 17.959902794653708, "grad_norm": 0.7434013485908508, "learning_rate": 2.2188e-05, "loss": 0.0394, "step": 7399 }, { "epoch": 17.962332928311056, "grad_norm": 1.467481255531311, "learning_rate": 2.2191e-05, "loss": 0.0463, "step": 7400 }, { "epoch": 17.96476306196841, "grad_norm": 0.8006847500801086, "learning_rate": 2.2194e-05, "loss": 0.039, "step": 7401 }, { "epoch": 17.96719319562576, "grad_norm": 1.1847296953201294, "learning_rate": 2.2197e-05, "loss": 0.0452, "step": 7402 }, { "epoch": 17.96962332928311, "grad_norm": 0.9839162826538086, "learning_rate": 2.22e-05, "loss": 0.0469, "step": 7403 }, { "epoch": 17.972053462940462, "grad_norm": 1.194332480430603, "learning_rate": 2.2203e-05, "loss": 0.1315, "step": 7404 }, { "epoch": 17.974483596597814, "grad_norm": 0.795265257358551, "learning_rate": 2.2206e-05, "loss": 0.1988, "step": 7405 }, { "epoch": 17.976913730255163, "grad_norm": 0.6734186410903931, "learning_rate": 2.2209e-05, "loss": 0.097, "step": 7406 }, { "epoch": 17.979343863912515, "grad_norm": 0.3871018886566162, "learning_rate": 2.2212e-05, "loss": 0.0412, "step": 7407 }, { "epoch": 17.981773997569867, "grad_norm": 0.5652511119842529, "learning_rate": 2.2215000000000003e-05, "loss": 0.0435, "step": 7408 }, { "epoch": 17.984204131227216, "grad_norm": 0.6858938336372375, "learning_rate": 2.2218000000000003e-05, "loss": 0.0309, "step": 7409 }, { "epoch": 17.986634264884568, "grad_norm": 0.6965352296829224, "learning_rate": 2.2221000000000003e-05, "loss": 0.0633, "step": 7410 }, { "epoch": 17.98906439854192, "grad_norm": 0.9578492641448975, "learning_rate": 2.2224e-05, "loss": 0.0519, "step": 7411 }, { "epoch": 17.991494532199273, "grad_norm": 0.8435024619102478, "learning_rate": 2.2227e-05, "loss": 0.0907, "step": 7412 }, { "epoch": 17.99392466585662, "grad_norm": 0.5741799473762512, "learning_rate": 2.223e-05, "loss": 0.0223, "step": 7413 }, { "epoch": 17.996354799513973, "grad_norm": 0.5926425457000732, "learning_rate": 2.2233e-05, "loss": 0.0413, "step": 7414 }, { "epoch": 17.998784933171326, "grad_norm": 0.5064903497695923, "learning_rate": 2.2236e-05, "loss": 0.0267, "step": 7415 }, { "epoch": 18.0, "grad_norm": 1.379431962966919, "learning_rate": 2.2239e-05, "loss": 0.0371, "step": 7416 }, { "epoch": 18.002430133657352, "grad_norm": 1.1577485799789429, "learning_rate": 2.2241999999999998e-05, "loss": 0.2742, "step": 7417 }, { "epoch": 18.0048602673147, "grad_norm": 0.5596855878829956, "learning_rate": 2.2245e-05, "loss": 0.1759, "step": 7418 }, { "epoch": 18.007290400972053, "grad_norm": 0.6378374099731445, "learning_rate": 2.2248e-05, "loss": 0.1419, "step": 7419 }, { "epoch": 18.009720534629405, "grad_norm": 0.6362681984901428, "learning_rate": 2.2251e-05, "loss": 0.1373, "step": 7420 }, { "epoch": 18.012150668286754, "grad_norm": 0.6437216401100159, "learning_rate": 2.2254e-05, "loss": 0.0974, "step": 7421 }, { "epoch": 18.014580801944106, "grad_norm": 0.5016639232635498, "learning_rate": 2.2257e-05, "loss": 0.0831, "step": 7422 }, { "epoch": 18.01701093560146, "grad_norm": 0.5539716482162476, "learning_rate": 2.226e-05, "loss": 0.0902, "step": 7423 }, { "epoch": 18.01944106925881, "grad_norm": 0.48344385623931885, "learning_rate": 2.2263e-05, "loss": 0.0637, "step": 7424 }, { "epoch": 18.02187120291616, "grad_norm": 0.513991117477417, "learning_rate": 2.2266e-05, "loss": 0.0473, "step": 7425 }, { "epoch": 18.02430133657351, "grad_norm": 0.3099709749221802, "learning_rate": 2.2269e-05, "loss": 0.044, "step": 7426 }, { "epoch": 18.026731470230864, "grad_norm": 0.6368842124938965, "learning_rate": 2.2272e-05, "loss": 0.0514, "step": 7427 }, { "epoch": 18.029161603888213, "grad_norm": 0.3117256462574005, "learning_rate": 2.2275000000000003e-05, "loss": 0.027, "step": 7428 }, { "epoch": 18.031591737545565, "grad_norm": 0.5466824173927307, "learning_rate": 2.2278000000000003e-05, "loss": 0.0383, "step": 7429 }, { "epoch": 18.034021871202917, "grad_norm": 0.7635456323623657, "learning_rate": 2.2281000000000002e-05, "loss": 0.0289, "step": 7430 }, { "epoch": 18.036452004860266, "grad_norm": 0.46111106872558594, "learning_rate": 2.2284000000000002e-05, "loss": 0.024, "step": 7431 }, { "epoch": 18.038882138517618, "grad_norm": 0.49223071336746216, "learning_rate": 2.2287000000000002e-05, "loss": 0.0262, "step": 7432 }, { "epoch": 18.04131227217497, "grad_norm": 0.7493529915809631, "learning_rate": 2.2290000000000002e-05, "loss": 0.0368, "step": 7433 }, { "epoch": 18.043742405832322, "grad_norm": 0.503672182559967, "learning_rate": 2.2293e-05, "loss": 0.0481, "step": 7434 }, { "epoch": 18.04617253948967, "grad_norm": 0.38349345326423645, "learning_rate": 2.2296e-05, "loss": 0.024, "step": 7435 }, { "epoch": 18.048602673147023, "grad_norm": 0.5060370564460754, "learning_rate": 2.2298999999999998e-05, "loss": 0.0282, "step": 7436 }, { "epoch": 18.051032806804375, "grad_norm": 0.4985533058643341, "learning_rate": 2.2301999999999998e-05, "loss": 0.0287, "step": 7437 }, { "epoch": 18.053462940461724, "grad_norm": 0.33208322525024414, "learning_rate": 2.2305e-05, "loss": 0.0179, "step": 7438 }, { "epoch": 18.055893074119076, "grad_norm": 0.7465507984161377, "learning_rate": 2.2308e-05, "loss": 0.0875, "step": 7439 }, { "epoch": 18.05832320777643, "grad_norm": 0.4336089491844177, "learning_rate": 2.2311e-05, "loss": 0.0256, "step": 7440 }, { "epoch": 18.060753341433777, "grad_norm": 0.8931414484977722, "learning_rate": 2.2314e-05, "loss": 0.038, "step": 7441 }, { "epoch": 18.06318347509113, "grad_norm": 0.5343766808509827, "learning_rate": 2.2317e-05, "loss": 0.0366, "step": 7442 }, { "epoch": 18.06561360874848, "grad_norm": 1.038008451461792, "learning_rate": 2.232e-05, "loss": 0.0472, "step": 7443 }, { "epoch": 18.068043742405834, "grad_norm": 0.6063787341117859, "learning_rate": 2.2323e-05, "loss": 0.0324, "step": 7444 }, { "epoch": 18.070473876063183, "grad_norm": 0.4829173982143402, "learning_rate": 2.2326e-05, "loss": 0.0237, "step": 7445 }, { "epoch": 18.072904009720535, "grad_norm": 0.4685438871383667, "learning_rate": 2.2329e-05, "loss": 0.0281, "step": 7446 }, { "epoch": 18.075334143377887, "grad_norm": 0.6285792589187622, "learning_rate": 2.2332e-05, "loss": 0.0534, "step": 7447 }, { "epoch": 18.077764277035236, "grad_norm": 0.3830280601978302, "learning_rate": 2.2335000000000002e-05, "loss": 0.0205, "step": 7448 }, { "epoch": 18.080194410692588, "grad_norm": 0.4714210629463196, "learning_rate": 2.2338000000000002e-05, "loss": 0.0292, "step": 7449 }, { "epoch": 18.08262454434994, "grad_norm": 0.5405285358428955, "learning_rate": 2.2341000000000002e-05, "loss": 0.0251, "step": 7450 }, { "epoch": 18.08505467800729, "grad_norm": 0.5074334144592285, "learning_rate": 2.2344e-05, "loss": 0.0296, "step": 7451 }, { "epoch": 18.08748481166464, "grad_norm": 0.43033432960510254, "learning_rate": 2.2347e-05, "loss": 0.0282, "step": 7452 }, { "epoch": 18.089914945321993, "grad_norm": 0.6744089126586914, "learning_rate": 2.235e-05, "loss": 0.0248, "step": 7453 }, { "epoch": 18.092345078979346, "grad_norm": 0.5307458639144897, "learning_rate": 2.2353e-05, "loss": 0.0229, "step": 7454 }, { "epoch": 18.094775212636694, "grad_norm": 0.3580546975135803, "learning_rate": 2.2356e-05, "loss": 0.0147, "step": 7455 }, { "epoch": 18.097205346294047, "grad_norm": 0.9027302265167236, "learning_rate": 2.2359e-05, "loss": 0.0162, "step": 7456 }, { "epoch": 18.0996354799514, "grad_norm": 0.453426331281662, "learning_rate": 2.2362e-05, "loss": 0.0244, "step": 7457 }, { "epoch": 18.102065613608747, "grad_norm": 0.5626187324523926, "learning_rate": 2.2365000000000004e-05, "loss": 0.0321, "step": 7458 }, { "epoch": 18.1044957472661, "grad_norm": 0.5044991970062256, "learning_rate": 2.2368000000000003e-05, "loss": 0.0286, "step": 7459 }, { "epoch": 18.106925880923452, "grad_norm": 0.9095771908760071, "learning_rate": 2.2371e-05, "loss": 0.0402, "step": 7460 }, { "epoch": 18.1093560145808, "grad_norm": 0.6495378613471985, "learning_rate": 2.2374e-05, "loss": 0.0276, "step": 7461 }, { "epoch": 18.111786148238153, "grad_norm": 3.1154308319091797, "learning_rate": 2.2377e-05, "loss": 0.0502, "step": 7462 }, { "epoch": 18.114216281895505, "grad_norm": 2.0636167526245117, "learning_rate": 2.238e-05, "loss": 0.0351, "step": 7463 }, { "epoch": 18.116646415552854, "grad_norm": 2.7404510974884033, "learning_rate": 2.2383e-05, "loss": 0.0768, "step": 7464 }, { "epoch": 18.119076549210206, "grad_norm": 0.6191658973693848, "learning_rate": 2.2386e-05, "loss": 0.0273, "step": 7465 }, { "epoch": 18.121506682867558, "grad_norm": 2.2790098190307617, "learning_rate": 2.2389e-05, "loss": 0.1008, "step": 7466 }, { "epoch": 18.12393681652491, "grad_norm": 0.9697814583778381, "learning_rate": 2.2392e-05, "loss": 0.2352, "step": 7467 }, { "epoch": 18.12636695018226, "grad_norm": 0.6575456857681274, "learning_rate": 2.2395e-05, "loss": 0.1917, "step": 7468 }, { "epoch": 18.12879708383961, "grad_norm": 0.5588580369949341, "learning_rate": 2.2398e-05, "loss": 0.1278, "step": 7469 }, { "epoch": 18.131227217496964, "grad_norm": 1.0105944871902466, "learning_rate": 2.2401e-05, "loss": 0.1411, "step": 7470 }, { "epoch": 18.133657351154312, "grad_norm": 0.683885395526886, "learning_rate": 2.2404e-05, "loss": 0.1116, "step": 7471 }, { "epoch": 18.136087484811664, "grad_norm": 0.5159409642219543, "learning_rate": 2.2407e-05, "loss": 0.0853, "step": 7472 }, { "epoch": 18.138517618469017, "grad_norm": 0.46445491909980774, "learning_rate": 2.241e-05, "loss": 0.0721, "step": 7473 }, { "epoch": 18.140947752126365, "grad_norm": 0.6535950899124146, "learning_rate": 2.2413e-05, "loss": 0.0633, "step": 7474 }, { "epoch": 18.143377885783718, "grad_norm": 0.5518104434013367, "learning_rate": 2.2416e-05, "loss": 0.068, "step": 7475 }, { "epoch": 18.14580801944107, "grad_norm": 0.46979352831840515, "learning_rate": 2.2419e-05, "loss": 0.0476, "step": 7476 }, { "epoch": 18.148238153098422, "grad_norm": 0.4146970510482788, "learning_rate": 2.2422e-05, "loss": 0.0546, "step": 7477 }, { "epoch": 18.15066828675577, "grad_norm": 0.6501644253730774, "learning_rate": 2.2425000000000003e-05, "loss": 0.0497, "step": 7478 }, { "epoch": 18.153098420413123, "grad_norm": 0.6317958235740662, "learning_rate": 2.2428000000000003e-05, "loss": 0.0422, "step": 7479 }, { "epoch": 18.155528554070475, "grad_norm": 0.4349968135356903, "learning_rate": 2.2431000000000003e-05, "loss": 0.0379, "step": 7480 }, { "epoch": 18.157958687727824, "grad_norm": 0.3930842876434326, "learning_rate": 2.2434000000000002e-05, "loss": 0.0334, "step": 7481 }, { "epoch": 18.160388821385176, "grad_norm": 0.5394790172576904, "learning_rate": 2.2437000000000002e-05, "loss": 0.0459, "step": 7482 }, { "epoch": 18.16281895504253, "grad_norm": 0.5901286602020264, "learning_rate": 2.2440000000000002e-05, "loss": 0.0446, "step": 7483 }, { "epoch": 18.165249088699877, "grad_norm": 0.5837100744247437, "learning_rate": 2.2443000000000002e-05, "loss": 0.0471, "step": 7484 }, { "epoch": 18.16767922235723, "grad_norm": 0.3990725576877594, "learning_rate": 2.2445999999999998e-05, "loss": 0.0279, "step": 7485 }, { "epoch": 18.17010935601458, "grad_norm": 0.8541878461837769, "learning_rate": 2.2448999999999998e-05, "loss": 0.0405, "step": 7486 }, { "epoch": 18.172539489671934, "grad_norm": 0.4377598762512207, "learning_rate": 2.2451999999999998e-05, "loss": 0.0241, "step": 7487 }, { "epoch": 18.174969623329282, "grad_norm": 0.5253065824508667, "learning_rate": 2.2455e-05, "loss": 0.0249, "step": 7488 }, { "epoch": 18.177399756986635, "grad_norm": 0.7725985646247864, "learning_rate": 2.2458e-05, "loss": 0.0412, "step": 7489 }, { "epoch": 18.179829890643987, "grad_norm": 0.6801501512527466, "learning_rate": 2.2461e-05, "loss": 0.028, "step": 7490 }, { "epoch": 18.182260024301335, "grad_norm": 0.47625666856765747, "learning_rate": 2.2464e-05, "loss": 0.0297, "step": 7491 }, { "epoch": 18.184690157958688, "grad_norm": 0.8288225531578064, "learning_rate": 2.2467e-05, "loss": 0.0337, "step": 7492 }, { "epoch": 18.18712029161604, "grad_norm": 0.48918992280960083, "learning_rate": 2.247e-05, "loss": 0.025, "step": 7493 }, { "epoch": 18.18955042527339, "grad_norm": 1.050094485282898, "learning_rate": 2.2473e-05, "loss": 0.02, "step": 7494 }, { "epoch": 18.19198055893074, "grad_norm": 0.4649692475795746, "learning_rate": 2.2476e-05, "loss": 0.0324, "step": 7495 }, { "epoch": 18.194410692588093, "grad_norm": 0.3279310464859009, "learning_rate": 2.2479e-05, "loss": 0.0204, "step": 7496 }, { "epoch": 18.19684082624544, "grad_norm": 0.7143903970718384, "learning_rate": 2.2482e-05, "loss": 0.0301, "step": 7497 }, { "epoch": 18.199270959902794, "grad_norm": 0.45906975865364075, "learning_rate": 2.2485000000000002e-05, "loss": 0.0219, "step": 7498 }, { "epoch": 18.201701093560146, "grad_norm": 0.42317068576812744, "learning_rate": 2.2488000000000002e-05, "loss": 0.0222, "step": 7499 }, { "epoch": 18.2041312272175, "grad_norm": 0.4839160442352295, "learning_rate": 2.2491000000000002e-05, "loss": 0.0234, "step": 7500 }, { "epoch": 18.206561360874847, "grad_norm": 0.42483416199684143, "learning_rate": 2.2494000000000002e-05, "loss": 0.0385, "step": 7501 }, { "epoch": 18.2089914945322, "grad_norm": 0.7028674483299255, "learning_rate": 2.2497e-05, "loss": 0.0396, "step": 7502 }, { "epoch": 18.21142162818955, "grad_norm": 0.5510019063949585, "learning_rate": 2.25e-05, "loss": 0.0312, "step": 7503 }, { "epoch": 18.2138517618469, "grad_norm": 0.41719210147857666, "learning_rate": 2.2503e-05, "loss": 0.031, "step": 7504 }, { "epoch": 18.216281895504252, "grad_norm": 1.7568016052246094, "learning_rate": 2.2506e-05, "loss": 0.0279, "step": 7505 }, { "epoch": 18.218712029161605, "grad_norm": 0.5131407380104065, "learning_rate": 2.2509e-05, "loss": 0.0176, "step": 7506 }, { "epoch": 18.221142162818953, "grad_norm": 0.6600714325904846, "learning_rate": 2.2512e-05, "loss": 0.0317, "step": 7507 }, { "epoch": 18.223572296476306, "grad_norm": 0.6948134303092957, "learning_rate": 2.2515e-05, "loss": 0.0263, "step": 7508 }, { "epoch": 18.226002430133658, "grad_norm": 0.4798775613307953, "learning_rate": 2.2518e-05, "loss": 0.0188, "step": 7509 }, { "epoch": 18.22843256379101, "grad_norm": 0.6306053996086121, "learning_rate": 2.2521e-05, "loss": 0.0404, "step": 7510 }, { "epoch": 18.23086269744836, "grad_norm": 0.5082557797431946, "learning_rate": 2.2524e-05, "loss": 0.023, "step": 7511 }, { "epoch": 18.23329283110571, "grad_norm": 0.8760145902633667, "learning_rate": 2.2527e-05, "loss": 0.0403, "step": 7512 }, { "epoch": 18.235722964763063, "grad_norm": 1.3235218524932861, "learning_rate": 2.253e-05, "loss": 0.0526, "step": 7513 }, { "epoch": 18.238153098420412, "grad_norm": 2.0605199337005615, "learning_rate": 2.2533e-05, "loss": 0.046, "step": 7514 }, { "epoch": 18.240583232077764, "grad_norm": 0.5413383841514587, "learning_rate": 2.2536e-05, "loss": 0.0267, "step": 7515 }, { "epoch": 18.243013365735116, "grad_norm": 1.4597543478012085, "learning_rate": 2.2539e-05, "loss": 0.0757, "step": 7516 }, { "epoch": 18.245443499392465, "grad_norm": 1.1398922204971313, "learning_rate": 2.2542e-05, "loss": 0.2296, "step": 7517 }, { "epoch": 18.247873633049817, "grad_norm": 0.7601566314697266, "learning_rate": 2.2545e-05, "loss": 0.1937, "step": 7518 }, { "epoch": 18.25030376670717, "grad_norm": 0.5652275085449219, "learning_rate": 2.2548e-05, "loss": 0.1497, "step": 7519 }, { "epoch": 18.25273390036452, "grad_norm": 0.6641935110092163, "learning_rate": 2.2551e-05, "loss": 0.1337, "step": 7520 }, { "epoch": 18.25516403402187, "grad_norm": 0.6532748937606812, "learning_rate": 2.2554e-05, "loss": 0.1337, "step": 7521 }, { "epoch": 18.257594167679223, "grad_norm": 0.6541625261306763, "learning_rate": 2.2557e-05, "loss": 0.106, "step": 7522 }, { "epoch": 18.260024301336575, "grad_norm": 0.6541167497634888, "learning_rate": 2.256e-05, "loss": 0.107, "step": 7523 }, { "epoch": 18.262454434993924, "grad_norm": 0.4350110590457916, "learning_rate": 2.2563e-05, "loss": 0.0645, "step": 7524 }, { "epoch": 18.264884568651276, "grad_norm": 0.41264912486076355, "learning_rate": 2.2566e-05, "loss": 0.052, "step": 7525 }, { "epoch": 18.267314702308628, "grad_norm": 0.47518616914749146, "learning_rate": 2.2569e-05, "loss": 0.0544, "step": 7526 }, { "epoch": 18.269744835965977, "grad_norm": 0.293878436088562, "learning_rate": 2.2572e-05, "loss": 0.0273, "step": 7527 }, { "epoch": 18.27217496962333, "grad_norm": 0.37242433428764343, "learning_rate": 2.2575e-05, "loss": 0.0349, "step": 7528 }, { "epoch": 18.27460510328068, "grad_norm": 0.3271694481372833, "learning_rate": 2.2578000000000003e-05, "loss": 0.0311, "step": 7529 }, { "epoch": 18.277035236938033, "grad_norm": 0.6260360479354858, "learning_rate": 2.2581000000000003e-05, "loss": 0.0324, "step": 7530 }, { "epoch": 18.279465370595382, "grad_norm": 0.4644349217414856, "learning_rate": 2.2584000000000003e-05, "loss": 0.0369, "step": 7531 }, { "epoch": 18.281895504252734, "grad_norm": 0.5570610165596008, "learning_rate": 2.2587000000000002e-05, "loss": 0.046, "step": 7532 }, { "epoch": 18.284325637910086, "grad_norm": 0.3941459655761719, "learning_rate": 2.2590000000000002e-05, "loss": 0.0562, "step": 7533 }, { "epoch": 18.286755771567435, "grad_norm": 0.5924492478370667, "learning_rate": 2.2593e-05, "loss": 0.0397, "step": 7534 }, { "epoch": 18.289185905224787, "grad_norm": 0.4478943645954132, "learning_rate": 2.2596e-05, "loss": 0.0329, "step": 7535 }, { "epoch": 18.29161603888214, "grad_norm": 0.3705723285675049, "learning_rate": 2.2598999999999998e-05, "loss": 0.0284, "step": 7536 }, { "epoch": 18.29404617253949, "grad_norm": 0.7770824432373047, "learning_rate": 2.2601999999999998e-05, "loss": 0.0427, "step": 7537 }, { "epoch": 18.29647630619684, "grad_norm": 0.41790875792503357, "learning_rate": 2.2604999999999998e-05, "loss": 0.0304, "step": 7538 }, { "epoch": 18.298906439854193, "grad_norm": 0.46444129943847656, "learning_rate": 2.2608e-05, "loss": 0.0168, "step": 7539 }, { "epoch": 18.30133657351154, "grad_norm": 0.4515412747859955, "learning_rate": 2.2611e-05, "loss": 0.0421, "step": 7540 }, { "epoch": 18.303766707168894, "grad_norm": 0.5598306059837341, "learning_rate": 2.2614e-05, "loss": 0.0279, "step": 7541 }, { "epoch": 18.306196840826246, "grad_norm": 0.6526433825492859, "learning_rate": 2.2617e-05, "loss": 0.039, "step": 7542 }, { "epoch": 18.308626974483598, "grad_norm": 0.5526354312896729, "learning_rate": 2.262e-05, "loss": 0.0231, "step": 7543 }, { "epoch": 18.311057108140947, "grad_norm": 0.7386375069618225, "learning_rate": 2.2623e-05, "loss": 0.0369, "step": 7544 }, { "epoch": 18.3134872417983, "grad_norm": 0.7980743646621704, "learning_rate": 2.2626e-05, "loss": 0.0361, "step": 7545 }, { "epoch": 18.31591737545565, "grad_norm": 0.5577043294906616, "learning_rate": 2.2629e-05, "loss": 0.0268, "step": 7546 }, { "epoch": 18.318347509113, "grad_norm": 0.4531900882720947, "learning_rate": 2.2632e-05, "loss": 0.0247, "step": 7547 }, { "epoch": 18.320777642770352, "grad_norm": 0.8630191683769226, "learning_rate": 2.2635e-05, "loss": 0.0388, "step": 7548 }, { "epoch": 18.323207776427704, "grad_norm": 0.42801859974861145, "learning_rate": 2.2638000000000002e-05, "loss": 0.0279, "step": 7549 }, { "epoch": 18.325637910085053, "grad_norm": 1.4393064975738525, "learning_rate": 2.2641000000000002e-05, "loss": 0.0331, "step": 7550 }, { "epoch": 18.328068043742405, "grad_norm": 1.0805989503860474, "learning_rate": 2.2644000000000002e-05, "loss": 0.0272, "step": 7551 }, { "epoch": 18.330498177399758, "grad_norm": 0.458448588848114, "learning_rate": 2.2647000000000002e-05, "loss": 0.0273, "step": 7552 }, { "epoch": 18.33292831105711, "grad_norm": 0.4359380304813385, "learning_rate": 2.265e-05, "loss": 0.021, "step": 7553 }, { "epoch": 18.33535844471446, "grad_norm": 0.6613588929176331, "learning_rate": 2.2653e-05, "loss": 0.0256, "step": 7554 }, { "epoch": 18.33778857837181, "grad_norm": 0.9222180843353271, "learning_rate": 2.2656e-05, "loss": 0.0399, "step": 7555 }, { "epoch": 18.340218712029163, "grad_norm": 1.1636240482330322, "learning_rate": 2.2659e-05, "loss": 0.0808, "step": 7556 }, { "epoch": 18.34264884568651, "grad_norm": 0.4623214304447174, "learning_rate": 2.2662e-05, "loss": 0.0195, "step": 7557 }, { "epoch": 18.345078979343864, "grad_norm": 0.5138825178146362, "learning_rate": 2.2665e-05, "loss": 0.0256, "step": 7558 }, { "epoch": 18.347509113001216, "grad_norm": 0.4594307839870453, "learning_rate": 2.2668e-05, "loss": 0.0282, "step": 7559 }, { "epoch": 18.349939246658565, "grad_norm": 0.4832783639431, "learning_rate": 2.2671e-05, "loss": 0.0271, "step": 7560 }, { "epoch": 18.352369380315917, "grad_norm": 0.6288027167320251, "learning_rate": 2.2674e-05, "loss": 0.0555, "step": 7561 }, { "epoch": 18.35479951397327, "grad_norm": 0.7558152675628662, "learning_rate": 2.2677e-05, "loss": 0.0422, "step": 7562 }, { "epoch": 18.35722964763062, "grad_norm": 0.6760542392730713, "learning_rate": 2.268e-05, "loss": 0.0328, "step": 7563 }, { "epoch": 18.35965978128797, "grad_norm": 1.0364720821380615, "learning_rate": 2.2683e-05, "loss": 0.0333, "step": 7564 }, { "epoch": 18.362089914945322, "grad_norm": 1.827061414718628, "learning_rate": 2.2686e-05, "loss": 0.0911, "step": 7565 }, { "epoch": 18.364520048602675, "grad_norm": 2.00958251953125, "learning_rate": 2.2689e-05, "loss": 0.1786, "step": 7566 }, { "epoch": 18.366950182260023, "grad_norm": 1.5147435665130615, "learning_rate": 2.2692e-05, "loss": 0.2923, "step": 7567 }, { "epoch": 18.369380315917375, "grad_norm": 0.9043169617652893, "learning_rate": 2.2695e-05, "loss": 0.226, "step": 7568 }, { "epoch": 18.371810449574728, "grad_norm": 1.0358906984329224, "learning_rate": 2.2698000000000002e-05, "loss": 0.164, "step": 7569 }, { "epoch": 18.374240583232076, "grad_norm": 0.9149872660636902, "learning_rate": 2.2701000000000002e-05, "loss": 0.1254, "step": 7570 }, { "epoch": 18.37667071688943, "grad_norm": 0.6171049475669861, "learning_rate": 2.2704e-05, "loss": 0.1272, "step": 7571 }, { "epoch": 18.37910085054678, "grad_norm": 0.6334242224693298, "learning_rate": 2.2707e-05, "loss": 0.0937, "step": 7572 }, { "epoch": 18.381530984204133, "grad_norm": 0.6312787532806396, "learning_rate": 2.271e-05, "loss": 0.0895, "step": 7573 }, { "epoch": 18.38396111786148, "grad_norm": 0.7538720369338989, "learning_rate": 2.2713e-05, "loss": 0.0808, "step": 7574 }, { "epoch": 18.386391251518834, "grad_norm": 0.5544162392616272, "learning_rate": 2.2716e-05, "loss": 0.0583, "step": 7575 }, { "epoch": 18.388821385176186, "grad_norm": 0.6033633351325989, "learning_rate": 2.2719e-05, "loss": 0.0666, "step": 7576 }, { "epoch": 18.391251518833535, "grad_norm": 0.5287725329399109, "learning_rate": 2.2722e-05, "loss": 0.0466, "step": 7577 }, { "epoch": 18.393681652490887, "grad_norm": 0.6041070818901062, "learning_rate": 2.2725e-05, "loss": 0.0479, "step": 7578 }, { "epoch": 18.39611178614824, "grad_norm": 0.5453423857688904, "learning_rate": 2.2728000000000003e-05, "loss": 0.0379, "step": 7579 }, { "epoch": 18.398541919805588, "grad_norm": 0.8199343681335449, "learning_rate": 2.2731000000000003e-05, "loss": 0.045, "step": 7580 }, { "epoch": 18.40097205346294, "grad_norm": 0.5150067806243896, "learning_rate": 2.2734000000000003e-05, "loss": 0.0431, "step": 7581 }, { "epoch": 18.403402187120292, "grad_norm": 0.5923002362251282, "learning_rate": 2.2737000000000003e-05, "loss": 0.028, "step": 7582 }, { "epoch": 18.40583232077764, "grad_norm": 0.657433271408081, "learning_rate": 2.274e-05, "loss": 0.0385, "step": 7583 }, { "epoch": 18.408262454434993, "grad_norm": 0.35442453622817993, "learning_rate": 2.2743e-05, "loss": 0.0179, "step": 7584 }, { "epoch": 18.410692588092346, "grad_norm": 0.40389323234558105, "learning_rate": 2.2746e-05, "loss": 0.0276, "step": 7585 }, { "epoch": 18.413122721749698, "grad_norm": 0.5353848934173584, "learning_rate": 2.2749e-05, "loss": 0.0341, "step": 7586 }, { "epoch": 18.415552855407046, "grad_norm": 0.8128246068954468, "learning_rate": 2.2752e-05, "loss": 0.0379, "step": 7587 }, { "epoch": 18.4179829890644, "grad_norm": 0.2886504828929901, "learning_rate": 2.2754999999999998e-05, "loss": 0.0194, "step": 7588 }, { "epoch": 18.42041312272175, "grad_norm": 0.5742344856262207, "learning_rate": 2.2758e-05, "loss": 0.0407, "step": 7589 }, { "epoch": 18.4228432563791, "grad_norm": 0.7974448800086975, "learning_rate": 2.2761e-05, "loss": 0.0367, "step": 7590 }, { "epoch": 18.425273390036452, "grad_norm": 0.32924023270606995, "learning_rate": 2.2764e-05, "loss": 0.0187, "step": 7591 }, { "epoch": 18.427703523693804, "grad_norm": 1.0273832082748413, "learning_rate": 2.2767e-05, "loss": 0.0185, "step": 7592 }, { "epoch": 18.430133657351153, "grad_norm": 0.31424441933631897, "learning_rate": 2.277e-05, "loss": 0.0281, "step": 7593 }, { "epoch": 18.432563791008505, "grad_norm": 0.7479321956634521, "learning_rate": 2.2773e-05, "loss": 0.0347, "step": 7594 }, { "epoch": 18.434993924665857, "grad_norm": 0.5397182106971741, "learning_rate": 2.2776e-05, "loss": 0.0285, "step": 7595 }, { "epoch": 18.43742405832321, "grad_norm": 0.4494345486164093, "learning_rate": 2.2779e-05, "loss": 0.0284, "step": 7596 }, { "epoch": 18.439854191980558, "grad_norm": 0.7739879488945007, "learning_rate": 2.2782e-05, "loss": 0.0351, "step": 7597 }, { "epoch": 18.44228432563791, "grad_norm": 0.4885505139827728, "learning_rate": 2.2785e-05, "loss": 0.0293, "step": 7598 }, { "epoch": 18.444714459295263, "grad_norm": 0.30844950675964355, "learning_rate": 2.2788000000000003e-05, "loss": 0.0145, "step": 7599 }, { "epoch": 18.44714459295261, "grad_norm": 0.5053753852844238, "learning_rate": 2.2791000000000003e-05, "loss": 0.0173, "step": 7600 }, { "epoch": 18.449574726609963, "grad_norm": 0.44642481207847595, "learning_rate": 2.2794000000000002e-05, "loss": 0.0271, "step": 7601 }, { "epoch": 18.452004860267316, "grad_norm": 1.252860188484192, "learning_rate": 2.2797000000000002e-05, "loss": 0.0254, "step": 7602 }, { "epoch": 18.454434993924664, "grad_norm": 0.7119371891021729, "learning_rate": 2.2800000000000002e-05, "loss": 0.0321, "step": 7603 }, { "epoch": 18.456865127582017, "grad_norm": 0.9182800054550171, "learning_rate": 2.2803000000000002e-05, "loss": 0.0571, "step": 7604 }, { "epoch": 18.45929526123937, "grad_norm": 0.592244029045105, "learning_rate": 2.2806e-05, "loss": 0.0319, "step": 7605 }, { "epoch": 18.46172539489672, "grad_norm": 0.8623756170272827, "learning_rate": 2.2809e-05, "loss": 0.0355, "step": 7606 }, { "epoch": 18.46415552855407, "grad_norm": 0.42134565114974976, "learning_rate": 2.2812e-05, "loss": 0.0328, "step": 7607 }, { "epoch": 18.466585662211422, "grad_norm": 0.38319239020347595, "learning_rate": 2.2814999999999998e-05, "loss": 0.0228, "step": 7608 }, { "epoch": 18.469015795868774, "grad_norm": 0.6376749277114868, "learning_rate": 2.2818e-05, "loss": 0.0293, "step": 7609 }, { "epoch": 18.471445929526123, "grad_norm": 0.8210089206695557, "learning_rate": 2.2821e-05, "loss": 0.0329, "step": 7610 }, { "epoch": 18.473876063183475, "grad_norm": 0.6715679168701172, "learning_rate": 2.2824e-05, "loss": 0.0364, "step": 7611 }, { "epoch": 18.476306196840827, "grad_norm": 0.9568561911582947, "learning_rate": 2.2827e-05, "loss": 0.031, "step": 7612 }, { "epoch": 18.478736330498176, "grad_norm": 0.7730439901351929, "learning_rate": 2.283e-05, "loss": 0.0402, "step": 7613 }, { "epoch": 18.481166464155528, "grad_norm": 2.715646505355835, "learning_rate": 2.2833e-05, "loss": 0.0651, "step": 7614 }, { "epoch": 18.48359659781288, "grad_norm": 1.5509908199310303, "learning_rate": 2.2836e-05, "loss": 0.0409, "step": 7615 }, { "epoch": 18.48602673147023, "grad_norm": 1.2651182413101196, "learning_rate": 2.2839e-05, "loss": 0.0501, "step": 7616 }, { "epoch": 18.48845686512758, "grad_norm": 1.2375692129135132, "learning_rate": 2.2842e-05, "loss": 0.3078, "step": 7617 }, { "epoch": 18.490886998784934, "grad_norm": 0.8063665628433228, "learning_rate": 2.2845e-05, "loss": 0.2239, "step": 7618 }, { "epoch": 18.493317132442286, "grad_norm": 0.6820582151412964, "learning_rate": 2.2848000000000002e-05, "loss": 0.1705, "step": 7619 }, { "epoch": 18.495747266099634, "grad_norm": 0.618701159954071, "learning_rate": 2.2851000000000002e-05, "loss": 0.1377, "step": 7620 }, { "epoch": 18.498177399756987, "grad_norm": 0.4625548720359802, "learning_rate": 2.2854000000000002e-05, "loss": 0.0946, "step": 7621 }, { "epoch": 18.50060753341434, "grad_norm": 0.6503806114196777, "learning_rate": 2.2857e-05, "loss": 0.1002, "step": 7622 }, { "epoch": 18.503037667071688, "grad_norm": 0.543444812297821, "learning_rate": 2.286e-05, "loss": 0.0796, "step": 7623 }, { "epoch": 18.50546780072904, "grad_norm": 0.783464252948761, "learning_rate": 2.2863e-05, "loss": 0.0698, "step": 7624 }, { "epoch": 18.507897934386392, "grad_norm": 0.6342799663543701, "learning_rate": 2.2866e-05, "loss": 0.0671, "step": 7625 }, { "epoch": 18.51032806804374, "grad_norm": 0.6654233336448669, "learning_rate": 2.2869e-05, "loss": 0.0764, "step": 7626 }, { "epoch": 18.512758201701093, "grad_norm": 0.5897406935691833, "learning_rate": 2.2872e-05, "loss": 0.0465, "step": 7627 }, { "epoch": 18.515188335358445, "grad_norm": 0.7288907170295715, "learning_rate": 2.2875e-05, "loss": 0.0569, "step": 7628 }, { "epoch": 18.517618469015797, "grad_norm": 0.3849013149738312, "learning_rate": 2.2878e-05, "loss": 0.0278, "step": 7629 }, { "epoch": 18.520048602673146, "grad_norm": 1.06241774559021, "learning_rate": 2.2881000000000003e-05, "loss": 0.0445, "step": 7630 }, { "epoch": 18.5224787363305, "grad_norm": 0.5609515905380249, "learning_rate": 2.2884000000000003e-05, "loss": 0.0392, "step": 7631 }, { "epoch": 18.52490886998785, "grad_norm": 0.4920559823513031, "learning_rate": 2.2887e-05, "loss": 0.0219, "step": 7632 }, { "epoch": 18.5273390036452, "grad_norm": 0.4962436854839325, "learning_rate": 2.289e-05, "loss": 0.033, "step": 7633 }, { "epoch": 18.52976913730255, "grad_norm": 0.42572569847106934, "learning_rate": 2.2893e-05, "loss": 0.0283, "step": 7634 }, { "epoch": 18.532199270959904, "grad_norm": 0.5978862643241882, "learning_rate": 2.2896e-05, "loss": 0.0459, "step": 7635 }, { "epoch": 18.534629404617252, "grad_norm": 0.4478876292705536, "learning_rate": 2.2899e-05, "loss": 0.0325, "step": 7636 }, { "epoch": 18.537059538274605, "grad_norm": 0.47037753462791443, "learning_rate": 2.2902e-05, "loss": 0.0327, "step": 7637 }, { "epoch": 18.539489671931957, "grad_norm": 0.37499451637268066, "learning_rate": 2.2905e-05, "loss": 0.0242, "step": 7638 }, { "epoch": 18.54191980558931, "grad_norm": 0.3581997752189636, "learning_rate": 2.2907999999999998e-05, "loss": 0.025, "step": 7639 }, { "epoch": 18.544349939246658, "grad_norm": 0.42767101526260376, "learning_rate": 2.2911e-05, "loss": 0.0254, "step": 7640 }, { "epoch": 18.54678007290401, "grad_norm": 1.451916217803955, "learning_rate": 2.2914e-05, "loss": 0.0217, "step": 7641 }, { "epoch": 18.549210206561362, "grad_norm": 0.6845611333847046, "learning_rate": 2.2917e-05, "loss": 0.0722, "step": 7642 }, { "epoch": 18.55164034021871, "grad_norm": 0.6092773675918579, "learning_rate": 2.292e-05, "loss": 0.0363, "step": 7643 }, { "epoch": 18.554070473876063, "grad_norm": 0.44812971353530884, "learning_rate": 2.2923e-05, "loss": 0.0234, "step": 7644 }, { "epoch": 18.556500607533415, "grad_norm": 0.5912677049636841, "learning_rate": 2.2926e-05, "loss": 0.0413, "step": 7645 }, { "epoch": 18.558930741190764, "grad_norm": 0.5398919582366943, "learning_rate": 2.2929e-05, "loss": 0.0228, "step": 7646 }, { "epoch": 18.561360874848116, "grad_norm": 0.4045504033565521, "learning_rate": 2.2932e-05, "loss": 0.0243, "step": 7647 }, { "epoch": 18.56379100850547, "grad_norm": 0.5462066531181335, "learning_rate": 2.2935e-05, "loss": 0.0257, "step": 7648 }, { "epoch": 18.566221142162817, "grad_norm": 0.6660487651824951, "learning_rate": 2.2938e-05, "loss": 0.034, "step": 7649 }, { "epoch": 18.56865127582017, "grad_norm": 0.4649648070335388, "learning_rate": 2.2941000000000003e-05, "loss": 0.024, "step": 7650 }, { "epoch": 18.57108140947752, "grad_norm": 0.9038561582565308, "learning_rate": 2.2944000000000003e-05, "loss": 0.0858, "step": 7651 }, { "epoch": 18.573511543134874, "grad_norm": 0.6844737529754639, "learning_rate": 2.2947000000000002e-05, "loss": 0.0436, "step": 7652 }, { "epoch": 18.575941676792223, "grad_norm": 0.7218800187110901, "learning_rate": 2.2950000000000002e-05, "loss": 0.0269, "step": 7653 }, { "epoch": 18.578371810449575, "grad_norm": 1.600374460220337, "learning_rate": 2.2953000000000002e-05, "loss": 0.1419, "step": 7654 }, { "epoch": 18.580801944106927, "grad_norm": 0.4867270588874817, "learning_rate": 2.2956000000000002e-05, "loss": 0.0384, "step": 7655 }, { "epoch": 18.583232077764276, "grad_norm": 0.5701076984405518, "learning_rate": 2.2959e-05, "loss": 0.0388, "step": 7656 }, { "epoch": 18.585662211421628, "grad_norm": 0.6107194423675537, "learning_rate": 2.2961999999999998e-05, "loss": 0.0341, "step": 7657 }, { "epoch": 18.58809234507898, "grad_norm": 0.5038135051727295, "learning_rate": 2.2964999999999998e-05, "loss": 0.0247, "step": 7658 }, { "epoch": 18.59052247873633, "grad_norm": 0.4643338620662689, "learning_rate": 2.2967999999999998e-05, "loss": 0.0255, "step": 7659 }, { "epoch": 18.59295261239368, "grad_norm": 0.6451895236968994, "learning_rate": 2.2971e-05, "loss": 0.043, "step": 7660 }, { "epoch": 18.595382746051033, "grad_norm": 0.7132598161697388, "learning_rate": 2.2974e-05, "loss": 0.0234, "step": 7661 }, { "epoch": 18.597812879708385, "grad_norm": 0.4977562725543976, "learning_rate": 2.2977e-05, "loss": 0.0273, "step": 7662 }, { "epoch": 18.600243013365734, "grad_norm": 0.6831465363502502, "learning_rate": 2.298e-05, "loss": 0.0295, "step": 7663 }, { "epoch": 18.602673147023086, "grad_norm": 0.7749857306480408, "learning_rate": 2.2983e-05, "loss": 0.0241, "step": 7664 }, { "epoch": 18.60510328068044, "grad_norm": 0.5767897963523865, "learning_rate": 2.2986e-05, "loss": 0.0247, "step": 7665 }, { "epoch": 18.607533414337787, "grad_norm": 2.15256929397583, "learning_rate": 2.2989e-05, "loss": 0.0742, "step": 7666 }, { "epoch": 18.60996354799514, "grad_norm": 0.9082555770874023, "learning_rate": 2.2992e-05, "loss": 0.2435, "step": 7667 }, { "epoch": 18.61239368165249, "grad_norm": 0.6800010204315186, "learning_rate": 2.2995e-05, "loss": 0.2011, "step": 7668 }, { "epoch": 18.61482381530984, "grad_norm": 0.8739922642707825, "learning_rate": 2.2998e-05, "loss": 0.1302, "step": 7669 }, { "epoch": 18.617253948967193, "grad_norm": 1.027755856513977, "learning_rate": 2.3001000000000002e-05, "loss": 0.1641, "step": 7670 }, { "epoch": 18.619684082624545, "grad_norm": 0.8359779119491577, "learning_rate": 2.3004000000000002e-05, "loss": 0.1321, "step": 7671 }, { "epoch": 18.622114216281897, "grad_norm": 0.7732165455818176, "learning_rate": 2.3007000000000002e-05, "loss": 0.0948, "step": 7672 }, { "epoch": 18.624544349939246, "grad_norm": 0.9816145896911621, "learning_rate": 2.301e-05, "loss": 0.0953, "step": 7673 }, { "epoch": 18.626974483596598, "grad_norm": 0.7063319683074951, "learning_rate": 2.3013e-05, "loss": 0.0778, "step": 7674 }, { "epoch": 18.62940461725395, "grad_norm": 1.012515664100647, "learning_rate": 2.3016e-05, "loss": 0.1344, "step": 7675 }, { "epoch": 18.6318347509113, "grad_norm": 0.6304982900619507, "learning_rate": 2.3019e-05, "loss": 0.0474, "step": 7676 }, { "epoch": 18.63426488456865, "grad_norm": 0.6101714968681335, "learning_rate": 2.3022e-05, "loss": 0.0487, "step": 7677 }, { "epoch": 18.636695018226003, "grad_norm": 0.6419079303741455, "learning_rate": 2.3025e-05, "loss": 0.0518, "step": 7678 }, { "epoch": 18.639125151883352, "grad_norm": 0.46602070331573486, "learning_rate": 2.3028e-05, "loss": 0.0365, "step": 7679 }, { "epoch": 18.641555285540704, "grad_norm": 0.6186034083366394, "learning_rate": 2.3031000000000004e-05, "loss": 0.0571, "step": 7680 }, { "epoch": 18.643985419198057, "grad_norm": 0.3506230413913727, "learning_rate": 2.3034e-05, "loss": 0.0224, "step": 7681 }, { "epoch": 18.64641555285541, "grad_norm": 0.45812684297561646, "learning_rate": 2.3037e-05, "loss": 0.0557, "step": 7682 }, { "epoch": 18.648845686512757, "grad_norm": 0.530586302280426, "learning_rate": 2.304e-05, "loss": 0.034, "step": 7683 }, { "epoch": 18.65127582017011, "grad_norm": 0.4490065574645996, "learning_rate": 2.3043e-05, "loss": 0.0413, "step": 7684 }, { "epoch": 18.653705953827462, "grad_norm": 0.8351462483406067, "learning_rate": 2.3046e-05, "loss": 0.0367, "step": 7685 }, { "epoch": 18.65613608748481, "grad_norm": 0.32267239689826965, "learning_rate": 2.3049e-05, "loss": 0.0205, "step": 7686 }, { "epoch": 18.658566221142163, "grad_norm": 0.5285741090774536, "learning_rate": 2.3052e-05, "loss": 0.0277, "step": 7687 }, { "epoch": 18.660996354799515, "grad_norm": 0.5641232132911682, "learning_rate": 2.3055e-05, "loss": 0.0412, "step": 7688 }, { "epoch": 18.663426488456864, "grad_norm": 0.46076542139053345, "learning_rate": 2.3058e-05, "loss": 0.0267, "step": 7689 }, { "epoch": 18.665856622114216, "grad_norm": 0.3815561830997467, "learning_rate": 2.3061e-05, "loss": 0.0216, "step": 7690 }, { "epoch": 18.668286755771568, "grad_norm": 0.5906295776367188, "learning_rate": 2.3064e-05, "loss": 0.0233, "step": 7691 }, { "epoch": 18.670716889428917, "grad_norm": 0.4479934275150299, "learning_rate": 2.3067e-05, "loss": 0.0372, "step": 7692 }, { "epoch": 18.67314702308627, "grad_norm": 0.3695438504219055, "learning_rate": 2.307e-05, "loss": 0.0267, "step": 7693 }, { "epoch": 18.67557715674362, "grad_norm": 0.4800214469432831, "learning_rate": 2.3073e-05, "loss": 0.0287, "step": 7694 }, { "epoch": 18.678007290400974, "grad_norm": 0.5178050398826599, "learning_rate": 2.3076e-05, "loss": 0.0335, "step": 7695 }, { "epoch": 18.680437424058322, "grad_norm": 0.46404388546943665, "learning_rate": 2.3079e-05, "loss": 0.0244, "step": 7696 }, { "epoch": 18.682867557715674, "grad_norm": 0.8194983601570129, "learning_rate": 2.3082e-05, "loss": 0.0357, "step": 7697 }, { "epoch": 18.685297691373027, "grad_norm": 0.40886086225509644, "learning_rate": 2.3085e-05, "loss": 0.0213, "step": 7698 }, { "epoch": 18.687727825030375, "grad_norm": 0.4822536110877991, "learning_rate": 2.3088e-05, "loss": 0.0225, "step": 7699 }, { "epoch": 18.690157958687728, "grad_norm": 0.6667514443397522, "learning_rate": 2.3091000000000003e-05, "loss": 0.0356, "step": 7700 }, { "epoch": 18.69258809234508, "grad_norm": 0.7149680852890015, "learning_rate": 2.3094000000000003e-05, "loss": 0.0326, "step": 7701 }, { "epoch": 18.69501822600243, "grad_norm": 0.4847433865070343, "learning_rate": 2.3097000000000003e-05, "loss": 0.0227, "step": 7702 }, { "epoch": 18.69744835965978, "grad_norm": 0.3960364758968353, "learning_rate": 2.3100000000000002e-05, "loss": 0.0218, "step": 7703 }, { "epoch": 18.699878493317133, "grad_norm": 0.9255478978157043, "learning_rate": 2.3103000000000002e-05, "loss": 0.0311, "step": 7704 }, { "epoch": 18.702308626974485, "grad_norm": 0.35300448536872864, "learning_rate": 2.3106000000000002e-05, "loss": 0.019, "step": 7705 }, { "epoch": 18.704738760631834, "grad_norm": 0.8180844187736511, "learning_rate": 2.3109e-05, "loss": 0.0294, "step": 7706 }, { "epoch": 18.707168894289186, "grad_norm": 0.5964922308921814, "learning_rate": 2.3111999999999998e-05, "loss": 0.0332, "step": 7707 }, { "epoch": 18.70959902794654, "grad_norm": 0.6380705237388611, "learning_rate": 2.3114999999999998e-05, "loss": 0.0422, "step": 7708 }, { "epoch": 18.712029161603887, "grad_norm": 0.5610523223876953, "learning_rate": 2.3117999999999998e-05, "loss": 0.0401, "step": 7709 }, { "epoch": 18.71445929526124, "grad_norm": 0.6582736372947693, "learning_rate": 2.3121e-05, "loss": 0.0279, "step": 7710 }, { "epoch": 18.71688942891859, "grad_norm": 0.8831318020820618, "learning_rate": 2.3124e-05, "loss": 0.0551, "step": 7711 }, { "epoch": 18.71931956257594, "grad_norm": 0.9381386637687683, "learning_rate": 2.3127e-05, "loss": 0.0373, "step": 7712 }, { "epoch": 18.721749696233292, "grad_norm": 1.0676182508468628, "learning_rate": 2.313e-05, "loss": 0.0419, "step": 7713 }, { "epoch": 18.724179829890645, "grad_norm": 1.1494052410125732, "learning_rate": 2.3133e-05, "loss": 0.0668, "step": 7714 }, { "epoch": 18.726609963547997, "grad_norm": 0.9838677048683167, "learning_rate": 2.3136e-05, "loss": 0.0389, "step": 7715 }, { "epoch": 18.729040097205345, "grad_norm": 0.8610432744026184, "learning_rate": 2.3139e-05, "loss": 0.0336, "step": 7716 }, { "epoch": 18.731470230862698, "grad_norm": 0.8777901530265808, "learning_rate": 2.3142e-05, "loss": 0.2381, "step": 7717 }, { "epoch": 18.73390036452005, "grad_norm": 0.5484206676483154, "learning_rate": 2.3145e-05, "loss": 0.1856, "step": 7718 }, { "epoch": 18.7363304981774, "grad_norm": 0.6103980541229248, "learning_rate": 2.3148e-05, "loss": 0.1368, "step": 7719 }, { "epoch": 18.73876063183475, "grad_norm": 0.8173736333847046, "learning_rate": 2.3151000000000002e-05, "loss": 0.1887, "step": 7720 }, { "epoch": 18.741190765492103, "grad_norm": 0.8299824595451355, "learning_rate": 2.3154000000000002e-05, "loss": 0.1623, "step": 7721 }, { "epoch": 18.74362089914945, "grad_norm": 0.5551202893257141, "learning_rate": 2.3157000000000002e-05, "loss": 0.1119, "step": 7722 }, { "epoch": 18.746051032806804, "grad_norm": 0.6608569025993347, "learning_rate": 2.3160000000000002e-05, "loss": 0.116, "step": 7723 }, { "epoch": 18.748481166464156, "grad_norm": 0.5049854516983032, "learning_rate": 2.3163e-05, "loss": 0.0745, "step": 7724 }, { "epoch": 18.75091130012151, "grad_norm": 0.5868502855300903, "learning_rate": 2.3166e-05, "loss": 0.0599, "step": 7725 }, { "epoch": 18.753341433778857, "grad_norm": 0.7515851259231567, "learning_rate": 2.3169e-05, "loss": 0.0441, "step": 7726 }, { "epoch": 18.75577156743621, "grad_norm": 0.5234010815620422, "learning_rate": 2.3172e-05, "loss": 0.0561, "step": 7727 }, { "epoch": 18.75820170109356, "grad_norm": 0.299321711063385, "learning_rate": 2.3175e-05, "loss": 0.0283, "step": 7728 }, { "epoch": 18.76063183475091, "grad_norm": 0.5777825713157654, "learning_rate": 2.3178e-05, "loss": 0.0506, "step": 7729 }, { "epoch": 18.763061968408262, "grad_norm": 0.46326157450675964, "learning_rate": 2.3181000000000004e-05, "loss": 0.0332, "step": 7730 }, { "epoch": 18.765492102065615, "grad_norm": 0.4479855000972748, "learning_rate": 2.3184e-05, "loss": 0.0314, "step": 7731 }, { "epoch": 18.767922235722963, "grad_norm": 0.9443984031677246, "learning_rate": 2.3187e-05, "loss": 0.035, "step": 7732 }, { "epoch": 18.770352369380316, "grad_norm": 0.4278523623943329, "learning_rate": 2.319e-05, "loss": 0.0352, "step": 7733 }, { "epoch": 18.772782503037668, "grad_norm": 0.5750224590301514, "learning_rate": 2.3193e-05, "loss": 0.0279, "step": 7734 }, { "epoch": 18.775212636695016, "grad_norm": 0.36380353569984436, "learning_rate": 2.3196e-05, "loss": 0.0211, "step": 7735 }, { "epoch": 18.77764277035237, "grad_norm": 0.459911584854126, "learning_rate": 2.3199e-05, "loss": 0.0257, "step": 7736 }, { "epoch": 18.78007290400972, "grad_norm": 0.811336874961853, "learning_rate": 2.3202e-05, "loss": 0.0237, "step": 7737 }, { "epoch": 18.782503037667073, "grad_norm": 0.9252901077270508, "learning_rate": 2.3205e-05, "loss": 0.0337, "step": 7738 }, { "epoch": 18.784933171324422, "grad_norm": 0.6873453855514526, "learning_rate": 2.3208e-05, "loss": 0.0472, "step": 7739 }, { "epoch": 18.787363304981774, "grad_norm": 0.7128635048866272, "learning_rate": 2.3211000000000002e-05, "loss": 0.0625, "step": 7740 }, { "epoch": 18.789793438639126, "grad_norm": 0.5098733901977539, "learning_rate": 2.3214000000000002e-05, "loss": 0.0263, "step": 7741 }, { "epoch": 18.792223572296475, "grad_norm": 0.5933287739753723, "learning_rate": 2.3217e-05, "loss": 0.0338, "step": 7742 }, { "epoch": 18.794653705953827, "grad_norm": 0.60586017370224, "learning_rate": 2.322e-05, "loss": 0.0324, "step": 7743 }, { "epoch": 18.79708383961118, "grad_norm": 1.0042670965194702, "learning_rate": 2.3223e-05, "loss": 0.0394, "step": 7744 }, { "epoch": 18.799513973268528, "grad_norm": 0.41623255610466003, "learning_rate": 2.3226e-05, "loss": 0.0284, "step": 7745 }, { "epoch": 18.80194410692588, "grad_norm": 0.5237480998039246, "learning_rate": 2.3229e-05, "loss": 0.0252, "step": 7746 }, { "epoch": 18.804374240583233, "grad_norm": 0.5844178795814514, "learning_rate": 2.3232e-05, "loss": 0.0391, "step": 7747 }, { "epoch": 18.806804374240585, "grad_norm": 0.5696040391921997, "learning_rate": 2.3235e-05, "loss": 0.0395, "step": 7748 }, { "epoch": 18.809234507897933, "grad_norm": 0.7127975821495056, "learning_rate": 2.3238e-05, "loss": 0.028, "step": 7749 }, { "epoch": 18.811664641555286, "grad_norm": 0.5923980474472046, "learning_rate": 2.3241000000000003e-05, "loss": 0.0429, "step": 7750 }, { "epoch": 18.814094775212638, "grad_norm": 0.7640862464904785, "learning_rate": 2.3244000000000003e-05, "loss": 0.0403, "step": 7751 }, { "epoch": 18.816524908869987, "grad_norm": 0.9776748418807983, "learning_rate": 2.3247000000000003e-05, "loss": 0.0337, "step": 7752 }, { "epoch": 18.81895504252734, "grad_norm": 0.8581508994102478, "learning_rate": 2.3250000000000003e-05, "loss": 0.0269, "step": 7753 }, { "epoch": 18.82138517618469, "grad_norm": 0.609271764755249, "learning_rate": 2.3253000000000003e-05, "loss": 0.0315, "step": 7754 }, { "epoch": 18.82381530984204, "grad_norm": 0.6587698459625244, "learning_rate": 2.3256e-05, "loss": 0.0349, "step": 7755 }, { "epoch": 18.826245443499392, "grad_norm": 0.673517644405365, "learning_rate": 2.3259e-05, "loss": 0.028, "step": 7756 }, { "epoch": 18.828675577156744, "grad_norm": 0.43941444158554077, "learning_rate": 2.3262e-05, "loss": 0.0178, "step": 7757 }, { "epoch": 18.831105710814096, "grad_norm": 0.6148292422294617, "learning_rate": 2.3265e-05, "loss": 0.0312, "step": 7758 }, { "epoch": 18.833535844471445, "grad_norm": 0.5762404799461365, "learning_rate": 2.3267999999999998e-05, "loss": 0.0259, "step": 7759 }, { "epoch": 18.835965978128797, "grad_norm": 0.6700893640518188, "learning_rate": 2.3270999999999998e-05, "loss": 0.0305, "step": 7760 }, { "epoch": 18.83839611178615, "grad_norm": 0.4012485146522522, "learning_rate": 2.3274e-05, "loss": 0.0196, "step": 7761 }, { "epoch": 18.8408262454435, "grad_norm": 1.2300376892089844, "learning_rate": 2.3277e-05, "loss": 0.02, "step": 7762 }, { "epoch": 18.84325637910085, "grad_norm": 0.6490622162818909, "learning_rate": 2.328e-05, "loss": 0.0379, "step": 7763 }, { "epoch": 18.845686512758203, "grad_norm": 1.8147752285003662, "learning_rate": 2.3283e-05, "loss": 0.05, "step": 7764 }, { "epoch": 18.84811664641555, "grad_norm": 0.9580399990081787, "learning_rate": 2.3286e-05, "loss": 0.0348, "step": 7765 }, { "epoch": 18.850546780072904, "grad_norm": 1.5257840156555176, "learning_rate": 2.3289e-05, "loss": 0.0713, "step": 7766 }, { "epoch": 18.852976913730256, "grad_norm": 1.602967619895935, "learning_rate": 2.3292e-05, "loss": 0.2607, "step": 7767 }, { "epoch": 18.855407047387608, "grad_norm": 0.6812103986740112, "learning_rate": 2.3295e-05, "loss": 0.1604, "step": 7768 }, { "epoch": 18.857837181044957, "grad_norm": 1.3138790130615234, "learning_rate": 2.3298e-05, "loss": 0.1415, "step": 7769 }, { "epoch": 18.86026731470231, "grad_norm": 0.5167971849441528, "learning_rate": 2.3301e-05, "loss": 0.1109, "step": 7770 }, { "epoch": 18.86269744835966, "grad_norm": 0.6426122784614563, "learning_rate": 2.3304000000000003e-05, "loss": 0.1239, "step": 7771 }, { "epoch": 18.86512758201701, "grad_norm": 0.6194524168968201, "learning_rate": 2.3307000000000002e-05, "loss": 0.1077, "step": 7772 }, { "epoch": 18.867557715674362, "grad_norm": 0.4824952483177185, "learning_rate": 2.3310000000000002e-05, "loss": 0.0842, "step": 7773 }, { "epoch": 18.869987849331714, "grad_norm": 0.7419784665107727, "learning_rate": 2.3313000000000002e-05, "loss": 0.0751, "step": 7774 }, { "epoch": 18.872417982989063, "grad_norm": 0.7428793907165527, "learning_rate": 2.3316000000000002e-05, "loss": 0.0512, "step": 7775 }, { "epoch": 18.874848116646415, "grad_norm": 0.6265090703964233, "learning_rate": 2.3319e-05, "loss": 0.0392, "step": 7776 }, { "epoch": 18.877278250303767, "grad_norm": 0.635208010673523, "learning_rate": 2.3322e-05, "loss": 0.0405, "step": 7777 }, { "epoch": 18.879708383961116, "grad_norm": 0.5662554502487183, "learning_rate": 2.3325e-05, "loss": 0.0463, "step": 7778 }, { "epoch": 18.88213851761847, "grad_norm": 0.6518709659576416, "learning_rate": 2.3328e-05, "loss": 0.0451, "step": 7779 }, { "epoch": 18.88456865127582, "grad_norm": 0.5717907547950745, "learning_rate": 2.3330999999999997e-05, "loss": 0.0587, "step": 7780 }, { "epoch": 18.886998784933173, "grad_norm": 0.5165591835975647, "learning_rate": 2.3334e-05, "loss": 0.0492, "step": 7781 }, { "epoch": 18.88942891859052, "grad_norm": 0.48440489172935486, "learning_rate": 2.3337e-05, "loss": 0.0292, "step": 7782 }, { "epoch": 18.891859052247874, "grad_norm": 0.7705854773521423, "learning_rate": 2.334e-05, "loss": 0.0294, "step": 7783 }, { "epoch": 18.894289185905226, "grad_norm": 0.46422314643859863, "learning_rate": 2.3343e-05, "loss": 0.0578, "step": 7784 }, { "epoch": 18.896719319562575, "grad_norm": 0.5734525322914124, "learning_rate": 2.3346e-05, "loss": 0.0289, "step": 7785 }, { "epoch": 18.899149453219927, "grad_norm": 0.4810989499092102, "learning_rate": 2.3349e-05, "loss": 0.024, "step": 7786 }, { "epoch": 18.90157958687728, "grad_norm": 0.44941964745521545, "learning_rate": 2.3352e-05, "loss": 0.0364, "step": 7787 }, { "epoch": 18.904009720534628, "grad_norm": 0.564091682434082, "learning_rate": 2.3355e-05, "loss": 0.0464, "step": 7788 }, { "epoch": 18.90643985419198, "grad_norm": 0.8303185105323792, "learning_rate": 2.3358e-05, "loss": 0.0472, "step": 7789 }, { "epoch": 18.908869987849332, "grad_norm": 0.4814099371433258, "learning_rate": 2.3361e-05, "loss": 0.0276, "step": 7790 }, { "epoch": 18.911300121506684, "grad_norm": 0.46383658051490784, "learning_rate": 2.3364000000000002e-05, "loss": 0.0338, "step": 7791 }, { "epoch": 18.913730255164033, "grad_norm": 0.727450966835022, "learning_rate": 2.3367000000000002e-05, "loss": 0.0379, "step": 7792 }, { "epoch": 18.916160388821385, "grad_norm": 0.36423930525779724, "learning_rate": 2.337e-05, "loss": 0.0239, "step": 7793 }, { "epoch": 18.918590522478738, "grad_norm": 0.4812810719013214, "learning_rate": 2.3373e-05, "loss": 0.0181, "step": 7794 }, { "epoch": 18.921020656136086, "grad_norm": 0.8449602723121643, "learning_rate": 2.3376e-05, "loss": 0.0272, "step": 7795 }, { "epoch": 18.92345078979344, "grad_norm": 0.36333248019218445, "learning_rate": 2.3379e-05, "loss": 0.0143, "step": 7796 }, { "epoch": 18.92588092345079, "grad_norm": 0.6382935047149658, "learning_rate": 2.3382e-05, "loss": 0.0329, "step": 7797 }, { "epoch": 18.92831105710814, "grad_norm": 0.9079365730285645, "learning_rate": 2.3385e-05, "loss": 0.0259, "step": 7798 }, { "epoch": 18.93074119076549, "grad_norm": 0.6040779948234558, "learning_rate": 2.3388e-05, "loss": 0.0323, "step": 7799 }, { "epoch": 18.933171324422844, "grad_norm": 0.546301007270813, "learning_rate": 2.3391e-05, "loss": 0.0336, "step": 7800 }, { "epoch": 18.935601458080196, "grad_norm": 0.3283948600292206, "learning_rate": 2.3394000000000003e-05, "loss": 0.0241, "step": 7801 }, { "epoch": 18.938031591737545, "grad_norm": 0.7342092394828796, "learning_rate": 2.3397000000000003e-05, "loss": 0.0841, "step": 7802 }, { "epoch": 18.940461725394897, "grad_norm": 0.5451323390007019, "learning_rate": 2.3400000000000003e-05, "loss": 0.0332, "step": 7803 }, { "epoch": 18.94289185905225, "grad_norm": 0.9373253583908081, "learning_rate": 2.3403e-05, "loss": 0.032, "step": 7804 }, { "epoch": 18.945321992709598, "grad_norm": 0.43214061856269836, "learning_rate": 2.3406e-05, "loss": 0.0303, "step": 7805 }, { "epoch": 18.94775212636695, "grad_norm": 0.6258218288421631, "learning_rate": 2.3409e-05, "loss": 0.0442, "step": 7806 }, { "epoch": 18.950182260024302, "grad_norm": 0.9384692907333374, "learning_rate": 2.3412e-05, "loss": 0.0333, "step": 7807 }, { "epoch": 18.95261239368165, "grad_norm": 0.932498574256897, "learning_rate": 2.3415e-05, "loss": 0.0389, "step": 7808 }, { "epoch": 18.955042527339003, "grad_norm": 0.4456796944141388, "learning_rate": 2.3418e-05, "loss": 0.0245, "step": 7809 }, { "epoch": 18.957472660996356, "grad_norm": 1.260308027267456, "learning_rate": 2.3420999999999998e-05, "loss": 0.1133, "step": 7810 }, { "epoch": 18.959902794653708, "grad_norm": 0.768929123878479, "learning_rate": 2.3424e-05, "loss": 0.0374, "step": 7811 }, { "epoch": 18.962332928311056, "grad_norm": 0.36786893010139465, "learning_rate": 2.3427e-05, "loss": 0.0236, "step": 7812 }, { "epoch": 18.96476306196841, "grad_norm": 0.933405876159668, "learning_rate": 2.343e-05, "loss": 0.0459, "step": 7813 }, { "epoch": 18.96719319562576, "grad_norm": 0.8050283193588257, "learning_rate": 2.3433e-05, "loss": 0.0407, "step": 7814 }, { "epoch": 18.96962332928311, "grad_norm": 1.45622980594635, "learning_rate": 2.3436e-05, "loss": 0.0624, "step": 7815 }, { "epoch": 18.972053462940462, "grad_norm": 1.789483904838562, "learning_rate": 2.3439e-05, "loss": 0.1063, "step": 7816 }, { "epoch": 18.974483596597814, "grad_norm": 0.7677150368690491, "learning_rate": 2.3442e-05, "loss": 0.1748, "step": 7817 }, { "epoch": 18.976913730255163, "grad_norm": 0.6752752065658569, "learning_rate": 2.3445e-05, "loss": 0.0989, "step": 7818 }, { "epoch": 18.979343863912515, "grad_norm": 0.5095506310462952, "learning_rate": 2.3448e-05, "loss": 0.0482, "step": 7819 }, { "epoch": 18.981773997569867, "grad_norm": 0.5700127482414246, "learning_rate": 2.3451e-05, "loss": 0.0409, "step": 7820 }, { "epoch": 18.984204131227216, "grad_norm": 0.6878297924995422, "learning_rate": 2.3454000000000003e-05, "loss": 0.0561, "step": 7821 }, { "epoch": 18.986634264884568, "grad_norm": 0.4809623956680298, "learning_rate": 2.3457000000000003e-05, "loss": 0.0281, "step": 7822 }, { "epoch": 18.98906439854192, "grad_norm": 0.6763648390769958, "learning_rate": 2.3460000000000002e-05, "loss": 0.0372, "step": 7823 }, { "epoch": 18.991494532199273, "grad_norm": 0.60286545753479, "learning_rate": 2.3463000000000002e-05, "loss": 0.0441, "step": 7824 }, { "epoch": 18.99392466585662, "grad_norm": 0.572637677192688, "learning_rate": 2.3466000000000002e-05, "loss": 0.0284, "step": 7825 }, { "epoch": 18.996354799513973, "grad_norm": 0.412005215883255, "learning_rate": 2.3469000000000002e-05, "loss": 0.0303, "step": 7826 }, { "epoch": 18.998784933171326, "grad_norm": 0.6534282565116882, "learning_rate": 2.3472e-05, "loss": 0.0406, "step": 7827 }, { "epoch": 19.0, "grad_norm": 1.0105122327804565, "learning_rate": 2.3475e-05, "loss": 0.0371, "step": 7828 }, { "epoch": 19.002430133657352, "grad_norm": 0.7232773303985596, "learning_rate": 2.3477999999999998e-05, "loss": 0.2228, "step": 7829 }, { "epoch": 19.0048602673147, "grad_norm": 0.5500338077545166, "learning_rate": 2.3480999999999998e-05, "loss": 0.1872, "step": 7830 }, { "epoch": 19.007290400972053, "grad_norm": 0.45356541872024536, "learning_rate": 2.3484e-05, "loss": 0.1328, "step": 7831 }, { "epoch": 19.009720534629405, "grad_norm": 0.7175462245941162, "learning_rate": 2.3487e-05, "loss": 0.1212, "step": 7832 }, { "epoch": 19.012150668286754, "grad_norm": 0.598888635635376, "learning_rate": 2.349e-05, "loss": 0.0951, "step": 7833 }, { "epoch": 19.014580801944106, "grad_norm": 0.6102678179740906, "learning_rate": 2.3493e-05, "loss": 0.1087, "step": 7834 }, { "epoch": 19.01701093560146, "grad_norm": 0.430698424577713, "learning_rate": 2.3496e-05, "loss": 0.084, "step": 7835 }, { "epoch": 19.01944106925881, "grad_norm": 0.6433333158493042, "learning_rate": 2.3499e-05, "loss": 0.0633, "step": 7836 }, { "epoch": 19.02187120291616, "grad_norm": 0.63907790184021, "learning_rate": 2.3502e-05, "loss": 0.0502, "step": 7837 }, { "epoch": 19.02430133657351, "grad_norm": 0.492673397064209, "learning_rate": 2.3505e-05, "loss": 0.0416, "step": 7838 }, { "epoch": 19.026731470230864, "grad_norm": 0.48422130942344666, "learning_rate": 2.3508e-05, "loss": 0.047, "step": 7839 }, { "epoch": 19.029161603888213, "grad_norm": 0.45157700777053833, "learning_rate": 2.3511e-05, "loss": 0.0479, "step": 7840 }, { "epoch": 19.031591737545565, "grad_norm": 0.4071183502674103, "learning_rate": 2.3514000000000002e-05, "loss": 0.0349, "step": 7841 }, { "epoch": 19.034021871202917, "grad_norm": 0.6438446044921875, "learning_rate": 2.3517000000000002e-05, "loss": 0.0459, "step": 7842 }, { "epoch": 19.036452004860266, "grad_norm": 0.30542677640914917, "learning_rate": 2.3520000000000002e-05, "loss": 0.0333, "step": 7843 }, { "epoch": 19.038882138517618, "grad_norm": 0.48930516839027405, "learning_rate": 2.3523e-05, "loss": 0.0486, "step": 7844 }, { "epoch": 19.04131227217497, "grad_norm": 0.40464287996292114, "learning_rate": 2.3526e-05, "loss": 0.0262, "step": 7845 }, { "epoch": 19.043742405832322, "grad_norm": 0.47933101654052734, "learning_rate": 2.3529e-05, "loss": 0.0306, "step": 7846 }, { "epoch": 19.04617253948967, "grad_norm": 0.28499001264572144, "learning_rate": 2.3532e-05, "loss": 0.029, "step": 7847 }, { "epoch": 19.048602673147023, "grad_norm": 0.47859472036361694, "learning_rate": 2.3535e-05, "loss": 0.0172, "step": 7848 }, { "epoch": 19.051032806804375, "grad_norm": 0.41737502813339233, "learning_rate": 2.3538e-05, "loss": 0.0192, "step": 7849 }, { "epoch": 19.053462940461724, "grad_norm": 0.5455688834190369, "learning_rate": 2.3541e-05, "loss": 0.027, "step": 7850 }, { "epoch": 19.055893074119076, "grad_norm": 0.4986167252063751, "learning_rate": 2.3544000000000004e-05, "loss": 0.0305, "step": 7851 }, { "epoch": 19.05832320777643, "grad_norm": 0.7195044755935669, "learning_rate": 2.3547000000000003e-05, "loss": 0.0299, "step": 7852 }, { "epoch": 19.060753341433777, "grad_norm": 0.3555028438568115, "learning_rate": 2.3550000000000003e-05, "loss": 0.0176, "step": 7853 }, { "epoch": 19.06318347509113, "grad_norm": 0.3339501917362213, "learning_rate": 2.3553e-05, "loss": 0.0138, "step": 7854 }, { "epoch": 19.06561360874848, "grad_norm": 0.2872607111930847, "learning_rate": 2.3556e-05, "loss": 0.0185, "step": 7855 }, { "epoch": 19.068043742405834, "grad_norm": 0.7486904859542847, "learning_rate": 2.3559e-05, "loss": 0.033, "step": 7856 }, { "epoch": 19.070473876063183, "grad_norm": 0.34925025701522827, "learning_rate": 2.3562e-05, "loss": 0.0242, "step": 7857 }, { "epoch": 19.072904009720535, "grad_norm": 0.46136799454689026, "learning_rate": 2.3565e-05, "loss": 0.0226, "step": 7858 }, { "epoch": 19.075334143377887, "grad_norm": 0.3924514055252075, "learning_rate": 2.3568e-05, "loss": 0.0185, "step": 7859 }, { "epoch": 19.077764277035236, "grad_norm": 0.69559645652771, "learning_rate": 2.3571e-05, "loss": 0.0208, "step": 7860 }, { "epoch": 19.080194410692588, "grad_norm": 0.42633506655693054, "learning_rate": 2.3574e-05, "loss": 0.0277, "step": 7861 }, { "epoch": 19.08262454434994, "grad_norm": 0.3398628532886505, "learning_rate": 2.3577e-05, "loss": 0.0173, "step": 7862 }, { "epoch": 19.08505467800729, "grad_norm": 0.8001116514205933, "learning_rate": 2.358e-05, "loss": 0.0218, "step": 7863 }, { "epoch": 19.08748481166464, "grad_norm": 0.6674374938011169, "learning_rate": 2.3583e-05, "loss": 0.0661, "step": 7864 }, { "epoch": 19.089914945321993, "grad_norm": 0.7483930587768555, "learning_rate": 2.3586e-05, "loss": 0.0245, "step": 7865 }, { "epoch": 19.092345078979346, "grad_norm": 0.3453618884086609, "learning_rate": 2.3589e-05, "loss": 0.0186, "step": 7866 }, { "epoch": 19.094775212636694, "grad_norm": 0.41562241315841675, "learning_rate": 2.3592e-05, "loss": 0.0185, "step": 7867 }, { "epoch": 19.097205346294047, "grad_norm": 0.47368136048316956, "learning_rate": 2.3595e-05, "loss": 0.0238, "step": 7868 }, { "epoch": 19.0996354799514, "grad_norm": 1.1538163423538208, "learning_rate": 2.3598e-05, "loss": 0.0332, "step": 7869 }, { "epoch": 19.102065613608747, "grad_norm": 0.44723206758499146, "learning_rate": 2.3601e-05, "loss": 0.0235, "step": 7870 }, { "epoch": 19.1044957472661, "grad_norm": 0.6088936924934387, "learning_rate": 2.3604000000000003e-05, "loss": 0.0436, "step": 7871 }, { "epoch": 19.106925880923452, "grad_norm": 0.5429292321205139, "learning_rate": 2.3607000000000003e-05, "loss": 0.0261, "step": 7872 }, { "epoch": 19.1093560145808, "grad_norm": 3.8559741973876953, "learning_rate": 2.3610000000000003e-05, "loss": 0.0321, "step": 7873 }, { "epoch": 19.111786148238153, "grad_norm": 1.250312089920044, "learning_rate": 2.3613000000000002e-05, "loss": 0.0359, "step": 7874 }, { "epoch": 19.114216281895505, "grad_norm": 0.8488923907279968, "learning_rate": 2.3616000000000002e-05, "loss": 0.0327, "step": 7875 }, { "epoch": 19.116646415552854, "grad_norm": 0.5854378342628479, "learning_rate": 2.3619000000000002e-05, "loss": 0.0381, "step": 7876 }, { "epoch": 19.119076549210206, "grad_norm": 1.3449429273605347, "learning_rate": 2.3622000000000002e-05, "loss": 0.0357, "step": 7877 }, { "epoch": 19.121506682867558, "grad_norm": 2.9295899868011475, "learning_rate": 2.3624999999999998e-05, "loss": 0.0502, "step": 7878 }, { "epoch": 19.12393681652491, "grad_norm": 1.0929068326950073, "learning_rate": 2.3627999999999998e-05, "loss": 0.2576, "step": 7879 }, { "epoch": 19.12636695018226, "grad_norm": 0.7109350562095642, "learning_rate": 2.3630999999999998e-05, "loss": 0.205, "step": 7880 }, { "epoch": 19.12879708383961, "grad_norm": 0.5540733337402344, "learning_rate": 2.3633999999999998e-05, "loss": 0.1649, "step": 7881 }, { "epoch": 19.131227217496964, "grad_norm": 0.6159129738807678, "learning_rate": 2.3637e-05, "loss": 0.1389, "step": 7882 }, { "epoch": 19.133657351154312, "grad_norm": 0.6473926305770874, "learning_rate": 2.364e-05, "loss": 0.1117, "step": 7883 }, { "epoch": 19.136087484811664, "grad_norm": 0.6062822341918945, "learning_rate": 2.3643e-05, "loss": 0.0696, "step": 7884 }, { "epoch": 19.138517618469017, "grad_norm": 0.5679354071617126, "learning_rate": 2.3646e-05, "loss": 0.0876, "step": 7885 }, { "epoch": 19.140947752126365, "grad_norm": 0.480791300535202, "learning_rate": 2.3649e-05, "loss": 0.0757, "step": 7886 }, { "epoch": 19.143377885783718, "grad_norm": 0.8741275072097778, "learning_rate": 2.3652e-05, "loss": 0.0874, "step": 7887 }, { "epoch": 19.14580801944107, "grad_norm": 0.6496526002883911, "learning_rate": 2.3655e-05, "loss": 0.0411, "step": 7888 }, { "epoch": 19.148238153098422, "grad_norm": 0.9315991401672363, "learning_rate": 2.3658e-05, "loss": 0.0369, "step": 7889 }, { "epoch": 19.15066828675577, "grad_norm": 0.5796331167221069, "learning_rate": 2.3661e-05, "loss": 0.0409, "step": 7890 }, { "epoch": 19.153098420413123, "grad_norm": 0.8559188842773438, "learning_rate": 2.3664e-05, "loss": 0.0397, "step": 7891 }, { "epoch": 19.155528554070475, "grad_norm": 0.5365552306175232, "learning_rate": 2.3667000000000002e-05, "loss": 0.0297, "step": 7892 }, { "epoch": 19.157958687727824, "grad_norm": 0.585371196269989, "learning_rate": 2.3670000000000002e-05, "loss": 0.0481, "step": 7893 }, { "epoch": 19.160388821385176, "grad_norm": 0.5583192110061646, "learning_rate": 2.3673000000000002e-05, "loss": 0.0404, "step": 7894 }, { "epoch": 19.16281895504253, "grad_norm": 0.5185644626617432, "learning_rate": 2.3676e-05, "loss": 0.0285, "step": 7895 }, { "epoch": 19.165249088699877, "grad_norm": 0.579918384552002, "learning_rate": 2.3679e-05, "loss": 0.037, "step": 7896 }, { "epoch": 19.16767922235723, "grad_norm": 0.3951779007911682, "learning_rate": 2.3682e-05, "loss": 0.0243, "step": 7897 }, { "epoch": 19.17010935601458, "grad_norm": 0.3470633924007416, "learning_rate": 2.3685e-05, "loss": 0.0168, "step": 7898 }, { "epoch": 19.172539489671934, "grad_norm": 0.6880810856819153, "learning_rate": 2.3688e-05, "loss": 0.0397, "step": 7899 }, { "epoch": 19.174969623329282, "grad_norm": 0.3936748504638672, "learning_rate": 2.3691e-05, "loss": 0.0167, "step": 7900 }, { "epoch": 19.177399756986635, "grad_norm": 0.34716734290122986, "learning_rate": 2.3694e-05, "loss": 0.0244, "step": 7901 }, { "epoch": 19.179829890643987, "grad_norm": 0.7177839279174805, "learning_rate": 2.3697000000000004e-05, "loss": 0.0394, "step": 7902 }, { "epoch": 19.182260024301335, "grad_norm": 0.3802579939365387, "learning_rate": 2.37e-05, "loss": 0.0221, "step": 7903 }, { "epoch": 19.184690157958688, "grad_norm": 0.5825529098510742, "learning_rate": 2.3703e-05, "loss": 0.0345, "step": 7904 }, { "epoch": 19.18712029161604, "grad_norm": 0.504553496837616, "learning_rate": 2.3706e-05, "loss": 0.0301, "step": 7905 }, { "epoch": 19.18955042527339, "grad_norm": 0.3809839189052582, "learning_rate": 2.3709e-05, "loss": 0.0189, "step": 7906 }, { "epoch": 19.19198055893074, "grad_norm": 0.5075424313545227, "learning_rate": 2.3712e-05, "loss": 0.0269, "step": 7907 }, { "epoch": 19.194410692588093, "grad_norm": 0.3460897207260132, "learning_rate": 2.3715e-05, "loss": 0.0213, "step": 7908 }, { "epoch": 19.19684082624544, "grad_norm": 0.3342066705226898, "learning_rate": 2.3718e-05, "loss": 0.0186, "step": 7909 }, { "epoch": 19.199270959902794, "grad_norm": 0.6228055357933044, "learning_rate": 2.3721e-05, "loss": 0.0243, "step": 7910 }, { "epoch": 19.201701093560146, "grad_norm": 0.5238274931907654, "learning_rate": 2.3724e-05, "loss": 0.0215, "step": 7911 }, { "epoch": 19.2041312272175, "grad_norm": 0.40595677495002747, "learning_rate": 2.3727000000000002e-05, "loss": 0.0212, "step": 7912 }, { "epoch": 19.206561360874847, "grad_norm": 0.5801669359207153, "learning_rate": 2.373e-05, "loss": 0.0322, "step": 7913 }, { "epoch": 19.2089914945322, "grad_norm": 0.604559600353241, "learning_rate": 2.3733e-05, "loss": 0.024, "step": 7914 }, { "epoch": 19.21142162818955, "grad_norm": 0.5588735938072205, "learning_rate": 2.3736e-05, "loss": 0.0302, "step": 7915 }, { "epoch": 19.2138517618469, "grad_norm": 0.47848761081695557, "learning_rate": 2.3739e-05, "loss": 0.0259, "step": 7916 }, { "epoch": 19.216281895504252, "grad_norm": 0.5059358477592468, "learning_rate": 2.3742e-05, "loss": 0.026, "step": 7917 }, { "epoch": 19.218712029161605, "grad_norm": 1.8885221481323242, "learning_rate": 2.3745e-05, "loss": 0.0222, "step": 7918 }, { "epoch": 19.221142162818953, "grad_norm": 0.6074569821357727, "learning_rate": 2.3748e-05, "loss": 0.0253, "step": 7919 }, { "epoch": 19.223572296476306, "grad_norm": 0.7867709398269653, "learning_rate": 2.3751e-05, "loss": 0.0377, "step": 7920 }, { "epoch": 19.226002430133658, "grad_norm": 0.6876304745674133, "learning_rate": 2.3754e-05, "loss": 0.0797, "step": 7921 }, { "epoch": 19.22843256379101, "grad_norm": 1.2990922927856445, "learning_rate": 2.3757000000000003e-05, "loss": 0.0289, "step": 7922 }, { "epoch": 19.23086269744836, "grad_norm": 0.7802664637565613, "learning_rate": 2.3760000000000003e-05, "loss": 0.03, "step": 7923 }, { "epoch": 19.23329283110571, "grad_norm": 0.2783946990966797, "learning_rate": 2.3763000000000003e-05, "loss": 0.0143, "step": 7924 }, { "epoch": 19.235722964763063, "grad_norm": 0.6604165434837341, "learning_rate": 2.3766000000000003e-05, "loss": 0.0278, "step": 7925 }, { "epoch": 19.238153098420412, "grad_norm": 1.1488009691238403, "learning_rate": 2.3769000000000002e-05, "loss": 0.0393, "step": 7926 }, { "epoch": 19.240583232077764, "grad_norm": 1.14255690574646, "learning_rate": 2.3772e-05, "loss": 0.0459, "step": 7927 }, { "epoch": 19.243013365735116, "grad_norm": 0.9763277769088745, "learning_rate": 2.3775e-05, "loss": 0.0704, "step": 7928 }, { "epoch": 19.245443499392465, "grad_norm": 2.3848609924316406, "learning_rate": 2.3778e-05, "loss": 0.3044, "step": 7929 }, { "epoch": 19.247873633049817, "grad_norm": 0.7607913613319397, "learning_rate": 2.3780999999999998e-05, "loss": 0.1854, "step": 7930 }, { "epoch": 19.25030376670717, "grad_norm": 0.7334006428718567, "learning_rate": 2.3783999999999998e-05, "loss": 0.1592, "step": 7931 }, { "epoch": 19.25273390036452, "grad_norm": 1.0047117471694946, "learning_rate": 2.3787e-05, "loss": 0.1087, "step": 7932 }, { "epoch": 19.25516403402187, "grad_norm": 0.7777300477027893, "learning_rate": 2.379e-05, "loss": 0.0946, "step": 7933 }, { "epoch": 19.257594167679223, "grad_norm": 0.4747650921344757, "learning_rate": 2.3793e-05, "loss": 0.0709, "step": 7934 }, { "epoch": 19.260024301336575, "grad_norm": 0.4748832583427429, "learning_rate": 2.3796e-05, "loss": 0.067, "step": 7935 }, { "epoch": 19.262454434993924, "grad_norm": 0.7158834934234619, "learning_rate": 2.3799e-05, "loss": 0.055, "step": 7936 }, { "epoch": 19.264884568651276, "grad_norm": 0.5783267021179199, "learning_rate": 2.3802e-05, "loss": 0.0398, "step": 7937 }, { "epoch": 19.267314702308628, "grad_norm": 0.5321074724197388, "learning_rate": 2.3805e-05, "loss": 0.0562, "step": 7938 }, { "epoch": 19.269744835965977, "grad_norm": 0.5336418747901917, "learning_rate": 2.3808e-05, "loss": 0.0466, "step": 7939 }, { "epoch": 19.27217496962333, "grad_norm": 0.3763405680656433, "learning_rate": 2.3811e-05, "loss": 0.0325, "step": 7940 }, { "epoch": 19.27460510328068, "grad_norm": 0.48757123947143555, "learning_rate": 2.3814e-05, "loss": 0.0298, "step": 7941 }, { "epoch": 19.277035236938033, "grad_norm": 0.482339084148407, "learning_rate": 2.3817000000000003e-05, "loss": 0.0302, "step": 7942 }, { "epoch": 19.279465370595382, "grad_norm": 0.4907706379890442, "learning_rate": 2.3820000000000002e-05, "loss": 0.0314, "step": 7943 }, { "epoch": 19.281895504252734, "grad_norm": 0.6270248293876648, "learning_rate": 2.3823000000000002e-05, "loss": 0.0546, "step": 7944 }, { "epoch": 19.284325637910086, "grad_norm": 0.5734360218048096, "learning_rate": 2.3826000000000002e-05, "loss": 0.0401, "step": 7945 }, { "epoch": 19.286755771567435, "grad_norm": 0.4433996081352234, "learning_rate": 2.3829000000000002e-05, "loss": 0.0312, "step": 7946 }, { "epoch": 19.289185905224787, "grad_norm": 0.5505190491676331, "learning_rate": 2.3832e-05, "loss": 0.0209, "step": 7947 }, { "epoch": 19.29161603888214, "grad_norm": 0.4766157567501068, "learning_rate": 2.3835e-05, "loss": 0.0205, "step": 7948 }, { "epoch": 19.29404617253949, "grad_norm": 0.4225587546825409, "learning_rate": 2.3838e-05, "loss": 0.0215, "step": 7949 }, { "epoch": 19.29647630619684, "grad_norm": 0.4825456738471985, "learning_rate": 2.3841e-05, "loss": 0.0184, "step": 7950 }, { "epoch": 19.298906439854193, "grad_norm": 0.4242784380912781, "learning_rate": 2.3844e-05, "loss": 0.0228, "step": 7951 }, { "epoch": 19.30133657351154, "grad_norm": 0.38786086440086365, "learning_rate": 2.3847e-05, "loss": 0.0168, "step": 7952 }, { "epoch": 19.303766707168894, "grad_norm": 0.3145756125450134, "learning_rate": 2.385e-05, "loss": 0.0214, "step": 7953 }, { "epoch": 19.306196840826246, "grad_norm": 0.39792850613594055, "learning_rate": 2.3853e-05, "loss": 0.019, "step": 7954 }, { "epoch": 19.308626974483598, "grad_norm": 0.6135281324386597, "learning_rate": 2.3856e-05, "loss": 0.03, "step": 7955 }, { "epoch": 19.311057108140947, "grad_norm": 0.5354217886924744, "learning_rate": 2.3859e-05, "loss": 0.0384, "step": 7956 }, { "epoch": 19.3134872417983, "grad_norm": 0.4430893361568451, "learning_rate": 2.3862e-05, "loss": 0.0207, "step": 7957 }, { "epoch": 19.31591737545565, "grad_norm": 0.4396216571331024, "learning_rate": 2.3865e-05, "loss": 0.014, "step": 7958 }, { "epoch": 19.318347509113, "grad_norm": 0.74220871925354, "learning_rate": 2.3868e-05, "loss": 0.0238, "step": 7959 }, { "epoch": 19.320777642770352, "grad_norm": 0.41000503301620483, "learning_rate": 2.3871e-05, "loss": 0.0194, "step": 7960 }, { "epoch": 19.323207776427704, "grad_norm": 0.441211074590683, "learning_rate": 2.3874e-05, "loss": 0.0152, "step": 7961 }, { "epoch": 19.325637910085053, "grad_norm": 1.2854894399642944, "learning_rate": 2.3877000000000002e-05, "loss": 0.0364, "step": 7962 }, { "epoch": 19.328068043742405, "grad_norm": 0.40795907378196716, "learning_rate": 2.3880000000000002e-05, "loss": 0.0171, "step": 7963 }, { "epoch": 19.330498177399758, "grad_norm": 0.9584217667579651, "learning_rate": 2.3883e-05, "loss": 0.0415, "step": 7964 }, { "epoch": 19.33292831105711, "grad_norm": 0.6993303894996643, "learning_rate": 2.3886e-05, "loss": 0.0278, "step": 7965 }, { "epoch": 19.33535844471446, "grad_norm": 0.9918439388275146, "learning_rate": 2.3889e-05, "loss": 0.0352, "step": 7966 }, { "epoch": 19.33778857837181, "grad_norm": 0.4788212180137634, "learning_rate": 2.3892e-05, "loss": 0.0328, "step": 7967 }, { "epoch": 19.340218712029163, "grad_norm": 0.47790825366973877, "learning_rate": 2.3895e-05, "loss": 0.0299, "step": 7968 }, { "epoch": 19.34264884568651, "grad_norm": 0.3316161632537842, "learning_rate": 2.3898e-05, "loss": 0.0238, "step": 7969 }, { "epoch": 19.345078979343864, "grad_norm": 0.3425515294075012, "learning_rate": 2.3901e-05, "loss": 0.0264, "step": 7970 }, { "epoch": 19.347509113001216, "grad_norm": 0.5905998349189758, "learning_rate": 2.3904e-05, "loss": 0.0305, "step": 7971 }, { "epoch": 19.349939246658565, "grad_norm": 1.0004340410232544, "learning_rate": 2.3907000000000003e-05, "loss": 0.0397, "step": 7972 }, { "epoch": 19.352369380315917, "grad_norm": 0.7441449165344238, "learning_rate": 2.3910000000000003e-05, "loss": 0.03, "step": 7973 }, { "epoch": 19.35479951397327, "grad_norm": 0.6144006848335266, "learning_rate": 2.3913000000000003e-05, "loss": 0.0435, "step": 7974 }, { "epoch": 19.35722964763062, "grad_norm": 0.8736698627471924, "learning_rate": 2.3916000000000003e-05, "loss": 0.0341, "step": 7975 }, { "epoch": 19.35965978128797, "grad_norm": 0.6169910430908203, "learning_rate": 2.3919e-05, "loss": 0.03, "step": 7976 }, { "epoch": 19.362089914945322, "grad_norm": 0.7721880078315735, "learning_rate": 2.3922e-05, "loss": 0.0781, "step": 7977 }, { "epoch": 19.364520048602675, "grad_norm": 0.9374182224273682, "learning_rate": 2.3925e-05, "loss": 0.0715, "step": 7978 }, { "epoch": 19.366950182260023, "grad_norm": 1.1241719722747803, "learning_rate": 2.3928e-05, "loss": 0.2207, "step": 7979 }, { "epoch": 19.369380315917375, "grad_norm": 0.8866764903068542, "learning_rate": 2.3931e-05, "loss": 0.1605, "step": 7980 }, { "epoch": 19.371810449574728, "grad_norm": 0.562360942363739, "learning_rate": 2.3933999999999998e-05, "loss": 0.1383, "step": 7981 }, { "epoch": 19.374240583232076, "grad_norm": 0.631637692451477, "learning_rate": 2.3937e-05, "loss": 0.1171, "step": 7982 }, { "epoch": 19.37667071688943, "grad_norm": 0.7468942999839783, "learning_rate": 2.394e-05, "loss": 0.1282, "step": 7983 }, { "epoch": 19.37910085054678, "grad_norm": 0.5062358379364014, "learning_rate": 2.3943e-05, "loss": 0.0843, "step": 7984 }, { "epoch": 19.381530984204133, "grad_norm": 0.836040735244751, "learning_rate": 2.3946e-05, "loss": 0.0705, "step": 7985 }, { "epoch": 19.38396111786148, "grad_norm": 0.6771988868713379, "learning_rate": 2.3949e-05, "loss": 0.0743, "step": 7986 }, { "epoch": 19.386391251518834, "grad_norm": 0.6494112610816956, "learning_rate": 2.3952e-05, "loss": 0.0466, "step": 7987 }, { "epoch": 19.388821385176186, "grad_norm": 0.4837433397769928, "learning_rate": 2.3955e-05, "loss": 0.064, "step": 7988 }, { "epoch": 19.391251518833535, "grad_norm": 0.5956375598907471, "learning_rate": 2.3958e-05, "loss": 0.0481, "step": 7989 }, { "epoch": 19.393681652490887, "grad_norm": 0.42369386553764343, "learning_rate": 2.3961e-05, "loss": 0.0288, "step": 7990 }, { "epoch": 19.39611178614824, "grad_norm": 0.4694395959377289, "learning_rate": 2.3964e-05, "loss": 0.0429, "step": 7991 }, { "epoch": 19.398541919805588, "grad_norm": 0.5125194787979126, "learning_rate": 2.3967000000000003e-05, "loss": 0.0504, "step": 7992 }, { "epoch": 19.40097205346294, "grad_norm": 0.7654478549957275, "learning_rate": 2.3970000000000003e-05, "loss": 0.0278, "step": 7993 }, { "epoch": 19.403402187120292, "grad_norm": 0.6338098645210266, "learning_rate": 2.3973000000000002e-05, "loss": 0.0431, "step": 7994 }, { "epoch": 19.40583232077764, "grad_norm": 0.68684321641922, "learning_rate": 2.3976000000000002e-05, "loss": 0.041, "step": 7995 }, { "epoch": 19.408262454434993, "grad_norm": 0.4730212092399597, "learning_rate": 2.3979000000000002e-05, "loss": 0.0323, "step": 7996 }, { "epoch": 19.410692588092346, "grad_norm": 0.41927197575569153, "learning_rate": 2.3982000000000002e-05, "loss": 0.0358, "step": 7997 }, { "epoch": 19.413122721749698, "grad_norm": 0.2868942320346832, "learning_rate": 2.3985e-05, "loss": 0.0219, "step": 7998 }, { "epoch": 19.415552855407046, "grad_norm": 0.3950389623641968, "learning_rate": 2.3988e-05, "loss": 0.03, "step": 7999 }, { "epoch": 19.4179829890644, "grad_norm": 0.6956366300582886, "learning_rate": 2.3991e-05, "loss": 0.0309, "step": 8000 }, { "epoch": 19.4179829890644, "eval_cer": 0.092121114769898, "eval_loss": 0.3189784586429596, "eval_runtime": 8.2519, "eval_samples_per_second": 12.24, "eval_steps_per_second": 0.485, "eval_wer": 0.2839506172839506, "step": 8000 }, { "epoch": 19.42041312272175, "grad_norm": 0.356009304523468, "learning_rate": 2.3993999999999998e-05, "loss": 0.0222, "step": 8001 }, { "epoch": 19.4228432563791, "grad_norm": 0.7498154640197754, "learning_rate": 2.3997e-05, "loss": 0.0343, "step": 8002 }, { "epoch": 19.425273390036452, "grad_norm": 0.34289756417274475, "learning_rate": 2.4e-05, "loss": 0.0253, "step": 8003 }, { "epoch": 19.427703523693804, "grad_norm": 1.0382225513458252, "learning_rate": 2.4003e-05, "loss": 0.0337, "step": 8004 }, { "epoch": 19.430133657351153, "grad_norm": 0.4181426167488098, "learning_rate": 2.4006e-05, "loss": 0.0412, "step": 8005 }, { "epoch": 19.432563791008505, "grad_norm": 0.44907262921333313, "learning_rate": 2.4009e-05, "loss": 0.0434, "step": 8006 }, { "epoch": 19.434993924665857, "grad_norm": 0.2904737889766693, "learning_rate": 2.4012e-05, "loss": 0.0168, "step": 8007 }, { "epoch": 19.43742405832321, "grad_norm": 0.5300465226173401, "learning_rate": 2.4015e-05, "loss": 0.0374, "step": 8008 }, { "epoch": 19.439854191980558, "grad_norm": 0.35908061265945435, "learning_rate": 2.4018e-05, "loss": 0.0213, "step": 8009 }, { "epoch": 19.44228432563791, "grad_norm": 0.4528176784515381, "learning_rate": 2.4021e-05, "loss": 0.0329, "step": 8010 }, { "epoch": 19.444714459295263, "grad_norm": 0.4941556751728058, "learning_rate": 2.4024e-05, "loss": 0.0286, "step": 8011 }, { "epoch": 19.44714459295261, "grad_norm": 0.4519619047641754, "learning_rate": 2.4027e-05, "loss": 0.0254, "step": 8012 }, { "epoch": 19.449574726609963, "grad_norm": 0.7815436124801636, "learning_rate": 2.4030000000000002e-05, "loss": 0.0541, "step": 8013 }, { "epoch": 19.452004860267316, "grad_norm": 0.4674033522605896, "learning_rate": 2.4033000000000002e-05, "loss": 0.0302, "step": 8014 }, { "epoch": 19.454434993924664, "grad_norm": 0.5275654196739197, "learning_rate": 2.4036e-05, "loss": 0.0266, "step": 8015 }, { "epoch": 19.456865127582017, "grad_norm": 0.42229941487312317, "learning_rate": 2.4039e-05, "loss": 0.0249, "step": 8016 }, { "epoch": 19.45929526123937, "grad_norm": 0.3647831976413727, "learning_rate": 2.4042e-05, "loss": 0.0211, "step": 8017 }, { "epoch": 19.46172539489672, "grad_norm": 1.033728003501892, "learning_rate": 2.4045e-05, "loss": 0.0272, "step": 8018 }, { "epoch": 19.46415552855407, "grad_norm": 0.8040756583213806, "learning_rate": 2.4048e-05, "loss": 0.0319, "step": 8019 }, { "epoch": 19.466585662211422, "grad_norm": 0.9721460342407227, "learning_rate": 2.4051e-05, "loss": 0.0339, "step": 8020 }, { "epoch": 19.469015795868774, "grad_norm": 0.41143324971199036, "learning_rate": 2.4054e-05, "loss": 0.0244, "step": 8021 }, { "epoch": 19.471445929526123, "grad_norm": 0.9764626026153564, "learning_rate": 2.4057e-05, "loss": 0.0349, "step": 8022 }, { "epoch": 19.473876063183475, "grad_norm": 0.911342203617096, "learning_rate": 2.4060000000000003e-05, "loss": 0.0495, "step": 8023 }, { "epoch": 19.476306196840827, "grad_norm": 1.5233162641525269, "learning_rate": 2.4063000000000003e-05, "loss": 0.0162, "step": 8024 }, { "epoch": 19.478736330498176, "grad_norm": 0.896831214427948, "learning_rate": 2.4066000000000003e-05, "loss": 0.0523, "step": 8025 }, { "epoch": 19.481166464155528, "grad_norm": 0.7681997418403625, "learning_rate": 2.4069e-05, "loss": 0.0406, "step": 8026 }, { "epoch": 19.48359659781288, "grad_norm": 0.7561794519424438, "learning_rate": 2.4072e-05, "loss": 0.0257, "step": 8027 }, { "epoch": 19.48602673147023, "grad_norm": 1.3969993591308594, "learning_rate": 2.4075e-05, "loss": 0.0559, "step": 8028 }, { "epoch": 19.48845686512758, "grad_norm": 1.216966986656189, "learning_rate": 2.4078e-05, "loss": 0.2497, "step": 8029 }, { "epoch": 19.490886998784934, "grad_norm": 0.940864622592926, "learning_rate": 2.4081e-05, "loss": 0.1656, "step": 8030 }, { "epoch": 19.493317132442286, "grad_norm": 0.5216062068939209, "learning_rate": 2.4084e-05, "loss": 0.1242, "step": 8031 }, { "epoch": 19.495747266099634, "grad_norm": 0.9804427027702332, "learning_rate": 2.4086999999999998e-05, "loss": 0.1591, "step": 8032 }, { "epoch": 19.498177399756987, "grad_norm": 0.7262668013572693, "learning_rate": 2.409e-05, "loss": 0.1509, "step": 8033 }, { "epoch": 19.50060753341434, "grad_norm": 0.6651502847671509, "learning_rate": 2.4093e-05, "loss": 0.1015, "step": 8034 }, { "epoch": 19.503037667071688, "grad_norm": 0.8466780185699463, "learning_rate": 2.4096e-05, "loss": 0.0841, "step": 8035 }, { "epoch": 19.50546780072904, "grad_norm": 0.601593017578125, "learning_rate": 2.4099e-05, "loss": 0.0519, "step": 8036 }, { "epoch": 19.507897934386392, "grad_norm": 0.6372039914131165, "learning_rate": 2.4102e-05, "loss": 0.0985, "step": 8037 }, { "epoch": 19.51032806804374, "grad_norm": 0.6822032332420349, "learning_rate": 2.4105e-05, "loss": 0.0427, "step": 8038 }, { "epoch": 19.512758201701093, "grad_norm": 0.5533204078674316, "learning_rate": 2.4108e-05, "loss": 0.0734, "step": 8039 }, { "epoch": 19.515188335358445, "grad_norm": 0.4361056089401245, "learning_rate": 2.4111e-05, "loss": 0.0347, "step": 8040 }, { "epoch": 19.517618469015797, "grad_norm": 0.4270112216472626, "learning_rate": 2.4114e-05, "loss": 0.0353, "step": 8041 }, { "epoch": 19.520048602673146, "grad_norm": 0.4188229739665985, "learning_rate": 2.4117e-05, "loss": 0.0296, "step": 8042 }, { "epoch": 19.5224787363305, "grad_norm": 0.7316055297851562, "learning_rate": 2.4120000000000003e-05, "loss": 0.0489, "step": 8043 }, { "epoch": 19.52490886998785, "grad_norm": 1.4957376718521118, "learning_rate": 2.4123000000000003e-05, "loss": 0.0373, "step": 8044 }, { "epoch": 19.5273390036452, "grad_norm": 0.7134762406349182, "learning_rate": 2.4126000000000002e-05, "loss": 0.0527, "step": 8045 }, { "epoch": 19.52976913730255, "grad_norm": 0.4332769215106964, "learning_rate": 2.4129000000000002e-05, "loss": 0.0458, "step": 8046 }, { "epoch": 19.532199270959904, "grad_norm": 0.5516603589057922, "learning_rate": 2.4132000000000002e-05, "loss": 0.0303, "step": 8047 }, { "epoch": 19.534629404617252, "grad_norm": 0.39417174458503723, "learning_rate": 2.4135000000000002e-05, "loss": 0.0236, "step": 8048 }, { "epoch": 19.537059538274605, "grad_norm": 0.5755891799926758, "learning_rate": 2.4138e-05, "loss": 0.0488, "step": 8049 }, { "epoch": 19.539489671931957, "grad_norm": 0.5690779685974121, "learning_rate": 2.4140999999999998e-05, "loss": 0.0368, "step": 8050 }, { "epoch": 19.54191980558931, "grad_norm": 0.513604998588562, "learning_rate": 2.4143999999999998e-05, "loss": 0.0701, "step": 8051 }, { "epoch": 19.544349939246658, "grad_norm": 0.40637439489364624, "learning_rate": 2.4146999999999998e-05, "loss": 0.0431, "step": 8052 }, { "epoch": 19.54678007290401, "grad_norm": 0.5486868619918823, "learning_rate": 2.415e-05, "loss": 0.0337, "step": 8053 }, { "epoch": 19.549210206561362, "grad_norm": 0.4265301823616028, "learning_rate": 2.4153e-05, "loss": 0.0259, "step": 8054 }, { "epoch": 19.55164034021871, "grad_norm": 0.5222687721252441, "learning_rate": 2.4156e-05, "loss": 0.0348, "step": 8055 }, { "epoch": 19.554070473876063, "grad_norm": 0.3487361967563629, "learning_rate": 2.4159e-05, "loss": 0.0186, "step": 8056 }, { "epoch": 19.556500607533415, "grad_norm": 0.2436019480228424, "learning_rate": 2.4162e-05, "loss": 0.0159, "step": 8057 }, { "epoch": 19.558930741190764, "grad_norm": 0.5243362784385681, "learning_rate": 2.4165e-05, "loss": 0.0285, "step": 8058 }, { "epoch": 19.561360874848116, "grad_norm": 0.7542765736579895, "learning_rate": 2.4168e-05, "loss": 0.0505, "step": 8059 }, { "epoch": 19.56379100850547, "grad_norm": 0.9045454263687134, "learning_rate": 2.4171e-05, "loss": 0.0391, "step": 8060 }, { "epoch": 19.566221142162817, "grad_norm": 0.9236581921577454, "learning_rate": 2.4174e-05, "loss": 0.0463, "step": 8061 }, { "epoch": 19.56865127582017, "grad_norm": 0.5394813418388367, "learning_rate": 2.4177e-05, "loss": 0.0303, "step": 8062 }, { "epoch": 19.57108140947752, "grad_norm": 0.46233853697776794, "learning_rate": 2.4180000000000002e-05, "loss": 0.0234, "step": 8063 }, { "epoch": 19.573511543134874, "grad_norm": 0.5528681874275208, "learning_rate": 2.4183000000000002e-05, "loss": 0.0444, "step": 8064 }, { "epoch": 19.575941676792223, "grad_norm": 0.838260293006897, "learning_rate": 2.4186000000000002e-05, "loss": 0.081, "step": 8065 }, { "epoch": 19.578371810449575, "grad_norm": 0.6065621972084045, "learning_rate": 2.4189e-05, "loss": 0.0213, "step": 8066 }, { "epoch": 19.580801944106927, "grad_norm": 0.4786209166049957, "learning_rate": 2.4192e-05, "loss": 0.0241, "step": 8067 }, { "epoch": 19.583232077764276, "grad_norm": 1.0318870544433594, "learning_rate": 2.4195e-05, "loss": 0.0375, "step": 8068 }, { "epoch": 19.585662211421628, "grad_norm": 0.7250677347183228, "learning_rate": 2.4198e-05, "loss": 0.0297, "step": 8069 }, { "epoch": 19.58809234507898, "grad_norm": 0.5047459006309509, "learning_rate": 2.4201e-05, "loss": 0.0203, "step": 8070 }, { "epoch": 19.59052247873633, "grad_norm": 0.6615172624588013, "learning_rate": 2.4204e-05, "loss": 0.0368, "step": 8071 }, { "epoch": 19.59295261239368, "grad_norm": 0.5675061345100403, "learning_rate": 2.4207e-05, "loss": 0.03, "step": 8072 }, { "epoch": 19.595382746051033, "grad_norm": 0.4404408931732178, "learning_rate": 2.4210000000000004e-05, "loss": 0.0193, "step": 8073 }, { "epoch": 19.597812879708385, "grad_norm": 0.5586727857589722, "learning_rate": 2.4213000000000003e-05, "loss": 0.0325, "step": 8074 }, { "epoch": 19.600243013365734, "grad_norm": 1.0504169464111328, "learning_rate": 2.4216e-05, "loss": 0.0382, "step": 8075 }, { "epoch": 19.602673147023086, "grad_norm": 0.5722959637641907, "learning_rate": 2.4219e-05, "loss": 0.0224, "step": 8076 }, { "epoch": 19.60510328068044, "grad_norm": 1.323117971420288, "learning_rate": 2.4222e-05, "loss": 0.0641, "step": 8077 }, { "epoch": 19.607533414337787, "grad_norm": 1.2847115993499756, "learning_rate": 2.4225e-05, "loss": 0.0435, "step": 8078 }, { "epoch": 19.60996354799514, "grad_norm": 1.3119125366210938, "learning_rate": 2.4228e-05, "loss": 0.3174, "step": 8079 }, { "epoch": 19.61239368165249, "grad_norm": 0.7820058465003967, "learning_rate": 2.4231e-05, "loss": 0.1659, "step": 8080 }, { "epoch": 19.61482381530984, "grad_norm": 0.7617273330688477, "learning_rate": 2.4234e-05, "loss": 0.1321, "step": 8081 }, { "epoch": 19.617253948967193, "grad_norm": 0.5660605430603027, "learning_rate": 2.4237e-05, "loss": 0.1225, "step": 8082 }, { "epoch": 19.619684082624545, "grad_norm": 1.1299222707748413, "learning_rate": 2.4240000000000002e-05, "loss": 0.1286, "step": 8083 }, { "epoch": 19.622114216281897, "grad_norm": 0.5705452561378479, "learning_rate": 2.4243e-05, "loss": 0.102, "step": 8084 }, { "epoch": 19.624544349939246, "grad_norm": 0.376936137676239, "learning_rate": 2.4246e-05, "loss": 0.0606, "step": 8085 }, { "epoch": 19.626974483596598, "grad_norm": 0.4471406638622284, "learning_rate": 2.4249e-05, "loss": 0.0757, "step": 8086 }, { "epoch": 19.62940461725395, "grad_norm": 0.5347881317138672, "learning_rate": 2.4252e-05, "loss": 0.0792, "step": 8087 }, { "epoch": 19.6318347509113, "grad_norm": 0.7188692688941956, "learning_rate": 2.4255e-05, "loss": 0.0631, "step": 8088 }, { "epoch": 19.63426488456865, "grad_norm": 0.7386629581451416, "learning_rate": 2.4258e-05, "loss": 0.0502, "step": 8089 }, { "epoch": 19.636695018226003, "grad_norm": 0.5289586782455444, "learning_rate": 2.4261e-05, "loss": 0.0388, "step": 8090 }, { "epoch": 19.639125151883352, "grad_norm": 0.422914981842041, "learning_rate": 2.4264e-05, "loss": 0.0282, "step": 8091 }, { "epoch": 19.641555285540704, "grad_norm": 0.5569627285003662, "learning_rate": 2.4267e-05, "loss": 0.0388, "step": 8092 }, { "epoch": 19.643985419198057, "grad_norm": 0.3386964201927185, "learning_rate": 2.4270000000000003e-05, "loss": 0.0319, "step": 8093 }, { "epoch": 19.64641555285541, "grad_norm": 0.8684976696968079, "learning_rate": 2.4273000000000003e-05, "loss": 0.0171, "step": 8094 }, { "epoch": 19.648845686512757, "grad_norm": 0.4569146931171417, "learning_rate": 2.4276000000000003e-05, "loss": 0.032, "step": 8095 }, { "epoch": 19.65127582017011, "grad_norm": 0.5042330622673035, "learning_rate": 2.4279000000000003e-05, "loss": 0.0359, "step": 8096 }, { "epoch": 19.653705953827462, "grad_norm": 0.4455787241458893, "learning_rate": 2.4282000000000002e-05, "loss": 0.0315, "step": 8097 }, { "epoch": 19.65613608748481, "grad_norm": 0.3544621169567108, "learning_rate": 2.4285000000000002e-05, "loss": 0.0213, "step": 8098 }, { "epoch": 19.658566221142163, "grad_norm": 0.6895053386688232, "learning_rate": 2.4288e-05, "loss": 0.0312, "step": 8099 }, { "epoch": 19.660996354799515, "grad_norm": 0.8675581812858582, "learning_rate": 2.4291e-05, "loss": 0.0419, "step": 8100 }, { "epoch": 19.663426488456864, "grad_norm": 0.42293810844421387, "learning_rate": 2.4293999999999998e-05, "loss": 0.0347, "step": 8101 }, { "epoch": 19.665856622114216, "grad_norm": 0.42086169123649597, "learning_rate": 2.4296999999999998e-05, "loss": 0.0271, "step": 8102 }, { "epoch": 19.668286755771568, "grad_norm": 0.3962019085884094, "learning_rate": 2.43e-05, "loss": 0.0162, "step": 8103 }, { "epoch": 19.670716889428917, "grad_norm": 0.47539520263671875, "learning_rate": 2.4303e-05, "loss": 0.0201, "step": 8104 }, { "epoch": 19.67314702308627, "grad_norm": 0.7049728035926819, "learning_rate": 2.4306e-05, "loss": 0.0451, "step": 8105 }, { "epoch": 19.67557715674362, "grad_norm": 0.5847794413566589, "learning_rate": 2.4309e-05, "loss": 0.0267, "step": 8106 }, { "epoch": 19.678007290400974, "grad_norm": 0.5411628484725952, "learning_rate": 2.4312e-05, "loss": 0.0197, "step": 8107 }, { "epoch": 19.680437424058322, "grad_norm": 0.24209125339984894, "learning_rate": 2.4315e-05, "loss": 0.012, "step": 8108 }, { "epoch": 19.682867557715674, "grad_norm": 0.6453546285629272, "learning_rate": 2.4318e-05, "loss": 0.0349, "step": 8109 }, { "epoch": 19.685297691373027, "grad_norm": 0.8542968034744263, "learning_rate": 2.4321e-05, "loss": 0.029, "step": 8110 }, { "epoch": 19.687727825030375, "grad_norm": 0.27263563871383667, "learning_rate": 2.4324e-05, "loss": 0.0184, "step": 8111 }, { "epoch": 19.690157958687728, "grad_norm": 1.2210566997528076, "learning_rate": 2.4327e-05, "loss": 0.0415, "step": 8112 }, { "epoch": 19.69258809234508, "grad_norm": 0.40261077880859375, "learning_rate": 2.4330000000000003e-05, "loss": 0.0171, "step": 8113 }, { "epoch": 19.69501822600243, "grad_norm": 0.4797526001930237, "learning_rate": 2.4333000000000002e-05, "loss": 0.029, "step": 8114 }, { "epoch": 19.69744835965978, "grad_norm": 0.5946927666664124, "learning_rate": 2.4336000000000002e-05, "loss": 0.0347, "step": 8115 }, { "epoch": 19.699878493317133, "grad_norm": 0.31781476736068726, "learning_rate": 2.4339000000000002e-05, "loss": 0.0168, "step": 8116 }, { "epoch": 19.702308626974485, "grad_norm": 0.48698386549949646, "learning_rate": 2.4342000000000002e-05, "loss": 0.0353, "step": 8117 }, { "epoch": 19.704738760631834, "grad_norm": 0.7042684555053711, "learning_rate": 2.4345e-05, "loss": 0.046, "step": 8118 }, { "epoch": 19.707168894289186, "grad_norm": 1.0418827533721924, "learning_rate": 2.4348e-05, "loss": 0.0282, "step": 8119 }, { "epoch": 19.70959902794654, "grad_norm": 0.6925160884857178, "learning_rate": 2.4351e-05, "loss": 0.0371, "step": 8120 }, { "epoch": 19.712029161603887, "grad_norm": 0.45703113079071045, "learning_rate": 2.4354e-05, "loss": 0.0213, "step": 8121 }, { "epoch": 19.71445929526124, "grad_norm": 0.962118923664093, "learning_rate": 2.4357e-05, "loss": 0.0552, "step": 8122 }, { "epoch": 19.71688942891859, "grad_norm": 1.394492268562317, "learning_rate": 2.4360000000000004e-05, "loss": 0.0315, "step": 8123 }, { "epoch": 19.71931956257594, "grad_norm": 0.7552441358566284, "learning_rate": 2.4363e-05, "loss": 0.0337, "step": 8124 }, { "epoch": 19.721749696233292, "grad_norm": 0.7402313351631165, "learning_rate": 2.4366e-05, "loss": 0.031, "step": 8125 }, { "epoch": 19.724179829890645, "grad_norm": 0.5855035185813904, "learning_rate": 2.4369e-05, "loss": 0.0256, "step": 8126 }, { "epoch": 19.726609963547997, "grad_norm": 1.833070993423462, "learning_rate": 2.4372e-05, "loss": 0.0446, "step": 8127 }, { "epoch": 19.729040097205345, "grad_norm": 2.309082508087158, "learning_rate": 2.4375e-05, "loss": 0.1113, "step": 8128 }, { "epoch": 19.731470230862698, "grad_norm": 2.82265043258667, "learning_rate": 2.4378e-05, "loss": 0.321, "step": 8129 }, { "epoch": 19.73390036452005, "grad_norm": 1.170350193977356, "learning_rate": 2.4381e-05, "loss": 0.2032, "step": 8130 }, { "epoch": 19.7363304981774, "grad_norm": 0.5832990407943726, "learning_rate": 2.4384e-05, "loss": 0.1194, "step": 8131 }, { "epoch": 19.73876063183475, "grad_norm": 0.7411494255065918, "learning_rate": 2.4387e-05, "loss": 0.1464, "step": 8132 }, { "epoch": 19.741190765492103, "grad_norm": 0.6890280246734619, "learning_rate": 2.439e-05, "loss": 0.1314, "step": 8133 }, { "epoch": 19.74362089914945, "grad_norm": 0.6915946006774902, "learning_rate": 2.4393000000000002e-05, "loss": 0.0853, "step": 8134 }, { "epoch": 19.746051032806804, "grad_norm": 0.7811502814292908, "learning_rate": 2.4396e-05, "loss": 0.1066, "step": 8135 }, { "epoch": 19.748481166464156, "grad_norm": 0.5810379981994629, "learning_rate": 2.4399e-05, "loss": 0.0666, "step": 8136 }, { "epoch": 19.75091130012151, "grad_norm": 0.5079833269119263, "learning_rate": 2.4402e-05, "loss": 0.0497, "step": 8137 }, { "epoch": 19.753341433778857, "grad_norm": 0.6375793218612671, "learning_rate": 2.4405e-05, "loss": 0.0409, "step": 8138 }, { "epoch": 19.75577156743621, "grad_norm": 0.7357649207115173, "learning_rate": 2.4408e-05, "loss": 0.074, "step": 8139 }, { "epoch": 19.75820170109356, "grad_norm": 0.4396105408668518, "learning_rate": 2.4411e-05, "loss": 0.0392, "step": 8140 }, { "epoch": 19.76063183475091, "grad_norm": 0.6101546287536621, "learning_rate": 2.4414e-05, "loss": 0.0565, "step": 8141 }, { "epoch": 19.763061968408262, "grad_norm": 0.46386584639549255, "learning_rate": 2.4417e-05, "loss": 0.0414, "step": 8142 }, { "epoch": 19.765492102065615, "grad_norm": 0.4756642282009125, "learning_rate": 2.442e-05, "loss": 0.0236, "step": 8143 }, { "epoch": 19.767922235722963, "grad_norm": 0.43887028098106384, "learning_rate": 2.4423000000000003e-05, "loss": 0.0224, "step": 8144 }, { "epoch": 19.770352369380316, "grad_norm": 0.4426807463169098, "learning_rate": 2.4426000000000003e-05, "loss": 0.0432, "step": 8145 }, { "epoch": 19.772782503037668, "grad_norm": 0.49037590622901917, "learning_rate": 2.4429000000000003e-05, "loss": 0.0452, "step": 8146 }, { "epoch": 19.775212636695016, "grad_norm": 0.3709362745285034, "learning_rate": 2.4432000000000003e-05, "loss": 0.0201, "step": 8147 }, { "epoch": 19.77764277035237, "grad_norm": 0.5894861221313477, "learning_rate": 2.4435e-05, "loss": 0.0394, "step": 8148 }, { "epoch": 19.78007290400972, "grad_norm": 0.564674437046051, "learning_rate": 2.4438e-05, "loss": 0.0262, "step": 8149 }, { "epoch": 19.782503037667073, "grad_norm": 0.40388187766075134, "learning_rate": 2.4441e-05, "loss": 0.0182, "step": 8150 }, { "epoch": 19.784933171324422, "grad_norm": 0.6784467697143555, "learning_rate": 2.4444e-05, "loss": 0.0301, "step": 8151 }, { "epoch": 19.787363304981774, "grad_norm": 0.41375669836997986, "learning_rate": 2.4446999999999998e-05, "loss": 0.0209, "step": 8152 }, { "epoch": 19.789793438639126, "grad_norm": 0.7455136179924011, "learning_rate": 2.4449999999999998e-05, "loss": 0.0459, "step": 8153 }, { "epoch": 19.792223572296475, "grad_norm": 0.36788034439086914, "learning_rate": 2.4453e-05, "loss": 0.0241, "step": 8154 }, { "epoch": 19.794653705953827, "grad_norm": 0.3500017821788788, "learning_rate": 2.4456e-05, "loss": 0.0197, "step": 8155 }, { "epoch": 19.79708383961118, "grad_norm": 0.43832001090049744, "learning_rate": 2.4459e-05, "loss": 0.029, "step": 8156 }, { "epoch": 19.799513973268528, "grad_norm": 0.6239696145057678, "learning_rate": 2.4462e-05, "loss": 0.0345, "step": 8157 }, { "epoch": 19.80194410692588, "grad_norm": 0.859821617603302, "learning_rate": 2.4465e-05, "loss": 0.0461, "step": 8158 }, { "epoch": 19.804374240583233, "grad_norm": 0.6797425150871277, "learning_rate": 2.4468e-05, "loss": 0.027, "step": 8159 }, { "epoch": 19.806804374240585, "grad_norm": 0.6489253044128418, "learning_rate": 2.4471e-05, "loss": 0.0431, "step": 8160 }, { "epoch": 19.809234507897933, "grad_norm": 0.932307243347168, "learning_rate": 2.4474e-05, "loss": 0.0289, "step": 8161 }, { "epoch": 19.811664641555286, "grad_norm": 0.3601224422454834, "learning_rate": 2.4477e-05, "loss": 0.0174, "step": 8162 }, { "epoch": 19.814094775212638, "grad_norm": 0.6656429171562195, "learning_rate": 2.448e-05, "loss": 0.036, "step": 8163 }, { "epoch": 19.816524908869987, "grad_norm": 0.6716451048851013, "learning_rate": 2.4483000000000003e-05, "loss": 0.0336, "step": 8164 }, { "epoch": 19.81895504252734, "grad_norm": 0.4174908399581909, "learning_rate": 2.4486000000000002e-05, "loss": 0.0273, "step": 8165 }, { "epoch": 19.82138517618469, "grad_norm": 0.5211536288261414, "learning_rate": 2.4489000000000002e-05, "loss": 0.0205, "step": 8166 }, { "epoch": 19.82381530984204, "grad_norm": 0.4743040204048157, "learning_rate": 2.4492000000000002e-05, "loss": 0.021, "step": 8167 }, { "epoch": 19.826245443499392, "grad_norm": 1.237298846244812, "learning_rate": 2.4495000000000002e-05, "loss": 0.0454, "step": 8168 }, { "epoch": 19.828675577156744, "grad_norm": 0.5533947944641113, "learning_rate": 2.4498e-05, "loss": 0.0217, "step": 8169 }, { "epoch": 19.831105710814096, "grad_norm": 0.5672993659973145, "learning_rate": 2.4501e-05, "loss": 0.0183, "step": 8170 }, { "epoch": 19.833535844471445, "grad_norm": 0.5388439893722534, "learning_rate": 2.4504e-05, "loss": 0.025, "step": 8171 }, { "epoch": 19.835965978128797, "grad_norm": 0.5168247818946838, "learning_rate": 2.4507e-05, "loss": 0.0287, "step": 8172 }, { "epoch": 19.83839611178615, "grad_norm": 0.6210883855819702, "learning_rate": 2.4509999999999997e-05, "loss": 0.0168, "step": 8173 }, { "epoch": 19.8408262454435, "grad_norm": 0.3450884521007538, "learning_rate": 2.4513e-05, "loss": 0.0167, "step": 8174 }, { "epoch": 19.84325637910085, "grad_norm": 1.1574093103408813, "learning_rate": 2.4516e-05, "loss": 0.0319, "step": 8175 }, { "epoch": 19.845686512758203, "grad_norm": 0.7433009147644043, "learning_rate": 2.4519e-05, "loss": 0.0324, "step": 8176 }, { "epoch": 19.84811664641555, "grad_norm": 1.536948323249817, "learning_rate": 2.4522e-05, "loss": 0.0294, "step": 8177 }, { "epoch": 19.850546780072904, "grad_norm": 1.9803701639175415, "learning_rate": 2.4525e-05, "loss": 0.0871, "step": 8178 }, { "epoch": 19.852976913730256, "grad_norm": 0.9615151286125183, "learning_rate": 2.4528e-05, "loss": 0.2239, "step": 8179 }, { "epoch": 19.855407047387608, "grad_norm": 0.689712643623352, "learning_rate": 2.4531e-05, "loss": 0.1998, "step": 8180 }, { "epoch": 19.857837181044957, "grad_norm": 0.6110420227050781, "learning_rate": 2.4534e-05, "loss": 0.1314, "step": 8181 }, { "epoch": 19.86026731470231, "grad_norm": 0.7215285301208496, "learning_rate": 2.4537e-05, "loss": 0.1636, "step": 8182 }, { "epoch": 19.86269744835966, "grad_norm": 0.5692952275276184, "learning_rate": 2.454e-05, "loss": 0.1318, "step": 8183 }, { "epoch": 19.86512758201701, "grad_norm": 0.6294167637825012, "learning_rate": 2.4543000000000002e-05, "loss": 0.0976, "step": 8184 }, { "epoch": 19.867557715674362, "grad_norm": 0.6422566771507263, "learning_rate": 2.4546000000000002e-05, "loss": 0.0819, "step": 8185 }, { "epoch": 19.869987849331714, "grad_norm": 0.5914015173912048, "learning_rate": 2.4549e-05, "loss": 0.065, "step": 8186 }, { "epoch": 19.872417982989063, "grad_norm": 0.46972060203552246, "learning_rate": 2.4552e-05, "loss": 0.0364, "step": 8187 }, { "epoch": 19.874848116646415, "grad_norm": 0.4610329568386078, "learning_rate": 2.4555e-05, "loss": 0.0526, "step": 8188 }, { "epoch": 19.877278250303767, "grad_norm": 0.5535914301872253, "learning_rate": 2.4558e-05, "loss": 0.0383, "step": 8189 }, { "epoch": 19.879708383961116, "grad_norm": 0.44716930389404297, "learning_rate": 2.4561e-05, "loss": 0.0394, "step": 8190 }, { "epoch": 19.88213851761847, "grad_norm": 0.3247095048427582, "learning_rate": 2.4564e-05, "loss": 0.0347, "step": 8191 }, { "epoch": 19.88456865127582, "grad_norm": 0.5465469360351562, "learning_rate": 2.4567e-05, "loss": 0.0334, "step": 8192 }, { "epoch": 19.886998784933173, "grad_norm": 0.33346301317214966, "learning_rate": 2.457e-05, "loss": 0.0266, "step": 8193 }, { "epoch": 19.88942891859052, "grad_norm": 0.6390310525894165, "learning_rate": 2.4573000000000003e-05, "loss": 0.0422, "step": 8194 }, { "epoch": 19.891859052247874, "grad_norm": 0.7068217396736145, "learning_rate": 2.4576000000000003e-05, "loss": 0.034, "step": 8195 }, { "epoch": 19.894289185905226, "grad_norm": 0.5734817385673523, "learning_rate": 2.4579000000000003e-05, "loss": 0.0388, "step": 8196 }, { "epoch": 19.896719319562575, "grad_norm": 0.5740038156509399, "learning_rate": 2.4582000000000003e-05, "loss": 0.0522, "step": 8197 }, { "epoch": 19.899149453219927, "grad_norm": 0.5038398504257202, "learning_rate": 2.4585e-05, "loss": 0.0385, "step": 8198 }, { "epoch": 19.90157958687728, "grad_norm": 0.33629855513572693, "learning_rate": 2.4588e-05, "loss": 0.0233, "step": 8199 }, { "epoch": 19.904009720534628, "grad_norm": 0.6040955185890198, "learning_rate": 2.4591e-05, "loss": 0.033, "step": 8200 }, { "epoch": 19.90643985419198, "grad_norm": 0.5424689054489136, "learning_rate": 2.4594e-05, "loss": 0.029, "step": 8201 }, { "epoch": 19.908869987849332, "grad_norm": 0.3759949207305908, "learning_rate": 2.4597e-05, "loss": 0.0193, "step": 8202 }, { "epoch": 19.911300121506684, "grad_norm": 0.340000182390213, "learning_rate": 2.4599999999999998e-05, "loss": 0.028, "step": 8203 }, { "epoch": 19.913730255164033, "grad_norm": 0.5198975801467896, "learning_rate": 2.4603e-05, "loss": 0.0242, "step": 8204 }, { "epoch": 19.916160388821385, "grad_norm": 0.5030783414840698, "learning_rate": 2.4606e-05, "loss": 0.0233, "step": 8205 }, { "epoch": 19.918590522478738, "grad_norm": 0.4556903541088104, "learning_rate": 2.4609e-05, "loss": 0.0329, "step": 8206 }, { "epoch": 19.921020656136086, "grad_norm": 2.6136646270751953, "learning_rate": 2.4612e-05, "loss": 0.0478, "step": 8207 }, { "epoch": 19.92345078979344, "grad_norm": 0.41651731729507446, "learning_rate": 2.4615e-05, "loss": 0.0175, "step": 8208 }, { "epoch": 19.92588092345079, "grad_norm": 0.6981807947158813, "learning_rate": 2.4618e-05, "loss": 0.0447, "step": 8209 }, { "epoch": 19.92831105710814, "grad_norm": 0.4666254222393036, "learning_rate": 2.4621e-05, "loss": 0.0238, "step": 8210 }, { "epoch": 19.93074119076549, "grad_norm": 0.6198545694351196, "learning_rate": 2.4624e-05, "loss": 0.0347, "step": 8211 }, { "epoch": 19.933171324422844, "grad_norm": 0.9669198393821716, "learning_rate": 2.4627e-05, "loss": 0.0254, "step": 8212 }, { "epoch": 19.935601458080196, "grad_norm": 0.4286288321018219, "learning_rate": 2.463e-05, "loss": 0.027, "step": 8213 }, { "epoch": 19.938031591737545, "grad_norm": 0.5548579096794128, "learning_rate": 2.4633000000000003e-05, "loss": 0.0272, "step": 8214 }, { "epoch": 19.940461725394897, "grad_norm": 0.4994180202484131, "learning_rate": 2.4636000000000003e-05, "loss": 0.0226, "step": 8215 }, { "epoch": 19.94289185905225, "grad_norm": 0.6392078399658203, "learning_rate": 2.4639000000000002e-05, "loss": 0.0692, "step": 8216 }, { "epoch": 19.945321992709598, "grad_norm": 0.657737672328949, "learning_rate": 2.4642000000000002e-05, "loss": 0.032, "step": 8217 }, { "epoch": 19.94775212636695, "grad_norm": 1.0097966194152832, "learning_rate": 2.4645000000000002e-05, "loss": 0.062, "step": 8218 }, { "epoch": 19.950182260024302, "grad_norm": 2.0084774494171143, "learning_rate": 2.4648000000000002e-05, "loss": 0.025, "step": 8219 }, { "epoch": 19.95261239368165, "grad_norm": 0.813247561454773, "learning_rate": 2.4651e-05, "loss": 0.0407, "step": 8220 }, { "epoch": 19.955042527339003, "grad_norm": 0.2817789614200592, "learning_rate": 2.4654e-05, "loss": 0.0159, "step": 8221 }, { "epoch": 19.957472660996356, "grad_norm": 1.6779502630233765, "learning_rate": 2.4656999999999998e-05, "loss": 0.057, "step": 8222 }, { "epoch": 19.959902794653708, "grad_norm": 0.5060879588127136, "learning_rate": 2.4659999999999998e-05, "loss": 0.0165, "step": 8223 }, { "epoch": 19.962332928311056, "grad_norm": 0.41408124566078186, "learning_rate": 2.4663e-05, "loss": 0.0204, "step": 8224 }, { "epoch": 19.96476306196841, "grad_norm": 0.7109601497650146, "learning_rate": 2.4666e-05, "loss": 0.0373, "step": 8225 }, { "epoch": 19.96719319562576, "grad_norm": 1.7042899131774902, "learning_rate": 2.4669e-05, "loss": 0.0574, "step": 8226 }, { "epoch": 19.96962332928311, "grad_norm": 0.8695281744003296, "learning_rate": 2.4672e-05, "loss": 0.0419, "step": 8227 }, { "epoch": 19.972053462940462, "grad_norm": 1.2016122341156006, "learning_rate": 2.4675e-05, "loss": 0.0831, "step": 8228 }, { "epoch": 19.974483596597814, "grad_norm": 1.322593331336975, "learning_rate": 2.4678e-05, "loss": 0.2175, "step": 8229 }, { "epoch": 19.976913730255163, "grad_norm": 0.649483323097229, "learning_rate": 2.4681e-05, "loss": 0.094, "step": 8230 }, { "epoch": 19.979343863912515, "grad_norm": 0.615744411945343, "learning_rate": 2.4684e-05, "loss": 0.0543, "step": 8231 }, { "epoch": 19.981773997569867, "grad_norm": 0.4087603688240051, "learning_rate": 2.4687e-05, "loss": 0.0308, "step": 8232 }, { "epoch": 19.984204131227216, "grad_norm": 0.3040385842323303, "learning_rate": 2.469e-05, "loss": 0.0196, "step": 8233 }, { "epoch": 19.986634264884568, "grad_norm": 0.6220316290855408, "learning_rate": 2.4693000000000002e-05, "loss": 0.0715, "step": 8234 }, { "epoch": 19.98906439854192, "grad_norm": 0.4828532040119171, "learning_rate": 2.4696000000000002e-05, "loss": 0.0232, "step": 8235 }, { "epoch": 19.991494532199273, "grad_norm": 0.886050283908844, "learning_rate": 2.4699000000000002e-05, "loss": 0.091, "step": 8236 }, { "epoch": 19.99392466585662, "grad_norm": 0.5424454212188721, "learning_rate": 2.4702e-05, "loss": 0.028, "step": 8237 }, { "epoch": 19.996354799513973, "grad_norm": 0.7563092112541199, "learning_rate": 2.4705e-05, "loss": 0.0256, "step": 8238 }, { "epoch": 19.998784933171326, "grad_norm": 1.151082992553711, "learning_rate": 2.4708e-05, "loss": 0.0601, "step": 8239 }, { "epoch": 20.0, "grad_norm": 0.5392994284629822, "learning_rate": 2.4711e-05, "loss": 0.0137, "step": 8240 }, { "epoch": 20.002430133657352, "grad_norm": 0.8676579594612122, "learning_rate": 2.4714e-05, "loss": 0.2421, "step": 8241 }, { "epoch": 20.0048602673147, "grad_norm": 0.5850691199302673, "learning_rate": 2.4717e-05, "loss": 0.1767, "step": 8242 }, { "epoch": 20.007290400972053, "grad_norm": 0.47723546624183655, "learning_rate": 2.472e-05, "loss": 0.1079, "step": 8243 }, { "epoch": 20.009720534629405, "grad_norm": 0.676979124546051, "learning_rate": 2.4723000000000004e-05, "loss": 0.1315, "step": 8244 }, { "epoch": 20.012150668286754, "grad_norm": 0.5302150249481201, "learning_rate": 2.4726000000000003e-05, "loss": 0.0993, "step": 8245 }, { "epoch": 20.014580801944106, "grad_norm": 0.4875968396663666, "learning_rate": 2.4729000000000003e-05, "loss": 0.0753, "step": 8246 }, { "epoch": 20.01701093560146, "grad_norm": 1.045644760131836, "learning_rate": 2.4732e-05, "loss": 0.0984, "step": 8247 }, { "epoch": 20.01944106925881, "grad_norm": 0.3510526120662689, "learning_rate": 2.4735e-05, "loss": 0.0324, "step": 8248 }, { "epoch": 20.02187120291616, "grad_norm": 0.5212877988815308, "learning_rate": 2.4738e-05, "loss": 0.0633, "step": 8249 }, { "epoch": 20.02430133657351, "grad_norm": 0.43342310190200806, "learning_rate": 2.4741e-05, "loss": 0.0334, "step": 8250 }, { "epoch": 20.026731470230864, "grad_norm": 0.5564873218536377, "learning_rate": 2.4744e-05, "loss": 0.0324, "step": 8251 }, { "epoch": 20.029161603888213, "grad_norm": 0.36277034878730774, "learning_rate": 2.4747e-05, "loss": 0.0305, "step": 8252 }, { "epoch": 20.031591737545565, "grad_norm": 0.3215257525444031, "learning_rate": 2.475e-05, "loss": 0.0298, "step": 8253 }, { "epoch": 20.034021871202917, "grad_norm": 0.5054019093513489, "learning_rate": 2.4753e-05, "loss": 0.0279, "step": 8254 }, { "epoch": 20.036452004860266, "grad_norm": 0.5419918894767761, "learning_rate": 2.4756e-05, "loss": 0.0363, "step": 8255 }, { "epoch": 20.038882138517618, "grad_norm": 0.34989774227142334, "learning_rate": 2.4759e-05, "loss": 0.0204, "step": 8256 }, { "epoch": 20.04131227217497, "grad_norm": 0.3546406626701355, "learning_rate": 2.4762e-05, "loss": 0.0217, "step": 8257 }, { "epoch": 20.043742405832322, "grad_norm": 0.6502357721328735, "learning_rate": 2.4765e-05, "loss": 0.0549, "step": 8258 }, { "epoch": 20.04617253948967, "grad_norm": 0.2975497245788574, "learning_rate": 2.4768e-05, "loss": 0.0152, "step": 8259 }, { "epoch": 20.048602673147023, "grad_norm": 0.6269820928573608, "learning_rate": 2.4771e-05, "loss": 0.0332, "step": 8260 }, { "epoch": 20.051032806804375, "grad_norm": 1.4500834941864014, "learning_rate": 2.4774e-05, "loss": 0.0261, "step": 8261 }, { "epoch": 20.053462940461724, "grad_norm": 0.3663661777973175, "learning_rate": 2.4777e-05, "loss": 0.0206, "step": 8262 }, { "epoch": 20.055893074119076, "grad_norm": 0.5897666811943054, "learning_rate": 2.478e-05, "loss": 0.0143, "step": 8263 }, { "epoch": 20.05832320777643, "grad_norm": 0.49559828639030457, "learning_rate": 2.4783e-05, "loss": 0.0355, "step": 8264 }, { "epoch": 20.060753341433777, "grad_norm": 0.4391125738620758, "learning_rate": 2.4786000000000003e-05, "loss": 0.0217, "step": 8265 }, { "epoch": 20.06318347509113, "grad_norm": 0.32725057005882263, "learning_rate": 2.4789000000000003e-05, "loss": 0.0261, "step": 8266 }, { "epoch": 20.06561360874848, "grad_norm": 0.7564564943313599, "learning_rate": 2.4792000000000003e-05, "loss": 0.0367, "step": 8267 }, { "epoch": 20.068043742405834, "grad_norm": 0.3985237777233124, "learning_rate": 2.4795000000000002e-05, "loss": 0.0194, "step": 8268 }, { "epoch": 20.070473876063183, "grad_norm": 0.8094469308853149, "learning_rate": 2.4798000000000002e-05, "loss": 0.019, "step": 8269 }, { "epoch": 20.072904009720535, "grad_norm": 0.7748976349830627, "learning_rate": 2.4801000000000002e-05, "loss": 0.0288, "step": 8270 }, { "epoch": 20.075334143377887, "grad_norm": 0.7562494874000549, "learning_rate": 2.4804e-05, "loss": 0.0377, "step": 8271 }, { "epoch": 20.077764277035236, "grad_norm": 0.2830162048339844, "learning_rate": 2.4806999999999998e-05, "loss": 0.0163, "step": 8272 }, { "epoch": 20.080194410692588, "grad_norm": 0.44026097655296326, "learning_rate": 2.4809999999999998e-05, "loss": 0.0259, "step": 8273 }, { "epoch": 20.08262454434994, "grad_norm": 0.42488622665405273, "learning_rate": 2.4812999999999998e-05, "loss": 0.0245, "step": 8274 }, { "epoch": 20.08505467800729, "grad_norm": 0.6072728037834167, "learning_rate": 2.4816e-05, "loss": 0.0308, "step": 8275 }, { "epoch": 20.08748481166464, "grad_norm": 1.0366663932800293, "learning_rate": 2.4819e-05, "loss": 0.0317, "step": 8276 }, { "epoch": 20.089914945321993, "grad_norm": 0.43274572491645813, "learning_rate": 2.4822e-05, "loss": 0.0196, "step": 8277 }, { "epoch": 20.092345078979346, "grad_norm": 0.6579515933990479, "learning_rate": 2.4825e-05, "loss": 0.0191, "step": 8278 }, { "epoch": 20.094775212636694, "grad_norm": 1.7816190719604492, "learning_rate": 2.4828e-05, "loss": 0.0402, "step": 8279 }, { "epoch": 20.097205346294047, "grad_norm": 0.4550241231918335, "learning_rate": 2.4831e-05, "loss": 0.0233, "step": 8280 }, { "epoch": 20.0996354799514, "grad_norm": 0.5453656911849976, "learning_rate": 2.4834e-05, "loss": 0.0201, "step": 8281 }, { "epoch": 20.102065613608747, "grad_norm": 0.7275123596191406, "learning_rate": 2.4837e-05, "loss": 0.031, "step": 8282 }, { "epoch": 20.1044957472661, "grad_norm": 0.8888028264045715, "learning_rate": 2.484e-05, "loss": 0.0357, "step": 8283 }, { "epoch": 20.106925880923452, "grad_norm": 0.7762283682823181, "learning_rate": 2.4843e-05, "loss": 0.0323, "step": 8284 }, { "epoch": 20.1093560145808, "grad_norm": 1.1174248456954956, "learning_rate": 2.4846000000000002e-05, "loss": 0.0359, "step": 8285 }, { "epoch": 20.111786148238153, "grad_norm": 0.9910802245140076, "learning_rate": 2.4849000000000002e-05, "loss": 0.0388, "step": 8286 }, { "epoch": 20.114216281895505, "grad_norm": 0.8875089883804321, "learning_rate": 2.4852000000000002e-05, "loss": 0.0236, "step": 8287 }, { "epoch": 20.116646415552854, "grad_norm": 0.926372230052948, "learning_rate": 2.4855000000000002e-05, "loss": 0.0563, "step": 8288 }, { "epoch": 20.119076549210206, "grad_norm": 1.3443316221237183, "learning_rate": 2.4858e-05, "loss": 0.0315, "step": 8289 }, { "epoch": 20.121506682867558, "grad_norm": 1.5875802040100098, "learning_rate": 2.4861e-05, "loss": 0.0744, "step": 8290 }, { "epoch": 20.12393681652491, "grad_norm": 1.1365268230438232, "learning_rate": 2.4864e-05, "loss": 0.2237, "step": 8291 }, { "epoch": 20.12636695018226, "grad_norm": 0.7793760895729065, "learning_rate": 2.4867e-05, "loss": 0.1801, "step": 8292 }, { "epoch": 20.12879708383961, "grad_norm": 0.5440195202827454, "learning_rate": 2.487e-05, "loss": 0.1246, "step": 8293 }, { "epoch": 20.131227217496964, "grad_norm": 2.2234644889831543, "learning_rate": 2.4873e-05, "loss": 0.135, "step": 8294 }, { "epoch": 20.133657351154312, "grad_norm": 0.6994100213050842, "learning_rate": 2.4876000000000004e-05, "loss": 0.1104, "step": 8295 }, { "epoch": 20.136087484811664, "grad_norm": 0.6519515514373779, "learning_rate": 2.4879e-05, "loss": 0.0846, "step": 8296 }, { "epoch": 20.138517618469017, "grad_norm": 0.586824357509613, "learning_rate": 2.4882e-05, "loss": 0.0665, "step": 8297 }, { "epoch": 20.140947752126365, "grad_norm": 0.5389854311943054, "learning_rate": 2.4885e-05, "loss": 0.0358, "step": 8298 }, { "epoch": 20.143377885783718, "grad_norm": 0.9975634217262268, "learning_rate": 2.4888e-05, "loss": 0.0424, "step": 8299 }, { "epoch": 20.14580801944107, "grad_norm": 0.49024057388305664, "learning_rate": 2.4891e-05, "loss": 0.0515, "step": 8300 }, { "epoch": 20.148238153098422, "grad_norm": 0.30873262882232666, "learning_rate": 2.4894e-05, "loss": 0.0294, "step": 8301 }, { "epoch": 20.15066828675577, "grad_norm": 0.5689610242843628, "learning_rate": 2.4897e-05, "loss": 0.0355, "step": 8302 }, { "epoch": 20.153098420413123, "grad_norm": 0.40567511320114136, "learning_rate": 2.49e-05, "loss": 0.0354, "step": 8303 }, { "epoch": 20.155528554070475, "grad_norm": 0.48665618896484375, "learning_rate": 2.4903e-05, "loss": 0.0416, "step": 8304 }, { "epoch": 20.157958687727824, "grad_norm": 0.8463976383209229, "learning_rate": 2.4906000000000002e-05, "loss": 0.0477, "step": 8305 }, { "epoch": 20.160388821385176, "grad_norm": 0.32228681445121765, "learning_rate": 2.4909e-05, "loss": 0.0196, "step": 8306 }, { "epoch": 20.16281895504253, "grad_norm": 0.36995866894721985, "learning_rate": 2.4912e-05, "loss": 0.0377, "step": 8307 }, { "epoch": 20.165249088699877, "grad_norm": 0.4746890664100647, "learning_rate": 2.4915e-05, "loss": 0.0289, "step": 8308 }, { "epoch": 20.16767922235723, "grad_norm": 0.550004243850708, "learning_rate": 2.4918e-05, "loss": 0.0249, "step": 8309 }, { "epoch": 20.17010935601458, "grad_norm": 0.9192832708358765, "learning_rate": 2.4921e-05, "loss": 0.0314, "step": 8310 }, { "epoch": 20.172539489671934, "grad_norm": 0.9212648272514343, "learning_rate": 2.4924e-05, "loss": 0.0301, "step": 8311 }, { "epoch": 20.174969623329282, "grad_norm": 0.4259583652019501, "learning_rate": 2.4927e-05, "loss": 0.0224, "step": 8312 }, { "epoch": 20.177399756986635, "grad_norm": 0.3782539963722229, "learning_rate": 2.493e-05, "loss": 0.0137, "step": 8313 }, { "epoch": 20.179829890643987, "grad_norm": 0.4636983871459961, "learning_rate": 2.4933e-05, "loss": 0.0198, "step": 8314 }, { "epoch": 20.182260024301335, "grad_norm": 0.43116575479507446, "learning_rate": 2.4936000000000003e-05, "loss": 0.0293, "step": 8315 }, { "epoch": 20.184690157958688, "grad_norm": 0.3902944028377533, "learning_rate": 2.4939000000000003e-05, "loss": 0.0146, "step": 8316 }, { "epoch": 20.18712029161604, "grad_norm": 0.8100346922874451, "learning_rate": 2.4942000000000003e-05, "loss": 0.0221, "step": 8317 }, { "epoch": 20.18955042527339, "grad_norm": 0.5270625352859497, "learning_rate": 2.4945000000000003e-05, "loss": 0.0428, "step": 8318 }, { "epoch": 20.19198055893074, "grad_norm": 0.4349379539489746, "learning_rate": 2.4948000000000002e-05, "loss": 0.0228, "step": 8319 }, { "epoch": 20.194410692588093, "grad_norm": 0.4071655869483948, "learning_rate": 2.4951e-05, "loss": 0.0185, "step": 8320 }, { "epoch": 20.19684082624544, "grad_norm": 1.506913661956787, "learning_rate": 2.4954e-05, "loss": 0.0454, "step": 8321 }, { "epoch": 20.199270959902794, "grad_norm": 0.3688391149044037, "learning_rate": 2.4957e-05, "loss": 0.015, "step": 8322 }, { "epoch": 20.201701093560146, "grad_norm": 0.5883203148841858, "learning_rate": 2.4959999999999998e-05, "loss": 0.0324, "step": 8323 }, { "epoch": 20.2041312272175, "grad_norm": 0.44033631682395935, "learning_rate": 2.4962999999999998e-05, "loss": 0.0206, "step": 8324 }, { "epoch": 20.206561360874847, "grad_norm": 0.6108126044273376, "learning_rate": 2.4966e-05, "loss": 0.0196, "step": 8325 }, { "epoch": 20.2089914945322, "grad_norm": 0.5987625122070312, "learning_rate": 2.4969e-05, "loss": 0.0206, "step": 8326 }, { "epoch": 20.21142162818955, "grad_norm": 0.4699195921421051, "learning_rate": 2.4972e-05, "loss": 0.0172, "step": 8327 }, { "epoch": 20.2138517618469, "grad_norm": 0.6107085943222046, "learning_rate": 2.4975e-05, "loss": 0.0219, "step": 8328 }, { "epoch": 20.216281895504252, "grad_norm": 1.097778558731079, "learning_rate": 2.4978e-05, "loss": 0.0318, "step": 8329 }, { "epoch": 20.218712029161605, "grad_norm": 0.7193655371665955, "learning_rate": 2.4981e-05, "loss": 0.0331, "step": 8330 }, { "epoch": 20.221142162818953, "grad_norm": 1.436213493347168, "learning_rate": 2.4984e-05, "loss": 0.0387, "step": 8331 }, { "epoch": 20.223572296476306, "grad_norm": 0.358245849609375, "learning_rate": 2.4987e-05, "loss": 0.0157, "step": 8332 }, { "epoch": 20.226002430133658, "grad_norm": 0.4847467243671417, "learning_rate": 2.499e-05, "loss": 0.0281, "step": 8333 }, { "epoch": 20.22843256379101, "grad_norm": 0.42823755741119385, "learning_rate": 2.4993e-05, "loss": 0.0191, "step": 8334 }, { "epoch": 20.23086269744836, "grad_norm": 0.47763627767562866, "learning_rate": 2.4996000000000003e-05, "loss": 0.0165, "step": 8335 }, { "epoch": 20.23329283110571, "grad_norm": 0.5203549861907959, "learning_rate": 2.4999000000000002e-05, "loss": 0.0204, "step": 8336 }, { "epoch": 20.235722964763063, "grad_norm": 1.187250018119812, "learning_rate": 2.5002000000000002e-05, "loss": 0.0298, "step": 8337 }, { "epoch": 20.238153098420412, "grad_norm": 0.6960201263427734, "learning_rate": 2.5005000000000002e-05, "loss": 0.0411, "step": 8338 }, { "epoch": 20.240583232077764, "grad_norm": 0.6100462079048157, "learning_rate": 2.5008000000000002e-05, "loss": 0.0292, "step": 8339 }, { "epoch": 20.243013365735116, "grad_norm": 1.2325646877288818, "learning_rate": 2.5011e-05, "loss": 0.0317, "step": 8340 }, { "epoch": 20.245443499392465, "grad_norm": 0.8520324230194092, "learning_rate": 2.5014e-05, "loss": 0.2565, "step": 8341 }, { "epoch": 20.247873633049817, "grad_norm": 0.6215474009513855, "learning_rate": 2.5017e-05, "loss": 0.1545, "step": 8342 }, { "epoch": 20.25030376670717, "grad_norm": 0.5709292888641357, "learning_rate": 2.502e-05, "loss": 0.1471, "step": 8343 }, { "epoch": 20.25273390036452, "grad_norm": 0.6752534508705139, "learning_rate": 2.5023e-05, "loss": 0.1102, "step": 8344 }, { "epoch": 20.25516403402187, "grad_norm": 0.5761426687240601, "learning_rate": 2.5026e-05, "loss": 0.0958, "step": 8345 }, { "epoch": 20.257594167679223, "grad_norm": 0.600692093372345, "learning_rate": 2.5029e-05, "loss": 0.0917, "step": 8346 }, { "epoch": 20.260024301336575, "grad_norm": 0.4951705038547516, "learning_rate": 2.5032e-05, "loss": 0.084, "step": 8347 }, { "epoch": 20.262454434993924, "grad_norm": 0.4791509211063385, "learning_rate": 2.5035e-05, "loss": 0.0578, "step": 8348 }, { "epoch": 20.264884568651276, "grad_norm": 0.6731346845626831, "learning_rate": 2.5038e-05, "loss": 0.0768, "step": 8349 }, { "epoch": 20.267314702308628, "grad_norm": 0.5087987184524536, "learning_rate": 2.5041e-05, "loss": 0.0583, "step": 8350 }, { "epoch": 20.269744835965977, "grad_norm": 0.5679513216018677, "learning_rate": 2.5044e-05, "loss": 0.0482, "step": 8351 }, { "epoch": 20.27217496962333, "grad_norm": 0.7146996855735779, "learning_rate": 2.5047e-05, "loss": 0.0415, "step": 8352 }, { "epoch": 20.27460510328068, "grad_norm": 0.4727935791015625, "learning_rate": 2.505e-05, "loss": 0.0307, "step": 8353 }, { "epoch": 20.277035236938033, "grad_norm": 0.4209022521972656, "learning_rate": 2.5053e-05, "loss": 0.0249, "step": 8354 }, { "epoch": 20.279465370595382, "grad_norm": 0.4175269901752472, "learning_rate": 2.5056000000000002e-05, "loss": 0.0251, "step": 8355 }, { "epoch": 20.281895504252734, "grad_norm": 0.5762854218482971, "learning_rate": 2.5059000000000002e-05, "loss": 0.0349, "step": 8356 }, { "epoch": 20.284325637910086, "grad_norm": 0.6018350124359131, "learning_rate": 2.5062e-05, "loss": 0.0346, "step": 8357 }, { "epoch": 20.286755771567435, "grad_norm": 0.3978234827518463, "learning_rate": 2.5065e-05, "loss": 0.0285, "step": 8358 }, { "epoch": 20.289185905224787, "grad_norm": 0.39418458938598633, "learning_rate": 2.5068e-05, "loss": 0.0207, "step": 8359 }, { "epoch": 20.29161603888214, "grad_norm": 0.43943747878074646, "learning_rate": 2.5071e-05, "loss": 0.027, "step": 8360 }, { "epoch": 20.29404617253949, "grad_norm": 0.4985983073711395, "learning_rate": 2.5074e-05, "loss": 0.0207, "step": 8361 }, { "epoch": 20.29647630619684, "grad_norm": 1.3264038562774658, "learning_rate": 2.5077e-05, "loss": 0.0153, "step": 8362 }, { "epoch": 20.298906439854193, "grad_norm": 0.4080738425254822, "learning_rate": 2.508e-05, "loss": 0.0349, "step": 8363 }, { "epoch": 20.30133657351154, "grad_norm": 0.4130866229534149, "learning_rate": 2.5083e-05, "loss": 0.018, "step": 8364 }, { "epoch": 20.303766707168894, "grad_norm": 0.56819087266922, "learning_rate": 2.5086000000000003e-05, "loss": 0.0371, "step": 8365 }, { "epoch": 20.306196840826246, "grad_norm": 0.4847133159637451, "learning_rate": 2.5089000000000003e-05, "loss": 0.0531, "step": 8366 }, { "epoch": 20.308626974483598, "grad_norm": 0.4295268952846527, "learning_rate": 2.5092000000000003e-05, "loss": 0.0238, "step": 8367 }, { "epoch": 20.311057108140947, "grad_norm": 0.405739426612854, "learning_rate": 2.5095000000000003e-05, "loss": 0.0171, "step": 8368 }, { "epoch": 20.3134872417983, "grad_norm": 0.6042236089706421, "learning_rate": 2.5098000000000003e-05, "loss": 0.0214, "step": 8369 }, { "epoch": 20.31591737545565, "grad_norm": 0.5597671866416931, "learning_rate": 2.5101e-05, "loss": 0.031, "step": 8370 }, { "epoch": 20.318347509113, "grad_norm": 0.540360689163208, "learning_rate": 2.5104e-05, "loss": 0.0294, "step": 8371 }, { "epoch": 20.320777642770352, "grad_norm": 0.2505260407924652, "learning_rate": 2.5107e-05, "loss": 0.0109, "step": 8372 }, { "epoch": 20.323207776427704, "grad_norm": 0.4240434169769287, "learning_rate": 2.511e-05, "loss": 0.0288, "step": 8373 }, { "epoch": 20.325637910085053, "grad_norm": 0.8661622405052185, "learning_rate": 2.5112999999999998e-05, "loss": 0.0411, "step": 8374 }, { "epoch": 20.328068043742405, "grad_norm": 0.41662442684173584, "learning_rate": 2.5116e-05, "loss": 0.0167, "step": 8375 }, { "epoch": 20.330498177399758, "grad_norm": 0.5530422329902649, "learning_rate": 2.5119e-05, "loss": 0.0349, "step": 8376 }, { "epoch": 20.33292831105711, "grad_norm": 1.7513715028762817, "learning_rate": 2.5122e-05, "loss": 0.0837, "step": 8377 }, { "epoch": 20.33535844471446, "grad_norm": 0.47778430581092834, "learning_rate": 2.5125e-05, "loss": 0.0177, "step": 8378 }, { "epoch": 20.33778857837181, "grad_norm": 0.4278005063533783, "learning_rate": 2.5128e-05, "loss": 0.0238, "step": 8379 }, { "epoch": 20.340218712029163, "grad_norm": 0.5263770818710327, "learning_rate": 2.5131e-05, "loss": 0.018, "step": 8380 }, { "epoch": 20.34264884568651, "grad_norm": 0.9471363425254822, "learning_rate": 2.5134e-05, "loss": 0.0385, "step": 8381 }, { "epoch": 20.345078979343864, "grad_norm": 1.0015251636505127, "learning_rate": 2.5137e-05, "loss": 0.0405, "step": 8382 }, { "epoch": 20.347509113001216, "grad_norm": 0.4427868723869324, "learning_rate": 2.514e-05, "loss": 0.0176, "step": 8383 }, { "epoch": 20.349939246658565, "grad_norm": 0.6739123463630676, "learning_rate": 2.5143e-05, "loss": 0.0278, "step": 8384 }, { "epoch": 20.352369380315917, "grad_norm": 0.5852198004722595, "learning_rate": 2.5146e-05, "loss": 0.0313, "step": 8385 }, { "epoch": 20.35479951397327, "grad_norm": 0.627413809299469, "learning_rate": 2.5149000000000003e-05, "loss": 0.0245, "step": 8386 }, { "epoch": 20.35722964763062, "grad_norm": 0.8827821016311646, "learning_rate": 2.5152000000000002e-05, "loss": 0.0264, "step": 8387 }, { "epoch": 20.35965978128797, "grad_norm": 0.9493370056152344, "learning_rate": 2.5155000000000002e-05, "loss": 0.0386, "step": 8388 }, { "epoch": 20.362089914945322, "grad_norm": 2.3548595905303955, "learning_rate": 2.5158000000000002e-05, "loss": 0.0456, "step": 8389 }, { "epoch": 20.364520048602675, "grad_norm": 1.4427821636199951, "learning_rate": 2.5161000000000002e-05, "loss": 0.1021, "step": 8390 }, { "epoch": 20.366950182260023, "grad_norm": 1.0215251445770264, "learning_rate": 2.5164e-05, "loss": 0.215, "step": 8391 }, { "epoch": 20.369380315917375, "grad_norm": 0.8297132849693298, "learning_rate": 2.5167e-05, "loss": 0.153, "step": 8392 }, { "epoch": 20.371810449574728, "grad_norm": 0.8283843994140625, "learning_rate": 2.517e-05, "loss": 0.1396, "step": 8393 }, { "epoch": 20.374240583232076, "grad_norm": 0.8780248165130615, "learning_rate": 2.5172999999999998e-05, "loss": 0.1482, "step": 8394 }, { "epoch": 20.37667071688943, "grad_norm": 0.774598240852356, "learning_rate": 2.5175999999999997e-05, "loss": 0.1031, "step": 8395 }, { "epoch": 20.37910085054678, "grad_norm": 0.450494647026062, "learning_rate": 2.5179e-05, "loss": 0.0631, "step": 8396 }, { "epoch": 20.381530984204133, "grad_norm": 0.4936244785785675, "learning_rate": 2.5182e-05, "loss": 0.0672, "step": 8397 }, { "epoch": 20.38396111786148, "grad_norm": 0.5578811764717102, "learning_rate": 2.5185e-05, "loss": 0.0703, "step": 8398 }, { "epoch": 20.386391251518834, "grad_norm": 0.6149824261665344, "learning_rate": 2.5188e-05, "loss": 0.0449, "step": 8399 }, { "epoch": 20.388821385176186, "grad_norm": 0.3880974352359772, "learning_rate": 2.5191e-05, "loss": 0.038, "step": 8400 }, { "epoch": 20.391251518833535, "grad_norm": 0.37788957357406616, "learning_rate": 2.5194e-05, "loss": 0.0595, "step": 8401 }, { "epoch": 20.393681652490887, "grad_norm": 0.39179831743240356, "learning_rate": 2.5197e-05, "loss": 0.0266, "step": 8402 }, { "epoch": 20.39611178614824, "grad_norm": 0.6699495315551758, "learning_rate": 2.52e-05, "loss": 0.0411, "step": 8403 }, { "epoch": 20.398541919805588, "grad_norm": 0.3349238932132721, "learning_rate": 2.5203e-05, "loss": 0.0221, "step": 8404 }, { "epoch": 20.40097205346294, "grad_norm": 0.629426121711731, "learning_rate": 2.5206e-05, "loss": 0.0304, "step": 8405 }, { "epoch": 20.403402187120292, "grad_norm": 0.42343869805336, "learning_rate": 2.5209000000000002e-05, "loss": 0.0336, "step": 8406 }, { "epoch": 20.40583232077764, "grad_norm": 0.3800641596317291, "learning_rate": 2.5212000000000002e-05, "loss": 0.0199, "step": 8407 }, { "epoch": 20.408262454434993, "grad_norm": 0.41580596566200256, "learning_rate": 2.5215e-05, "loss": 0.0335, "step": 8408 }, { "epoch": 20.410692588092346, "grad_norm": 0.44646140933036804, "learning_rate": 2.5218e-05, "loss": 0.0372, "step": 8409 }, { "epoch": 20.413122721749698, "grad_norm": 0.8083697557449341, "learning_rate": 2.5221e-05, "loss": 0.0293, "step": 8410 }, { "epoch": 20.415552855407046, "grad_norm": 0.4701637029647827, "learning_rate": 2.5224e-05, "loss": 0.0334, "step": 8411 }, { "epoch": 20.4179829890644, "grad_norm": 0.35126611590385437, "learning_rate": 2.5227e-05, "loss": 0.0299, "step": 8412 }, { "epoch": 20.42041312272175, "grad_norm": 0.7495234608650208, "learning_rate": 2.523e-05, "loss": 0.0288, "step": 8413 }, { "epoch": 20.4228432563791, "grad_norm": 1.0996577739715576, "learning_rate": 2.5233e-05, "loss": 0.0328, "step": 8414 }, { "epoch": 20.425273390036452, "grad_norm": 0.417842835187912, "learning_rate": 2.5236e-05, "loss": 0.0206, "step": 8415 }, { "epoch": 20.427703523693804, "grad_norm": 0.9222006797790527, "learning_rate": 2.5239000000000003e-05, "loss": 0.0288, "step": 8416 }, { "epoch": 20.430133657351153, "grad_norm": 0.4259709119796753, "learning_rate": 2.5242000000000003e-05, "loss": 0.0158, "step": 8417 }, { "epoch": 20.432563791008505, "grad_norm": 0.596140444278717, "learning_rate": 2.5245000000000003e-05, "loss": 0.021, "step": 8418 }, { "epoch": 20.434993924665857, "grad_norm": 0.5249640345573425, "learning_rate": 2.5248e-05, "loss": 0.0319, "step": 8419 }, { "epoch": 20.43742405832321, "grad_norm": 0.5540350675582886, "learning_rate": 2.5251e-05, "loss": 0.0242, "step": 8420 }, { "epoch": 20.439854191980558, "grad_norm": 0.6761962175369263, "learning_rate": 2.5254e-05, "loss": 0.0318, "step": 8421 }, { "epoch": 20.44228432563791, "grad_norm": 0.41952410340309143, "learning_rate": 2.5257e-05, "loss": 0.0234, "step": 8422 }, { "epoch": 20.444714459295263, "grad_norm": 0.8163456916809082, "learning_rate": 2.526e-05, "loss": 0.0328, "step": 8423 }, { "epoch": 20.44714459295261, "grad_norm": 0.36435842514038086, "learning_rate": 2.5263e-05, "loss": 0.0269, "step": 8424 }, { "epoch": 20.449574726609963, "grad_norm": 1.2283616065979004, "learning_rate": 2.5266e-05, "loss": 0.0267, "step": 8425 }, { "epoch": 20.452004860267316, "grad_norm": 0.8706561326980591, "learning_rate": 2.5269e-05, "loss": 0.0488, "step": 8426 }, { "epoch": 20.454434993924664, "grad_norm": 0.7436635494232178, "learning_rate": 2.5272e-05, "loss": 0.028, "step": 8427 }, { "epoch": 20.456865127582017, "grad_norm": 1.5186644792556763, "learning_rate": 2.5275e-05, "loss": 0.0694, "step": 8428 }, { "epoch": 20.45929526123937, "grad_norm": 0.4326508939266205, "learning_rate": 2.5278e-05, "loss": 0.0292, "step": 8429 }, { "epoch": 20.46172539489672, "grad_norm": 0.54393470287323, "learning_rate": 2.5281e-05, "loss": 0.0274, "step": 8430 }, { "epoch": 20.46415552855407, "grad_norm": 0.7217767834663391, "learning_rate": 2.5284e-05, "loss": 0.041, "step": 8431 }, { "epoch": 20.466585662211422, "grad_norm": 0.4456157386302948, "learning_rate": 2.5287e-05, "loss": 0.0261, "step": 8432 }, { "epoch": 20.469015795868774, "grad_norm": 0.6483809351921082, "learning_rate": 2.529e-05, "loss": 0.0152, "step": 8433 }, { "epoch": 20.471445929526123, "grad_norm": 0.4685885012149811, "learning_rate": 2.5293e-05, "loss": 0.0191, "step": 8434 }, { "epoch": 20.473876063183475, "grad_norm": 1.1987086534500122, "learning_rate": 2.5296e-05, "loss": 0.0337, "step": 8435 }, { "epoch": 20.476306196840827, "grad_norm": 0.6296653747558594, "learning_rate": 2.5299000000000003e-05, "loss": 0.0345, "step": 8436 }, { "epoch": 20.478736330498176, "grad_norm": 1.2228827476501465, "learning_rate": 2.5302000000000003e-05, "loss": 0.0465, "step": 8437 }, { "epoch": 20.481166464155528, "grad_norm": 0.8531830906867981, "learning_rate": 2.5305000000000003e-05, "loss": 0.0336, "step": 8438 }, { "epoch": 20.48359659781288, "grad_norm": 0.925678014755249, "learning_rate": 2.5308000000000002e-05, "loss": 0.0438, "step": 8439 }, { "epoch": 20.48602673147023, "grad_norm": 1.1910977363586426, "learning_rate": 2.5311000000000002e-05, "loss": 0.0846, "step": 8440 }, { "epoch": 20.48845686512758, "grad_norm": 0.7634583115577698, "learning_rate": 2.5314000000000002e-05, "loss": 0.2216, "step": 8441 }, { "epoch": 20.490886998784934, "grad_norm": 0.6411483287811279, "learning_rate": 2.5317000000000002e-05, "loss": 0.183, "step": 8442 }, { "epoch": 20.493317132442286, "grad_norm": 0.737301230430603, "learning_rate": 2.5319999999999998e-05, "loss": 0.121, "step": 8443 }, { "epoch": 20.495747266099634, "grad_norm": 1.7823175191879272, "learning_rate": 2.5322999999999998e-05, "loss": 0.1215, "step": 8444 }, { "epoch": 20.498177399756987, "grad_norm": 0.744630753993988, "learning_rate": 2.5325999999999998e-05, "loss": 0.1169, "step": 8445 }, { "epoch": 20.50060753341434, "grad_norm": 0.6952568888664246, "learning_rate": 2.5329e-05, "loss": 0.0762, "step": 8446 }, { "epoch": 20.503037667071688, "grad_norm": 0.42349693179130554, "learning_rate": 2.5332e-05, "loss": 0.0707, "step": 8447 }, { "epoch": 20.50546780072904, "grad_norm": 0.6508817076683044, "learning_rate": 2.5335e-05, "loss": 0.0646, "step": 8448 }, { "epoch": 20.507897934386392, "grad_norm": 0.7560366988182068, "learning_rate": 2.5338e-05, "loss": 0.0957, "step": 8449 }, { "epoch": 20.51032806804374, "grad_norm": 0.46009212732315063, "learning_rate": 2.5341e-05, "loss": 0.0307, "step": 8450 }, { "epoch": 20.512758201701093, "grad_norm": 0.4283846914768219, "learning_rate": 2.5344e-05, "loss": 0.0358, "step": 8451 }, { "epoch": 20.515188335358445, "grad_norm": 0.5659103989601135, "learning_rate": 2.5347e-05, "loss": 0.026, "step": 8452 }, { "epoch": 20.517618469015797, "grad_norm": 0.3808794915676117, "learning_rate": 2.535e-05, "loss": 0.0213, "step": 8453 }, { "epoch": 20.520048602673146, "grad_norm": 0.532661497592926, "learning_rate": 2.5353e-05, "loss": 0.0268, "step": 8454 }, { "epoch": 20.5224787363305, "grad_norm": 0.6546074151992798, "learning_rate": 2.5356e-05, "loss": 0.0578, "step": 8455 }, { "epoch": 20.52490886998785, "grad_norm": 0.5401341915130615, "learning_rate": 2.5359000000000002e-05, "loss": 0.0469, "step": 8456 }, { "epoch": 20.5273390036452, "grad_norm": 0.5715715885162354, "learning_rate": 2.5362000000000002e-05, "loss": 0.0299, "step": 8457 }, { "epoch": 20.52976913730255, "grad_norm": 0.42250096797943115, "learning_rate": 2.5365000000000002e-05, "loss": 0.0219, "step": 8458 }, { "epoch": 20.532199270959904, "grad_norm": 0.4320892095565796, "learning_rate": 2.5368000000000002e-05, "loss": 0.026, "step": 8459 }, { "epoch": 20.534629404617252, "grad_norm": 0.5668535828590393, "learning_rate": 2.5371e-05, "loss": 0.0196, "step": 8460 }, { "epoch": 20.537059538274605, "grad_norm": 0.6868850588798523, "learning_rate": 2.5374e-05, "loss": 0.0248, "step": 8461 }, { "epoch": 20.539489671931957, "grad_norm": 0.6273048520088196, "learning_rate": 2.5377e-05, "loss": 0.0353, "step": 8462 }, { "epoch": 20.54191980558931, "grad_norm": 0.7304115295410156, "learning_rate": 2.538e-05, "loss": 0.079, "step": 8463 }, { "epoch": 20.544349939246658, "grad_norm": 0.6013527512550354, "learning_rate": 2.5383e-05, "loss": 0.0142, "step": 8464 }, { "epoch": 20.54678007290401, "grad_norm": 0.39334428310394287, "learning_rate": 2.5386e-05, "loss": 0.0193, "step": 8465 }, { "epoch": 20.549210206561362, "grad_norm": 0.6689793467521667, "learning_rate": 2.5389000000000004e-05, "loss": 0.0325, "step": 8466 }, { "epoch": 20.55164034021871, "grad_norm": 0.4193921685218811, "learning_rate": 2.5392000000000004e-05, "loss": 0.032, "step": 8467 }, { "epoch": 20.554070473876063, "grad_norm": 0.49278053641319275, "learning_rate": 2.5395e-05, "loss": 0.0333, "step": 8468 }, { "epoch": 20.556500607533415, "grad_norm": 0.5739785432815552, "learning_rate": 2.5398e-05, "loss": 0.0227, "step": 8469 }, { "epoch": 20.558930741190764, "grad_norm": 0.7629038691520691, "learning_rate": 2.5401e-05, "loss": 0.026, "step": 8470 }, { "epoch": 20.561360874848116, "grad_norm": 0.3240257203578949, "learning_rate": 2.5404e-05, "loss": 0.016, "step": 8471 }, { "epoch": 20.56379100850547, "grad_norm": 0.587836503982544, "learning_rate": 2.5407e-05, "loss": 0.0275, "step": 8472 }, { "epoch": 20.566221142162817, "grad_norm": 0.40397676825523376, "learning_rate": 2.541e-05, "loss": 0.0221, "step": 8473 }, { "epoch": 20.56865127582017, "grad_norm": 0.6765218377113342, "learning_rate": 2.5413e-05, "loss": 0.0345, "step": 8474 }, { "epoch": 20.57108140947752, "grad_norm": 0.6876322031021118, "learning_rate": 2.5416e-05, "loss": 0.0318, "step": 8475 }, { "epoch": 20.573511543134874, "grad_norm": 1.0139472484588623, "learning_rate": 2.5419000000000002e-05, "loss": 0.04, "step": 8476 }, { "epoch": 20.575941676792223, "grad_norm": 1.0817002058029175, "learning_rate": 2.5422e-05, "loss": 0.0259, "step": 8477 }, { "epoch": 20.578371810449575, "grad_norm": 0.4788045287132263, "learning_rate": 2.5425e-05, "loss": 0.0215, "step": 8478 }, { "epoch": 20.580801944106927, "grad_norm": 0.6970097422599792, "learning_rate": 2.5428e-05, "loss": 0.0189, "step": 8479 }, { "epoch": 20.583232077764276, "grad_norm": 1.0880173444747925, "learning_rate": 2.5431e-05, "loss": 0.0371, "step": 8480 }, { "epoch": 20.585662211421628, "grad_norm": 0.996401309967041, "learning_rate": 2.5434e-05, "loss": 0.0346, "step": 8481 }, { "epoch": 20.58809234507898, "grad_norm": 0.45725804567337036, "learning_rate": 2.5437e-05, "loss": 0.0214, "step": 8482 }, { "epoch": 20.59052247873633, "grad_norm": 0.6119385957717896, "learning_rate": 2.544e-05, "loss": 0.0174, "step": 8483 }, { "epoch": 20.59295261239368, "grad_norm": 0.42162731289863586, "learning_rate": 2.5443e-05, "loss": 0.0216, "step": 8484 }, { "epoch": 20.595382746051033, "grad_norm": 4.560181140899658, "learning_rate": 2.5446e-05, "loss": 0.0547, "step": 8485 }, { "epoch": 20.597812879708385, "grad_norm": 1.2809361219406128, "learning_rate": 2.5449000000000003e-05, "loss": 0.0385, "step": 8486 }, { "epoch": 20.600243013365734, "grad_norm": 0.6748340129852295, "learning_rate": 2.5452000000000003e-05, "loss": 0.0346, "step": 8487 }, { "epoch": 20.602673147023086, "grad_norm": 1.3602570295333862, "learning_rate": 2.5455000000000003e-05, "loss": 0.0787, "step": 8488 }, { "epoch": 20.60510328068044, "grad_norm": 1.1320234537124634, "learning_rate": 2.5458000000000003e-05, "loss": 0.0494, "step": 8489 }, { "epoch": 20.607533414337787, "grad_norm": 1.6298327445983887, "learning_rate": 2.5461000000000002e-05, "loss": 0.0686, "step": 8490 }, { "epoch": 20.60996354799514, "grad_norm": 2.088911294937134, "learning_rate": 2.5464000000000002e-05, "loss": 0.3062, "step": 8491 }, { "epoch": 20.61239368165249, "grad_norm": 0.9702487587928772, "learning_rate": 2.5467e-05, "loss": 0.1736, "step": 8492 }, { "epoch": 20.61482381530984, "grad_norm": 0.5447027683258057, "learning_rate": 2.547e-05, "loss": 0.1278, "step": 8493 }, { "epoch": 20.617253948967193, "grad_norm": 0.5678665041923523, "learning_rate": 2.5472999999999998e-05, "loss": 0.1351, "step": 8494 }, { "epoch": 20.619684082624545, "grad_norm": 1.5686498880386353, "learning_rate": 2.5475999999999998e-05, "loss": 0.1741, "step": 8495 }, { "epoch": 20.622114216281897, "grad_norm": 0.6878814697265625, "learning_rate": 2.5479e-05, "loss": 0.1022, "step": 8496 }, { "epoch": 20.624544349939246, "grad_norm": 0.5640206336975098, "learning_rate": 2.5482e-05, "loss": 0.0838, "step": 8497 }, { "epoch": 20.626974483596598, "grad_norm": 0.58148193359375, "learning_rate": 2.5485e-05, "loss": 0.0813, "step": 8498 }, { "epoch": 20.62940461725395, "grad_norm": 0.3682650029659271, "learning_rate": 2.5488e-05, "loss": 0.0387, "step": 8499 }, { "epoch": 20.6318347509113, "grad_norm": 1.3679096698760986, "learning_rate": 2.5491e-05, "loss": 0.0553, "step": 8500 }, { "epoch": 20.63426488456865, "grad_norm": 0.46770477294921875, "learning_rate": 2.5494e-05, "loss": 0.0414, "step": 8501 }, { "epoch": 20.636695018226003, "grad_norm": 0.6038303971290588, "learning_rate": 2.5497e-05, "loss": 0.057, "step": 8502 }, { "epoch": 20.639125151883352, "grad_norm": 0.5187859535217285, "learning_rate": 2.55e-05, "loss": 0.0363, "step": 8503 }, { "epoch": 20.641555285540704, "grad_norm": 0.7456408143043518, "learning_rate": 2.5503e-05, "loss": 0.0326, "step": 8504 }, { "epoch": 20.643985419198057, "grad_norm": 0.5402576923370361, "learning_rate": 2.5506e-05, "loss": 0.0439, "step": 8505 }, { "epoch": 20.64641555285541, "grad_norm": 0.5403032302856445, "learning_rate": 2.5509e-05, "loss": 0.0225, "step": 8506 }, { "epoch": 20.648845686512757, "grad_norm": 0.5523711442947388, "learning_rate": 2.5512000000000002e-05, "loss": 0.0452, "step": 8507 }, { "epoch": 20.65127582017011, "grad_norm": 0.8210770487785339, "learning_rate": 2.5515000000000002e-05, "loss": 0.0417, "step": 8508 }, { "epoch": 20.653705953827462, "grad_norm": 0.7096823453903198, "learning_rate": 2.5518000000000002e-05, "loss": 0.0282, "step": 8509 }, { "epoch": 20.65613608748481, "grad_norm": 0.4289025366306305, "learning_rate": 2.5521000000000002e-05, "loss": 0.0229, "step": 8510 }, { "epoch": 20.658566221142163, "grad_norm": 0.4154053032398224, "learning_rate": 2.5524e-05, "loss": 0.0264, "step": 8511 }, { "epoch": 20.660996354799515, "grad_norm": 0.4097908139228821, "learning_rate": 2.5527e-05, "loss": 0.0213, "step": 8512 }, { "epoch": 20.663426488456864, "grad_norm": 0.5484100580215454, "learning_rate": 2.553e-05, "loss": 0.0484, "step": 8513 }, { "epoch": 20.665856622114216, "grad_norm": 0.4751526713371277, "learning_rate": 2.5533e-05, "loss": 0.0286, "step": 8514 }, { "epoch": 20.668286755771568, "grad_norm": 0.5939993262290955, "learning_rate": 2.5536e-05, "loss": 0.0232, "step": 8515 }, { "epoch": 20.670716889428917, "grad_norm": 0.562284529209137, "learning_rate": 2.5539e-05, "loss": 0.0279, "step": 8516 }, { "epoch": 20.67314702308627, "grad_norm": 0.3624751567840576, "learning_rate": 2.5542e-05, "loss": 0.0262, "step": 8517 }, { "epoch": 20.67557715674362, "grad_norm": 0.5132653713226318, "learning_rate": 2.5545e-05, "loss": 0.0308, "step": 8518 }, { "epoch": 20.678007290400974, "grad_norm": 0.39556726813316345, "learning_rate": 2.5548e-05, "loss": 0.0198, "step": 8519 }, { "epoch": 20.680437424058322, "grad_norm": 0.6665555238723755, "learning_rate": 2.5551e-05, "loss": 0.0349, "step": 8520 }, { "epoch": 20.682867557715674, "grad_norm": 0.5416889190673828, "learning_rate": 2.5554e-05, "loss": 0.0287, "step": 8521 }, { "epoch": 20.685297691373027, "grad_norm": 0.5141915082931519, "learning_rate": 2.5557e-05, "loss": 0.018, "step": 8522 }, { "epoch": 20.687727825030375, "grad_norm": 1.859905481338501, "learning_rate": 2.556e-05, "loss": 0.054, "step": 8523 }, { "epoch": 20.690157958687728, "grad_norm": 0.5590251088142395, "learning_rate": 2.5563e-05, "loss": 0.0207, "step": 8524 }, { "epoch": 20.69258809234508, "grad_norm": 0.5366235375404358, "learning_rate": 2.5566e-05, "loss": 0.0251, "step": 8525 }, { "epoch": 20.69501822600243, "grad_norm": 0.677337646484375, "learning_rate": 2.5569e-05, "loss": 0.0778, "step": 8526 }, { "epoch": 20.69744835965978, "grad_norm": 0.710175096988678, "learning_rate": 2.5572000000000002e-05, "loss": 0.0249, "step": 8527 }, { "epoch": 20.699878493317133, "grad_norm": 0.5394506454467773, "learning_rate": 2.5575e-05, "loss": 0.0227, "step": 8528 }, { "epoch": 20.702308626974485, "grad_norm": 0.7114033102989197, "learning_rate": 2.5578e-05, "loss": 0.0341, "step": 8529 }, { "epoch": 20.704738760631834, "grad_norm": 0.7690298557281494, "learning_rate": 2.5581e-05, "loss": 0.0347, "step": 8530 }, { "epoch": 20.707168894289186, "grad_norm": 0.35340145230293274, "learning_rate": 2.5584e-05, "loss": 0.0149, "step": 8531 }, { "epoch": 20.70959902794654, "grad_norm": 0.6503624320030212, "learning_rate": 2.5587e-05, "loss": 0.0253, "step": 8532 }, { "epoch": 20.712029161603887, "grad_norm": 0.6473593711853027, "learning_rate": 2.559e-05, "loss": 0.0221, "step": 8533 }, { "epoch": 20.71445929526124, "grad_norm": 1.9955765008926392, "learning_rate": 2.5593e-05, "loss": 0.0266, "step": 8534 }, { "epoch": 20.71688942891859, "grad_norm": 0.6240177750587463, "learning_rate": 2.5596e-05, "loss": 0.028, "step": 8535 }, { "epoch": 20.71931956257594, "grad_norm": 0.5667684674263, "learning_rate": 2.5599e-05, "loss": 0.0254, "step": 8536 }, { "epoch": 20.721749696233292, "grad_norm": 1.1916848421096802, "learning_rate": 2.5602000000000003e-05, "loss": 0.0363, "step": 8537 }, { "epoch": 20.724179829890645, "grad_norm": 5.9023823738098145, "learning_rate": 2.5605000000000003e-05, "loss": 0.0494, "step": 8538 }, { "epoch": 20.726609963547997, "grad_norm": 1.131132960319519, "learning_rate": 2.5608000000000003e-05, "loss": 0.0797, "step": 8539 }, { "epoch": 20.729040097205345, "grad_norm": 1.0105445384979248, "learning_rate": 2.5611000000000003e-05, "loss": 0.0581, "step": 8540 }, { "epoch": 20.731470230862698, "grad_norm": 1.233876347541809, "learning_rate": 2.5614000000000002e-05, "loss": 0.2421, "step": 8541 }, { "epoch": 20.73390036452005, "grad_norm": 0.7250312566757202, "learning_rate": 2.5617e-05, "loss": 0.1655, "step": 8542 }, { "epoch": 20.7363304981774, "grad_norm": 0.5515177845954895, "learning_rate": 2.562e-05, "loss": 0.1276, "step": 8543 }, { "epoch": 20.73876063183475, "grad_norm": 0.7801435589790344, "learning_rate": 2.5623e-05, "loss": 0.1224, "step": 8544 }, { "epoch": 20.741190765492103, "grad_norm": 0.6308315396308899, "learning_rate": 2.5625999999999998e-05, "loss": 0.1244, "step": 8545 }, { "epoch": 20.74362089914945, "grad_norm": 0.6552131772041321, "learning_rate": 2.5628999999999998e-05, "loss": 0.104, "step": 8546 }, { "epoch": 20.746051032806804, "grad_norm": 0.4789849519729614, "learning_rate": 2.5632e-05, "loss": 0.0663, "step": 8547 }, { "epoch": 20.748481166464156, "grad_norm": 0.5171018838882446, "learning_rate": 2.5635e-05, "loss": 0.0589, "step": 8548 }, { "epoch": 20.75091130012151, "grad_norm": 0.3645718991756439, "learning_rate": 2.5638e-05, "loss": 0.0373, "step": 8549 }, { "epoch": 20.753341433778857, "grad_norm": 0.5287910103797913, "learning_rate": 2.5641e-05, "loss": 0.0618, "step": 8550 }, { "epoch": 20.75577156743621, "grad_norm": 0.3854106068611145, "learning_rate": 2.5644e-05, "loss": 0.0368, "step": 8551 }, { "epoch": 20.75820170109356, "grad_norm": 0.5689924359321594, "learning_rate": 2.5647e-05, "loss": 0.0452, "step": 8552 }, { "epoch": 20.76063183475091, "grad_norm": 0.39794620871543884, "learning_rate": 2.565e-05, "loss": 0.0382, "step": 8553 }, { "epoch": 20.763061968408262, "grad_norm": 0.40652310848236084, "learning_rate": 2.5653e-05, "loss": 0.0256, "step": 8554 }, { "epoch": 20.765492102065615, "grad_norm": 0.42608562111854553, "learning_rate": 2.5656e-05, "loss": 0.0487, "step": 8555 }, { "epoch": 20.767922235722963, "grad_norm": 0.5360633134841919, "learning_rate": 2.5659e-05, "loss": 0.0252, "step": 8556 }, { "epoch": 20.770352369380316, "grad_norm": 0.278642863035202, "learning_rate": 2.5662000000000003e-05, "loss": 0.0259, "step": 8557 }, { "epoch": 20.772782503037668, "grad_norm": 0.3441081941127777, "learning_rate": 2.5665000000000002e-05, "loss": 0.0168, "step": 8558 }, { "epoch": 20.775212636695016, "grad_norm": 0.397007554769516, "learning_rate": 2.5668000000000002e-05, "loss": 0.0256, "step": 8559 }, { "epoch": 20.77764277035237, "grad_norm": 0.5580462217330933, "learning_rate": 2.5671000000000002e-05, "loss": 0.0282, "step": 8560 }, { "epoch": 20.78007290400972, "grad_norm": 0.4610484540462494, "learning_rate": 2.5674000000000002e-05, "loss": 0.0259, "step": 8561 }, { "epoch": 20.782503037667073, "grad_norm": 0.36960649490356445, "learning_rate": 2.5677e-05, "loss": 0.027, "step": 8562 }, { "epoch": 20.784933171324422, "grad_norm": 0.44637301564216614, "learning_rate": 2.568e-05, "loss": 0.0281, "step": 8563 }, { "epoch": 20.787363304981774, "grad_norm": 0.24565747380256653, "learning_rate": 2.5683e-05, "loss": 0.013, "step": 8564 }, { "epoch": 20.789793438639126, "grad_norm": 1.1625030040740967, "learning_rate": 2.5686e-05, "loss": 0.0326, "step": 8565 }, { "epoch": 20.792223572296475, "grad_norm": 0.4552002549171448, "learning_rate": 2.5688999999999997e-05, "loss": 0.0148, "step": 8566 }, { "epoch": 20.794653705953827, "grad_norm": 0.6725385785102844, "learning_rate": 2.5692e-05, "loss": 0.0204, "step": 8567 }, { "epoch": 20.79708383961118, "grad_norm": 0.482356458902359, "learning_rate": 2.5695e-05, "loss": 0.0216, "step": 8568 }, { "epoch": 20.799513973268528, "grad_norm": 0.4165922701358795, "learning_rate": 2.5698e-05, "loss": 0.0146, "step": 8569 }, { "epoch": 20.80194410692588, "grad_norm": 0.3729911148548126, "learning_rate": 2.5701e-05, "loss": 0.0164, "step": 8570 }, { "epoch": 20.804374240583233, "grad_norm": 0.7542480230331421, "learning_rate": 2.5704e-05, "loss": 0.0375, "step": 8571 }, { "epoch": 20.806804374240585, "grad_norm": 0.6781923770904541, "learning_rate": 2.5707e-05, "loss": 0.0215, "step": 8572 }, { "epoch": 20.809234507897933, "grad_norm": 0.3562571704387665, "learning_rate": 2.571e-05, "loss": 0.0177, "step": 8573 }, { "epoch": 20.811664641555286, "grad_norm": 1.053838849067688, "learning_rate": 2.5713e-05, "loss": 0.0915, "step": 8574 }, { "epoch": 20.814094775212638, "grad_norm": 0.6393757462501526, "learning_rate": 2.5716e-05, "loss": 0.0205, "step": 8575 }, { "epoch": 20.816524908869987, "grad_norm": 0.5306834578514099, "learning_rate": 2.5719e-05, "loss": 0.0205, "step": 8576 }, { "epoch": 20.81895504252734, "grad_norm": 0.3917033076286316, "learning_rate": 2.5722000000000002e-05, "loss": 0.0156, "step": 8577 }, { "epoch": 20.82138517618469, "grad_norm": 1.0005815029144287, "learning_rate": 2.5725000000000002e-05, "loss": 0.0309, "step": 8578 }, { "epoch": 20.82381530984204, "grad_norm": 0.6336923837661743, "learning_rate": 2.5728e-05, "loss": 0.0371, "step": 8579 }, { "epoch": 20.826245443499392, "grad_norm": 0.6090542078018188, "learning_rate": 2.5731e-05, "loss": 0.0297, "step": 8580 }, { "epoch": 20.828675577156744, "grad_norm": 0.6133360266685486, "learning_rate": 2.5734e-05, "loss": 0.0314, "step": 8581 }, { "epoch": 20.831105710814096, "grad_norm": 1.2194370031356812, "learning_rate": 2.5737e-05, "loss": 0.0272, "step": 8582 }, { "epoch": 20.833535844471445, "grad_norm": 0.8934197425842285, "learning_rate": 2.574e-05, "loss": 0.0815, "step": 8583 }, { "epoch": 20.835965978128797, "grad_norm": 0.6792068481445312, "learning_rate": 2.5743e-05, "loss": 0.0355, "step": 8584 }, { "epoch": 20.83839611178615, "grad_norm": 0.8116639256477356, "learning_rate": 2.5746e-05, "loss": 0.038, "step": 8585 }, { "epoch": 20.8408262454435, "grad_norm": 1.0828194618225098, "learning_rate": 2.5749e-05, "loss": 0.0408, "step": 8586 }, { "epoch": 20.84325637910085, "grad_norm": 1.0493770837783813, "learning_rate": 2.5752000000000003e-05, "loss": 0.0462, "step": 8587 }, { "epoch": 20.845686512758203, "grad_norm": 1.0756052732467651, "learning_rate": 2.5755000000000003e-05, "loss": 0.0381, "step": 8588 }, { "epoch": 20.84811664641555, "grad_norm": 1.0631918907165527, "learning_rate": 2.5758000000000003e-05, "loss": 0.0459, "step": 8589 }, { "epoch": 20.850546780072904, "grad_norm": 1.9720218181610107, "learning_rate": 2.5761000000000003e-05, "loss": 0.0782, "step": 8590 }, { "epoch": 20.852976913730256, "grad_norm": 1.3978822231292725, "learning_rate": 2.5764e-05, "loss": 0.2657, "step": 8591 }, { "epoch": 20.855407047387608, "grad_norm": 0.7089318633079529, "learning_rate": 2.5767e-05, "loss": 0.155, "step": 8592 }, { "epoch": 20.857837181044957, "grad_norm": 0.6637341976165771, "learning_rate": 2.577e-05, "loss": 0.1594, "step": 8593 }, { "epoch": 20.86026731470231, "grad_norm": 0.7507243752479553, "learning_rate": 2.5773e-05, "loss": 0.1285, "step": 8594 }, { "epoch": 20.86269744835966, "grad_norm": 0.6649810671806335, "learning_rate": 2.5776e-05, "loss": 0.0935, "step": 8595 }, { "epoch": 20.86512758201701, "grad_norm": 0.6377226710319519, "learning_rate": 2.5779e-05, "loss": 0.1007, "step": 8596 }, { "epoch": 20.867557715674362, "grad_norm": 0.7752053737640381, "learning_rate": 2.5782e-05, "loss": 0.0949, "step": 8597 }, { "epoch": 20.869987849331714, "grad_norm": 0.652450442314148, "learning_rate": 2.5785e-05, "loss": 0.0691, "step": 8598 }, { "epoch": 20.872417982989063, "grad_norm": 0.6380149126052856, "learning_rate": 2.5788e-05, "loss": 0.0662, "step": 8599 }, { "epoch": 20.874848116646415, "grad_norm": 0.5387003421783447, "learning_rate": 2.5791e-05, "loss": 0.0431, "step": 8600 }, { "epoch": 20.877278250303767, "grad_norm": 0.5355325937271118, "learning_rate": 2.5794e-05, "loss": 0.0498, "step": 8601 }, { "epoch": 20.879708383961116, "grad_norm": 0.4812292158603668, "learning_rate": 2.5797e-05, "loss": 0.0386, "step": 8602 }, { "epoch": 20.88213851761847, "grad_norm": 0.5157501101493835, "learning_rate": 2.58e-05, "loss": 0.0531, "step": 8603 }, { "epoch": 20.88456865127582, "grad_norm": 0.6132616996765137, "learning_rate": 2.5803e-05, "loss": 0.046, "step": 8604 }, { "epoch": 20.886998784933173, "grad_norm": 0.3579166829586029, "learning_rate": 2.5806e-05, "loss": 0.0257, "step": 8605 }, { "epoch": 20.88942891859052, "grad_norm": 0.45151689648628235, "learning_rate": 2.5809e-05, "loss": 0.0307, "step": 8606 }, { "epoch": 20.891859052247874, "grad_norm": 0.9126431345939636, "learning_rate": 2.5812000000000003e-05, "loss": 0.0433, "step": 8607 }, { "epoch": 20.894289185905226, "grad_norm": 0.39111456274986267, "learning_rate": 2.5815000000000003e-05, "loss": 0.0367, "step": 8608 }, { "epoch": 20.896719319562575, "grad_norm": 0.48203369975090027, "learning_rate": 2.5818000000000003e-05, "loss": 0.0299, "step": 8609 }, { "epoch": 20.899149453219927, "grad_norm": 0.3701839745044708, "learning_rate": 2.5821000000000002e-05, "loss": 0.0246, "step": 8610 }, { "epoch": 20.90157958687728, "grad_norm": 0.8306729793548584, "learning_rate": 2.5824000000000002e-05, "loss": 0.0241, "step": 8611 }, { "epoch": 20.904009720534628, "grad_norm": 0.5613278150558472, "learning_rate": 2.5827000000000002e-05, "loss": 0.0305, "step": 8612 }, { "epoch": 20.90643985419198, "grad_norm": 0.6790593266487122, "learning_rate": 2.5830000000000002e-05, "loss": 0.0372, "step": 8613 }, { "epoch": 20.908869987849332, "grad_norm": 0.45695367455482483, "learning_rate": 2.5833e-05, "loss": 0.0243, "step": 8614 }, { "epoch": 20.911300121506684, "grad_norm": 1.2005963325500488, "learning_rate": 2.5835999999999998e-05, "loss": 0.0537, "step": 8615 }, { "epoch": 20.913730255164033, "grad_norm": 0.6401060819625854, "learning_rate": 2.5838999999999998e-05, "loss": 0.0374, "step": 8616 }, { "epoch": 20.916160388821385, "grad_norm": 0.4468563199043274, "learning_rate": 2.5842e-05, "loss": 0.0167, "step": 8617 }, { "epoch": 20.918590522478738, "grad_norm": 0.8590715527534485, "learning_rate": 2.5845e-05, "loss": 0.0512, "step": 8618 }, { "epoch": 20.921020656136086, "grad_norm": 0.5108348727226257, "learning_rate": 2.5848e-05, "loss": 0.0315, "step": 8619 }, { "epoch": 20.92345078979344, "grad_norm": 0.3374035358428955, "learning_rate": 2.5851e-05, "loss": 0.0199, "step": 8620 }, { "epoch": 20.92588092345079, "grad_norm": 0.5616416335105896, "learning_rate": 2.5854e-05, "loss": 0.0415, "step": 8621 }, { "epoch": 20.92831105710814, "grad_norm": 0.6773972511291504, "learning_rate": 2.5857e-05, "loss": 0.0235, "step": 8622 }, { "epoch": 20.93074119076549, "grad_norm": 0.3405913710594177, "learning_rate": 2.586e-05, "loss": 0.0207, "step": 8623 }, { "epoch": 20.933171324422844, "grad_norm": 0.6067806482315063, "learning_rate": 2.5863e-05, "loss": 0.0415, "step": 8624 }, { "epoch": 20.935601458080196, "grad_norm": 0.5750149488449097, "learning_rate": 2.5866e-05, "loss": 0.031, "step": 8625 }, { "epoch": 20.938031591737545, "grad_norm": 0.4546016454696655, "learning_rate": 2.5869e-05, "loss": 0.0311, "step": 8626 }, { "epoch": 20.940461725394897, "grad_norm": 0.5351956486701965, "learning_rate": 2.5872000000000002e-05, "loss": 0.026, "step": 8627 }, { "epoch": 20.94289185905225, "grad_norm": 0.2785778045654297, "learning_rate": 2.5875000000000002e-05, "loss": 0.0137, "step": 8628 }, { "epoch": 20.945321992709598, "grad_norm": 0.5574973225593567, "learning_rate": 2.5878000000000002e-05, "loss": 0.0179, "step": 8629 }, { "epoch": 20.94775212636695, "grad_norm": 0.5580597519874573, "learning_rate": 2.5881000000000002e-05, "loss": 0.0206, "step": 8630 }, { "epoch": 20.950182260024302, "grad_norm": 0.6671426296234131, "learning_rate": 2.5884e-05, "loss": 0.0287, "step": 8631 }, { "epoch": 20.95261239368165, "grad_norm": 0.6353328227996826, "learning_rate": 2.5887e-05, "loss": 0.0273, "step": 8632 }, { "epoch": 20.955042527339003, "grad_norm": 0.5768901705741882, "learning_rate": 2.589e-05, "loss": 0.0221, "step": 8633 }, { "epoch": 20.957472660996356, "grad_norm": 0.7381832599639893, "learning_rate": 2.5893e-05, "loss": 0.0453, "step": 8634 }, { "epoch": 20.959902794653708, "grad_norm": 0.9497979283332825, "learning_rate": 2.5896e-05, "loss": 0.0444, "step": 8635 }, { "epoch": 20.962332928311056, "grad_norm": 0.8057687282562256, "learning_rate": 2.5899e-05, "loss": 0.0363, "step": 8636 }, { "epoch": 20.96476306196841, "grad_norm": 0.8528950214385986, "learning_rate": 2.5902e-05, "loss": 0.0445, "step": 8637 }, { "epoch": 20.96719319562576, "grad_norm": 1.0738869905471802, "learning_rate": 2.5905000000000004e-05, "loss": 0.0484, "step": 8638 }, { "epoch": 20.96962332928311, "grad_norm": 1.250295639038086, "learning_rate": 2.5908000000000003e-05, "loss": 0.0389, "step": 8639 }, { "epoch": 20.972053462940462, "grad_norm": 0.8448507785797119, "learning_rate": 2.5911e-05, "loss": 0.0463, "step": 8640 }, { "epoch": 20.974483596597814, "grad_norm": 0.8989359140396118, "learning_rate": 2.5914e-05, "loss": 0.1687, "step": 8641 }, { "epoch": 20.976913730255163, "grad_norm": 0.7078938484191895, "learning_rate": 2.5917e-05, "loss": 0.0999, "step": 8642 }, { "epoch": 20.979343863912515, "grad_norm": 0.47016647458076477, "learning_rate": 2.592e-05, "loss": 0.0445, "step": 8643 }, { "epoch": 20.981773997569867, "grad_norm": 0.5528625249862671, "learning_rate": 2.5923e-05, "loss": 0.0446, "step": 8644 }, { "epoch": 20.984204131227216, "grad_norm": 0.8240121603012085, "learning_rate": 2.5926e-05, "loss": 0.0293, "step": 8645 }, { "epoch": 20.986634264884568, "grad_norm": 0.5727821588516235, "learning_rate": 2.5929e-05, "loss": 0.0215, "step": 8646 }, { "epoch": 20.98906439854192, "grad_norm": 0.4879133105278015, "learning_rate": 2.5932e-05, "loss": 0.0343, "step": 8647 }, { "epoch": 20.991494532199273, "grad_norm": 0.5432649254798889, "learning_rate": 2.5935e-05, "loss": 0.03, "step": 8648 }, { "epoch": 20.99392466585662, "grad_norm": 0.6900424361228943, "learning_rate": 2.5938e-05, "loss": 0.049, "step": 8649 }, { "epoch": 20.996354799513973, "grad_norm": 0.8761054873466492, "learning_rate": 2.5941e-05, "loss": 0.0359, "step": 8650 }, { "epoch": 20.998784933171326, "grad_norm": 1.1090493202209473, "learning_rate": 2.5944e-05, "loss": 0.0575, "step": 8651 }, { "epoch": 21.0, "grad_norm": 1.179013967514038, "learning_rate": 2.5947e-05, "loss": 0.0314, "step": 8652 }, { "epoch": 21.002430133657352, "grad_norm": 1.047216534614563, "learning_rate": 2.595e-05, "loss": 0.226, "step": 8653 }, { "epoch": 21.0048602673147, "grad_norm": 0.5640441179275513, "learning_rate": 2.5953e-05, "loss": 0.1605, "step": 8654 }, { "epoch": 21.007290400972053, "grad_norm": 0.7148782014846802, "learning_rate": 2.5956e-05, "loss": 0.1433, "step": 8655 }, { "epoch": 21.009720534629405, "grad_norm": 0.9124997854232788, "learning_rate": 2.5959e-05, "loss": 0.1109, "step": 8656 }, { "epoch": 21.012150668286754, "grad_norm": 1.1448801755905151, "learning_rate": 2.5962e-05, "loss": 0.1302, "step": 8657 }, { "epoch": 21.014580801944106, "grad_norm": 0.503182590007782, "learning_rate": 2.5965000000000003e-05, "loss": 0.0771, "step": 8658 }, { "epoch": 21.01701093560146, "grad_norm": 0.4805222153663635, "learning_rate": 2.5968000000000003e-05, "loss": 0.0664, "step": 8659 }, { "epoch": 21.01944106925881, "grad_norm": 0.7720831632614136, "learning_rate": 2.5971000000000003e-05, "loss": 0.0778, "step": 8660 }, { "epoch": 21.02187120291616, "grad_norm": 0.6488167643547058, "learning_rate": 2.5974000000000002e-05, "loss": 0.0474, "step": 8661 }, { "epoch": 21.02430133657351, "grad_norm": 0.6616986989974976, "learning_rate": 2.5977000000000002e-05, "loss": 0.0418, "step": 8662 }, { "epoch": 21.026731470230864, "grad_norm": 0.4310075044631958, "learning_rate": 2.5980000000000002e-05, "loss": 0.0328, "step": 8663 }, { "epoch": 21.029161603888213, "grad_norm": 1.0166341066360474, "learning_rate": 2.5983000000000002e-05, "loss": 0.0498, "step": 8664 }, { "epoch": 21.031591737545565, "grad_norm": 0.563409149646759, "learning_rate": 2.5985999999999998e-05, "loss": 0.0349, "step": 8665 }, { "epoch": 21.034021871202917, "grad_norm": 0.42990842461586, "learning_rate": 2.5988999999999998e-05, "loss": 0.0242, "step": 8666 }, { "epoch": 21.036452004860266, "grad_norm": 0.5568361282348633, "learning_rate": 2.5991999999999998e-05, "loss": 0.0356, "step": 8667 }, { "epoch": 21.038882138517618, "grad_norm": 0.4061245322227478, "learning_rate": 2.5995e-05, "loss": 0.02, "step": 8668 }, { "epoch": 21.04131227217497, "grad_norm": 0.463795930147171, "learning_rate": 2.5998e-05, "loss": 0.0307, "step": 8669 }, { "epoch": 21.043742405832322, "grad_norm": 0.5790329575538635, "learning_rate": 2.6001e-05, "loss": 0.0536, "step": 8670 }, { "epoch": 21.04617253948967, "grad_norm": 0.5208329558372498, "learning_rate": 2.6004e-05, "loss": 0.0235, "step": 8671 }, { "epoch": 21.048602673147023, "grad_norm": 0.6232632398605347, "learning_rate": 2.6007e-05, "loss": 0.0207, "step": 8672 }, { "epoch": 21.051032806804375, "grad_norm": 0.42016658186912537, "learning_rate": 2.601e-05, "loss": 0.019, "step": 8673 }, { "epoch": 21.053462940461724, "grad_norm": 0.713867723941803, "learning_rate": 2.6013e-05, "loss": 0.0244, "step": 8674 }, { "epoch": 21.055893074119076, "grad_norm": 0.48059430718421936, "learning_rate": 2.6016e-05, "loss": 0.0284, "step": 8675 }, { "epoch": 21.05832320777643, "grad_norm": 0.4625176191329956, "learning_rate": 2.6019e-05, "loss": 0.0275, "step": 8676 }, { "epoch": 21.060753341433777, "grad_norm": 0.3665856420993805, "learning_rate": 2.6022e-05, "loss": 0.0185, "step": 8677 }, { "epoch": 21.06318347509113, "grad_norm": 0.6172884106636047, "learning_rate": 2.6025000000000002e-05, "loss": 0.024, "step": 8678 }, { "epoch": 21.06561360874848, "grad_norm": 0.48159000277519226, "learning_rate": 2.6028000000000002e-05, "loss": 0.0248, "step": 8679 }, { "epoch": 21.068043742405834, "grad_norm": 0.3943046033382416, "learning_rate": 2.6031000000000002e-05, "loss": 0.0194, "step": 8680 }, { "epoch": 21.070473876063183, "grad_norm": 0.7158656716346741, "learning_rate": 2.6034000000000002e-05, "loss": 0.0286, "step": 8681 }, { "epoch": 21.072904009720535, "grad_norm": 0.4490000307559967, "learning_rate": 2.6037e-05, "loss": 0.0205, "step": 8682 }, { "epoch": 21.075334143377887, "grad_norm": 0.3998022973537445, "learning_rate": 2.604e-05, "loss": 0.0114, "step": 8683 }, { "epoch": 21.077764277035236, "grad_norm": 0.8712895512580872, "learning_rate": 2.6043e-05, "loss": 0.0229, "step": 8684 }, { "epoch": 21.080194410692588, "grad_norm": 0.5571554899215698, "learning_rate": 2.6046e-05, "loss": 0.0172, "step": 8685 }, { "epoch": 21.08262454434994, "grad_norm": 0.7425393462181091, "learning_rate": 2.6049e-05, "loss": 0.0233, "step": 8686 }, { "epoch": 21.08505467800729, "grad_norm": 0.40640905499458313, "learning_rate": 2.6052e-05, "loss": 0.0177, "step": 8687 }, { "epoch": 21.08748481166464, "grad_norm": 0.43605154752731323, "learning_rate": 2.6055000000000004e-05, "loss": 0.0164, "step": 8688 }, { "epoch": 21.089914945321993, "grad_norm": 0.7626784443855286, "learning_rate": 2.6058e-05, "loss": 0.0203, "step": 8689 }, { "epoch": 21.092345078979346, "grad_norm": 0.5615953207015991, "learning_rate": 2.6061e-05, "loss": 0.0148, "step": 8690 }, { "epoch": 21.094775212636694, "grad_norm": 0.5508103370666504, "learning_rate": 2.6064e-05, "loss": 0.0223, "step": 8691 }, { "epoch": 21.097205346294047, "grad_norm": 0.4680829346179962, "learning_rate": 2.6067e-05, "loss": 0.0195, "step": 8692 }, { "epoch": 21.0996354799514, "grad_norm": 0.33229267597198486, "learning_rate": 2.607e-05, "loss": 0.0112, "step": 8693 }, { "epoch": 21.102065613608747, "grad_norm": 1.0723010301589966, "learning_rate": 2.6073e-05, "loss": 0.0337, "step": 8694 }, { "epoch": 21.1044957472661, "grad_norm": 0.6356741189956665, "learning_rate": 2.6076e-05, "loss": 0.0394, "step": 8695 }, { "epoch": 21.106925880923452, "grad_norm": 0.7509401440620422, "learning_rate": 2.6079e-05, "loss": 0.0664, "step": 8696 }, { "epoch": 21.1093560145808, "grad_norm": 0.43733081221580505, "learning_rate": 2.6082e-05, "loss": 0.0173, "step": 8697 }, { "epoch": 21.111786148238153, "grad_norm": 0.5679511427879333, "learning_rate": 2.6085000000000002e-05, "loss": 0.0272, "step": 8698 }, { "epoch": 21.114216281895505, "grad_norm": 0.7064818143844604, "learning_rate": 2.6088e-05, "loss": 0.0289, "step": 8699 }, { "epoch": 21.116646415552854, "grad_norm": 0.8303583860397339, "learning_rate": 2.6091e-05, "loss": 0.0449, "step": 8700 }, { "epoch": 21.119076549210206, "grad_norm": 1.1424957513809204, "learning_rate": 2.6094e-05, "loss": 0.081, "step": 8701 }, { "epoch": 21.121506682867558, "grad_norm": 0.8533111810684204, "learning_rate": 2.6097e-05, "loss": 0.0219, "step": 8702 }, { "epoch": 21.12393681652491, "grad_norm": 0.7413297891616821, "learning_rate": 2.61e-05, "loss": 0.2216, "step": 8703 }, { "epoch": 21.12636695018226, "grad_norm": 0.5960639119148254, "learning_rate": 2.6103e-05, "loss": 0.1921, "step": 8704 }, { "epoch": 21.12879708383961, "grad_norm": 0.4392397701740265, "learning_rate": 2.6106e-05, "loss": 0.1134, "step": 8705 }, { "epoch": 21.131227217496964, "grad_norm": 0.5534359216690063, "learning_rate": 2.6109e-05, "loss": 0.0955, "step": 8706 }, { "epoch": 21.133657351154312, "grad_norm": 0.5692007541656494, "learning_rate": 2.6112e-05, "loss": 0.0794, "step": 8707 }, { "epoch": 21.136087484811664, "grad_norm": 0.6170440316200256, "learning_rate": 2.6115000000000003e-05, "loss": 0.0774, "step": 8708 }, { "epoch": 21.138517618469017, "grad_norm": 0.4678882956504822, "learning_rate": 2.6118000000000003e-05, "loss": 0.0683, "step": 8709 }, { "epoch": 21.140947752126365, "grad_norm": 0.5111761689186096, "learning_rate": 2.6121000000000003e-05, "loss": 0.0639, "step": 8710 }, { "epoch": 21.143377885783718, "grad_norm": 0.6444820165634155, "learning_rate": 2.6124000000000003e-05, "loss": 0.0581, "step": 8711 }, { "epoch": 21.14580801944107, "grad_norm": 0.5040888786315918, "learning_rate": 2.6127000000000002e-05, "loss": 0.0424, "step": 8712 }, { "epoch": 21.148238153098422, "grad_norm": 0.5395974516868591, "learning_rate": 2.6130000000000002e-05, "loss": 0.0349, "step": 8713 }, { "epoch": 21.15066828675577, "grad_norm": 0.6409444212913513, "learning_rate": 2.6133e-05, "loss": 0.0321, "step": 8714 }, { "epoch": 21.153098420413123, "grad_norm": 0.39859965443611145, "learning_rate": 2.6136e-05, "loss": 0.0272, "step": 8715 }, { "epoch": 21.155528554070475, "grad_norm": 0.3194633722305298, "learning_rate": 2.6138999999999998e-05, "loss": 0.0317, "step": 8716 }, { "epoch": 21.157958687727824, "grad_norm": 0.5307056307792664, "learning_rate": 2.6141999999999998e-05, "loss": 0.0271, "step": 8717 }, { "epoch": 21.160388821385176, "grad_norm": 0.3618778586387634, "learning_rate": 2.6145e-05, "loss": 0.031, "step": 8718 }, { "epoch": 21.16281895504253, "grad_norm": 1.3113096952438354, "learning_rate": 2.6148e-05, "loss": 0.0349, "step": 8719 }, { "epoch": 21.165249088699877, "grad_norm": 0.5673553943634033, "learning_rate": 2.6151e-05, "loss": 0.0293, "step": 8720 }, { "epoch": 21.16767922235723, "grad_norm": 0.3303031623363495, "learning_rate": 2.6154e-05, "loss": 0.0129, "step": 8721 }, { "epoch": 21.17010935601458, "grad_norm": 0.49027219414711, "learning_rate": 2.6157e-05, "loss": 0.0277, "step": 8722 }, { "epoch": 21.172539489671934, "grad_norm": 0.2945215404033661, "learning_rate": 2.616e-05, "loss": 0.0239, "step": 8723 }, { "epoch": 21.174969623329282, "grad_norm": 0.4649626910686493, "learning_rate": 2.6163e-05, "loss": 0.0264, "step": 8724 }, { "epoch": 21.177399756986635, "grad_norm": 0.6161024570465088, "learning_rate": 2.6166e-05, "loss": 0.067, "step": 8725 }, { "epoch": 21.179829890643987, "grad_norm": 0.22235223650932312, "learning_rate": 2.6169e-05, "loss": 0.0204, "step": 8726 }, { "epoch": 21.182260024301335, "grad_norm": 0.42879074811935425, "learning_rate": 2.6172e-05, "loss": 0.0237, "step": 8727 }, { "epoch": 21.184690157958688, "grad_norm": 0.7068672776222229, "learning_rate": 2.6175000000000003e-05, "loss": 0.0431, "step": 8728 }, { "epoch": 21.18712029161604, "grad_norm": 0.1768544465303421, "learning_rate": 2.6178000000000002e-05, "loss": 0.012, "step": 8729 }, { "epoch": 21.18955042527339, "grad_norm": 0.6994725465774536, "learning_rate": 2.6181000000000002e-05, "loss": 0.0281, "step": 8730 }, { "epoch": 21.19198055893074, "grad_norm": 1.26518714427948, "learning_rate": 2.6184000000000002e-05, "loss": 0.0455, "step": 8731 }, { "epoch": 21.194410692588093, "grad_norm": 0.3359588384628296, "learning_rate": 2.6187000000000002e-05, "loss": 0.0217, "step": 8732 }, { "epoch": 21.19684082624544, "grad_norm": 0.44036728143692017, "learning_rate": 2.619e-05, "loss": 0.03, "step": 8733 }, { "epoch": 21.199270959902794, "grad_norm": 0.5361719131469727, "learning_rate": 2.6193e-05, "loss": 0.016, "step": 8734 }, { "epoch": 21.201701093560146, "grad_norm": 0.34589076042175293, "learning_rate": 2.6196e-05, "loss": 0.0175, "step": 8735 }, { "epoch": 21.2041312272175, "grad_norm": 0.46071338653564453, "learning_rate": 2.6199e-05, "loss": 0.0289, "step": 8736 }, { "epoch": 21.206561360874847, "grad_norm": 0.6681661009788513, "learning_rate": 2.6202e-05, "loss": 0.0192, "step": 8737 }, { "epoch": 21.2089914945322, "grad_norm": 0.31452926993370056, "learning_rate": 2.6205e-05, "loss": 0.0121, "step": 8738 }, { "epoch": 21.21142162818955, "grad_norm": 0.4302792251110077, "learning_rate": 2.6208e-05, "loss": 0.0114, "step": 8739 }, { "epoch": 21.2138517618469, "grad_norm": 0.5475883483886719, "learning_rate": 2.6211e-05, "loss": 0.0154, "step": 8740 }, { "epoch": 21.216281895504252, "grad_norm": 0.3734358251094818, "learning_rate": 2.6214e-05, "loss": 0.0172, "step": 8741 }, { "epoch": 21.218712029161605, "grad_norm": 0.8146844506263733, "learning_rate": 2.6217e-05, "loss": 0.0271, "step": 8742 }, { "epoch": 21.221142162818953, "grad_norm": 0.9428277611732483, "learning_rate": 2.622e-05, "loss": 0.0375, "step": 8743 }, { "epoch": 21.223572296476306, "grad_norm": 0.7292643189430237, "learning_rate": 2.6223e-05, "loss": 0.0183, "step": 8744 }, { "epoch": 21.226002430133658, "grad_norm": 0.49221155047416687, "learning_rate": 2.6226e-05, "loss": 0.0198, "step": 8745 }, { "epoch": 21.22843256379101, "grad_norm": 0.69056236743927, "learning_rate": 2.6229e-05, "loss": 0.0315, "step": 8746 }, { "epoch": 21.23086269744836, "grad_norm": 1.394136667251587, "learning_rate": 2.6232e-05, "loss": 0.0288, "step": 8747 }, { "epoch": 21.23329283110571, "grad_norm": 1.3693219423294067, "learning_rate": 2.6235000000000002e-05, "loss": 0.0342, "step": 8748 }, { "epoch": 21.235722964763063, "grad_norm": 0.6261851787567139, "learning_rate": 2.6238000000000002e-05, "loss": 0.0291, "step": 8749 }, { "epoch": 21.238153098420412, "grad_norm": 1.7145041227340698, "learning_rate": 2.6241e-05, "loss": 0.0524, "step": 8750 }, { "epoch": 21.240583232077764, "grad_norm": 0.8724861741065979, "learning_rate": 2.6244e-05, "loss": 0.0218, "step": 8751 }, { "epoch": 21.243013365735116, "grad_norm": 1.969539761543274, "learning_rate": 2.6247e-05, "loss": 0.1771, "step": 8752 }, { "epoch": 21.245443499392465, "grad_norm": 0.6935800313949585, "learning_rate": 2.625e-05, "loss": 0.2105, "step": 8753 }, { "epoch": 21.247873633049817, "grad_norm": 0.5346603393554688, "learning_rate": 2.6253e-05, "loss": 0.1134, "step": 8754 }, { "epoch": 21.25030376670717, "grad_norm": 0.7539044618606567, "learning_rate": 2.6256e-05, "loss": 0.1071, "step": 8755 }, { "epoch": 21.25273390036452, "grad_norm": 0.8745567202568054, "learning_rate": 2.6259e-05, "loss": 0.1615, "step": 8756 }, { "epoch": 21.25516403402187, "grad_norm": 0.7071897387504578, "learning_rate": 2.6262e-05, "loss": 0.112, "step": 8757 }, { "epoch": 21.257594167679223, "grad_norm": 0.5810978412628174, "learning_rate": 2.6265e-05, "loss": 0.0856, "step": 8758 }, { "epoch": 21.260024301336575, "grad_norm": 0.5567011833190918, "learning_rate": 2.6268000000000003e-05, "loss": 0.0708, "step": 8759 }, { "epoch": 21.262454434993924, "grad_norm": 0.7769723534584045, "learning_rate": 2.6271000000000003e-05, "loss": 0.0622, "step": 8760 }, { "epoch": 21.264884568651276, "grad_norm": 0.9520085453987122, "learning_rate": 2.6274000000000003e-05, "loss": 0.0581, "step": 8761 }, { "epoch": 21.267314702308628, "grad_norm": 0.7809876799583435, "learning_rate": 2.6277000000000003e-05, "loss": 0.0527, "step": 8762 }, { "epoch": 21.269744835965977, "grad_norm": 0.3407071530818939, "learning_rate": 2.628e-05, "loss": 0.0366, "step": 8763 }, { "epoch": 21.27217496962333, "grad_norm": 0.558090090751648, "learning_rate": 2.6283e-05, "loss": 0.0287, "step": 8764 }, { "epoch": 21.27460510328068, "grad_norm": 0.44126468896865845, "learning_rate": 2.6286e-05, "loss": 0.0283, "step": 8765 }, { "epoch": 21.277035236938033, "grad_norm": 0.6052604913711548, "learning_rate": 2.6289e-05, "loss": 0.0375, "step": 8766 }, { "epoch": 21.279465370595382, "grad_norm": 1.140924334526062, "learning_rate": 2.6292e-05, "loss": 0.0276, "step": 8767 }, { "epoch": 21.281895504252734, "grad_norm": 0.3668643534183502, "learning_rate": 2.6294999999999998e-05, "loss": 0.0208, "step": 8768 }, { "epoch": 21.284325637910086, "grad_norm": 0.3165355920791626, "learning_rate": 2.6298e-05, "loss": 0.0213, "step": 8769 }, { "epoch": 21.286755771567435, "grad_norm": 0.32542136311531067, "learning_rate": 2.6301e-05, "loss": 0.0241, "step": 8770 }, { "epoch": 21.289185905224787, "grad_norm": 0.3860298991203308, "learning_rate": 2.6304e-05, "loss": 0.0309, "step": 8771 }, { "epoch": 21.29161603888214, "grad_norm": 0.7050008773803711, "learning_rate": 2.6307e-05, "loss": 0.0368, "step": 8772 }, { "epoch": 21.29404617253949, "grad_norm": 0.4092705249786377, "learning_rate": 2.631e-05, "loss": 0.0138, "step": 8773 }, { "epoch": 21.29647630619684, "grad_norm": 0.4667247235774994, "learning_rate": 2.6313e-05, "loss": 0.017, "step": 8774 }, { "epoch": 21.298906439854193, "grad_norm": 0.7392244338989258, "learning_rate": 2.6316e-05, "loss": 0.0162, "step": 8775 }, { "epoch": 21.30133657351154, "grad_norm": 0.3977757394313812, "learning_rate": 2.6319e-05, "loss": 0.0099, "step": 8776 }, { "epoch": 21.303766707168894, "grad_norm": 0.45436009764671326, "learning_rate": 2.6322e-05, "loss": 0.0179, "step": 8777 }, { "epoch": 21.306196840826246, "grad_norm": 0.29092705249786377, "learning_rate": 2.6325e-05, "loss": 0.0139, "step": 8778 }, { "epoch": 21.308626974483598, "grad_norm": 0.632591962814331, "learning_rate": 2.6328000000000003e-05, "loss": 0.0364, "step": 8779 }, { "epoch": 21.311057108140947, "grad_norm": 0.52787184715271, "learning_rate": 2.6331000000000003e-05, "loss": 0.0411, "step": 8780 }, { "epoch": 21.3134872417983, "grad_norm": 0.2565099000930786, "learning_rate": 2.6334000000000002e-05, "loss": 0.008, "step": 8781 }, { "epoch": 21.31591737545565, "grad_norm": 0.6590870022773743, "learning_rate": 2.6337000000000002e-05, "loss": 0.0175, "step": 8782 }, { "epoch": 21.318347509113, "grad_norm": 0.5421954989433289, "learning_rate": 2.6340000000000002e-05, "loss": 0.0291, "step": 8783 }, { "epoch": 21.320777642770352, "grad_norm": 0.8391918540000916, "learning_rate": 2.6343000000000002e-05, "loss": 0.0358, "step": 8784 }, { "epoch": 21.323207776427704, "grad_norm": 0.3431958258152008, "learning_rate": 2.6346e-05, "loss": 0.018, "step": 8785 }, { "epoch": 21.325637910085053, "grad_norm": 0.46837350726127625, "learning_rate": 2.6349e-05, "loss": 0.0176, "step": 8786 }, { "epoch": 21.328068043742405, "grad_norm": 0.6247471570968628, "learning_rate": 2.6351999999999998e-05, "loss": 0.0698, "step": 8787 }, { "epoch": 21.330498177399758, "grad_norm": 0.6452547311782837, "learning_rate": 2.6354999999999998e-05, "loss": 0.037, "step": 8788 }, { "epoch": 21.33292831105711, "grad_norm": 0.5415076613426208, "learning_rate": 2.6358e-05, "loss": 0.024, "step": 8789 }, { "epoch": 21.33535844471446, "grad_norm": 0.8960994482040405, "learning_rate": 2.6361e-05, "loss": 0.0269, "step": 8790 }, { "epoch": 21.33778857837181, "grad_norm": 0.5149415731430054, "learning_rate": 2.6364e-05, "loss": 0.0196, "step": 8791 }, { "epoch": 21.340218712029163, "grad_norm": 0.8056175708770752, "learning_rate": 2.6367e-05, "loss": 0.0264, "step": 8792 }, { "epoch": 21.34264884568651, "grad_norm": 0.44282734394073486, "learning_rate": 2.637e-05, "loss": 0.0254, "step": 8793 }, { "epoch": 21.345078979343864, "grad_norm": 0.7120469808578491, "learning_rate": 2.6373e-05, "loss": 0.0277, "step": 8794 }, { "epoch": 21.347509113001216, "grad_norm": 0.773059606552124, "learning_rate": 2.6376e-05, "loss": 0.0312, "step": 8795 }, { "epoch": 21.349939246658565, "grad_norm": 1.057259440422058, "learning_rate": 2.6379e-05, "loss": 0.0373, "step": 8796 }, { "epoch": 21.352369380315917, "grad_norm": 0.5717472434043884, "learning_rate": 2.6382e-05, "loss": 0.0193, "step": 8797 }, { "epoch": 21.35479951397327, "grad_norm": 0.4670064151287079, "learning_rate": 2.6385e-05, "loss": 0.0238, "step": 8798 }, { "epoch": 21.35722964763062, "grad_norm": 0.33922910690307617, "learning_rate": 2.6388000000000002e-05, "loss": 0.0163, "step": 8799 }, { "epoch": 21.35965978128797, "grad_norm": 0.6915819644927979, "learning_rate": 2.6391000000000002e-05, "loss": 0.0214, "step": 8800 }, { "epoch": 21.362089914945322, "grad_norm": 0.8649219870567322, "learning_rate": 2.6394000000000002e-05, "loss": 0.0353, "step": 8801 }, { "epoch": 21.364520048602675, "grad_norm": 3.0740697383880615, "learning_rate": 2.6397e-05, "loss": 0.0812, "step": 8802 }, { "epoch": 21.366950182260023, "grad_norm": 0.7894485592842102, "learning_rate": 2.64e-05, "loss": 0.1926, "step": 8803 }, { "epoch": 21.369380315917375, "grad_norm": 0.6947154402732849, "learning_rate": 2.6403e-05, "loss": 0.1243, "step": 8804 }, { "epoch": 21.371810449574728, "grad_norm": 0.5909550189971924, "learning_rate": 2.6406e-05, "loss": 0.1139, "step": 8805 }, { "epoch": 21.374240583232076, "grad_norm": 0.6680501103401184, "learning_rate": 2.6409e-05, "loss": 0.1214, "step": 8806 }, { "epoch": 21.37667071688943, "grad_norm": 0.7031213045120239, "learning_rate": 2.6412e-05, "loss": 0.0941, "step": 8807 }, { "epoch": 21.37910085054678, "grad_norm": 0.6466382741928101, "learning_rate": 2.6415e-05, "loss": 0.0851, "step": 8808 }, { "epoch": 21.381530984204133, "grad_norm": 0.440460205078125, "learning_rate": 2.6418000000000004e-05, "loss": 0.0768, "step": 8809 }, { "epoch": 21.38396111786148, "grad_norm": 0.5891575217247009, "learning_rate": 2.6421000000000003e-05, "loss": 0.0582, "step": 8810 }, { "epoch": 21.386391251518834, "grad_norm": 0.5906314253807068, "learning_rate": 2.6424000000000003e-05, "loss": 0.0708, "step": 8811 }, { "epoch": 21.388821385176186, "grad_norm": 0.35045576095581055, "learning_rate": 2.6427e-05, "loss": 0.0269, "step": 8812 }, { "epoch": 21.391251518833535, "grad_norm": 0.6045961976051331, "learning_rate": 2.643e-05, "loss": 0.0439, "step": 8813 }, { "epoch": 21.393681652490887, "grad_norm": 0.463703989982605, "learning_rate": 2.6433e-05, "loss": 0.0358, "step": 8814 }, { "epoch": 21.39611178614824, "grad_norm": 0.5346675515174866, "learning_rate": 2.6436e-05, "loss": 0.0399, "step": 8815 }, { "epoch": 21.398541919805588, "grad_norm": 0.3435998260974884, "learning_rate": 2.6439e-05, "loss": 0.0302, "step": 8816 }, { "epoch": 21.40097205346294, "grad_norm": 0.3924773037433624, "learning_rate": 2.6442e-05, "loss": 0.0201, "step": 8817 }, { "epoch": 21.403402187120292, "grad_norm": 0.45457327365875244, "learning_rate": 2.6445e-05, "loss": 0.047, "step": 8818 }, { "epoch": 21.40583232077764, "grad_norm": 0.42222708463668823, "learning_rate": 2.6448e-05, "loss": 0.0256, "step": 8819 }, { "epoch": 21.408262454434993, "grad_norm": 0.4688030779361725, "learning_rate": 2.6451e-05, "loss": 0.0256, "step": 8820 }, { "epoch": 21.410692588092346, "grad_norm": 0.4715801775455475, "learning_rate": 2.6454e-05, "loss": 0.0287, "step": 8821 }, { "epoch": 21.413122721749698, "grad_norm": 0.4830717146396637, "learning_rate": 2.6457e-05, "loss": 0.0209, "step": 8822 }, { "epoch": 21.415552855407046, "grad_norm": 0.43308088183403015, "learning_rate": 2.646e-05, "loss": 0.0211, "step": 8823 }, { "epoch": 21.4179829890644, "grad_norm": 0.3247332274913788, "learning_rate": 2.6463e-05, "loss": 0.019, "step": 8824 }, { "epoch": 21.42041312272175, "grad_norm": 0.5197294354438782, "learning_rate": 2.6466e-05, "loss": 0.0188, "step": 8825 }, { "epoch": 21.4228432563791, "grad_norm": 0.7148971557617188, "learning_rate": 2.6469e-05, "loss": 0.0225, "step": 8826 }, { "epoch": 21.425273390036452, "grad_norm": 0.37961816787719727, "learning_rate": 2.6472e-05, "loss": 0.0156, "step": 8827 }, { "epoch": 21.427703523693804, "grad_norm": 0.40020832419395447, "learning_rate": 2.6475e-05, "loss": 0.0221, "step": 8828 }, { "epoch": 21.430133657351153, "grad_norm": 0.5807344913482666, "learning_rate": 2.6478000000000003e-05, "loss": 0.0196, "step": 8829 }, { "epoch": 21.432563791008505, "grad_norm": 0.6704575419425964, "learning_rate": 2.6481000000000003e-05, "loss": 0.0277, "step": 8830 }, { "epoch": 21.434993924665857, "grad_norm": 0.4406690299510956, "learning_rate": 2.6484000000000003e-05, "loss": 0.0221, "step": 8831 }, { "epoch": 21.43742405832321, "grad_norm": 0.5101807117462158, "learning_rate": 2.6487000000000002e-05, "loss": 0.0178, "step": 8832 }, { "epoch": 21.439854191980558, "grad_norm": 0.5849614143371582, "learning_rate": 2.6490000000000002e-05, "loss": 0.0175, "step": 8833 }, { "epoch": 21.44228432563791, "grad_norm": 0.5298033952713013, "learning_rate": 2.6493000000000002e-05, "loss": 0.0312, "step": 8834 }, { "epoch": 21.444714459295263, "grad_norm": 0.3958880305290222, "learning_rate": 2.6496000000000002e-05, "loss": 0.0167, "step": 8835 }, { "epoch": 21.44714459295261, "grad_norm": 0.4919697344303131, "learning_rate": 2.6499e-05, "loss": 0.0226, "step": 8836 }, { "epoch": 21.449574726609963, "grad_norm": 0.5229672789573669, "learning_rate": 2.6501999999999998e-05, "loss": 0.0249, "step": 8837 }, { "epoch": 21.452004860267316, "grad_norm": 0.6977758407592773, "learning_rate": 2.6504999999999998e-05, "loss": 0.0716, "step": 8838 }, { "epoch": 21.454434993924664, "grad_norm": 0.5834621787071228, "learning_rate": 2.6508e-05, "loss": 0.0289, "step": 8839 }, { "epoch": 21.456865127582017, "grad_norm": 0.4353491961956024, "learning_rate": 2.6511e-05, "loss": 0.0156, "step": 8840 }, { "epoch": 21.45929526123937, "grad_norm": 0.5694493055343628, "learning_rate": 2.6514e-05, "loss": 0.0199, "step": 8841 }, { "epoch": 21.46172539489672, "grad_norm": 1.0103462934494019, "learning_rate": 2.6517e-05, "loss": 0.0325, "step": 8842 }, { "epoch": 21.46415552855407, "grad_norm": 0.5552119612693787, "learning_rate": 2.652e-05, "loss": 0.0298, "step": 8843 }, { "epoch": 21.466585662211422, "grad_norm": 0.3246440589427948, "learning_rate": 2.6523e-05, "loss": 0.017, "step": 8844 }, { "epoch": 21.469015795868774, "grad_norm": 1.2527296543121338, "learning_rate": 2.6526e-05, "loss": 0.0294, "step": 8845 }, { "epoch": 21.471445929526123, "grad_norm": 0.8233214616775513, "learning_rate": 2.6529e-05, "loss": 0.0374, "step": 8846 }, { "epoch": 21.473876063183475, "grad_norm": 1.0079741477966309, "learning_rate": 2.6532e-05, "loss": 0.0295, "step": 8847 }, { "epoch": 21.476306196840827, "grad_norm": 1.2014570236206055, "learning_rate": 2.6535e-05, "loss": 0.0322, "step": 8848 }, { "epoch": 21.478736330498176, "grad_norm": 0.7033734917640686, "learning_rate": 2.6538000000000002e-05, "loss": 0.0239, "step": 8849 }, { "epoch": 21.481166464155528, "grad_norm": 1.618135929107666, "learning_rate": 2.6541000000000002e-05, "loss": 0.0749, "step": 8850 }, { "epoch": 21.48359659781288, "grad_norm": 1.3423815965652466, "learning_rate": 2.6544000000000002e-05, "loss": 0.0262, "step": 8851 }, { "epoch": 21.48602673147023, "grad_norm": 2.992962121963501, "learning_rate": 2.6547000000000002e-05, "loss": 0.0856, "step": 8852 }, { "epoch": 21.48845686512758, "grad_norm": 1.8675565719604492, "learning_rate": 2.655e-05, "loss": 0.2248, "step": 8853 }, { "epoch": 21.490886998784934, "grad_norm": 0.7065326571464539, "learning_rate": 2.6553e-05, "loss": 0.1486, "step": 8854 }, { "epoch": 21.493317132442286, "grad_norm": 0.47560253739356995, "learning_rate": 2.6556e-05, "loss": 0.1066, "step": 8855 }, { "epoch": 21.495747266099634, "grad_norm": 0.7397502064704895, "learning_rate": 2.6559e-05, "loss": 0.0994, "step": 8856 }, { "epoch": 21.498177399756987, "grad_norm": 0.8568607568740845, "learning_rate": 2.6562e-05, "loss": 0.1065, "step": 8857 }, { "epoch": 21.50060753341434, "grad_norm": 0.6537235975265503, "learning_rate": 2.6565e-05, "loss": 0.0842, "step": 8858 }, { "epoch": 21.503037667071688, "grad_norm": 0.48367202281951904, "learning_rate": 2.6568000000000004e-05, "loss": 0.0618, "step": 8859 }, { "epoch": 21.50546780072904, "grad_norm": 0.4990442097187042, "learning_rate": 2.6571000000000004e-05, "loss": 0.0434, "step": 8860 }, { "epoch": 21.507897934386392, "grad_norm": 0.6113587021827698, "learning_rate": 2.6574e-05, "loss": 0.0538, "step": 8861 }, { "epoch": 21.51032806804374, "grad_norm": 0.7166478633880615, "learning_rate": 2.6577e-05, "loss": 0.0667, "step": 8862 }, { "epoch": 21.512758201701093, "grad_norm": 0.4910966753959656, "learning_rate": 2.658e-05, "loss": 0.0378, "step": 8863 }, { "epoch": 21.515188335358445, "grad_norm": 0.31095239520072937, "learning_rate": 2.6583e-05, "loss": 0.0332, "step": 8864 }, { "epoch": 21.517618469015797, "grad_norm": 0.3846375048160553, "learning_rate": 2.6586e-05, "loss": 0.0352, "step": 8865 }, { "epoch": 21.520048602673146, "grad_norm": 0.7425712943077087, "learning_rate": 2.6589e-05, "loss": 0.0582, "step": 8866 }, { "epoch": 21.5224787363305, "grad_norm": 0.469360888004303, "learning_rate": 2.6592e-05, "loss": 0.0286, "step": 8867 }, { "epoch": 21.52490886998785, "grad_norm": 0.2676602005958557, "learning_rate": 2.6595e-05, "loss": 0.0183, "step": 8868 }, { "epoch": 21.5273390036452, "grad_norm": 0.4187355041503906, "learning_rate": 2.6598000000000002e-05, "loss": 0.0277, "step": 8869 }, { "epoch": 21.52976913730255, "grad_norm": 0.44275394082069397, "learning_rate": 2.6601e-05, "loss": 0.0233, "step": 8870 }, { "epoch": 21.532199270959904, "grad_norm": 0.23657408356666565, "learning_rate": 2.6604e-05, "loss": 0.0176, "step": 8871 }, { "epoch": 21.534629404617252, "grad_norm": 0.3844781517982483, "learning_rate": 2.6607e-05, "loss": 0.0298, "step": 8872 }, { "epoch": 21.537059538274605, "grad_norm": 0.2313869595527649, "learning_rate": 2.661e-05, "loss": 0.0125, "step": 8873 }, { "epoch": 21.539489671931957, "grad_norm": 0.2777453064918518, "learning_rate": 2.6613e-05, "loss": 0.0243, "step": 8874 }, { "epoch": 21.54191980558931, "grad_norm": 0.32181933522224426, "learning_rate": 2.6616e-05, "loss": 0.0148, "step": 8875 }, { "epoch": 21.544349939246658, "grad_norm": 0.4378986358642578, "learning_rate": 2.6619e-05, "loss": 0.0189, "step": 8876 }, { "epoch": 21.54678007290401, "grad_norm": 1.1299424171447754, "learning_rate": 2.6622e-05, "loss": 0.0298, "step": 8877 }, { "epoch": 21.549210206561362, "grad_norm": 0.6141592860221863, "learning_rate": 2.6625e-05, "loss": 0.0243, "step": 8878 }, { "epoch": 21.55164034021871, "grad_norm": 0.26943573355674744, "learning_rate": 2.6628e-05, "loss": 0.0134, "step": 8879 }, { "epoch": 21.554070473876063, "grad_norm": 0.42949625849723816, "learning_rate": 2.6631000000000003e-05, "loss": 0.0171, "step": 8880 }, { "epoch": 21.556500607533415, "grad_norm": 0.3092747926712036, "learning_rate": 2.6634000000000003e-05, "loss": 0.0165, "step": 8881 }, { "epoch": 21.558930741190764, "grad_norm": 0.37844833731651306, "learning_rate": 2.6637000000000003e-05, "loss": 0.0136, "step": 8882 }, { "epoch": 21.561360874848116, "grad_norm": 1.0939656496047974, "learning_rate": 2.6640000000000002e-05, "loss": 0.0264, "step": 8883 }, { "epoch": 21.56379100850547, "grad_norm": 0.8327828645706177, "learning_rate": 2.6643000000000002e-05, "loss": 0.0369, "step": 8884 }, { "epoch": 21.566221142162817, "grad_norm": 0.7392838001251221, "learning_rate": 2.6646000000000002e-05, "loss": 0.0194, "step": 8885 }, { "epoch": 21.56865127582017, "grad_norm": 0.8964910507202148, "learning_rate": 2.6649e-05, "loss": 0.0372, "step": 8886 }, { "epoch": 21.57108140947752, "grad_norm": 0.8118704557418823, "learning_rate": 2.6651999999999998e-05, "loss": 0.0274, "step": 8887 }, { "epoch": 21.573511543134874, "grad_norm": 0.3269222676753998, "learning_rate": 2.6654999999999998e-05, "loss": 0.015, "step": 8888 }, { "epoch": 21.575941676792223, "grad_norm": 0.9399827718734741, "learning_rate": 2.6657999999999998e-05, "loss": 0.0776, "step": 8889 }, { "epoch": 21.578371810449575, "grad_norm": 0.25221994519233704, "learning_rate": 2.6661e-05, "loss": 0.013, "step": 8890 }, { "epoch": 21.580801944106927, "grad_norm": 0.717494547367096, "learning_rate": 2.6664e-05, "loss": 0.0269, "step": 8891 }, { "epoch": 21.583232077764276, "grad_norm": 0.45183229446411133, "learning_rate": 2.6667e-05, "loss": 0.0319, "step": 8892 }, { "epoch": 21.585662211421628, "grad_norm": 0.7109895348548889, "learning_rate": 2.667e-05, "loss": 0.0244, "step": 8893 }, { "epoch": 21.58809234507898, "grad_norm": 0.5128263831138611, "learning_rate": 2.6673e-05, "loss": 0.013, "step": 8894 }, { "epoch": 21.59052247873633, "grad_norm": 0.7232389450073242, "learning_rate": 2.6676e-05, "loss": 0.0292, "step": 8895 }, { "epoch": 21.59295261239368, "grad_norm": 0.7248286008834839, "learning_rate": 2.6679e-05, "loss": 0.0306, "step": 8896 }, { "epoch": 21.595382746051033, "grad_norm": 0.9483417868614197, "learning_rate": 2.6682e-05, "loss": 0.0341, "step": 8897 }, { "epoch": 21.597812879708385, "grad_norm": 1.3343651294708252, "learning_rate": 2.6685e-05, "loss": 0.0353, "step": 8898 }, { "epoch": 21.600243013365734, "grad_norm": 0.7469567060470581, "learning_rate": 2.6688e-05, "loss": 0.0297, "step": 8899 }, { "epoch": 21.602673147023086, "grad_norm": 0.9249729514122009, "learning_rate": 2.6691000000000002e-05, "loss": 0.0431, "step": 8900 }, { "epoch": 21.60510328068044, "grad_norm": 1.1489198207855225, "learning_rate": 2.6694000000000002e-05, "loss": 0.0274, "step": 8901 }, { "epoch": 21.607533414337787, "grad_norm": 1.1308220624923706, "learning_rate": 2.6697000000000002e-05, "loss": 0.0377, "step": 8902 }, { "epoch": 21.60996354799514, "grad_norm": 1.723168134689331, "learning_rate": 2.6700000000000002e-05, "loss": 0.3126, "step": 8903 }, { "epoch": 21.61239368165249, "grad_norm": 0.6869703531265259, "learning_rate": 2.6703e-05, "loss": 0.1753, "step": 8904 }, { "epoch": 21.61482381530984, "grad_norm": 0.8522907495498657, "learning_rate": 2.6706e-05, "loss": 0.1414, "step": 8905 }, { "epoch": 21.617253948967193, "grad_norm": 0.6537105441093445, "learning_rate": 2.6709e-05, "loss": 0.1318, "step": 8906 }, { "epoch": 21.619684082624545, "grad_norm": 0.630742073059082, "learning_rate": 2.6712e-05, "loss": 0.1087, "step": 8907 }, { "epoch": 21.622114216281897, "grad_norm": 0.6754755973815918, "learning_rate": 2.6715e-05, "loss": 0.0889, "step": 8908 }, { "epoch": 21.624544349939246, "grad_norm": 0.7334146499633789, "learning_rate": 2.6718e-05, "loss": 0.0886, "step": 8909 }, { "epoch": 21.626974483596598, "grad_norm": 0.4750928282737732, "learning_rate": 2.6721e-05, "loss": 0.059, "step": 8910 }, { "epoch": 21.62940461725395, "grad_norm": 0.6325929164886475, "learning_rate": 2.6724e-05, "loss": 0.074, "step": 8911 }, { "epoch": 21.6318347509113, "grad_norm": 0.5779300332069397, "learning_rate": 2.6727e-05, "loss": 0.0573, "step": 8912 }, { "epoch": 21.63426488456865, "grad_norm": 0.45807722210884094, "learning_rate": 2.673e-05, "loss": 0.0391, "step": 8913 }, { "epoch": 21.636695018226003, "grad_norm": 0.4070265293121338, "learning_rate": 2.6733e-05, "loss": 0.0381, "step": 8914 }, { "epoch": 21.639125151883352, "grad_norm": 0.8227746486663818, "learning_rate": 2.6736e-05, "loss": 0.0343, "step": 8915 }, { "epoch": 21.641555285540704, "grad_norm": 0.21424701809883118, "learning_rate": 2.6739e-05, "loss": 0.0113, "step": 8916 }, { "epoch": 21.643985419198057, "grad_norm": 0.41379478573799133, "learning_rate": 2.6742e-05, "loss": 0.0317, "step": 8917 }, { "epoch": 21.64641555285541, "grad_norm": 0.40762975811958313, "learning_rate": 2.6745e-05, "loss": 0.0354, "step": 8918 }, { "epoch": 21.648845686512757, "grad_norm": 0.5132858157157898, "learning_rate": 2.6748e-05, "loss": 0.0564, "step": 8919 }, { "epoch": 21.65127582017011, "grad_norm": 0.47608184814453125, "learning_rate": 2.6751000000000002e-05, "loss": 0.0286, "step": 8920 }, { "epoch": 21.653705953827462, "grad_norm": 0.32650521397590637, "learning_rate": 2.6754e-05, "loss": 0.0259, "step": 8921 }, { "epoch": 21.65613608748481, "grad_norm": 0.4230479896068573, "learning_rate": 2.6757e-05, "loss": 0.0251, "step": 8922 }, { "epoch": 21.658566221142163, "grad_norm": 0.369822233915329, "learning_rate": 2.676e-05, "loss": 0.0234, "step": 8923 }, { "epoch": 21.660996354799515, "grad_norm": 0.823035717010498, "learning_rate": 2.6763e-05, "loss": 0.0253, "step": 8924 }, { "epoch": 21.663426488456864, "grad_norm": 0.5026037693023682, "learning_rate": 2.6766e-05, "loss": 0.0172, "step": 8925 }, { "epoch": 21.665856622114216, "grad_norm": 0.4080108106136322, "learning_rate": 2.6769e-05, "loss": 0.02, "step": 8926 }, { "epoch": 21.668286755771568, "grad_norm": 0.5640299320220947, "learning_rate": 2.6772e-05, "loss": 0.0283, "step": 8927 }, { "epoch": 21.670716889428917, "grad_norm": 0.2745102643966675, "learning_rate": 2.6775e-05, "loss": 0.0205, "step": 8928 }, { "epoch": 21.67314702308627, "grad_norm": 0.39803725481033325, "learning_rate": 2.6778e-05, "loss": 0.0174, "step": 8929 }, { "epoch": 21.67557715674362, "grad_norm": 0.6040463447570801, "learning_rate": 2.6781000000000003e-05, "loss": 0.0297, "step": 8930 }, { "epoch": 21.678007290400974, "grad_norm": 0.6220476627349854, "learning_rate": 2.6784000000000003e-05, "loss": 0.0198, "step": 8931 }, { "epoch": 21.680437424058322, "grad_norm": 0.3372099995613098, "learning_rate": 2.6787000000000003e-05, "loss": 0.0234, "step": 8932 }, { "epoch": 21.682867557715674, "grad_norm": 0.4250766932964325, "learning_rate": 2.6790000000000003e-05, "loss": 0.0316, "step": 8933 }, { "epoch": 21.685297691373027, "grad_norm": 0.8296391367912292, "learning_rate": 2.6793000000000002e-05, "loss": 0.036, "step": 8934 }, { "epoch": 21.687727825030375, "grad_norm": 0.466210275888443, "learning_rate": 2.6796e-05, "loss": 0.0231, "step": 8935 }, { "epoch": 21.690157958687728, "grad_norm": 0.5071157217025757, "learning_rate": 2.6799e-05, "loss": 0.0318, "step": 8936 }, { "epoch": 21.69258809234508, "grad_norm": 0.3840351104736328, "learning_rate": 2.6802e-05, "loss": 0.0187, "step": 8937 }, { "epoch": 21.69501822600243, "grad_norm": 0.6655980944633484, "learning_rate": 2.6805e-05, "loss": 0.0352, "step": 8938 }, { "epoch": 21.69744835965978, "grad_norm": 0.4207666516304016, "learning_rate": 2.6807999999999998e-05, "loss": 0.0213, "step": 8939 }, { "epoch": 21.699878493317133, "grad_norm": 1.117967128753662, "learning_rate": 2.6811e-05, "loss": 0.0911, "step": 8940 }, { "epoch": 21.702308626974485, "grad_norm": 0.4718795418739319, "learning_rate": 2.6814e-05, "loss": 0.0214, "step": 8941 }, { "epoch": 21.704738760631834, "grad_norm": 0.353760689496994, "learning_rate": 2.6817e-05, "loss": 0.0218, "step": 8942 }, { "epoch": 21.707168894289186, "grad_norm": 0.30516645312309265, "learning_rate": 2.682e-05, "loss": 0.0143, "step": 8943 }, { "epoch": 21.70959902794654, "grad_norm": 0.5145136713981628, "learning_rate": 2.6823e-05, "loss": 0.019, "step": 8944 }, { "epoch": 21.712029161603887, "grad_norm": 0.8474366664886475, "learning_rate": 2.6826e-05, "loss": 0.0333, "step": 8945 }, { "epoch": 21.71445929526124, "grad_norm": 0.9301607608795166, "learning_rate": 2.6829e-05, "loss": 0.0486, "step": 8946 }, { "epoch": 21.71688942891859, "grad_norm": 0.7454046010971069, "learning_rate": 2.6832e-05, "loss": 0.0294, "step": 8947 }, { "epoch": 21.71931956257594, "grad_norm": 0.619623064994812, "learning_rate": 2.6835e-05, "loss": 0.0215, "step": 8948 }, { "epoch": 21.721749696233292, "grad_norm": 0.5453692674636841, "learning_rate": 2.6838e-05, "loss": 0.0227, "step": 8949 }, { "epoch": 21.724179829890645, "grad_norm": 1.2791105508804321, "learning_rate": 2.6841000000000003e-05, "loss": 0.0556, "step": 8950 }, { "epoch": 21.726609963547997, "grad_norm": 0.6190446615219116, "learning_rate": 2.6844000000000003e-05, "loss": 0.0217, "step": 8951 }, { "epoch": 21.729040097205345, "grad_norm": 1.2501368522644043, "learning_rate": 2.6847000000000002e-05, "loss": 0.0414, "step": 8952 }, { "epoch": 21.731470230862698, "grad_norm": 1.2670066356658936, "learning_rate": 2.6850000000000002e-05, "loss": 0.2175, "step": 8953 }, { "epoch": 21.73390036452005, "grad_norm": 0.6967774033546448, "learning_rate": 2.6853000000000002e-05, "loss": 0.155, "step": 8954 }, { "epoch": 21.7363304981774, "grad_norm": 0.6629874110221863, "learning_rate": 2.6856000000000002e-05, "loss": 0.1472, "step": 8955 }, { "epoch": 21.73876063183475, "grad_norm": 0.8865278959274292, "learning_rate": 2.6859e-05, "loss": 0.1331, "step": 8956 }, { "epoch": 21.741190765492103, "grad_norm": 0.658656656742096, "learning_rate": 2.6862e-05, "loss": 0.085, "step": 8957 }, { "epoch": 21.74362089914945, "grad_norm": 0.6686931848526001, "learning_rate": 2.6865e-05, "loss": 0.0849, "step": 8958 }, { "epoch": 21.746051032806804, "grad_norm": 0.5208371877670288, "learning_rate": 2.6867999999999998e-05, "loss": 0.06, "step": 8959 }, { "epoch": 21.748481166464156, "grad_norm": 0.7693209052085876, "learning_rate": 2.6871e-05, "loss": 0.0592, "step": 8960 }, { "epoch": 21.75091130012151, "grad_norm": 0.6995670795440674, "learning_rate": 2.6874e-05, "loss": 0.0991, "step": 8961 }, { "epoch": 21.753341433778857, "grad_norm": 0.5852293968200684, "learning_rate": 2.6877e-05, "loss": 0.0375, "step": 8962 }, { "epoch": 21.75577156743621, "grad_norm": 0.6045831441879272, "learning_rate": 2.688e-05, "loss": 0.0487, "step": 8963 }, { "epoch": 21.75820170109356, "grad_norm": 0.527851939201355, "learning_rate": 2.6883e-05, "loss": 0.0401, "step": 8964 }, { "epoch": 21.76063183475091, "grad_norm": 0.7087315917015076, "learning_rate": 2.6886e-05, "loss": 0.0369, "step": 8965 }, { "epoch": 21.763061968408262, "grad_norm": 0.3451478183269501, "learning_rate": 2.6889e-05, "loss": 0.0292, "step": 8966 }, { "epoch": 21.765492102065615, "grad_norm": 0.4634805917739868, "learning_rate": 2.6892e-05, "loss": 0.0266, "step": 8967 }, { "epoch": 21.767922235722963, "grad_norm": 0.3855695128440857, "learning_rate": 2.6895e-05, "loss": 0.0246, "step": 8968 }, { "epoch": 21.770352369380316, "grad_norm": 0.4019244611263275, "learning_rate": 2.6898e-05, "loss": 0.0313, "step": 8969 }, { "epoch": 21.772782503037668, "grad_norm": 0.5271849632263184, "learning_rate": 2.6901000000000002e-05, "loss": 0.0267, "step": 8970 }, { "epoch": 21.775212636695016, "grad_norm": 0.44070377945899963, "learning_rate": 2.6904000000000002e-05, "loss": 0.0179, "step": 8971 }, { "epoch": 21.77764277035237, "grad_norm": 0.6150839924812317, "learning_rate": 2.6907000000000002e-05, "loss": 0.0214, "step": 8972 }, { "epoch": 21.78007290400972, "grad_norm": 0.3579559624195099, "learning_rate": 2.691e-05, "loss": 0.0263, "step": 8973 }, { "epoch": 21.782503037667073, "grad_norm": 0.6058394312858582, "learning_rate": 2.6913e-05, "loss": 0.0195, "step": 8974 }, { "epoch": 21.784933171324422, "grad_norm": 0.40669095516204834, "learning_rate": 2.6916e-05, "loss": 0.0239, "step": 8975 }, { "epoch": 21.787363304981774, "grad_norm": 0.5056467652320862, "learning_rate": 2.6919e-05, "loss": 0.0317, "step": 8976 }, { "epoch": 21.789793438639126, "grad_norm": 0.47385162115097046, "learning_rate": 2.6922e-05, "loss": 0.0354, "step": 8977 }, { "epoch": 21.792223572296475, "grad_norm": 0.4399922490119934, "learning_rate": 2.6925e-05, "loss": 0.034, "step": 8978 }, { "epoch": 21.794653705953827, "grad_norm": 0.5591046214103699, "learning_rate": 2.6928e-05, "loss": 0.0326, "step": 8979 }, { "epoch": 21.79708383961118, "grad_norm": 0.3178257346153259, "learning_rate": 2.6931000000000004e-05, "loss": 0.016, "step": 8980 }, { "epoch": 21.799513973268528, "grad_norm": 0.7605111002922058, "learning_rate": 2.6934000000000003e-05, "loss": 0.0348, "step": 8981 }, { "epoch": 21.80194410692588, "grad_norm": 0.40564221143722534, "learning_rate": 2.6937000000000003e-05, "loss": 0.0213, "step": 8982 }, { "epoch": 21.804374240583233, "grad_norm": 0.5672736167907715, "learning_rate": 2.6940000000000003e-05, "loss": 0.0297, "step": 8983 }, { "epoch": 21.806804374240585, "grad_norm": 0.42542576789855957, "learning_rate": 2.6943e-05, "loss": 0.0178, "step": 8984 }, { "epoch": 21.809234507897933, "grad_norm": 0.6446245312690735, "learning_rate": 2.6946e-05, "loss": 0.02, "step": 8985 }, { "epoch": 21.811664641555286, "grad_norm": 0.3476441204547882, "learning_rate": 2.6949e-05, "loss": 0.0123, "step": 8986 }, { "epoch": 21.814094775212638, "grad_norm": 0.8355202674865723, "learning_rate": 2.6952e-05, "loss": 0.0343, "step": 8987 }, { "epoch": 21.816524908869987, "grad_norm": 0.4097154438495636, "learning_rate": 2.6955e-05, "loss": 0.0154, "step": 8988 }, { "epoch": 21.81895504252734, "grad_norm": 0.3835155665874481, "learning_rate": 2.6958e-05, "loss": 0.0193, "step": 8989 }, { "epoch": 21.82138517618469, "grad_norm": 0.3526805341243744, "learning_rate": 2.6961e-05, "loss": 0.0189, "step": 8990 }, { "epoch": 21.82381530984204, "grad_norm": 0.7536855340003967, "learning_rate": 2.6964e-05, "loss": 0.041, "step": 8991 }, { "epoch": 21.826245443499392, "grad_norm": 0.8681060671806335, "learning_rate": 2.6967e-05, "loss": 0.034, "step": 8992 }, { "epoch": 21.828675577156744, "grad_norm": 0.7648541927337646, "learning_rate": 2.697e-05, "loss": 0.0323, "step": 8993 }, { "epoch": 21.831105710814096, "grad_norm": 0.5237388610839844, "learning_rate": 2.6973e-05, "loss": 0.0254, "step": 8994 }, { "epoch": 21.833535844471445, "grad_norm": 1.8884304761886597, "learning_rate": 2.6976e-05, "loss": 0.0286, "step": 8995 }, { "epoch": 21.835965978128797, "grad_norm": 1.1796109676361084, "learning_rate": 2.6979e-05, "loss": 0.0267, "step": 8996 }, { "epoch": 21.83839611178615, "grad_norm": 0.7822946310043335, "learning_rate": 2.6982e-05, "loss": 0.0277, "step": 8997 }, { "epoch": 21.8408262454435, "grad_norm": 0.6991837620735168, "learning_rate": 2.6985e-05, "loss": 0.0377, "step": 8998 }, { "epoch": 21.84325637910085, "grad_norm": 0.6209947466850281, "learning_rate": 2.6988e-05, "loss": 0.0287, "step": 8999 }, { "epoch": 21.845686512758203, "grad_norm": 0.5691967606544495, "learning_rate": 2.6991000000000003e-05, "loss": 0.0262, "step": 9000 }, { "epoch": 21.845686512758203, "eval_cer": 0.09244237410649747, "eval_loss": 0.2880706489086151, "eval_runtime": 7.5295, "eval_samples_per_second": 13.414, "eval_steps_per_second": 0.531, "eval_wer": 0.2896825396825397, "step": 9000 }, { "epoch": 21.84811664641555, "grad_norm": 1.2997273206710815, "learning_rate": 2.6994000000000003e-05, "loss": 0.0461, "step": 9001 }, { "epoch": 21.850546780072904, "grad_norm": 0.986766517162323, "learning_rate": 2.6997000000000003e-05, "loss": 0.0454, "step": 9002 }, { "epoch": 21.852976913730256, "grad_norm": 0.776432454586029, "learning_rate": 2.7000000000000002e-05, "loss": 0.2617, "step": 9003 }, { "epoch": 21.855407047387608, "grad_norm": 0.7234309315681458, "learning_rate": 2.7003000000000002e-05, "loss": 0.1942, "step": 9004 }, { "epoch": 21.857837181044957, "grad_norm": 0.5457703471183777, "learning_rate": 2.7006000000000002e-05, "loss": 0.1388, "step": 9005 }, { "epoch": 21.86026731470231, "grad_norm": 0.6180237531661987, "learning_rate": 2.7009000000000002e-05, "loss": 0.1295, "step": 9006 }, { "epoch": 21.86269744835966, "grad_norm": 0.8394423127174377, "learning_rate": 2.7012e-05, "loss": 0.1431, "step": 9007 }, { "epoch": 21.86512758201701, "grad_norm": 0.42754465341567993, "learning_rate": 2.7015e-05, "loss": 0.0654, "step": 9008 }, { "epoch": 21.867557715674362, "grad_norm": 0.6665739417076111, "learning_rate": 2.7017999999999998e-05, "loss": 0.0956, "step": 9009 }, { "epoch": 21.869987849331714, "grad_norm": 0.4896669089794159, "learning_rate": 2.7020999999999998e-05, "loss": 0.0485, "step": 9010 }, { "epoch": 21.872417982989063, "grad_norm": 0.48538610339164734, "learning_rate": 2.7024e-05, "loss": 0.0378, "step": 9011 }, { "epoch": 21.874848116646415, "grad_norm": 0.4776100814342499, "learning_rate": 2.7027e-05, "loss": 0.0468, "step": 9012 }, { "epoch": 21.877278250303767, "grad_norm": 0.6930302977561951, "learning_rate": 2.703e-05, "loss": 0.05, "step": 9013 }, { "epoch": 21.879708383961116, "grad_norm": 0.5490346550941467, "learning_rate": 2.7033e-05, "loss": 0.0544, "step": 9014 }, { "epoch": 21.88213851761847, "grad_norm": 1.004469394683838, "learning_rate": 2.7036e-05, "loss": 0.0282, "step": 9015 }, { "epoch": 21.88456865127582, "grad_norm": 0.4163796901702881, "learning_rate": 2.7039e-05, "loss": 0.0201, "step": 9016 }, { "epoch": 21.886998784933173, "grad_norm": 0.6712565422058105, "learning_rate": 2.7042e-05, "loss": 0.0356, "step": 9017 }, { "epoch": 21.88942891859052, "grad_norm": 0.311028391122818, "learning_rate": 2.7045e-05, "loss": 0.0222, "step": 9018 }, { "epoch": 21.891859052247874, "grad_norm": 0.834632158279419, "learning_rate": 2.7048e-05, "loss": 0.0268, "step": 9019 }, { "epoch": 21.894289185905226, "grad_norm": 0.3735630214214325, "learning_rate": 2.7051e-05, "loss": 0.0305, "step": 9020 }, { "epoch": 21.896719319562575, "grad_norm": 0.4897262454032898, "learning_rate": 2.7054000000000002e-05, "loss": 0.019, "step": 9021 }, { "epoch": 21.899149453219927, "grad_norm": 0.6114760041236877, "learning_rate": 2.7057000000000002e-05, "loss": 0.0286, "step": 9022 }, { "epoch": 21.90157958687728, "grad_norm": 0.41966041922569275, "learning_rate": 2.7060000000000002e-05, "loss": 0.0271, "step": 9023 }, { "epoch": 21.904009720534628, "grad_norm": 0.6473264098167419, "learning_rate": 2.7063e-05, "loss": 0.0267, "step": 9024 }, { "epoch": 21.90643985419198, "grad_norm": 0.6661794185638428, "learning_rate": 2.7066e-05, "loss": 0.045, "step": 9025 }, { "epoch": 21.908869987849332, "grad_norm": 0.47921299934387207, "learning_rate": 2.7069e-05, "loss": 0.0363, "step": 9026 }, { "epoch": 21.911300121506684, "grad_norm": 0.45432746410369873, "learning_rate": 2.7072e-05, "loss": 0.023, "step": 9027 }, { "epoch": 21.913730255164033, "grad_norm": 0.6552165746688843, "learning_rate": 2.7075e-05, "loss": 0.0515, "step": 9028 }, { "epoch": 21.916160388821385, "grad_norm": 0.7031326293945312, "learning_rate": 2.7078e-05, "loss": 0.0339, "step": 9029 }, { "epoch": 21.918590522478738, "grad_norm": 0.37875017523765564, "learning_rate": 2.7081e-05, "loss": 0.0129, "step": 9030 }, { "epoch": 21.921020656136086, "grad_norm": 0.5325044989585876, "learning_rate": 2.7084000000000004e-05, "loss": 0.0276, "step": 9031 }, { "epoch": 21.92345078979344, "grad_norm": 0.288046270608902, "learning_rate": 2.7087000000000003e-05, "loss": 0.014, "step": 9032 }, { "epoch": 21.92588092345079, "grad_norm": 0.9276859164237976, "learning_rate": 2.709e-05, "loss": 0.0398, "step": 9033 }, { "epoch": 21.92831105710814, "grad_norm": 0.505490779876709, "learning_rate": 2.7093e-05, "loss": 0.0232, "step": 9034 }, { "epoch": 21.93074119076549, "grad_norm": 0.8374305963516235, "learning_rate": 2.7096e-05, "loss": 0.0316, "step": 9035 }, { "epoch": 21.933171324422844, "grad_norm": 0.5778748393058777, "learning_rate": 2.7099e-05, "loss": 0.0256, "step": 9036 }, { "epoch": 21.935601458080196, "grad_norm": 0.42540720105171204, "learning_rate": 2.7102e-05, "loss": 0.0297, "step": 9037 }, { "epoch": 21.938031591737545, "grad_norm": 0.38557976484298706, "learning_rate": 2.7105e-05, "loss": 0.0338, "step": 9038 }, { "epoch": 21.940461725394897, "grad_norm": 0.9157170653343201, "learning_rate": 2.7108e-05, "loss": 0.0268, "step": 9039 }, { "epoch": 21.94289185905225, "grad_norm": 1.500709891319275, "learning_rate": 2.7111e-05, "loss": 0.0314, "step": 9040 }, { "epoch": 21.945321992709598, "grad_norm": 0.4635179936885834, "learning_rate": 2.7114e-05, "loss": 0.0279, "step": 9041 }, { "epoch": 21.94775212636695, "grad_norm": 0.8140537142753601, "learning_rate": 2.7117e-05, "loss": 0.0378, "step": 9042 }, { "epoch": 21.950182260024302, "grad_norm": 0.7978588342666626, "learning_rate": 2.712e-05, "loss": 0.0247, "step": 9043 }, { "epoch": 21.95261239368165, "grad_norm": 0.43925580382347107, "learning_rate": 2.7123e-05, "loss": 0.0254, "step": 9044 }, { "epoch": 21.955042527339003, "grad_norm": 0.5190321803092957, "learning_rate": 2.7126e-05, "loss": 0.0227, "step": 9045 }, { "epoch": 21.957472660996356, "grad_norm": 0.595950186252594, "learning_rate": 2.7129e-05, "loss": 0.0285, "step": 9046 }, { "epoch": 21.959902794653708, "grad_norm": 1.1894590854644775, "learning_rate": 2.7132e-05, "loss": 0.0341, "step": 9047 }, { "epoch": 21.962332928311056, "grad_norm": 1.595035433769226, "learning_rate": 2.7135e-05, "loss": 0.0293, "step": 9048 }, { "epoch": 21.96476306196841, "grad_norm": 1.0422252416610718, "learning_rate": 2.7138e-05, "loss": 0.0578, "step": 9049 }, { "epoch": 21.96719319562576, "grad_norm": 1.2575236558914185, "learning_rate": 2.7141e-05, "loss": 0.0514, "step": 9050 }, { "epoch": 21.96962332928311, "grad_norm": 0.7373253107070923, "learning_rate": 2.7144000000000003e-05, "loss": 0.0303, "step": 9051 }, { "epoch": 21.972053462940462, "grad_norm": 0.9911895394325256, "learning_rate": 2.7147000000000003e-05, "loss": 0.0358, "step": 9052 }, { "epoch": 21.974483596597814, "grad_norm": 0.7535138726234436, "learning_rate": 2.7150000000000003e-05, "loss": 0.1506, "step": 9053 }, { "epoch": 21.976913730255163, "grad_norm": 0.571287989616394, "learning_rate": 2.7153000000000002e-05, "loss": 0.0635, "step": 9054 }, { "epoch": 21.979343863912515, "grad_norm": 0.4761521518230438, "learning_rate": 2.7156000000000002e-05, "loss": 0.0449, "step": 9055 }, { "epoch": 21.981773997569867, "grad_norm": 0.5196619629859924, "learning_rate": 2.7159000000000002e-05, "loss": 0.0237, "step": 9056 }, { "epoch": 21.984204131227216, "grad_norm": 0.40173304080963135, "learning_rate": 2.7162000000000002e-05, "loss": 0.0297, "step": 9057 }, { "epoch": 21.986634264884568, "grad_norm": 0.4757731854915619, "learning_rate": 2.7164999999999998e-05, "loss": 0.0326, "step": 9058 }, { "epoch": 21.98906439854192, "grad_norm": 0.4535802900791168, "learning_rate": 2.7167999999999998e-05, "loss": 0.0196, "step": 9059 }, { "epoch": 21.991494532199273, "grad_norm": 0.7422687411308289, "learning_rate": 2.7170999999999998e-05, "loss": 0.0435, "step": 9060 }, { "epoch": 21.99392466585662, "grad_norm": 0.6340906023979187, "learning_rate": 2.7174e-05, "loss": 0.0348, "step": 9061 }, { "epoch": 21.996354799513973, "grad_norm": 1.0142827033996582, "learning_rate": 2.7177e-05, "loss": 0.0381, "step": 9062 }, { "epoch": 21.998784933171326, "grad_norm": 0.6945249438285828, "learning_rate": 2.718e-05, "loss": 0.0343, "step": 9063 }, { "epoch": 22.0, "grad_norm": 0.4480014443397522, "learning_rate": 2.7183e-05, "loss": 0.0143, "step": 9064 }, { "epoch": 22.002430133657352, "grad_norm": 0.9511511325836182, "learning_rate": 2.7186e-05, "loss": 0.2203, "step": 9065 }, { "epoch": 22.0048602673147, "grad_norm": 0.5301051735877991, "learning_rate": 2.7189e-05, "loss": 0.1319, "step": 9066 }, { "epoch": 22.007290400972053, "grad_norm": 0.633561909198761, "learning_rate": 2.7192e-05, "loss": 0.1002, "step": 9067 }, { "epoch": 22.009720534629405, "grad_norm": 0.6880051493644714, "learning_rate": 2.7195e-05, "loss": 0.116, "step": 9068 }, { "epoch": 22.012150668286754, "grad_norm": 0.7487694621086121, "learning_rate": 2.7198e-05, "loss": 0.0936, "step": 9069 }, { "epoch": 22.014580801944106, "grad_norm": 0.5650671124458313, "learning_rate": 2.7201e-05, "loss": 0.0695, "step": 9070 }, { "epoch": 22.01701093560146, "grad_norm": 0.5343952178955078, "learning_rate": 2.7204000000000002e-05, "loss": 0.0606, "step": 9071 }, { "epoch": 22.01944106925881, "grad_norm": 0.5908486843109131, "learning_rate": 2.7207000000000002e-05, "loss": 0.059, "step": 9072 }, { "epoch": 22.02187120291616, "grad_norm": 0.5681141018867493, "learning_rate": 2.7210000000000002e-05, "loss": 0.0722, "step": 9073 }, { "epoch": 22.02430133657351, "grad_norm": 0.5589855313301086, "learning_rate": 2.7213000000000002e-05, "loss": 0.0381, "step": 9074 }, { "epoch": 22.026731470230864, "grad_norm": 0.4319134056568146, "learning_rate": 2.7216e-05, "loss": 0.0457, "step": 9075 }, { "epoch": 22.029161603888213, "grad_norm": 0.351603239774704, "learning_rate": 2.7219e-05, "loss": 0.0218, "step": 9076 }, { "epoch": 22.031591737545565, "grad_norm": 0.3045240342617035, "learning_rate": 2.7222e-05, "loss": 0.0162, "step": 9077 }, { "epoch": 22.034021871202917, "grad_norm": 0.3709683120250702, "learning_rate": 2.7225e-05, "loss": 0.0185, "step": 9078 }, { "epoch": 22.036452004860266, "grad_norm": 0.47619718313217163, "learning_rate": 2.7228e-05, "loss": 0.0308, "step": 9079 }, { "epoch": 22.038882138517618, "grad_norm": 0.4416239261627197, "learning_rate": 2.7231e-05, "loss": 0.0208, "step": 9080 }, { "epoch": 22.04131227217497, "grad_norm": 0.3841886818408966, "learning_rate": 2.7234000000000004e-05, "loss": 0.0261, "step": 9081 }, { "epoch": 22.043742405832322, "grad_norm": 0.39370399713516235, "learning_rate": 2.7237e-05, "loss": 0.0329, "step": 9082 }, { "epoch": 22.04617253948967, "grad_norm": 0.23531976342201233, "learning_rate": 2.724e-05, "loss": 0.0098, "step": 9083 }, { "epoch": 22.048602673147023, "grad_norm": 0.34251177310943604, "learning_rate": 2.7243e-05, "loss": 0.015, "step": 9084 }, { "epoch": 22.051032806804375, "grad_norm": 0.779796838760376, "learning_rate": 2.7246e-05, "loss": 0.0232, "step": 9085 }, { "epoch": 22.053462940461724, "grad_norm": 0.4427812993526459, "learning_rate": 2.7249e-05, "loss": 0.0215, "step": 9086 }, { "epoch": 22.055893074119076, "grad_norm": 0.4005337655544281, "learning_rate": 2.7252e-05, "loss": 0.0244, "step": 9087 }, { "epoch": 22.05832320777643, "grad_norm": 0.7882911562919617, "learning_rate": 2.7255e-05, "loss": 0.0181, "step": 9088 }, { "epoch": 22.060753341433777, "grad_norm": 0.8749749064445496, "learning_rate": 2.7258e-05, "loss": 0.0226, "step": 9089 }, { "epoch": 22.06318347509113, "grad_norm": 2.6578660011291504, "learning_rate": 2.7261e-05, "loss": 0.0281, "step": 9090 }, { "epoch": 22.06561360874848, "grad_norm": 0.4923504590988159, "learning_rate": 2.7264000000000002e-05, "loss": 0.0256, "step": 9091 }, { "epoch": 22.068043742405834, "grad_norm": 0.3714867830276489, "learning_rate": 2.7267e-05, "loss": 0.0242, "step": 9092 }, { "epoch": 22.070473876063183, "grad_norm": 0.43977582454681396, "learning_rate": 2.727e-05, "loss": 0.028, "step": 9093 }, { "epoch": 22.072904009720535, "grad_norm": 0.4625437557697296, "learning_rate": 2.7273e-05, "loss": 0.0344, "step": 9094 }, { "epoch": 22.075334143377887, "grad_norm": 0.6076465249061584, "learning_rate": 2.7276e-05, "loss": 0.0238, "step": 9095 }, { "epoch": 22.077764277035236, "grad_norm": 0.46338003873825073, "learning_rate": 2.7279e-05, "loss": 0.0279, "step": 9096 }, { "epoch": 22.080194410692588, "grad_norm": 0.3351605236530304, "learning_rate": 2.7282e-05, "loss": 0.0119, "step": 9097 }, { "epoch": 22.08262454434994, "grad_norm": 0.46385082602500916, "learning_rate": 2.7285e-05, "loss": 0.0223, "step": 9098 }, { "epoch": 22.08505467800729, "grad_norm": 0.9344703555107117, "learning_rate": 2.7288e-05, "loss": 0.0271, "step": 9099 }, { "epoch": 22.08748481166464, "grad_norm": 0.5923449993133545, "learning_rate": 2.7291e-05, "loss": 0.0331, "step": 9100 }, { "epoch": 22.089914945321993, "grad_norm": 0.6995188593864441, "learning_rate": 2.7294000000000003e-05, "loss": 0.0204, "step": 9101 }, { "epoch": 22.092345078979346, "grad_norm": 0.4928041100502014, "learning_rate": 2.7297000000000003e-05, "loss": 0.0576, "step": 9102 }, { "epoch": 22.094775212636694, "grad_norm": 0.35640180110931396, "learning_rate": 2.7300000000000003e-05, "loss": 0.0154, "step": 9103 }, { "epoch": 22.097205346294047, "grad_norm": 1.0731875896453857, "learning_rate": 2.7303000000000003e-05, "loss": 0.0395, "step": 9104 }, { "epoch": 22.0996354799514, "grad_norm": 0.5619587302207947, "learning_rate": 2.7306000000000002e-05, "loss": 0.0211, "step": 9105 }, { "epoch": 22.102065613608747, "grad_norm": 0.3274923264980316, "learning_rate": 2.7309000000000002e-05, "loss": 0.0145, "step": 9106 }, { "epoch": 22.1044957472661, "grad_norm": 0.5606045722961426, "learning_rate": 2.7312e-05, "loss": 0.0295, "step": 9107 }, { "epoch": 22.106925880923452, "grad_norm": 0.5717912316322327, "learning_rate": 2.7315e-05, "loss": 0.0657, "step": 9108 }, { "epoch": 22.1093560145808, "grad_norm": 0.4934445023536682, "learning_rate": 2.7318e-05, "loss": 0.0185, "step": 9109 }, { "epoch": 22.111786148238153, "grad_norm": 0.789330005645752, "learning_rate": 2.7320999999999998e-05, "loss": 0.0223, "step": 9110 }, { "epoch": 22.114216281895505, "grad_norm": 0.6041492223739624, "learning_rate": 2.7324e-05, "loss": 0.0185, "step": 9111 }, { "epoch": 22.116646415552854, "grad_norm": 2.348498582839966, "learning_rate": 2.7327e-05, "loss": 0.0236, "step": 9112 }, { "epoch": 22.119076549210206, "grad_norm": 1.210397481918335, "learning_rate": 2.733e-05, "loss": 0.0615, "step": 9113 }, { "epoch": 22.121506682867558, "grad_norm": 1.460889220237732, "learning_rate": 2.7333e-05, "loss": 0.0343, "step": 9114 }, { "epoch": 22.12393681652491, "grad_norm": 0.7296109795570374, "learning_rate": 2.7336e-05, "loss": 0.1943, "step": 9115 }, { "epoch": 22.12636695018226, "grad_norm": 0.6210430860519409, "learning_rate": 2.7339e-05, "loss": 0.1097, "step": 9116 }, { "epoch": 22.12879708383961, "grad_norm": 0.5451358556747437, "learning_rate": 2.7342e-05, "loss": 0.0964, "step": 9117 }, { "epoch": 22.131227217496964, "grad_norm": 0.5677196383476257, "learning_rate": 2.7345e-05, "loss": 0.0921, "step": 9118 }, { "epoch": 22.133657351154312, "grad_norm": 0.546833336353302, "learning_rate": 2.7348e-05, "loss": 0.0868, "step": 9119 }, { "epoch": 22.136087484811664, "grad_norm": 0.5009256601333618, "learning_rate": 2.7351e-05, "loss": 0.0776, "step": 9120 }, { "epoch": 22.138517618469017, "grad_norm": 0.6843969225883484, "learning_rate": 2.7354000000000003e-05, "loss": 0.0705, "step": 9121 }, { "epoch": 22.140947752126365, "grad_norm": 0.4905213713645935, "learning_rate": 2.7357000000000003e-05, "loss": 0.0587, "step": 9122 }, { "epoch": 22.143377885783718, "grad_norm": 0.4835207164287567, "learning_rate": 2.7360000000000002e-05, "loss": 0.0403, "step": 9123 }, { "epoch": 22.14580801944107, "grad_norm": 0.42670586705207825, "learning_rate": 2.7363000000000002e-05, "loss": 0.034, "step": 9124 }, { "epoch": 22.148238153098422, "grad_norm": 0.3491132855415344, "learning_rate": 2.7366000000000002e-05, "loss": 0.0242, "step": 9125 }, { "epoch": 22.15066828675577, "grad_norm": 0.41183701157569885, "learning_rate": 2.7369000000000002e-05, "loss": 0.0341, "step": 9126 }, { "epoch": 22.153098420413123, "grad_norm": 0.4514673352241516, "learning_rate": 2.7372e-05, "loss": 0.0301, "step": 9127 }, { "epoch": 22.155528554070475, "grad_norm": 0.30717551708221436, "learning_rate": 2.7375e-05, "loss": 0.0169, "step": 9128 }, { "epoch": 22.157958687727824, "grad_norm": 1.802146077156067, "learning_rate": 2.7378e-05, "loss": 0.0474, "step": 9129 }, { "epoch": 22.160388821385176, "grad_norm": 0.4152325391769409, "learning_rate": 2.7381e-05, "loss": 0.0242, "step": 9130 }, { "epoch": 22.16281895504253, "grad_norm": 0.3787498474121094, "learning_rate": 2.7383999999999997e-05, "loss": 0.0233, "step": 9131 }, { "epoch": 22.165249088699877, "grad_norm": 0.36588597297668457, "learning_rate": 2.7387e-05, "loss": 0.0181, "step": 9132 }, { "epoch": 22.16767922235723, "grad_norm": 0.3805668354034424, "learning_rate": 2.739e-05, "loss": 0.0144, "step": 9133 }, { "epoch": 22.17010935601458, "grad_norm": 0.5706822276115417, "learning_rate": 2.7393e-05, "loss": 0.0292, "step": 9134 }, { "epoch": 22.172539489671934, "grad_norm": 0.48496368527412415, "learning_rate": 2.7396e-05, "loss": 0.0256, "step": 9135 }, { "epoch": 22.174969623329282, "grad_norm": 0.6916058659553528, "learning_rate": 2.7399e-05, "loss": 0.0128, "step": 9136 }, { "epoch": 22.177399756986635, "grad_norm": 0.5585574507713318, "learning_rate": 2.7402e-05, "loss": 0.0203, "step": 9137 }, { "epoch": 22.179829890643987, "grad_norm": 0.3503684103488922, "learning_rate": 2.7405e-05, "loss": 0.0224, "step": 9138 }, { "epoch": 22.182260024301335, "grad_norm": 0.44774141907691956, "learning_rate": 2.7408e-05, "loss": 0.0179, "step": 9139 }, { "epoch": 22.184690157958688, "grad_norm": 0.2631155252456665, "learning_rate": 2.7411e-05, "loss": 0.0173, "step": 9140 }, { "epoch": 22.18712029161604, "grad_norm": 0.41791832447052, "learning_rate": 2.7414e-05, "loss": 0.0225, "step": 9141 }, { "epoch": 22.18955042527339, "grad_norm": 0.4198698103427887, "learning_rate": 2.7417000000000002e-05, "loss": 0.029, "step": 9142 }, { "epoch": 22.19198055893074, "grad_norm": 0.4027808606624603, "learning_rate": 2.7420000000000002e-05, "loss": 0.0152, "step": 9143 }, { "epoch": 22.194410692588093, "grad_norm": 0.30487239360809326, "learning_rate": 2.7423e-05, "loss": 0.0125, "step": 9144 }, { "epoch": 22.19684082624544, "grad_norm": 0.5031389594078064, "learning_rate": 2.7426e-05, "loss": 0.0208, "step": 9145 }, { "epoch": 22.199270959902794, "grad_norm": 0.36822375655174255, "learning_rate": 2.7429e-05, "loss": 0.0195, "step": 9146 }, { "epoch": 22.201701093560146, "grad_norm": 0.6431712508201599, "learning_rate": 2.7432e-05, "loss": 0.0254, "step": 9147 }, { "epoch": 22.2041312272175, "grad_norm": 0.8500660061836243, "learning_rate": 2.7435e-05, "loss": 0.0326, "step": 9148 }, { "epoch": 22.206561360874847, "grad_norm": 0.6049516797065735, "learning_rate": 2.7438e-05, "loss": 0.0239, "step": 9149 }, { "epoch": 22.2089914945322, "grad_norm": 0.42740440368652344, "learning_rate": 2.7441e-05, "loss": 0.0162, "step": 9150 }, { "epoch": 22.21142162818955, "grad_norm": 0.9438063502311707, "learning_rate": 2.7444e-05, "loss": 0.078, "step": 9151 }, { "epoch": 22.2138517618469, "grad_norm": 0.4154859483242035, "learning_rate": 2.7447000000000003e-05, "loss": 0.0178, "step": 9152 }, { "epoch": 22.216281895504252, "grad_norm": 0.6682385802268982, "learning_rate": 2.7450000000000003e-05, "loss": 0.0155, "step": 9153 }, { "epoch": 22.218712029161605, "grad_norm": 0.5199863314628601, "learning_rate": 2.7453000000000003e-05, "loss": 0.0195, "step": 9154 }, { "epoch": 22.221142162818953, "grad_norm": 0.2998788356781006, "learning_rate": 2.7456000000000003e-05, "loss": 0.0136, "step": 9155 }, { "epoch": 22.223572296476306, "grad_norm": 0.4928586184978485, "learning_rate": 2.7459e-05, "loss": 0.0174, "step": 9156 }, { "epoch": 22.226002430133658, "grad_norm": 0.3904030919075012, "learning_rate": 2.7462e-05, "loss": 0.0223, "step": 9157 }, { "epoch": 22.22843256379101, "grad_norm": 0.29962587356567383, "learning_rate": 2.7465e-05, "loss": 0.0096, "step": 9158 }, { "epoch": 22.23086269744836, "grad_norm": 0.4876345098018646, "learning_rate": 2.7468e-05, "loss": 0.0262, "step": 9159 }, { "epoch": 22.23329283110571, "grad_norm": 1.0892653465270996, "learning_rate": 2.7471e-05, "loss": 0.0272, "step": 9160 }, { "epoch": 22.235722964763063, "grad_norm": 0.9175124168395996, "learning_rate": 2.7473999999999998e-05, "loss": 0.0213, "step": 9161 }, { "epoch": 22.238153098420412, "grad_norm": 0.6489455103874207, "learning_rate": 2.7477e-05, "loss": 0.0189, "step": 9162 }, { "epoch": 22.240583232077764, "grad_norm": 1.1571154594421387, "learning_rate": 2.748e-05, "loss": 0.0301, "step": 9163 }, { "epoch": 22.243013365735116, "grad_norm": 1.32268488407135, "learning_rate": 2.7483e-05, "loss": 0.0402, "step": 9164 }, { "epoch": 22.245443499392465, "grad_norm": 0.7586017847061157, "learning_rate": 2.7486e-05, "loss": 0.224, "step": 9165 }, { "epoch": 22.247873633049817, "grad_norm": 0.6482843160629272, "learning_rate": 2.7489e-05, "loss": 0.1733, "step": 9166 }, { "epoch": 22.25030376670717, "grad_norm": 0.6450307369232178, "learning_rate": 2.7492e-05, "loss": 0.1219, "step": 9167 }, { "epoch": 22.25273390036452, "grad_norm": 0.5920550227165222, "learning_rate": 2.7495e-05, "loss": 0.1124, "step": 9168 }, { "epoch": 22.25516403402187, "grad_norm": 0.6254917979240417, "learning_rate": 2.7498e-05, "loss": 0.0913, "step": 9169 }, { "epoch": 22.257594167679223, "grad_norm": 0.48402583599090576, "learning_rate": 2.7501e-05, "loss": 0.071, "step": 9170 }, { "epoch": 22.260024301336575, "grad_norm": 0.45802560448646545, "learning_rate": 2.7504e-05, "loss": 0.0758, "step": 9171 }, { "epoch": 22.262454434993924, "grad_norm": 0.5078534483909607, "learning_rate": 2.7507000000000003e-05, "loss": 0.051, "step": 9172 }, { "epoch": 22.264884568651276, "grad_norm": 0.35433781147003174, "learning_rate": 2.7510000000000003e-05, "loss": 0.0364, "step": 9173 }, { "epoch": 22.267314702308628, "grad_norm": 0.4933721125125885, "learning_rate": 2.7513000000000002e-05, "loss": 0.0332, "step": 9174 }, { "epoch": 22.269744835965977, "grad_norm": 0.693749725818634, "learning_rate": 2.7516000000000002e-05, "loss": 0.0302, "step": 9175 }, { "epoch": 22.27217496962333, "grad_norm": 0.4784693419933319, "learning_rate": 2.7519000000000002e-05, "loss": 0.0354, "step": 9176 }, { "epoch": 22.27460510328068, "grad_norm": 0.308346152305603, "learning_rate": 2.7522000000000002e-05, "loss": 0.0194, "step": 9177 }, { "epoch": 22.277035236938033, "grad_norm": 0.3500645160675049, "learning_rate": 2.7525e-05, "loss": 0.022, "step": 9178 }, { "epoch": 22.279465370595382, "grad_norm": 0.39488473534584045, "learning_rate": 2.7528e-05, "loss": 0.0241, "step": 9179 }, { "epoch": 22.281895504252734, "grad_norm": 0.5870954990386963, "learning_rate": 2.7531e-05, "loss": 0.0224, "step": 9180 }, { "epoch": 22.284325637910086, "grad_norm": 0.36992520093917847, "learning_rate": 2.7533999999999998e-05, "loss": 0.0166, "step": 9181 }, { "epoch": 22.286755771567435, "grad_norm": 0.48305484652519226, "learning_rate": 2.7537e-05, "loss": 0.0378, "step": 9182 }, { "epoch": 22.289185905224787, "grad_norm": 0.595952033996582, "learning_rate": 2.754e-05, "loss": 0.0224, "step": 9183 }, { "epoch": 22.29161603888214, "grad_norm": 0.8251422047615051, "learning_rate": 2.7543e-05, "loss": 0.027, "step": 9184 }, { "epoch": 22.29404617253949, "grad_norm": 0.5844799876213074, "learning_rate": 2.7546e-05, "loss": 0.0234, "step": 9185 }, { "epoch": 22.29647630619684, "grad_norm": 0.7183060050010681, "learning_rate": 2.7549e-05, "loss": 0.0286, "step": 9186 }, { "epoch": 22.298906439854193, "grad_norm": 0.6018632650375366, "learning_rate": 2.7552e-05, "loss": 0.0234, "step": 9187 }, { "epoch": 22.30133657351154, "grad_norm": 0.41447004675865173, "learning_rate": 2.7555e-05, "loss": 0.0253, "step": 9188 }, { "epoch": 22.303766707168894, "grad_norm": 0.27857449650764465, "learning_rate": 2.7558e-05, "loss": 0.0185, "step": 9189 }, { "epoch": 22.306196840826246, "grad_norm": 0.5695845484733582, "learning_rate": 2.7561e-05, "loss": 0.0539, "step": 9190 }, { "epoch": 22.308626974483598, "grad_norm": 1.3180394172668457, "learning_rate": 2.7564e-05, "loss": 0.0322, "step": 9191 }, { "epoch": 22.311057108140947, "grad_norm": 0.5440097451210022, "learning_rate": 2.7567000000000002e-05, "loss": 0.0358, "step": 9192 }, { "epoch": 22.3134872417983, "grad_norm": 0.5186916589736938, "learning_rate": 2.7570000000000002e-05, "loss": 0.0238, "step": 9193 }, { "epoch": 22.31591737545565, "grad_norm": 0.5924410820007324, "learning_rate": 2.7573000000000002e-05, "loss": 0.0151, "step": 9194 }, { "epoch": 22.318347509113, "grad_norm": 0.9278886914253235, "learning_rate": 2.7576e-05, "loss": 0.0331, "step": 9195 }, { "epoch": 22.320777642770352, "grad_norm": 0.31742894649505615, "learning_rate": 2.7579e-05, "loss": 0.0141, "step": 9196 }, { "epoch": 22.323207776427704, "grad_norm": 0.43500128388404846, "learning_rate": 2.7582e-05, "loss": 0.0166, "step": 9197 }, { "epoch": 22.325637910085053, "grad_norm": 0.4780687689781189, "learning_rate": 2.7585e-05, "loss": 0.0266, "step": 9198 }, { "epoch": 22.328068043742405, "grad_norm": 0.8260059356689453, "learning_rate": 2.7588e-05, "loss": 0.0354, "step": 9199 }, { "epoch": 22.330498177399758, "grad_norm": 0.8500795960426331, "learning_rate": 2.7591e-05, "loss": 0.074, "step": 9200 }, { "epoch": 22.33292831105711, "grad_norm": 0.5206941366195679, "learning_rate": 2.7594e-05, "loss": 0.027, "step": 9201 }, { "epoch": 22.33535844471446, "grad_norm": 0.40933939814567566, "learning_rate": 2.7597000000000004e-05, "loss": 0.0141, "step": 9202 }, { "epoch": 22.33778857837181, "grad_norm": 0.4974237382411957, "learning_rate": 2.7600000000000003e-05, "loss": 0.0208, "step": 9203 }, { "epoch": 22.340218712029163, "grad_norm": 0.8004574775695801, "learning_rate": 2.7603000000000003e-05, "loss": 0.0229, "step": 9204 }, { "epoch": 22.34264884568651, "grad_norm": 0.8847511410713196, "learning_rate": 2.7606e-05, "loss": 0.0248, "step": 9205 }, { "epoch": 22.345078979343864, "grad_norm": 0.4805704355239868, "learning_rate": 2.7609e-05, "loss": 0.0205, "step": 9206 }, { "epoch": 22.347509113001216, "grad_norm": 0.5974154472351074, "learning_rate": 2.7612e-05, "loss": 0.0233, "step": 9207 }, { "epoch": 22.349939246658565, "grad_norm": 0.566660463809967, "learning_rate": 2.7615e-05, "loss": 0.0311, "step": 9208 }, { "epoch": 22.352369380315917, "grad_norm": 0.902804434299469, "learning_rate": 2.7618e-05, "loss": 0.034, "step": 9209 }, { "epoch": 22.35479951397327, "grad_norm": 0.8905074000358582, "learning_rate": 2.7621e-05, "loss": 0.0301, "step": 9210 }, { "epoch": 22.35722964763062, "grad_norm": 0.7586123943328857, "learning_rate": 2.7624e-05, "loss": 0.0203, "step": 9211 }, { "epoch": 22.35965978128797, "grad_norm": 0.910898745059967, "learning_rate": 2.7627e-05, "loss": 0.0515, "step": 9212 }, { "epoch": 22.362089914945322, "grad_norm": 0.5926105380058289, "learning_rate": 2.763e-05, "loss": 0.0256, "step": 9213 }, { "epoch": 22.364520048602675, "grad_norm": 0.8123162984848022, "learning_rate": 2.7633e-05, "loss": 0.0325, "step": 9214 }, { "epoch": 22.366950182260023, "grad_norm": 1.193516492843628, "learning_rate": 2.7636e-05, "loss": 0.2471, "step": 9215 }, { "epoch": 22.369380315917375, "grad_norm": 0.6934013366699219, "learning_rate": 2.7639e-05, "loss": 0.15, "step": 9216 }, { "epoch": 22.371810449574728, "grad_norm": 0.6055653095245361, "learning_rate": 2.7642e-05, "loss": 0.1075, "step": 9217 }, { "epoch": 22.374240583232076, "grad_norm": 0.5830343961715698, "learning_rate": 2.7645e-05, "loss": 0.0851, "step": 9218 }, { "epoch": 22.37667071688943, "grad_norm": 0.7221181392669678, "learning_rate": 2.7648e-05, "loss": 0.0873, "step": 9219 }, { "epoch": 22.37910085054678, "grad_norm": 0.5769820213317871, "learning_rate": 2.7651e-05, "loss": 0.0799, "step": 9220 }, { "epoch": 22.381530984204133, "grad_norm": 0.6505403518676758, "learning_rate": 2.7654e-05, "loss": 0.0685, "step": 9221 }, { "epoch": 22.38396111786148, "grad_norm": 0.4131193161010742, "learning_rate": 2.7657000000000003e-05, "loss": 0.0386, "step": 9222 }, { "epoch": 22.386391251518834, "grad_norm": 0.5084986090660095, "learning_rate": 2.7660000000000003e-05, "loss": 0.0471, "step": 9223 }, { "epoch": 22.388821385176186, "grad_norm": 0.38422611355781555, "learning_rate": 2.7663000000000003e-05, "loss": 0.0267, "step": 9224 }, { "epoch": 22.391251518833535, "grad_norm": 0.49498891830444336, "learning_rate": 2.7666000000000002e-05, "loss": 0.0351, "step": 9225 }, { "epoch": 22.393681652490887, "grad_norm": 0.5666396021842957, "learning_rate": 2.7669000000000002e-05, "loss": 0.0331, "step": 9226 }, { "epoch": 22.39611178614824, "grad_norm": 0.48002371191978455, "learning_rate": 2.7672000000000002e-05, "loss": 0.0281, "step": 9227 }, { "epoch": 22.398541919805588, "grad_norm": 0.6061299443244934, "learning_rate": 2.7675000000000002e-05, "loss": 0.0399, "step": 9228 }, { "epoch": 22.40097205346294, "grad_norm": 0.4366823434829712, "learning_rate": 2.7678e-05, "loss": 0.0298, "step": 9229 }, { "epoch": 22.403402187120292, "grad_norm": 0.30503225326538086, "learning_rate": 2.7680999999999998e-05, "loss": 0.015, "step": 9230 }, { "epoch": 22.40583232077764, "grad_norm": 0.5061365962028503, "learning_rate": 2.7683999999999998e-05, "loss": 0.019, "step": 9231 }, { "epoch": 22.408262454434993, "grad_norm": 0.5706414580345154, "learning_rate": 2.7687e-05, "loss": 0.0373, "step": 9232 }, { "epoch": 22.410692588092346, "grad_norm": 0.3501380980014801, "learning_rate": 2.769e-05, "loss": 0.0176, "step": 9233 }, { "epoch": 22.413122721749698, "grad_norm": 0.4872841238975525, "learning_rate": 2.7693e-05, "loss": 0.04, "step": 9234 }, { "epoch": 22.415552855407046, "grad_norm": 0.2970481812953949, "learning_rate": 2.7696e-05, "loss": 0.0125, "step": 9235 }, { "epoch": 22.4179829890644, "grad_norm": 0.34971851110458374, "learning_rate": 2.7699e-05, "loss": 0.0216, "step": 9236 }, { "epoch": 22.42041312272175, "grad_norm": 0.4063538610935211, "learning_rate": 2.7702e-05, "loss": 0.0214, "step": 9237 }, { "epoch": 22.4228432563791, "grad_norm": 0.2683359682559967, "learning_rate": 2.7705e-05, "loss": 0.0136, "step": 9238 }, { "epoch": 22.425273390036452, "grad_norm": 0.474384605884552, "learning_rate": 2.7708e-05, "loss": 0.0366, "step": 9239 }, { "epoch": 22.427703523693804, "grad_norm": 0.7933268547058105, "learning_rate": 2.7711e-05, "loss": 0.0275, "step": 9240 }, { "epoch": 22.430133657351153, "grad_norm": 0.44769200682640076, "learning_rate": 2.7714e-05, "loss": 0.0155, "step": 9241 }, { "epoch": 22.432563791008505, "grad_norm": 0.7219977974891663, "learning_rate": 2.7717000000000002e-05, "loss": 0.0247, "step": 9242 }, { "epoch": 22.434993924665857, "grad_norm": 0.5839986205101013, "learning_rate": 2.7720000000000002e-05, "loss": 0.0254, "step": 9243 }, { "epoch": 22.43742405832321, "grad_norm": 0.5842862129211426, "learning_rate": 2.7723000000000002e-05, "loss": 0.0233, "step": 9244 }, { "epoch": 22.439854191980558, "grad_norm": 0.3436921536922455, "learning_rate": 2.7726000000000002e-05, "loss": 0.0158, "step": 9245 }, { "epoch": 22.44228432563791, "grad_norm": 1.5048154592514038, "learning_rate": 2.7729e-05, "loss": 0.035, "step": 9246 }, { "epoch": 22.444714459295263, "grad_norm": 0.29647499322891235, "learning_rate": 2.7732e-05, "loss": 0.0105, "step": 9247 }, { "epoch": 22.44714459295261, "grad_norm": 0.45145437121391296, "learning_rate": 2.7735e-05, "loss": 0.0168, "step": 9248 }, { "epoch": 22.449574726609963, "grad_norm": 0.6491940021514893, "learning_rate": 2.7738e-05, "loss": 0.0305, "step": 9249 }, { "epoch": 22.452004860267316, "grad_norm": 1.0945988893508911, "learning_rate": 2.7741e-05, "loss": 0.0207, "step": 9250 }, { "epoch": 22.454434993924664, "grad_norm": 0.41506659984588623, "learning_rate": 2.7744e-05, "loss": 0.0149, "step": 9251 }, { "epoch": 22.456865127582017, "grad_norm": 0.5094269514083862, "learning_rate": 2.7747000000000004e-05, "loss": 0.017, "step": 9252 }, { "epoch": 22.45929526123937, "grad_norm": 0.5314305424690247, "learning_rate": 2.7750000000000004e-05, "loss": 0.0208, "step": 9253 }, { "epoch": 22.46172539489672, "grad_norm": 0.48230695724487305, "learning_rate": 2.7753e-05, "loss": 0.0203, "step": 9254 }, { "epoch": 22.46415552855407, "grad_norm": 0.4365916848182678, "learning_rate": 2.7756e-05, "loss": 0.0206, "step": 9255 }, { "epoch": 22.466585662211422, "grad_norm": 0.7527480721473694, "learning_rate": 2.7759e-05, "loss": 0.0214, "step": 9256 }, { "epoch": 22.469015795868774, "grad_norm": 0.6245054006576538, "learning_rate": 2.7762e-05, "loss": 0.0284, "step": 9257 }, { "epoch": 22.471445929526123, "grad_norm": 1.3616434335708618, "learning_rate": 2.7765e-05, "loss": 0.0317, "step": 9258 }, { "epoch": 22.473876063183475, "grad_norm": 0.5271835327148438, "learning_rate": 2.7768e-05, "loss": 0.0199, "step": 9259 }, { "epoch": 22.476306196840827, "grad_norm": 0.5403119325637817, "learning_rate": 2.7771e-05, "loss": 0.0167, "step": 9260 }, { "epoch": 22.478736330498176, "grad_norm": 0.4718811810016632, "learning_rate": 2.7774e-05, "loss": 0.0213, "step": 9261 }, { "epoch": 22.481166464155528, "grad_norm": 0.7632173895835876, "learning_rate": 2.7777e-05, "loss": 0.0291, "step": 9262 }, { "epoch": 22.48359659781288, "grad_norm": 0.9469254016876221, "learning_rate": 2.778e-05, "loss": 0.032, "step": 9263 }, { "epoch": 22.48602673147023, "grad_norm": 3.094468355178833, "learning_rate": 2.7783e-05, "loss": 0.0617, "step": 9264 }, { "epoch": 22.48845686512758, "grad_norm": 1.1693143844604492, "learning_rate": 2.7786e-05, "loss": 0.2001, "step": 9265 }, { "epoch": 22.490886998784934, "grad_norm": 0.6980845332145691, "learning_rate": 2.7789e-05, "loss": 0.1663, "step": 9266 }, { "epoch": 22.493317132442286, "grad_norm": 0.6631114482879639, "learning_rate": 2.7792e-05, "loss": 0.1359, "step": 9267 }, { "epoch": 22.495747266099634, "grad_norm": 0.49252408742904663, "learning_rate": 2.7795e-05, "loss": 0.0962, "step": 9268 }, { "epoch": 22.498177399756987, "grad_norm": 0.6530789136886597, "learning_rate": 2.7798e-05, "loss": 0.1241, "step": 9269 }, { "epoch": 22.50060753341434, "grad_norm": 0.5647223591804504, "learning_rate": 2.7801e-05, "loss": 0.0798, "step": 9270 }, { "epoch": 22.503037667071688, "grad_norm": 0.6573798656463623, "learning_rate": 2.7804e-05, "loss": 0.0624, "step": 9271 }, { "epoch": 22.50546780072904, "grad_norm": 0.3972306251525879, "learning_rate": 2.7807e-05, "loss": 0.0539, "step": 9272 }, { "epoch": 22.507897934386392, "grad_norm": 0.37657853960990906, "learning_rate": 2.7810000000000003e-05, "loss": 0.0353, "step": 9273 }, { "epoch": 22.51032806804374, "grad_norm": 0.3960722088813782, "learning_rate": 2.7813000000000003e-05, "loss": 0.049, "step": 9274 }, { "epoch": 22.512758201701093, "grad_norm": 0.340494304895401, "learning_rate": 2.7816000000000003e-05, "loss": 0.0269, "step": 9275 }, { "epoch": 22.515188335358445, "grad_norm": 0.5564839839935303, "learning_rate": 2.7819000000000002e-05, "loss": 0.041, "step": 9276 }, { "epoch": 22.517618469015797, "grad_norm": 0.5186761617660522, "learning_rate": 2.7822000000000002e-05, "loss": 0.0245, "step": 9277 }, { "epoch": 22.520048602673146, "grad_norm": 0.5570184588432312, "learning_rate": 2.7825000000000002e-05, "loss": 0.0334, "step": 9278 }, { "epoch": 22.5224787363305, "grad_norm": 0.29423847794532776, "learning_rate": 2.7828e-05, "loss": 0.0276, "step": 9279 }, { "epoch": 22.52490886998785, "grad_norm": 0.3717619776725769, "learning_rate": 2.7831e-05, "loss": 0.0191, "step": 9280 }, { "epoch": 22.5273390036452, "grad_norm": 0.4400074779987335, "learning_rate": 2.7833999999999998e-05, "loss": 0.0288, "step": 9281 }, { "epoch": 22.52976913730255, "grad_norm": 0.4011082351207733, "learning_rate": 2.7836999999999998e-05, "loss": 0.0195, "step": 9282 }, { "epoch": 22.532199270959904, "grad_norm": 0.4125007092952728, "learning_rate": 2.784e-05, "loss": 0.0285, "step": 9283 }, { "epoch": 22.534629404617252, "grad_norm": 0.5023068189620972, "learning_rate": 2.7843e-05, "loss": 0.0235, "step": 9284 }, { "epoch": 22.537059538274605, "grad_norm": 0.6183760762214661, "learning_rate": 2.7846e-05, "loss": 0.0288, "step": 9285 }, { "epoch": 22.539489671931957, "grad_norm": 1.1764925718307495, "learning_rate": 2.7849e-05, "loss": 0.0156, "step": 9286 }, { "epoch": 22.54191980558931, "grad_norm": 0.4188726544380188, "learning_rate": 2.7852e-05, "loss": 0.016, "step": 9287 }, { "epoch": 22.544349939246658, "grad_norm": 0.7331187129020691, "learning_rate": 2.7855e-05, "loss": 0.0285, "step": 9288 }, { "epoch": 22.54678007290401, "grad_norm": 0.40832069516181946, "learning_rate": 2.7858e-05, "loss": 0.0181, "step": 9289 }, { "epoch": 22.549210206561362, "grad_norm": 0.5224248766899109, "learning_rate": 2.7861e-05, "loss": 0.0249, "step": 9290 }, { "epoch": 22.55164034021871, "grad_norm": 0.4503021836280823, "learning_rate": 2.7864e-05, "loss": 0.0216, "step": 9291 }, { "epoch": 22.554070473876063, "grad_norm": 0.6234816908836365, "learning_rate": 2.7867e-05, "loss": 0.0245, "step": 9292 }, { "epoch": 22.556500607533415, "grad_norm": 0.4400221109390259, "learning_rate": 2.7870000000000003e-05, "loss": 0.018, "step": 9293 }, { "epoch": 22.558930741190764, "grad_norm": 0.5883082151412964, "learning_rate": 2.7873000000000002e-05, "loss": 0.0101, "step": 9294 }, { "epoch": 22.561360874848116, "grad_norm": 0.3209976553916931, "learning_rate": 2.7876000000000002e-05, "loss": 0.0195, "step": 9295 }, { "epoch": 22.56379100850547, "grad_norm": 1.0651991367340088, "learning_rate": 2.7879000000000002e-05, "loss": 0.0217, "step": 9296 }, { "epoch": 22.566221142162817, "grad_norm": 0.7212660908699036, "learning_rate": 2.7882000000000002e-05, "loss": 0.0235, "step": 9297 }, { "epoch": 22.56865127582017, "grad_norm": 0.31023678183555603, "learning_rate": 2.7885e-05, "loss": 0.0113, "step": 9298 }, { "epoch": 22.57108140947752, "grad_norm": 0.45629262924194336, "learning_rate": 2.7888e-05, "loss": 0.0659, "step": 9299 }, { "epoch": 22.573511543134874, "grad_norm": 0.5893154144287109, "learning_rate": 2.7891e-05, "loss": 0.0308, "step": 9300 }, { "epoch": 22.575941676792223, "grad_norm": 0.6285452246665955, "learning_rate": 2.7894e-05, "loss": 0.0235, "step": 9301 }, { "epoch": 22.578371810449575, "grad_norm": 0.4832080006599426, "learning_rate": 2.7897e-05, "loss": 0.0252, "step": 9302 }, { "epoch": 22.580801944106927, "grad_norm": 0.5075706839561462, "learning_rate": 2.79e-05, "loss": 0.0168, "step": 9303 }, { "epoch": 22.583232077764276, "grad_norm": 0.4756624102592468, "learning_rate": 2.7903e-05, "loss": 0.0246, "step": 9304 }, { "epoch": 22.585662211421628, "grad_norm": 0.30192118883132935, "learning_rate": 2.7906e-05, "loss": 0.0095, "step": 9305 }, { "epoch": 22.58809234507898, "grad_norm": 0.4943375587463379, "learning_rate": 2.7909e-05, "loss": 0.0245, "step": 9306 }, { "epoch": 22.59052247873633, "grad_norm": 0.6123420596122742, "learning_rate": 2.7912e-05, "loss": 0.0374, "step": 9307 }, { "epoch": 22.59295261239368, "grad_norm": 0.5497276186943054, "learning_rate": 2.7915e-05, "loss": 0.0309, "step": 9308 }, { "epoch": 22.595382746051033, "grad_norm": 0.46815380454063416, "learning_rate": 2.7918e-05, "loss": 0.0186, "step": 9309 }, { "epoch": 22.597812879708385, "grad_norm": 0.5451845526695251, "learning_rate": 2.7921e-05, "loss": 0.0199, "step": 9310 }, { "epoch": 22.600243013365734, "grad_norm": 1.598708152770996, "learning_rate": 2.7924e-05, "loss": 0.029, "step": 9311 }, { "epoch": 22.602673147023086, "grad_norm": 1.24983811378479, "learning_rate": 2.7927e-05, "loss": 0.0268, "step": 9312 }, { "epoch": 22.60510328068044, "grad_norm": 1.0216436386108398, "learning_rate": 2.7930000000000002e-05, "loss": 0.038, "step": 9313 }, { "epoch": 22.607533414337787, "grad_norm": 0.9176215529441833, "learning_rate": 2.7933000000000002e-05, "loss": 0.0815, "step": 9314 }, { "epoch": 22.60996354799514, "grad_norm": 1.3020471334457397, "learning_rate": 2.7936e-05, "loss": 0.2192, "step": 9315 }, { "epoch": 22.61239368165249, "grad_norm": 0.6484166383743286, "learning_rate": 2.7939e-05, "loss": 0.1603, "step": 9316 }, { "epoch": 22.61482381530984, "grad_norm": 0.5293410420417786, "learning_rate": 2.7942e-05, "loss": 0.108, "step": 9317 }, { "epoch": 22.617253948967193, "grad_norm": 0.5187203884124756, "learning_rate": 2.7945e-05, "loss": 0.1084, "step": 9318 }, { "epoch": 22.619684082624545, "grad_norm": 0.8823221921920776, "learning_rate": 2.7948e-05, "loss": 0.1468, "step": 9319 }, { "epoch": 22.622114216281897, "grad_norm": 0.6995418071746826, "learning_rate": 2.7951e-05, "loss": 0.0879, "step": 9320 }, { "epoch": 22.624544349939246, "grad_norm": 0.6561338305473328, "learning_rate": 2.7954e-05, "loss": 0.0748, "step": 9321 }, { "epoch": 22.626974483596598, "grad_norm": 0.6966776847839355, "learning_rate": 2.7957e-05, "loss": 0.069, "step": 9322 }, { "epoch": 22.62940461725395, "grad_norm": 0.5141066908836365, "learning_rate": 2.7960000000000003e-05, "loss": 0.035, "step": 9323 }, { "epoch": 22.6318347509113, "grad_norm": 0.5810403823852539, "learning_rate": 2.7963000000000003e-05, "loss": 0.0493, "step": 9324 }, { "epoch": 22.63426488456865, "grad_norm": 0.4932030439376831, "learning_rate": 2.7966000000000003e-05, "loss": 0.0499, "step": 9325 }, { "epoch": 22.636695018226003, "grad_norm": 0.2928663194179535, "learning_rate": 2.7969000000000003e-05, "loss": 0.0293, "step": 9326 }, { "epoch": 22.639125151883352, "grad_norm": 0.44927459955215454, "learning_rate": 2.7972000000000003e-05, "loss": 0.0455, "step": 9327 }, { "epoch": 22.641555285540704, "grad_norm": 0.24488972127437592, "learning_rate": 2.7975e-05, "loss": 0.0176, "step": 9328 }, { "epoch": 22.643985419198057, "grad_norm": 0.3953351676464081, "learning_rate": 2.7978e-05, "loss": 0.027, "step": 9329 }, { "epoch": 22.64641555285541, "grad_norm": 0.6846179366111755, "learning_rate": 2.7981e-05, "loss": 0.0544, "step": 9330 }, { "epoch": 22.648845686512757, "grad_norm": 0.26763325929641724, "learning_rate": 2.7984e-05, "loss": 0.0198, "step": 9331 }, { "epoch": 22.65127582017011, "grad_norm": 0.4463162124156952, "learning_rate": 2.7986999999999998e-05, "loss": 0.0344, "step": 9332 }, { "epoch": 22.653705953827462, "grad_norm": 0.4693339765071869, "learning_rate": 2.799e-05, "loss": 0.0264, "step": 9333 }, { "epoch": 22.65613608748481, "grad_norm": 0.6122652888298035, "learning_rate": 2.7993e-05, "loss": 0.0334, "step": 9334 }, { "epoch": 22.658566221142163, "grad_norm": 0.30721843242645264, "learning_rate": 2.7996e-05, "loss": 0.0148, "step": 9335 }, { "epoch": 22.660996354799515, "grad_norm": 0.6713215708732605, "learning_rate": 2.7999e-05, "loss": 0.0282, "step": 9336 }, { "epoch": 22.663426488456864, "grad_norm": 0.29405027627944946, "learning_rate": 2.8002e-05, "loss": 0.0205, "step": 9337 }, { "epoch": 22.665856622114216, "grad_norm": 0.7730758190155029, "learning_rate": 2.8005e-05, "loss": 0.0222, "step": 9338 }, { "epoch": 22.668286755771568, "grad_norm": 0.47438108921051025, "learning_rate": 2.8008e-05, "loss": 0.0152, "step": 9339 }, { "epoch": 22.670716889428917, "grad_norm": 1.0020861625671387, "learning_rate": 2.8011e-05, "loss": 0.0427, "step": 9340 }, { "epoch": 22.67314702308627, "grad_norm": 0.9341611862182617, "learning_rate": 2.8014e-05, "loss": 0.0306, "step": 9341 }, { "epoch": 22.67557715674362, "grad_norm": 0.4241943955421448, "learning_rate": 2.8017e-05, "loss": 0.0192, "step": 9342 }, { "epoch": 22.678007290400974, "grad_norm": 0.6016605496406555, "learning_rate": 2.8020000000000003e-05, "loss": 0.0219, "step": 9343 }, { "epoch": 22.680437424058322, "grad_norm": 0.8628275394439697, "learning_rate": 2.8023000000000003e-05, "loss": 0.0146, "step": 9344 }, { "epoch": 22.682867557715674, "grad_norm": 0.9867396950721741, "learning_rate": 2.8026000000000002e-05, "loss": 0.0237, "step": 9345 }, { "epoch": 22.685297691373027, "grad_norm": 0.49751609563827515, "learning_rate": 2.8029000000000002e-05, "loss": 0.0183, "step": 9346 }, { "epoch": 22.687727825030375, "grad_norm": 1.1012883186340332, "learning_rate": 2.8032000000000002e-05, "loss": 0.0336, "step": 9347 }, { "epoch": 22.690157958687728, "grad_norm": 0.6060912609100342, "learning_rate": 2.8035000000000002e-05, "loss": 0.0296, "step": 9348 }, { "epoch": 22.69258809234508, "grad_norm": 0.36981600522994995, "learning_rate": 2.8038e-05, "loss": 0.0187, "step": 9349 }, { "epoch": 22.69501822600243, "grad_norm": 0.399387001991272, "learning_rate": 2.8041e-05, "loss": 0.0239, "step": 9350 }, { "epoch": 22.69744835965978, "grad_norm": 0.36690255999565125, "learning_rate": 2.8044e-05, "loss": 0.0155, "step": 9351 }, { "epoch": 22.699878493317133, "grad_norm": 0.6519044041633606, "learning_rate": 2.8047e-05, "loss": 0.0273, "step": 9352 }, { "epoch": 22.702308626974485, "grad_norm": 0.5096431970596313, "learning_rate": 2.805e-05, "loss": 0.0212, "step": 9353 }, { "epoch": 22.704738760631834, "grad_norm": 0.6430704593658447, "learning_rate": 2.8053e-05, "loss": 0.0317, "step": 9354 }, { "epoch": 22.707168894289186, "grad_norm": 0.6916409134864807, "learning_rate": 2.8056e-05, "loss": 0.0216, "step": 9355 }, { "epoch": 22.70959902794654, "grad_norm": 0.6632130146026611, "learning_rate": 2.8059e-05, "loss": 0.0263, "step": 9356 }, { "epoch": 22.712029161603887, "grad_norm": 0.47448408603668213, "learning_rate": 2.8062e-05, "loss": 0.0273, "step": 9357 }, { "epoch": 22.71445929526124, "grad_norm": 0.2979868948459625, "learning_rate": 2.8065e-05, "loss": 0.0136, "step": 9358 }, { "epoch": 22.71688942891859, "grad_norm": 0.9389346837997437, "learning_rate": 2.8068e-05, "loss": 0.0364, "step": 9359 }, { "epoch": 22.71931956257594, "grad_norm": 0.9410580992698669, "learning_rate": 2.8071e-05, "loss": 0.032, "step": 9360 }, { "epoch": 22.721749696233292, "grad_norm": 3.217963457107544, "learning_rate": 2.8074e-05, "loss": 0.0358, "step": 9361 }, { "epoch": 22.724179829890645, "grad_norm": 0.6943159103393555, "learning_rate": 2.8077e-05, "loss": 0.0295, "step": 9362 }, { "epoch": 22.726609963547997, "grad_norm": 0.5410875678062439, "learning_rate": 2.8080000000000002e-05, "loss": 0.0163, "step": 9363 }, { "epoch": 22.729040097205345, "grad_norm": 2.1501495838165283, "learning_rate": 2.8083000000000002e-05, "loss": 0.0729, "step": 9364 }, { "epoch": 22.731470230862698, "grad_norm": 0.9641919136047363, "learning_rate": 2.8086000000000002e-05, "loss": 0.2331, "step": 9365 }, { "epoch": 22.73390036452005, "grad_norm": 1.7116528749465942, "learning_rate": 2.8089e-05, "loss": 0.1605, "step": 9366 }, { "epoch": 22.7363304981774, "grad_norm": 0.7709829807281494, "learning_rate": 2.8092e-05, "loss": 0.1277, "step": 9367 }, { "epoch": 22.73876063183475, "grad_norm": 0.8970803618431091, "learning_rate": 2.8095e-05, "loss": 0.1145, "step": 9368 }, { "epoch": 22.741190765492103, "grad_norm": 0.8573577404022217, "learning_rate": 2.8098e-05, "loss": 0.1039, "step": 9369 }, { "epoch": 22.74362089914945, "grad_norm": 0.6778486371040344, "learning_rate": 2.8101e-05, "loss": 0.1083, "step": 9370 }, { "epoch": 22.746051032806804, "grad_norm": 0.4432999789714813, "learning_rate": 2.8104e-05, "loss": 0.0621, "step": 9371 }, { "epoch": 22.748481166464156, "grad_norm": 1.382272481918335, "learning_rate": 2.8107e-05, "loss": 0.0696, "step": 9372 }, { "epoch": 22.75091130012151, "grad_norm": 0.965857982635498, "learning_rate": 2.8110000000000004e-05, "loss": 0.0622, "step": 9373 }, { "epoch": 22.753341433778857, "grad_norm": 0.8091889023780823, "learning_rate": 2.8113000000000003e-05, "loss": 0.0592, "step": 9374 }, { "epoch": 22.75577156743621, "grad_norm": 0.4894338548183441, "learning_rate": 2.8116000000000003e-05, "loss": 0.0328, "step": 9375 }, { "epoch": 22.75820170109356, "grad_norm": 0.4893525540828705, "learning_rate": 2.8119000000000003e-05, "loss": 0.048, "step": 9376 }, { "epoch": 22.76063183475091, "grad_norm": 0.4641259014606476, "learning_rate": 2.8122e-05, "loss": 0.0382, "step": 9377 }, { "epoch": 22.763061968408262, "grad_norm": 0.36884376406669617, "learning_rate": 2.8125e-05, "loss": 0.0324, "step": 9378 }, { "epoch": 22.765492102065615, "grad_norm": 0.5786806344985962, "learning_rate": 2.8128e-05, "loss": 0.0409, "step": 9379 }, { "epoch": 22.767922235722963, "grad_norm": 0.4237705171108246, "learning_rate": 2.8131e-05, "loss": 0.0262, "step": 9380 }, { "epoch": 22.770352369380316, "grad_norm": 0.5643070936203003, "learning_rate": 2.8134e-05, "loss": 0.0417, "step": 9381 }, { "epoch": 22.772782503037668, "grad_norm": 0.3885447680950165, "learning_rate": 2.8137e-05, "loss": 0.0266, "step": 9382 }, { "epoch": 22.775212636695016, "grad_norm": 0.731120765209198, "learning_rate": 2.8139999999999998e-05, "loss": 0.0378, "step": 9383 }, { "epoch": 22.77764277035237, "grad_norm": 0.36511409282684326, "learning_rate": 2.8143e-05, "loss": 0.0188, "step": 9384 }, { "epoch": 22.78007290400972, "grad_norm": 0.49961376190185547, "learning_rate": 2.8146e-05, "loss": 0.0224, "step": 9385 }, { "epoch": 22.782503037667073, "grad_norm": 0.522123396396637, "learning_rate": 2.8149e-05, "loss": 0.0418, "step": 9386 }, { "epoch": 22.784933171324422, "grad_norm": 0.4678845703601837, "learning_rate": 2.8152e-05, "loss": 0.0511, "step": 9387 }, { "epoch": 22.787363304981774, "grad_norm": 0.4056362509727478, "learning_rate": 2.8155e-05, "loss": 0.0228, "step": 9388 }, { "epoch": 22.789793438639126, "grad_norm": 0.4782094657421112, "learning_rate": 2.8158e-05, "loss": 0.0287, "step": 9389 }, { "epoch": 22.792223572296475, "grad_norm": 0.4575834274291992, "learning_rate": 2.8161e-05, "loss": 0.0154, "step": 9390 }, { "epoch": 22.794653705953827, "grad_norm": 0.43615227937698364, "learning_rate": 2.8164e-05, "loss": 0.0257, "step": 9391 }, { "epoch": 22.79708383961118, "grad_norm": 0.8591693043708801, "learning_rate": 2.8167e-05, "loss": 0.0197, "step": 9392 }, { "epoch": 22.799513973268528, "grad_norm": 0.3827569782733917, "learning_rate": 2.817e-05, "loss": 0.0192, "step": 9393 }, { "epoch": 22.80194410692588, "grad_norm": 0.6243499517440796, "learning_rate": 2.8173000000000003e-05, "loss": 0.0317, "step": 9394 }, { "epoch": 22.804374240583233, "grad_norm": 0.6299105882644653, "learning_rate": 2.8176000000000003e-05, "loss": 0.0244, "step": 9395 }, { "epoch": 22.806804374240585, "grad_norm": 0.3539683222770691, "learning_rate": 2.8179000000000002e-05, "loss": 0.0167, "step": 9396 }, { "epoch": 22.809234507897933, "grad_norm": 0.690250039100647, "learning_rate": 2.8182000000000002e-05, "loss": 0.0241, "step": 9397 }, { "epoch": 22.811664641555286, "grad_norm": 0.9647139310836792, "learning_rate": 2.8185000000000002e-05, "loss": 0.0264, "step": 9398 }, { "epoch": 22.814094775212638, "grad_norm": 0.29806557297706604, "learning_rate": 2.8188000000000002e-05, "loss": 0.0165, "step": 9399 }, { "epoch": 22.816524908869987, "grad_norm": 0.8239002227783203, "learning_rate": 2.8191e-05, "loss": 0.0336, "step": 9400 }, { "epoch": 22.81895504252734, "grad_norm": 0.3648075461387634, "learning_rate": 2.8194e-05, "loss": 0.0182, "step": 9401 }, { "epoch": 22.82138517618469, "grad_norm": 1.0636276006698608, "learning_rate": 2.8196999999999998e-05, "loss": 0.0456, "step": 9402 }, { "epoch": 22.82381530984204, "grad_norm": 1.6598310470581055, "learning_rate": 2.8199999999999998e-05, "loss": 0.0553, "step": 9403 }, { "epoch": 22.826245443499392, "grad_norm": 1.1525994539260864, "learning_rate": 2.8203e-05, "loss": 0.0201, "step": 9404 }, { "epoch": 22.828675577156744, "grad_norm": 0.44960400462150574, "learning_rate": 2.8206e-05, "loss": 0.0246, "step": 9405 }, { "epoch": 22.831105710814096, "grad_norm": 0.29651591181755066, "learning_rate": 2.8209e-05, "loss": 0.0132, "step": 9406 }, { "epoch": 22.833535844471445, "grad_norm": 0.5023442506790161, "learning_rate": 2.8212e-05, "loss": 0.0184, "step": 9407 }, { "epoch": 22.835965978128797, "grad_norm": 0.6020722389221191, "learning_rate": 2.8215e-05, "loss": 0.026, "step": 9408 }, { "epoch": 22.83839611178615, "grad_norm": 0.6313143968582153, "learning_rate": 2.8218e-05, "loss": 0.0256, "step": 9409 }, { "epoch": 22.8408262454435, "grad_norm": 0.7962828874588013, "learning_rate": 2.8221e-05, "loss": 0.0236, "step": 9410 }, { "epoch": 22.84325637910085, "grad_norm": 0.7888681292533875, "learning_rate": 2.8224e-05, "loss": 0.0343, "step": 9411 }, { "epoch": 22.845686512758203, "grad_norm": 1.2632921934127808, "learning_rate": 2.8227e-05, "loss": 0.0291, "step": 9412 }, { "epoch": 22.84811664641555, "grad_norm": 0.9216704964637756, "learning_rate": 2.823e-05, "loss": 0.032, "step": 9413 }, { "epoch": 22.850546780072904, "grad_norm": 1.0101237297058105, "learning_rate": 2.8233000000000002e-05, "loss": 0.0364, "step": 9414 }, { "epoch": 22.852976913730256, "grad_norm": 0.8496021032333374, "learning_rate": 2.8236000000000002e-05, "loss": 0.2335, "step": 9415 }, { "epoch": 22.855407047387608, "grad_norm": 0.6947756409645081, "learning_rate": 2.8239000000000002e-05, "loss": 0.1432, "step": 9416 }, { "epoch": 22.857837181044957, "grad_norm": 0.4704570174217224, "learning_rate": 2.8242e-05, "loss": 0.0997, "step": 9417 }, { "epoch": 22.86026731470231, "grad_norm": 0.5403448343276978, "learning_rate": 2.8245e-05, "loss": 0.1286, "step": 9418 }, { "epoch": 22.86269744835966, "grad_norm": 0.7318487167358398, "learning_rate": 2.8248e-05, "loss": 0.1179, "step": 9419 }, { "epoch": 22.86512758201701, "grad_norm": 0.49083393812179565, "learning_rate": 2.8251e-05, "loss": 0.0761, "step": 9420 }, { "epoch": 22.867557715674362, "grad_norm": 0.614996075630188, "learning_rate": 2.8254e-05, "loss": 0.0618, "step": 9421 }, { "epoch": 22.869987849331714, "grad_norm": 0.5688278079032898, "learning_rate": 2.8257e-05, "loss": 0.0558, "step": 9422 }, { "epoch": 22.872417982989063, "grad_norm": 0.5487556457519531, "learning_rate": 2.826e-05, "loss": 0.0374, "step": 9423 }, { "epoch": 22.874848116646415, "grad_norm": 0.6004343628883362, "learning_rate": 2.8263000000000004e-05, "loss": 0.0423, "step": 9424 }, { "epoch": 22.877278250303767, "grad_norm": 0.566051721572876, "learning_rate": 2.8266000000000003e-05, "loss": 0.0399, "step": 9425 }, { "epoch": 22.879708383961116, "grad_norm": 0.3323514759540558, "learning_rate": 2.8269e-05, "loss": 0.021, "step": 9426 }, { "epoch": 22.88213851761847, "grad_norm": 0.648038387298584, "learning_rate": 2.8272e-05, "loss": 0.0438, "step": 9427 }, { "epoch": 22.88456865127582, "grad_norm": 0.48871010541915894, "learning_rate": 2.8275e-05, "loss": 0.0298, "step": 9428 }, { "epoch": 22.886998784933173, "grad_norm": 0.741241991519928, "learning_rate": 2.8278e-05, "loss": 0.0263, "step": 9429 }, { "epoch": 22.88942891859052, "grad_norm": 0.4598803222179413, "learning_rate": 2.8281e-05, "loss": 0.0242, "step": 9430 }, { "epoch": 22.891859052247874, "grad_norm": 0.670743465423584, "learning_rate": 2.8284e-05, "loss": 0.0284, "step": 9431 }, { "epoch": 22.894289185905226, "grad_norm": 0.5211281776428223, "learning_rate": 2.8287e-05, "loss": 0.0219, "step": 9432 }, { "epoch": 22.896719319562575, "grad_norm": 2.1477580070495605, "learning_rate": 2.829e-05, "loss": 0.0223, "step": 9433 }, { "epoch": 22.899149453219927, "grad_norm": 1.1769518852233887, "learning_rate": 2.8293e-05, "loss": 0.0437, "step": 9434 }, { "epoch": 22.90157958687728, "grad_norm": 0.381596177816391, "learning_rate": 2.8296e-05, "loss": 0.021, "step": 9435 }, { "epoch": 22.904009720534628, "grad_norm": 0.4303818643093109, "learning_rate": 2.8299e-05, "loss": 0.018, "step": 9436 }, { "epoch": 22.90643985419198, "grad_norm": 0.4355303943157196, "learning_rate": 2.8302e-05, "loss": 0.0223, "step": 9437 }, { "epoch": 22.908869987849332, "grad_norm": 0.4276711940765381, "learning_rate": 2.8305e-05, "loss": 0.0175, "step": 9438 }, { "epoch": 22.911300121506684, "grad_norm": 1.0638747215270996, "learning_rate": 2.8308e-05, "loss": 0.018, "step": 9439 }, { "epoch": 22.913730255164033, "grad_norm": 0.5336689949035645, "learning_rate": 2.8311e-05, "loss": 0.0244, "step": 9440 }, { "epoch": 22.916160388821385, "grad_norm": 0.27811896800994873, "learning_rate": 2.8314e-05, "loss": 0.0141, "step": 9441 }, { "epoch": 22.918590522478738, "grad_norm": 0.5784602165222168, "learning_rate": 2.8317e-05, "loss": 0.024, "step": 9442 }, { "epoch": 22.921020656136086, "grad_norm": 0.4955185353755951, "learning_rate": 2.832e-05, "loss": 0.0341, "step": 9443 }, { "epoch": 22.92345078979344, "grad_norm": 1.2101373672485352, "learning_rate": 2.8323000000000003e-05, "loss": 0.0313, "step": 9444 }, { "epoch": 22.92588092345079, "grad_norm": 0.498500794172287, "learning_rate": 2.8326000000000003e-05, "loss": 0.0229, "step": 9445 }, { "epoch": 22.92831105710814, "grad_norm": 0.2996995151042938, "learning_rate": 2.8329000000000003e-05, "loss": 0.0132, "step": 9446 }, { "epoch": 22.93074119076549, "grad_norm": 0.4673643112182617, "learning_rate": 2.8332000000000002e-05, "loss": 0.0248, "step": 9447 }, { "epoch": 22.933171324422844, "grad_norm": 0.367348849773407, "learning_rate": 2.8335000000000002e-05, "loss": 0.0192, "step": 9448 }, { "epoch": 22.935601458080196, "grad_norm": 0.6791804432868958, "learning_rate": 2.8338000000000002e-05, "loss": 0.0148, "step": 9449 }, { "epoch": 22.938031591737545, "grad_norm": 0.3515191376209259, "learning_rate": 2.8341000000000002e-05, "loss": 0.0177, "step": 9450 }, { "epoch": 22.940461725394897, "grad_norm": 2.51188325881958, "learning_rate": 2.8344e-05, "loss": 0.033, "step": 9451 }, { "epoch": 22.94289185905225, "grad_norm": 0.8694812655448914, "learning_rate": 2.8346999999999998e-05, "loss": 0.0232, "step": 9452 }, { "epoch": 22.945321992709598, "grad_norm": 0.7809587121009827, "learning_rate": 2.8349999999999998e-05, "loss": 0.0318, "step": 9453 }, { "epoch": 22.94775212636695, "grad_norm": 0.9121000170707703, "learning_rate": 2.8353e-05, "loss": 0.0357, "step": 9454 }, { "epoch": 22.950182260024302, "grad_norm": 0.6000341773033142, "learning_rate": 2.8356e-05, "loss": 0.0211, "step": 9455 }, { "epoch": 22.95261239368165, "grad_norm": 1.295871615409851, "learning_rate": 2.8359e-05, "loss": 0.0235, "step": 9456 }, { "epoch": 22.955042527339003, "grad_norm": 0.6452890038490295, "learning_rate": 2.8362e-05, "loss": 0.028, "step": 9457 }, { "epoch": 22.957472660996356, "grad_norm": 0.520392894744873, "learning_rate": 2.8365e-05, "loss": 0.0265, "step": 9458 }, { "epoch": 22.959902794653708, "grad_norm": 0.808550238609314, "learning_rate": 2.8368e-05, "loss": 0.0403, "step": 9459 }, { "epoch": 22.962332928311056, "grad_norm": 0.6250235438346863, "learning_rate": 2.8371e-05, "loss": 0.0251, "step": 9460 }, { "epoch": 22.96476306196841, "grad_norm": 0.7097616195678711, "learning_rate": 2.8374e-05, "loss": 0.0298, "step": 9461 }, { "epoch": 22.96719319562576, "grad_norm": 1.4343596696853638, "learning_rate": 2.8377e-05, "loss": 0.045, "step": 9462 }, { "epoch": 22.96962332928311, "grad_norm": 0.8437432646751404, "learning_rate": 2.838e-05, "loss": 0.0246, "step": 9463 }, { "epoch": 22.972053462940462, "grad_norm": 1.0240205526351929, "learning_rate": 2.8383000000000003e-05, "loss": 0.0575, "step": 9464 }, { "epoch": 22.974483596597814, "grad_norm": 0.7677366137504578, "learning_rate": 2.8386000000000002e-05, "loss": 0.1629, "step": 9465 }, { "epoch": 22.976913730255163, "grad_norm": 0.6210299730300903, "learning_rate": 2.8389000000000002e-05, "loss": 0.0713, "step": 9466 }, { "epoch": 22.979343863912515, "grad_norm": 0.3806281089782715, "learning_rate": 2.8392000000000002e-05, "loss": 0.0367, "step": 9467 }, { "epoch": 22.981773997569867, "grad_norm": 0.5358873605728149, "learning_rate": 2.8395000000000002e-05, "loss": 0.0325, "step": 9468 }, { "epoch": 22.984204131227216, "grad_norm": 1.3514670133590698, "learning_rate": 2.8398e-05, "loss": 0.0425, "step": 9469 }, { "epoch": 22.986634264884568, "grad_norm": 0.4905816614627838, "learning_rate": 2.8401e-05, "loss": 0.0218, "step": 9470 }, { "epoch": 22.98906439854192, "grad_norm": 0.5260843634605408, "learning_rate": 2.8404e-05, "loss": 0.0226, "step": 9471 }, { "epoch": 22.991494532199273, "grad_norm": 0.8495868444442749, "learning_rate": 2.8407e-05, "loss": 0.0324, "step": 9472 }, { "epoch": 22.99392466585662, "grad_norm": 0.39350762963294983, "learning_rate": 2.841e-05, "loss": 0.0221, "step": 9473 }, { "epoch": 22.996354799513973, "grad_norm": 0.5451349020004272, "learning_rate": 2.8413000000000004e-05, "loss": 0.0194, "step": 9474 }, { "epoch": 22.998784933171326, "grad_norm": 0.5893639326095581, "learning_rate": 2.8416e-05, "loss": 0.0278, "step": 9475 }, { "epoch": 23.0, "grad_norm": 1.343565583229065, "learning_rate": 2.8419e-05, "loss": 0.0607, "step": 9476 }, { "epoch": 23.002430133657352, "grad_norm": 1.1660603284835815, "learning_rate": 2.8422e-05, "loss": 0.2055, "step": 9477 }, { "epoch": 23.0048602673147, "grad_norm": 0.9283016324043274, "learning_rate": 2.8425e-05, "loss": 0.1224, "step": 9478 }, { "epoch": 23.007290400972053, "grad_norm": 0.4372737407684326, "learning_rate": 2.8428e-05, "loss": 0.089, "step": 9479 }, { "epoch": 23.009720534629405, "grad_norm": 0.7654767036437988, "learning_rate": 2.8431e-05, "loss": 0.1148, "step": 9480 }, { "epoch": 23.012150668286754, "grad_norm": 0.8734636902809143, "learning_rate": 2.8434e-05, "loss": 0.1043, "step": 9481 }, { "epoch": 23.014580801944106, "grad_norm": 0.9045001268386841, "learning_rate": 2.8437e-05, "loss": 0.0819, "step": 9482 }, { "epoch": 23.01701093560146, "grad_norm": 0.720622181892395, "learning_rate": 2.844e-05, "loss": 0.0773, "step": 9483 }, { "epoch": 23.01944106925881, "grad_norm": 0.44730719923973083, "learning_rate": 2.8443000000000002e-05, "loss": 0.0498, "step": 9484 }, { "epoch": 23.02187120291616, "grad_norm": 0.6341466903686523, "learning_rate": 2.8446000000000002e-05, "loss": 0.0427, "step": 9485 }, { "epoch": 23.02430133657351, "grad_norm": 0.6713523268699646, "learning_rate": 2.8449e-05, "loss": 0.0861, "step": 9486 }, { "epoch": 23.026731470230864, "grad_norm": 0.6554563045501709, "learning_rate": 2.8452e-05, "loss": 0.0424, "step": 9487 }, { "epoch": 23.029161603888213, "grad_norm": 0.4606843888759613, "learning_rate": 2.8455e-05, "loss": 0.0268, "step": 9488 }, { "epoch": 23.031591737545565, "grad_norm": 0.4122871160507202, "learning_rate": 2.8458e-05, "loss": 0.0268, "step": 9489 }, { "epoch": 23.034021871202917, "grad_norm": 0.3034082353115082, "learning_rate": 2.8461e-05, "loss": 0.0256, "step": 9490 }, { "epoch": 23.036452004860266, "grad_norm": 0.40595242381095886, "learning_rate": 2.8464e-05, "loss": 0.0313, "step": 9491 }, { "epoch": 23.038882138517618, "grad_norm": 0.36841171979904175, "learning_rate": 2.8467e-05, "loss": 0.0142, "step": 9492 }, { "epoch": 23.04131227217497, "grad_norm": 0.49264591932296753, "learning_rate": 2.847e-05, "loss": 0.021, "step": 9493 }, { "epoch": 23.043742405832322, "grad_norm": 0.28425416350364685, "learning_rate": 2.8473000000000003e-05, "loss": 0.0165, "step": 9494 }, { "epoch": 23.04617253948967, "grad_norm": 0.5000314712524414, "learning_rate": 2.8476000000000003e-05, "loss": 0.0304, "step": 9495 }, { "epoch": 23.048602673147023, "grad_norm": 0.436055451631546, "learning_rate": 2.8479000000000003e-05, "loss": 0.0252, "step": 9496 }, { "epoch": 23.051032806804375, "grad_norm": 0.3492831587791443, "learning_rate": 2.8482000000000003e-05, "loss": 0.0148, "step": 9497 }, { "epoch": 23.053462940461724, "grad_norm": 0.4163864254951477, "learning_rate": 2.8485000000000003e-05, "loss": 0.0105, "step": 9498 }, { "epoch": 23.055893074119076, "grad_norm": 0.5151399970054626, "learning_rate": 2.8488000000000002e-05, "loss": 0.0169, "step": 9499 }, { "epoch": 23.05832320777643, "grad_norm": 0.7082837820053101, "learning_rate": 2.8491e-05, "loss": 0.0212, "step": 9500 }, { "epoch": 23.060753341433777, "grad_norm": 0.5986821055412292, "learning_rate": 2.8494e-05, "loss": 0.0202, "step": 9501 }, { "epoch": 23.06318347509113, "grad_norm": 0.5387284159660339, "learning_rate": 2.8497e-05, "loss": 0.0205, "step": 9502 }, { "epoch": 23.06561360874848, "grad_norm": 0.2771790325641632, "learning_rate": 2.8499999999999998e-05, "loss": 0.0136, "step": 9503 }, { "epoch": 23.068043742405834, "grad_norm": 0.45849311351776123, "learning_rate": 2.8502999999999998e-05, "loss": 0.0155, "step": 9504 }, { "epoch": 23.070473876063183, "grad_norm": 0.4336565434932709, "learning_rate": 2.8506e-05, "loss": 0.0222, "step": 9505 }, { "epoch": 23.072904009720535, "grad_norm": 0.4672035574913025, "learning_rate": 2.8509e-05, "loss": 0.0216, "step": 9506 }, { "epoch": 23.075334143377887, "grad_norm": 0.407081663608551, "learning_rate": 2.8512e-05, "loss": 0.0215, "step": 9507 }, { "epoch": 23.077764277035236, "grad_norm": 0.7343034744262695, "learning_rate": 2.8515e-05, "loss": 0.0223, "step": 9508 }, { "epoch": 23.080194410692588, "grad_norm": 0.5822822451591492, "learning_rate": 2.8518e-05, "loss": 0.0216, "step": 9509 }, { "epoch": 23.08262454434994, "grad_norm": 0.35504862666130066, "learning_rate": 2.8521e-05, "loss": 0.0099, "step": 9510 }, { "epoch": 23.08505467800729, "grad_norm": 0.32035449147224426, "learning_rate": 2.8524e-05, "loss": 0.0129, "step": 9511 }, { "epoch": 23.08748481166464, "grad_norm": 0.37946006655693054, "learning_rate": 2.8527e-05, "loss": 0.0158, "step": 9512 }, { "epoch": 23.089914945321993, "grad_norm": 0.43072885274887085, "learning_rate": 2.853e-05, "loss": 0.0142, "step": 9513 }, { "epoch": 23.092345078979346, "grad_norm": 0.5147731304168701, "learning_rate": 2.8533e-05, "loss": 0.0171, "step": 9514 }, { "epoch": 23.094775212636694, "grad_norm": 0.40962740778923035, "learning_rate": 2.8536000000000003e-05, "loss": 0.0121, "step": 9515 }, { "epoch": 23.097205346294047, "grad_norm": 0.7077569365501404, "learning_rate": 2.8539000000000002e-05, "loss": 0.0369, "step": 9516 }, { "epoch": 23.0996354799514, "grad_norm": 0.3777700960636139, "learning_rate": 2.8542000000000002e-05, "loss": 0.0159, "step": 9517 }, { "epoch": 23.102065613608747, "grad_norm": 0.45212823152542114, "learning_rate": 2.8545000000000002e-05, "loss": 0.0116, "step": 9518 }, { "epoch": 23.1044957472661, "grad_norm": 0.8842912316322327, "learning_rate": 2.8548000000000002e-05, "loss": 0.0148, "step": 9519 }, { "epoch": 23.106925880923452, "grad_norm": 0.34436267614364624, "learning_rate": 2.8551e-05, "loss": 0.0147, "step": 9520 }, { "epoch": 23.1093560145808, "grad_norm": 0.8853698968887329, "learning_rate": 2.8554e-05, "loss": 0.018, "step": 9521 }, { "epoch": 23.111786148238153, "grad_norm": 0.538921058177948, "learning_rate": 2.8557e-05, "loss": 0.0205, "step": 9522 }, { "epoch": 23.114216281895505, "grad_norm": 0.366825670003891, "learning_rate": 2.856e-05, "loss": 0.0181, "step": 9523 }, { "epoch": 23.116646415552854, "grad_norm": 1.3063428401947021, "learning_rate": 2.8563e-05, "loss": 0.0288, "step": 9524 }, { "epoch": 23.119076549210206, "grad_norm": 2.898780107498169, "learning_rate": 2.8566e-05, "loss": 0.0437, "step": 9525 }, { "epoch": 23.121506682867558, "grad_norm": 0.9766759872436523, "learning_rate": 2.8569e-05, "loss": 0.0361, "step": 9526 }, { "epoch": 23.12393681652491, "grad_norm": 0.8211612701416016, "learning_rate": 2.8572e-05, "loss": 0.2127, "step": 9527 }, { "epoch": 23.12636695018226, "grad_norm": 0.7159101366996765, "learning_rate": 2.8575e-05, "loss": 0.1721, "step": 9528 }, { "epoch": 23.12879708383961, "grad_norm": 0.46314769983291626, "learning_rate": 2.8578e-05, "loss": 0.0935, "step": 9529 }, { "epoch": 23.131227217496964, "grad_norm": 0.5885925889015198, "learning_rate": 2.8581e-05, "loss": 0.091, "step": 9530 }, { "epoch": 23.133657351154312, "grad_norm": 0.586782693862915, "learning_rate": 2.8584e-05, "loss": 0.0776, "step": 9531 }, { "epoch": 23.136087484811664, "grad_norm": 0.5335400700569153, "learning_rate": 2.8587e-05, "loss": 0.0664, "step": 9532 }, { "epoch": 23.138517618469017, "grad_norm": 0.5252411365509033, "learning_rate": 2.859e-05, "loss": 0.0641, "step": 9533 }, { "epoch": 23.140947752126365, "grad_norm": 0.5249719023704529, "learning_rate": 2.8593e-05, "loss": 0.0408, "step": 9534 }, { "epoch": 23.143377885783718, "grad_norm": 0.407583624124527, "learning_rate": 2.8596000000000002e-05, "loss": 0.0335, "step": 9535 }, { "epoch": 23.14580801944107, "grad_norm": 0.5255374908447266, "learning_rate": 2.8599000000000002e-05, "loss": 0.0318, "step": 9536 }, { "epoch": 23.148238153098422, "grad_norm": 0.3808148205280304, "learning_rate": 2.8602e-05, "loss": 0.0239, "step": 9537 }, { "epoch": 23.15066828675577, "grad_norm": 0.56357741355896, "learning_rate": 2.8605e-05, "loss": 0.0282, "step": 9538 }, { "epoch": 23.153098420413123, "grad_norm": 0.8101863265037537, "learning_rate": 2.8608e-05, "loss": 0.0319, "step": 9539 }, { "epoch": 23.155528554070475, "grad_norm": 0.3424091637134552, "learning_rate": 2.8611e-05, "loss": 0.0272, "step": 9540 }, { "epoch": 23.157958687727824, "grad_norm": 0.32339268922805786, "learning_rate": 2.8614e-05, "loss": 0.0206, "step": 9541 }, { "epoch": 23.160388821385176, "grad_norm": 0.39631614089012146, "learning_rate": 2.8617e-05, "loss": 0.0174, "step": 9542 }, { "epoch": 23.16281895504253, "grad_norm": 0.3710317313671112, "learning_rate": 2.862e-05, "loss": 0.0209, "step": 9543 }, { "epoch": 23.165249088699877, "grad_norm": 0.2813839316368103, "learning_rate": 2.8623e-05, "loss": 0.0188, "step": 9544 }, { "epoch": 23.16767922235723, "grad_norm": 0.3835070729255676, "learning_rate": 2.8626000000000003e-05, "loss": 0.0202, "step": 9545 }, { "epoch": 23.17010935601458, "grad_norm": 0.613288164138794, "learning_rate": 2.8629000000000003e-05, "loss": 0.0339, "step": 9546 }, { "epoch": 23.172539489671934, "grad_norm": 0.37356656789779663, "learning_rate": 2.8632000000000003e-05, "loss": 0.0278, "step": 9547 }, { "epoch": 23.174969623329282, "grad_norm": 0.7100823521614075, "learning_rate": 2.8635000000000003e-05, "loss": 0.0201, "step": 9548 }, { "epoch": 23.177399756986635, "grad_norm": 0.35147973895072937, "learning_rate": 2.8638e-05, "loss": 0.0174, "step": 9549 }, { "epoch": 23.179829890643987, "grad_norm": 0.4862484037876129, "learning_rate": 2.8641e-05, "loss": 0.0101, "step": 9550 }, { "epoch": 23.182260024301335, "grad_norm": 0.2584484815597534, "learning_rate": 2.8644e-05, "loss": 0.0152, "step": 9551 }, { "epoch": 23.184690157958688, "grad_norm": 0.2463264912366867, "learning_rate": 2.8647e-05, "loss": 0.0129, "step": 9552 }, { "epoch": 23.18712029161604, "grad_norm": 0.7121842503547668, "learning_rate": 2.865e-05, "loss": 0.0329, "step": 9553 }, { "epoch": 23.18955042527339, "grad_norm": 0.40462374687194824, "learning_rate": 2.8652999999999998e-05, "loss": 0.0229, "step": 9554 }, { "epoch": 23.19198055893074, "grad_norm": 0.3820768892765045, "learning_rate": 2.8656e-05, "loss": 0.0117, "step": 9555 }, { "epoch": 23.194410692588093, "grad_norm": 0.2516772747039795, "learning_rate": 2.8659e-05, "loss": 0.0114, "step": 9556 }, { "epoch": 23.19684082624544, "grad_norm": 0.5206205248832703, "learning_rate": 2.8662e-05, "loss": 0.0252, "step": 9557 }, { "epoch": 23.199270959902794, "grad_norm": 0.5527470707893372, "learning_rate": 2.8665e-05, "loss": 0.0296, "step": 9558 }, { "epoch": 23.201701093560146, "grad_norm": 0.4516434371471405, "learning_rate": 2.8668e-05, "loss": 0.0136, "step": 9559 }, { "epoch": 23.2041312272175, "grad_norm": 0.6201870441436768, "learning_rate": 2.8671e-05, "loss": 0.0217, "step": 9560 }, { "epoch": 23.206561360874847, "grad_norm": 0.4090719223022461, "learning_rate": 2.8674e-05, "loss": 0.0121, "step": 9561 }, { "epoch": 23.2089914945322, "grad_norm": 0.4101414680480957, "learning_rate": 2.8677e-05, "loss": 0.017, "step": 9562 }, { "epoch": 23.21142162818955, "grad_norm": 0.9152660369873047, "learning_rate": 2.868e-05, "loss": 0.0248, "step": 9563 }, { "epoch": 23.2138517618469, "grad_norm": 0.9485266804695129, "learning_rate": 2.8683e-05, "loss": 0.0148, "step": 9564 }, { "epoch": 23.216281895504252, "grad_norm": 0.6215488314628601, "learning_rate": 2.8686000000000003e-05, "loss": 0.0125, "step": 9565 }, { "epoch": 23.218712029161605, "grad_norm": 0.40615391731262207, "learning_rate": 2.8689000000000003e-05, "loss": 0.0121, "step": 9566 }, { "epoch": 23.221142162818953, "grad_norm": 0.3814718723297119, "learning_rate": 2.8692000000000002e-05, "loss": 0.016, "step": 9567 }, { "epoch": 23.223572296476306, "grad_norm": 0.5636463761329651, "learning_rate": 2.8695000000000002e-05, "loss": 0.0251, "step": 9568 }, { "epoch": 23.226002430133658, "grad_norm": 0.3716985881328583, "learning_rate": 2.8698000000000002e-05, "loss": 0.0198, "step": 9569 }, { "epoch": 23.22843256379101, "grad_norm": 0.8845388889312744, "learning_rate": 2.8701000000000002e-05, "loss": 0.032, "step": 9570 }, { "epoch": 23.23086269744836, "grad_norm": 0.5753739476203918, "learning_rate": 2.8704e-05, "loss": 0.0185, "step": 9571 }, { "epoch": 23.23329283110571, "grad_norm": 1.3396326303482056, "learning_rate": 2.8707e-05, "loss": 0.0594, "step": 9572 }, { "epoch": 23.235722964763063, "grad_norm": 0.8857982158660889, "learning_rate": 2.871e-05, "loss": 0.0299, "step": 9573 }, { "epoch": 23.238153098420412, "grad_norm": 1.0062994956970215, "learning_rate": 2.8712999999999998e-05, "loss": 0.0274, "step": 9574 }, { "epoch": 23.240583232077764, "grad_norm": 0.80854332447052, "learning_rate": 2.8716e-05, "loss": 0.0214, "step": 9575 }, { "epoch": 23.243013365735116, "grad_norm": 1.1415010690689087, "learning_rate": 2.8719e-05, "loss": 0.0434, "step": 9576 }, { "epoch": 23.245443499392465, "grad_norm": 0.6873866319656372, "learning_rate": 2.8722e-05, "loss": 0.183, "step": 9577 }, { "epoch": 23.247873633049817, "grad_norm": 0.5782778859138489, "learning_rate": 2.8725e-05, "loss": 0.141, "step": 9578 }, { "epoch": 23.25030376670717, "grad_norm": 0.5602619051933289, "learning_rate": 2.8728e-05, "loss": 0.098, "step": 9579 }, { "epoch": 23.25273390036452, "grad_norm": 0.6403773427009583, "learning_rate": 2.8731e-05, "loss": 0.082, "step": 9580 }, { "epoch": 23.25516403402187, "grad_norm": 1.3115606307983398, "learning_rate": 2.8734e-05, "loss": 0.0776, "step": 9581 }, { "epoch": 23.257594167679223, "grad_norm": 0.9019761681556702, "learning_rate": 2.8737e-05, "loss": 0.0702, "step": 9582 }, { "epoch": 23.260024301336575, "grad_norm": 0.5122117400169373, "learning_rate": 2.874e-05, "loss": 0.0567, "step": 9583 }, { "epoch": 23.262454434993924, "grad_norm": 0.8391720652580261, "learning_rate": 2.8743e-05, "loss": 0.0661, "step": 9584 }, { "epoch": 23.264884568651276, "grad_norm": 0.570813000202179, "learning_rate": 2.8746000000000002e-05, "loss": 0.0479, "step": 9585 }, { "epoch": 23.267314702308628, "grad_norm": 0.47888001799583435, "learning_rate": 2.8749000000000002e-05, "loss": 0.0468, "step": 9586 }, { "epoch": 23.269744835965977, "grad_norm": 0.5563444495201111, "learning_rate": 2.8752000000000002e-05, "loss": 0.0401, "step": 9587 }, { "epoch": 23.27217496962333, "grad_norm": 0.4184245765209198, "learning_rate": 2.8755e-05, "loss": 0.0341, "step": 9588 }, { "epoch": 23.27460510328068, "grad_norm": 0.3650723993778229, "learning_rate": 2.8758e-05, "loss": 0.0265, "step": 9589 }, { "epoch": 23.277035236938033, "grad_norm": 0.4061654210090637, "learning_rate": 2.8761e-05, "loss": 0.0222, "step": 9590 }, { "epoch": 23.279465370595382, "grad_norm": 0.6556171774864197, "learning_rate": 2.8764e-05, "loss": 0.0293, "step": 9591 }, { "epoch": 23.281895504252734, "grad_norm": 0.3156546950340271, "learning_rate": 2.8767e-05, "loss": 0.0187, "step": 9592 }, { "epoch": 23.284325637910086, "grad_norm": 0.6387895941734314, "learning_rate": 2.877e-05, "loss": 0.0167, "step": 9593 }, { "epoch": 23.286755771567435, "grad_norm": 0.5840867161750793, "learning_rate": 2.8773e-05, "loss": 0.0316, "step": 9594 }, { "epoch": 23.289185905224787, "grad_norm": 0.4241392910480499, "learning_rate": 2.8776000000000004e-05, "loss": 0.0113, "step": 9595 }, { "epoch": 23.29161603888214, "grad_norm": 0.6075844764709473, "learning_rate": 2.8779000000000003e-05, "loss": 0.0299, "step": 9596 }, { "epoch": 23.29404617253949, "grad_norm": 0.41279059648513794, "learning_rate": 2.8782000000000003e-05, "loss": 0.015, "step": 9597 }, { "epoch": 23.29647630619684, "grad_norm": 0.7651547193527222, "learning_rate": 2.8785e-05, "loss": 0.0364, "step": 9598 }, { "epoch": 23.298906439854193, "grad_norm": 0.9493576884269714, "learning_rate": 2.8788e-05, "loss": 0.0395, "step": 9599 }, { "epoch": 23.30133657351154, "grad_norm": 0.7574448585510254, "learning_rate": 2.8791e-05, "loss": 0.0332, "step": 9600 }, { "epoch": 23.303766707168894, "grad_norm": 0.42393672466278076, "learning_rate": 2.8794e-05, "loss": 0.0167, "step": 9601 }, { "epoch": 23.306196840826246, "grad_norm": 0.5548346042633057, "learning_rate": 2.8797e-05, "loss": 0.0167, "step": 9602 }, { "epoch": 23.308626974483598, "grad_norm": 0.6436389088630676, "learning_rate": 2.88e-05, "loss": 0.0197, "step": 9603 }, { "epoch": 23.311057108140947, "grad_norm": 0.23040179908275604, "learning_rate": 2.8803e-05, "loss": 0.0108, "step": 9604 }, { "epoch": 23.3134872417983, "grad_norm": 0.5335226655006409, "learning_rate": 2.8806e-05, "loss": 0.017, "step": 9605 }, { "epoch": 23.31591737545565, "grad_norm": 0.6085913181304932, "learning_rate": 2.8809e-05, "loss": 0.0165, "step": 9606 }, { "epoch": 23.318347509113, "grad_norm": 0.5854588150978088, "learning_rate": 2.8812e-05, "loss": 0.0369, "step": 9607 }, { "epoch": 23.320777642770352, "grad_norm": 0.238523930311203, "learning_rate": 2.8815e-05, "loss": 0.0122, "step": 9608 }, { "epoch": 23.323207776427704, "grad_norm": 0.3803897798061371, "learning_rate": 2.8818e-05, "loss": 0.0122, "step": 9609 }, { "epoch": 23.325637910085053, "grad_norm": 0.2713606655597687, "learning_rate": 2.8821e-05, "loss": 0.0118, "step": 9610 }, { "epoch": 23.328068043742405, "grad_norm": 0.7343933582305908, "learning_rate": 2.8824e-05, "loss": 0.0308, "step": 9611 }, { "epoch": 23.330498177399758, "grad_norm": 0.8208174705505371, "learning_rate": 2.8827e-05, "loss": 0.0713, "step": 9612 }, { "epoch": 23.33292831105711, "grad_norm": 0.6275103092193604, "learning_rate": 2.883e-05, "loss": 0.0176, "step": 9613 }, { "epoch": 23.33535844471446, "grad_norm": 0.5239596366882324, "learning_rate": 2.8833e-05, "loss": 0.0204, "step": 9614 }, { "epoch": 23.33778857837181, "grad_norm": 0.6754969954490662, "learning_rate": 2.8836000000000003e-05, "loss": 0.0132, "step": 9615 }, { "epoch": 23.340218712029163, "grad_norm": 1.1751443147659302, "learning_rate": 2.8839000000000003e-05, "loss": 0.0381, "step": 9616 }, { "epoch": 23.34264884568651, "grad_norm": 0.41812625527381897, "learning_rate": 2.8842000000000003e-05, "loss": 0.0208, "step": 9617 }, { "epoch": 23.345078979343864, "grad_norm": 0.5406580567359924, "learning_rate": 2.8845000000000003e-05, "loss": 0.0233, "step": 9618 }, { "epoch": 23.347509113001216, "grad_norm": 0.5426051020622253, "learning_rate": 2.8848000000000002e-05, "loss": 0.0149, "step": 9619 }, { "epoch": 23.349939246658565, "grad_norm": 0.6300780773162842, "learning_rate": 2.8851000000000002e-05, "loss": 0.0282, "step": 9620 }, { "epoch": 23.352369380315917, "grad_norm": 0.975771963596344, "learning_rate": 2.8854000000000002e-05, "loss": 0.0427, "step": 9621 }, { "epoch": 23.35479951397327, "grad_norm": 0.8385307788848877, "learning_rate": 2.8857000000000002e-05, "loss": 0.0339, "step": 9622 }, { "epoch": 23.35722964763062, "grad_norm": 4.794836521148682, "learning_rate": 2.8859999999999998e-05, "loss": 0.0394, "step": 9623 }, { "epoch": 23.35965978128797, "grad_norm": 0.8830786943435669, "learning_rate": 2.8862999999999998e-05, "loss": 0.0413, "step": 9624 }, { "epoch": 23.362089914945322, "grad_norm": 1.9888947010040283, "learning_rate": 2.8866e-05, "loss": 0.0451, "step": 9625 }, { "epoch": 23.364520048602675, "grad_norm": 0.9226353764533997, "learning_rate": 2.8869e-05, "loss": 0.0805, "step": 9626 }, { "epoch": 23.366950182260023, "grad_norm": 1.0794264078140259, "learning_rate": 2.8872e-05, "loss": 0.2087, "step": 9627 }, { "epoch": 23.369380315917375, "grad_norm": 0.6071457862854004, "learning_rate": 2.8875e-05, "loss": 0.1378, "step": 9628 }, { "epoch": 23.371810449574728, "grad_norm": 0.5712815523147583, "learning_rate": 2.8878e-05, "loss": 0.1075, "step": 9629 }, { "epoch": 23.374240583232076, "grad_norm": 0.668440580368042, "learning_rate": 2.8881e-05, "loss": 0.0873, "step": 9630 }, { "epoch": 23.37667071688943, "grad_norm": 0.6137204170227051, "learning_rate": 2.8884e-05, "loss": 0.0981, "step": 9631 }, { "epoch": 23.37910085054678, "grad_norm": 0.9029156565666199, "learning_rate": 2.8887e-05, "loss": 0.0779, "step": 9632 }, { "epoch": 23.381530984204133, "grad_norm": 0.7034996151924133, "learning_rate": 2.889e-05, "loss": 0.0753, "step": 9633 }, { "epoch": 23.38396111786148, "grad_norm": 0.6585910320281982, "learning_rate": 2.8893e-05, "loss": 0.0509, "step": 9634 }, { "epoch": 23.386391251518834, "grad_norm": 0.5057927966117859, "learning_rate": 2.8896e-05, "loss": 0.0451, "step": 9635 }, { "epoch": 23.388821385176186, "grad_norm": 0.410820335149765, "learning_rate": 2.8899000000000002e-05, "loss": 0.0272, "step": 9636 }, { "epoch": 23.391251518833535, "grad_norm": 0.4803529381752014, "learning_rate": 2.8902000000000002e-05, "loss": 0.038, "step": 9637 }, { "epoch": 23.393681652490887, "grad_norm": 0.28240472078323364, "learning_rate": 2.8905000000000002e-05, "loss": 0.0241, "step": 9638 }, { "epoch": 23.39611178614824, "grad_norm": 0.42024198174476624, "learning_rate": 2.8908000000000002e-05, "loss": 0.0259, "step": 9639 }, { "epoch": 23.398541919805588, "grad_norm": 0.35355156660079956, "learning_rate": 2.8911e-05, "loss": 0.0228, "step": 9640 }, { "epoch": 23.40097205346294, "grad_norm": 0.3293326199054718, "learning_rate": 2.8914e-05, "loss": 0.0311, "step": 9641 }, { "epoch": 23.403402187120292, "grad_norm": 0.6761553287506104, "learning_rate": 2.8917e-05, "loss": 0.038, "step": 9642 }, { "epoch": 23.40583232077764, "grad_norm": 0.5534124970436096, "learning_rate": 2.892e-05, "loss": 0.0361, "step": 9643 }, { "epoch": 23.408262454434993, "grad_norm": 0.3856518268585205, "learning_rate": 2.8923e-05, "loss": 0.0247, "step": 9644 }, { "epoch": 23.410692588092346, "grad_norm": 0.3452407121658325, "learning_rate": 2.8926e-05, "loss": 0.0206, "step": 9645 }, { "epoch": 23.413122721749698, "grad_norm": 0.24326658248901367, "learning_rate": 2.8929000000000004e-05, "loss": 0.0131, "step": 9646 }, { "epoch": 23.415552855407046, "grad_norm": 0.4790770411491394, "learning_rate": 2.8932e-05, "loss": 0.0245, "step": 9647 }, { "epoch": 23.4179829890644, "grad_norm": 0.47346651554107666, "learning_rate": 2.8935e-05, "loss": 0.0271, "step": 9648 }, { "epoch": 23.42041312272175, "grad_norm": 0.35833004117012024, "learning_rate": 2.8938e-05, "loss": 0.022, "step": 9649 }, { "epoch": 23.4228432563791, "grad_norm": 0.4131792187690735, "learning_rate": 2.8941e-05, "loss": 0.0169, "step": 9650 }, { "epoch": 23.425273390036452, "grad_norm": 0.8230151534080505, "learning_rate": 2.8944e-05, "loss": 0.0182, "step": 9651 }, { "epoch": 23.427703523693804, "grad_norm": 0.5830249190330505, "learning_rate": 2.8947e-05, "loss": 0.0304, "step": 9652 }, { "epoch": 23.430133657351153, "grad_norm": 0.42366233468055725, "learning_rate": 2.895e-05, "loss": 0.0266, "step": 9653 }, { "epoch": 23.432563791008505, "grad_norm": 0.4328458309173584, "learning_rate": 2.8953e-05, "loss": 0.0212, "step": 9654 }, { "epoch": 23.434993924665857, "grad_norm": 0.8896652460098267, "learning_rate": 2.8956e-05, "loss": 0.0439, "step": 9655 }, { "epoch": 23.43742405832321, "grad_norm": 0.3761303722858429, "learning_rate": 2.8959000000000002e-05, "loss": 0.0149, "step": 9656 }, { "epoch": 23.439854191980558, "grad_norm": 0.5800397992134094, "learning_rate": 2.8962e-05, "loss": 0.0275, "step": 9657 }, { "epoch": 23.44228432563791, "grad_norm": 0.40498682856559753, "learning_rate": 2.8965e-05, "loss": 0.0156, "step": 9658 }, { "epoch": 23.444714459295263, "grad_norm": 0.8809854388237, "learning_rate": 2.8968e-05, "loss": 0.0255, "step": 9659 }, { "epoch": 23.44714459295261, "grad_norm": 0.647111177444458, "learning_rate": 2.8971e-05, "loss": 0.0294, "step": 9660 }, { "epoch": 23.449574726609963, "grad_norm": 0.27657097578048706, "learning_rate": 2.8974e-05, "loss": 0.0132, "step": 9661 }, { "epoch": 23.452004860267316, "grad_norm": 0.5283920168876648, "learning_rate": 2.8977e-05, "loss": 0.0151, "step": 9662 }, { "epoch": 23.454434993924664, "grad_norm": 0.5492156147956848, "learning_rate": 2.898e-05, "loss": 0.0198, "step": 9663 }, { "epoch": 23.456865127582017, "grad_norm": 0.6241753101348877, "learning_rate": 2.8983e-05, "loss": 0.0256, "step": 9664 }, { "epoch": 23.45929526123937, "grad_norm": 0.568877100944519, "learning_rate": 2.8986e-05, "loss": 0.0211, "step": 9665 }, { "epoch": 23.46172539489672, "grad_norm": 0.43151479959487915, "learning_rate": 2.8989000000000003e-05, "loss": 0.023, "step": 9666 }, { "epoch": 23.46415552855407, "grad_norm": 0.6391026377677917, "learning_rate": 2.8992000000000003e-05, "loss": 0.0206, "step": 9667 }, { "epoch": 23.466585662211422, "grad_norm": 0.3502877652645111, "learning_rate": 2.8995000000000003e-05, "loss": 0.0211, "step": 9668 }, { "epoch": 23.469015795868774, "grad_norm": 0.4584041237831116, "learning_rate": 2.8998000000000003e-05, "loss": 0.0163, "step": 9669 }, { "epoch": 23.471445929526123, "grad_norm": 0.44004014134407043, "learning_rate": 2.9001000000000002e-05, "loss": 0.0174, "step": 9670 }, { "epoch": 23.473876063183475, "grad_norm": 0.5165678262710571, "learning_rate": 2.9004000000000002e-05, "loss": 0.0286, "step": 9671 }, { "epoch": 23.476306196840827, "grad_norm": 0.7395194172859192, "learning_rate": 2.9007e-05, "loss": 0.0247, "step": 9672 }, { "epoch": 23.478736330498176, "grad_norm": 0.6405801773071289, "learning_rate": 2.901e-05, "loss": 0.0181, "step": 9673 }, { "epoch": 23.481166464155528, "grad_norm": 1.2357646226882935, "learning_rate": 2.9012999999999998e-05, "loss": 0.0452, "step": 9674 }, { "epoch": 23.48359659781288, "grad_norm": 1.2022294998168945, "learning_rate": 2.9015999999999998e-05, "loss": 0.0986, "step": 9675 }, { "epoch": 23.48602673147023, "grad_norm": 2.8735268115997314, "learning_rate": 2.9019e-05, "loss": 0.0811, "step": 9676 }, { "epoch": 23.48845686512758, "grad_norm": 1.2473499774932861, "learning_rate": 2.9022e-05, "loss": 0.2456, "step": 9677 }, { "epoch": 23.490886998784934, "grad_norm": 0.548064649105072, "learning_rate": 2.9025e-05, "loss": 0.1366, "step": 9678 }, { "epoch": 23.493317132442286, "grad_norm": 0.4759877622127533, "learning_rate": 2.9028e-05, "loss": 0.1058, "step": 9679 }, { "epoch": 23.495747266099634, "grad_norm": 0.621716320514679, "learning_rate": 2.9031e-05, "loss": 0.1067, "step": 9680 }, { "epoch": 23.498177399756987, "grad_norm": 0.6535392999649048, "learning_rate": 2.9034e-05, "loss": 0.1262, "step": 9681 }, { "epoch": 23.50060753341434, "grad_norm": 0.7160303592681885, "learning_rate": 2.9037e-05, "loss": 0.0533, "step": 9682 }, { "epoch": 23.503037667071688, "grad_norm": 0.5300166606903076, "learning_rate": 2.904e-05, "loss": 0.072, "step": 9683 }, { "epoch": 23.50546780072904, "grad_norm": 0.7013402581214905, "learning_rate": 2.9043e-05, "loss": 0.0686, "step": 9684 }, { "epoch": 23.507897934386392, "grad_norm": 0.5861819386482239, "learning_rate": 2.9046e-05, "loss": 0.0458, "step": 9685 }, { "epoch": 23.51032806804374, "grad_norm": 0.6155208349227905, "learning_rate": 2.9049000000000003e-05, "loss": 0.0419, "step": 9686 }, { "epoch": 23.512758201701093, "grad_norm": 0.7653539776802063, "learning_rate": 2.9052000000000002e-05, "loss": 0.0329, "step": 9687 }, { "epoch": 23.515188335358445, "grad_norm": 0.5316774845123291, "learning_rate": 2.9055000000000002e-05, "loss": 0.0338, "step": 9688 }, { "epoch": 23.517618469015797, "grad_norm": 0.698459804058075, "learning_rate": 2.9058000000000002e-05, "loss": 0.0409, "step": 9689 }, { "epoch": 23.520048602673146, "grad_norm": 0.2881585955619812, "learning_rate": 2.9061000000000002e-05, "loss": 0.0148, "step": 9690 }, { "epoch": 23.5224787363305, "grad_norm": 0.34171515703201294, "learning_rate": 2.9064e-05, "loss": 0.022, "step": 9691 }, { "epoch": 23.52490886998785, "grad_norm": 0.53977370262146, "learning_rate": 2.9067e-05, "loss": 0.0504, "step": 9692 }, { "epoch": 23.5273390036452, "grad_norm": 0.5080909132957458, "learning_rate": 2.907e-05, "loss": 0.0287, "step": 9693 }, { "epoch": 23.52976913730255, "grad_norm": 0.33268478512763977, "learning_rate": 2.9073e-05, "loss": 0.0217, "step": 9694 }, { "epoch": 23.532199270959904, "grad_norm": 0.386142373085022, "learning_rate": 2.9076e-05, "loss": 0.0158, "step": 9695 }, { "epoch": 23.534629404617252, "grad_norm": 0.7102218866348267, "learning_rate": 2.9079e-05, "loss": 0.0302, "step": 9696 }, { "epoch": 23.537059538274605, "grad_norm": 0.29084300994873047, "learning_rate": 2.9082e-05, "loss": 0.0156, "step": 9697 }, { "epoch": 23.539489671931957, "grad_norm": 0.36879634857177734, "learning_rate": 2.9085e-05, "loss": 0.016, "step": 9698 }, { "epoch": 23.54191980558931, "grad_norm": 1.1723427772521973, "learning_rate": 2.9088e-05, "loss": 0.0289, "step": 9699 }, { "epoch": 23.544349939246658, "grad_norm": 0.26978057622909546, "learning_rate": 2.9091e-05, "loss": 0.0101, "step": 9700 }, { "epoch": 23.54678007290401, "grad_norm": 0.8133243322372437, "learning_rate": 2.9094e-05, "loss": 0.0424, "step": 9701 }, { "epoch": 23.549210206561362, "grad_norm": 0.5535023808479309, "learning_rate": 2.9097e-05, "loss": 0.0587, "step": 9702 }, { "epoch": 23.55164034021871, "grad_norm": 0.6279627680778503, "learning_rate": 2.91e-05, "loss": 0.0216, "step": 9703 }, { "epoch": 23.554070473876063, "grad_norm": 0.40593987703323364, "learning_rate": 2.9103e-05, "loss": 0.0225, "step": 9704 }, { "epoch": 23.556500607533415, "grad_norm": 0.47269830107688904, "learning_rate": 2.9106e-05, "loss": 0.0121, "step": 9705 }, { "epoch": 23.558930741190764, "grad_norm": 0.4068252742290497, "learning_rate": 2.9109000000000002e-05, "loss": 0.0202, "step": 9706 }, { "epoch": 23.561360874848116, "grad_norm": 0.8117912411689758, "learning_rate": 2.9112000000000002e-05, "loss": 0.031, "step": 9707 }, { "epoch": 23.56379100850547, "grad_norm": 0.5735695958137512, "learning_rate": 2.9115e-05, "loss": 0.0205, "step": 9708 }, { "epoch": 23.566221142162817, "grad_norm": 0.7906632423400879, "learning_rate": 2.9118e-05, "loss": 0.0231, "step": 9709 }, { "epoch": 23.56865127582017, "grad_norm": 0.41654008626937866, "learning_rate": 2.9121e-05, "loss": 0.0179, "step": 9710 }, { "epoch": 23.57108140947752, "grad_norm": 0.3521312177181244, "learning_rate": 2.9124e-05, "loss": 0.0123, "step": 9711 }, { "epoch": 23.573511543134874, "grad_norm": 1.2348921298980713, "learning_rate": 2.9127e-05, "loss": 0.0295, "step": 9712 }, { "epoch": 23.575941676792223, "grad_norm": 0.5364603996276855, "learning_rate": 2.913e-05, "loss": 0.0231, "step": 9713 }, { "epoch": 23.578371810449575, "grad_norm": 1.5887782573699951, "learning_rate": 2.9133e-05, "loss": 0.0886, "step": 9714 }, { "epoch": 23.580801944106927, "grad_norm": 0.5395091772079468, "learning_rate": 2.9136e-05, "loss": 0.0198, "step": 9715 }, { "epoch": 23.583232077764276, "grad_norm": 0.5961582064628601, "learning_rate": 2.9139000000000003e-05, "loss": 0.0251, "step": 9716 }, { "epoch": 23.585662211421628, "grad_norm": 0.39282020926475525, "learning_rate": 2.9142000000000003e-05, "loss": 0.0218, "step": 9717 }, { "epoch": 23.58809234507898, "grad_norm": 0.5934118032455444, "learning_rate": 2.9145000000000003e-05, "loss": 0.0271, "step": 9718 }, { "epoch": 23.59052247873633, "grad_norm": 0.5858859419822693, "learning_rate": 2.9148000000000003e-05, "loss": 0.0256, "step": 9719 }, { "epoch": 23.59295261239368, "grad_norm": 0.40199416875839233, "learning_rate": 2.9151000000000003e-05, "loss": 0.0223, "step": 9720 }, { "epoch": 23.595382746051033, "grad_norm": 0.5413501262664795, "learning_rate": 2.9154e-05, "loss": 0.0161, "step": 9721 }, { "epoch": 23.597812879708385, "grad_norm": 0.5323272943496704, "learning_rate": 2.9157e-05, "loss": 0.0249, "step": 9722 }, { "epoch": 23.600243013365734, "grad_norm": 0.7767282724380493, "learning_rate": 2.916e-05, "loss": 0.0407, "step": 9723 }, { "epoch": 23.602673147023086, "grad_norm": 0.9992089867591858, "learning_rate": 2.9163e-05, "loss": 0.033, "step": 9724 }, { "epoch": 23.60510328068044, "grad_norm": 2.8780910968780518, "learning_rate": 2.9165999999999998e-05, "loss": 0.0421, "step": 9725 }, { "epoch": 23.607533414337787, "grad_norm": 2.6460227966308594, "learning_rate": 2.9169e-05, "loss": 0.077, "step": 9726 }, { "epoch": 23.60996354799514, "grad_norm": 1.0183157920837402, "learning_rate": 2.9172e-05, "loss": 0.2225, "step": 9727 }, { "epoch": 23.61239368165249, "grad_norm": 0.6719390749931335, "learning_rate": 2.9175e-05, "loss": 0.1546, "step": 9728 }, { "epoch": 23.61482381530984, "grad_norm": 0.5591229796409607, "learning_rate": 2.9178e-05, "loss": 0.1395, "step": 9729 }, { "epoch": 23.617253948967193, "grad_norm": 0.6904690265655518, "learning_rate": 2.9181e-05, "loss": 0.1185, "step": 9730 }, { "epoch": 23.619684082624545, "grad_norm": 0.5317046046257019, "learning_rate": 2.9184e-05, "loss": 0.0704, "step": 9731 }, { "epoch": 23.622114216281897, "grad_norm": 0.8343048691749573, "learning_rate": 2.9187e-05, "loss": 0.0809, "step": 9732 }, { "epoch": 23.624544349939246, "grad_norm": 0.8251399993896484, "learning_rate": 2.919e-05, "loss": 0.0593, "step": 9733 }, { "epoch": 23.626974483596598, "grad_norm": 0.44254639744758606, "learning_rate": 2.9193e-05, "loss": 0.0462, "step": 9734 }, { "epoch": 23.62940461725395, "grad_norm": 0.47851499915122986, "learning_rate": 2.9196e-05, "loss": 0.0564, "step": 9735 }, { "epoch": 23.6318347509113, "grad_norm": 0.35191673040390015, "learning_rate": 2.9199000000000003e-05, "loss": 0.0379, "step": 9736 }, { "epoch": 23.63426488456865, "grad_norm": 0.5423150658607483, "learning_rate": 2.9202000000000003e-05, "loss": 0.0519, "step": 9737 }, { "epoch": 23.636695018226003, "grad_norm": 1.5521440505981445, "learning_rate": 2.9205000000000002e-05, "loss": 0.0283, "step": 9738 }, { "epoch": 23.639125151883352, "grad_norm": 0.4850662648677826, "learning_rate": 2.9208000000000002e-05, "loss": 0.0227, "step": 9739 }, { "epoch": 23.641555285540704, "grad_norm": 0.44650954008102417, "learning_rate": 2.9211000000000002e-05, "loss": 0.0303, "step": 9740 }, { "epoch": 23.643985419198057, "grad_norm": 0.3681548833847046, "learning_rate": 2.9214000000000002e-05, "loss": 0.0282, "step": 9741 }, { "epoch": 23.64641555285541, "grad_norm": 0.5586681962013245, "learning_rate": 2.9217e-05, "loss": 0.0259, "step": 9742 }, { "epoch": 23.648845686512757, "grad_norm": 0.4882606565952301, "learning_rate": 2.922e-05, "loss": 0.0351, "step": 9743 }, { "epoch": 23.65127582017011, "grad_norm": 0.28481706976890564, "learning_rate": 2.9223e-05, "loss": 0.0256, "step": 9744 }, { "epoch": 23.653705953827462, "grad_norm": 0.5695919990539551, "learning_rate": 2.9226e-05, "loss": 0.0392, "step": 9745 }, { "epoch": 23.65613608748481, "grad_norm": 0.5409908294677734, "learning_rate": 2.9229e-05, "loss": 0.0276, "step": 9746 }, { "epoch": 23.658566221142163, "grad_norm": 0.39018410444259644, "learning_rate": 2.9232e-05, "loss": 0.0242, "step": 9747 }, { "epoch": 23.660996354799515, "grad_norm": 0.3665667176246643, "learning_rate": 2.9235e-05, "loss": 0.0174, "step": 9748 }, { "epoch": 23.663426488456864, "grad_norm": 0.3752090334892273, "learning_rate": 2.9238e-05, "loss": 0.0405, "step": 9749 }, { "epoch": 23.665856622114216, "grad_norm": 0.6053075790405273, "learning_rate": 2.9241e-05, "loss": 0.0216, "step": 9750 }, { "epoch": 23.668286755771568, "grad_norm": 0.4889114797115326, "learning_rate": 2.9244e-05, "loss": 0.0193, "step": 9751 }, { "epoch": 23.670716889428917, "grad_norm": 0.6583289504051208, "learning_rate": 2.9247e-05, "loss": 0.0167, "step": 9752 }, { "epoch": 23.67314702308627, "grad_norm": 0.5511744022369385, "learning_rate": 2.925e-05, "loss": 0.018, "step": 9753 }, { "epoch": 23.67557715674362, "grad_norm": 0.3677159547805786, "learning_rate": 2.9253e-05, "loss": 0.021, "step": 9754 }, { "epoch": 23.678007290400974, "grad_norm": 0.5387144684791565, "learning_rate": 2.9256e-05, "loss": 0.0193, "step": 9755 }, { "epoch": 23.680437424058322, "grad_norm": 0.4320157766342163, "learning_rate": 2.9259e-05, "loss": 0.0191, "step": 9756 }, { "epoch": 23.682867557715674, "grad_norm": 0.49855899810791016, "learning_rate": 2.9262000000000002e-05, "loss": 0.0253, "step": 9757 }, { "epoch": 23.685297691373027, "grad_norm": 0.37047243118286133, "learning_rate": 2.9265000000000002e-05, "loss": 0.0186, "step": 9758 }, { "epoch": 23.687727825030375, "grad_norm": 0.4614339768886566, "learning_rate": 2.9268e-05, "loss": 0.0225, "step": 9759 }, { "epoch": 23.690157958687728, "grad_norm": 0.5844979882240295, "learning_rate": 2.9271e-05, "loss": 0.0338, "step": 9760 }, { "epoch": 23.69258809234508, "grad_norm": 2.0222020149230957, "learning_rate": 2.9274e-05, "loss": 0.0315, "step": 9761 }, { "epoch": 23.69501822600243, "grad_norm": 0.5035544037818909, "learning_rate": 2.9277e-05, "loss": 0.0211, "step": 9762 }, { "epoch": 23.69744835965978, "grad_norm": 0.5199309587478638, "learning_rate": 2.928e-05, "loss": 0.0298, "step": 9763 }, { "epoch": 23.699878493317133, "grad_norm": 0.4754001200199127, "learning_rate": 2.9283e-05, "loss": 0.0181, "step": 9764 }, { "epoch": 23.702308626974485, "grad_norm": 0.3960065245628357, "learning_rate": 2.9286e-05, "loss": 0.0233, "step": 9765 }, { "epoch": 23.704738760631834, "grad_norm": 0.670845091342926, "learning_rate": 2.9289e-05, "loss": 0.0262, "step": 9766 }, { "epoch": 23.707168894289186, "grad_norm": 0.7684738039970398, "learning_rate": 2.9292000000000003e-05, "loss": 0.0211, "step": 9767 }, { "epoch": 23.70959902794654, "grad_norm": 0.6569257378578186, "learning_rate": 2.9295000000000003e-05, "loss": 0.0233, "step": 9768 }, { "epoch": 23.712029161603887, "grad_norm": 0.8300051689147949, "learning_rate": 2.9298000000000003e-05, "loss": 0.0263, "step": 9769 }, { "epoch": 23.71445929526124, "grad_norm": 0.5456178784370422, "learning_rate": 2.9301e-05, "loss": 0.0265, "step": 9770 }, { "epoch": 23.71688942891859, "grad_norm": 0.9441277384757996, "learning_rate": 2.9304e-05, "loss": 0.0331, "step": 9771 }, { "epoch": 23.71931956257594, "grad_norm": 0.5359410047531128, "learning_rate": 2.9307e-05, "loss": 0.0239, "step": 9772 }, { "epoch": 23.721749696233292, "grad_norm": 0.7556407451629639, "learning_rate": 2.931e-05, "loss": 0.0199, "step": 9773 }, { "epoch": 23.724179829890645, "grad_norm": 0.48832234740257263, "learning_rate": 2.9313e-05, "loss": 0.0151, "step": 9774 }, { "epoch": 23.726609963547997, "grad_norm": 0.6040066480636597, "learning_rate": 2.9316e-05, "loss": 0.019, "step": 9775 }, { "epoch": 23.729040097205345, "grad_norm": 1.319182276725769, "learning_rate": 2.9318999999999998e-05, "loss": 0.0398, "step": 9776 }, { "epoch": 23.731470230862698, "grad_norm": 1.1660488843917847, "learning_rate": 2.9322e-05, "loss": 0.2049, "step": 9777 }, { "epoch": 23.73390036452005, "grad_norm": 0.5856745839118958, "learning_rate": 2.9325e-05, "loss": 0.1453, "step": 9778 }, { "epoch": 23.7363304981774, "grad_norm": 0.5525960326194763, "learning_rate": 2.9328e-05, "loss": 0.0995, "step": 9779 }, { "epoch": 23.73876063183475, "grad_norm": 0.8269253373146057, "learning_rate": 2.9331e-05, "loss": 0.1018, "step": 9780 }, { "epoch": 23.741190765492103, "grad_norm": 0.9633733034133911, "learning_rate": 2.9334e-05, "loss": 0.1096, "step": 9781 }, { "epoch": 23.74362089914945, "grad_norm": 0.42450854182243347, "learning_rate": 2.9337e-05, "loss": 0.0616, "step": 9782 }, { "epoch": 23.746051032806804, "grad_norm": 0.5468454360961914, "learning_rate": 2.934e-05, "loss": 0.055, "step": 9783 }, { "epoch": 23.748481166464156, "grad_norm": 0.5478569269180298, "learning_rate": 2.9343e-05, "loss": 0.0462, "step": 9784 }, { "epoch": 23.75091130012151, "grad_norm": 0.6656402945518494, "learning_rate": 2.9346e-05, "loss": 0.0473, "step": 9785 }, { "epoch": 23.753341433778857, "grad_norm": 0.8019596338272095, "learning_rate": 2.9349e-05, "loss": 0.042, "step": 9786 }, { "epoch": 23.75577156743621, "grad_norm": 0.39570003747940063, "learning_rate": 2.9352000000000003e-05, "loss": 0.0344, "step": 9787 }, { "epoch": 23.75820170109356, "grad_norm": 1.1908681392669678, "learning_rate": 2.9355000000000003e-05, "loss": 0.0456, "step": 9788 }, { "epoch": 23.76063183475091, "grad_norm": 0.5716912746429443, "learning_rate": 2.9358000000000003e-05, "loss": 0.0349, "step": 9789 }, { "epoch": 23.763061968408262, "grad_norm": 0.4773416519165039, "learning_rate": 2.9361000000000002e-05, "loss": 0.0288, "step": 9790 }, { "epoch": 23.765492102065615, "grad_norm": 0.7236493825912476, "learning_rate": 2.9364000000000002e-05, "loss": 0.0453, "step": 9791 }, { "epoch": 23.767922235722963, "grad_norm": 0.48692256212234497, "learning_rate": 2.9367000000000002e-05, "loss": 0.0267, "step": 9792 }, { "epoch": 23.770352369380316, "grad_norm": 0.429821640253067, "learning_rate": 2.9370000000000002e-05, "loss": 0.0161, "step": 9793 }, { "epoch": 23.772782503037668, "grad_norm": 0.43303483724594116, "learning_rate": 2.9373e-05, "loss": 0.0336, "step": 9794 }, { "epoch": 23.775212636695016, "grad_norm": 0.6785426735877991, "learning_rate": 2.9375999999999998e-05, "loss": 0.0371, "step": 9795 }, { "epoch": 23.77764277035237, "grad_norm": 0.37803182005882263, "learning_rate": 2.9378999999999998e-05, "loss": 0.0267, "step": 9796 }, { "epoch": 23.78007290400972, "grad_norm": 0.41682520508766174, "learning_rate": 2.9382e-05, "loss": 0.0217, "step": 9797 }, { "epoch": 23.782503037667073, "grad_norm": 1.1797287464141846, "learning_rate": 2.9385e-05, "loss": 0.0355, "step": 9798 }, { "epoch": 23.784933171324422, "grad_norm": 0.37888360023498535, "learning_rate": 2.9388e-05, "loss": 0.0214, "step": 9799 }, { "epoch": 23.787363304981774, "grad_norm": 0.3333470821380615, "learning_rate": 2.9391e-05, "loss": 0.0211, "step": 9800 }, { "epoch": 23.789793438639126, "grad_norm": 0.25966203212738037, "learning_rate": 2.9394e-05, "loss": 0.0118, "step": 9801 }, { "epoch": 23.792223572296475, "grad_norm": 0.7254958152770996, "learning_rate": 2.9397e-05, "loss": 0.0191, "step": 9802 }, { "epoch": 23.794653705953827, "grad_norm": 0.41203823685646057, "learning_rate": 2.94e-05, "loss": 0.0199, "step": 9803 }, { "epoch": 23.79708383961118, "grad_norm": 1.3573963642120361, "learning_rate": 2.9403e-05, "loss": 0.0384, "step": 9804 }, { "epoch": 23.799513973268528, "grad_norm": 0.33749598264694214, "learning_rate": 2.9406e-05, "loss": 0.0195, "step": 9805 }, { "epoch": 23.80194410692588, "grad_norm": 0.3983689844608307, "learning_rate": 2.9409e-05, "loss": 0.0176, "step": 9806 }, { "epoch": 23.804374240583233, "grad_norm": 0.41641131043434143, "learning_rate": 2.9412000000000002e-05, "loss": 0.0258, "step": 9807 }, { "epoch": 23.806804374240585, "grad_norm": 0.7433393001556396, "learning_rate": 2.9415000000000002e-05, "loss": 0.0269, "step": 9808 }, { "epoch": 23.809234507897933, "grad_norm": 0.5460963845252991, "learning_rate": 2.9418000000000002e-05, "loss": 0.0272, "step": 9809 }, { "epoch": 23.811664641555286, "grad_norm": 0.45626235008239746, "learning_rate": 2.9421000000000002e-05, "loss": 0.0208, "step": 9810 }, { "epoch": 23.814094775212638, "grad_norm": 0.6292518973350525, "learning_rate": 2.9424e-05, "loss": 0.0702, "step": 9811 }, { "epoch": 23.816524908869987, "grad_norm": 0.5060350894927979, "learning_rate": 2.9427e-05, "loss": 0.0272, "step": 9812 }, { "epoch": 23.81895504252734, "grad_norm": 0.46770960092544556, "learning_rate": 2.943e-05, "loss": 0.0177, "step": 9813 }, { "epoch": 23.82138517618469, "grad_norm": 0.6717051863670349, "learning_rate": 2.9433e-05, "loss": 0.0598, "step": 9814 }, { "epoch": 23.82381530984204, "grad_norm": 0.423366516828537, "learning_rate": 2.9436e-05, "loss": 0.0245, "step": 9815 }, { "epoch": 23.826245443499392, "grad_norm": 0.41734546422958374, "learning_rate": 2.9439e-05, "loss": 0.0144, "step": 9816 }, { "epoch": 23.828675577156744, "grad_norm": 0.6120598316192627, "learning_rate": 2.9442000000000004e-05, "loss": 0.0145, "step": 9817 }, { "epoch": 23.831105710814096, "grad_norm": 1.4860261678695679, "learning_rate": 2.9445000000000004e-05, "loss": 0.0221, "step": 9818 }, { "epoch": 23.833535844471445, "grad_norm": 0.8400955200195312, "learning_rate": 2.9448e-05, "loss": 0.0323, "step": 9819 }, { "epoch": 23.835965978128797, "grad_norm": 0.8069465756416321, "learning_rate": 2.9451e-05, "loss": 0.0789, "step": 9820 }, { "epoch": 23.83839611178615, "grad_norm": 0.6100181937217712, "learning_rate": 2.9454e-05, "loss": 0.0328, "step": 9821 }, { "epoch": 23.8408262454435, "grad_norm": 0.3310921788215637, "learning_rate": 2.9457e-05, "loss": 0.011, "step": 9822 }, { "epoch": 23.84325637910085, "grad_norm": 0.6037766933441162, "learning_rate": 2.946e-05, "loss": 0.0221, "step": 9823 }, { "epoch": 23.845686512758203, "grad_norm": 0.7906667590141296, "learning_rate": 2.9463e-05, "loss": 0.0255, "step": 9824 }, { "epoch": 23.84811664641555, "grad_norm": 0.9986674785614014, "learning_rate": 2.9466e-05, "loss": 0.0235, "step": 9825 }, { "epoch": 23.850546780072904, "grad_norm": 1.015489935874939, "learning_rate": 2.9469e-05, "loss": 0.022, "step": 9826 }, { "epoch": 23.852976913730256, "grad_norm": 1.5159664154052734, "learning_rate": 2.9472000000000002e-05, "loss": 0.2457, "step": 9827 }, { "epoch": 23.855407047387608, "grad_norm": 0.6959315538406372, "learning_rate": 2.9475e-05, "loss": 0.1549, "step": 9828 }, { "epoch": 23.857837181044957, "grad_norm": 0.6551058292388916, "learning_rate": 2.9478e-05, "loss": 0.103, "step": 9829 }, { "epoch": 23.86026731470231, "grad_norm": 0.691125750541687, "learning_rate": 2.9481e-05, "loss": 0.1251, "step": 9830 }, { "epoch": 23.86269744835966, "grad_norm": 0.7372992634773254, "learning_rate": 2.9484e-05, "loss": 0.1039, "step": 9831 }, { "epoch": 23.86512758201701, "grad_norm": 0.6133431792259216, "learning_rate": 2.9487e-05, "loss": 0.0919, "step": 9832 }, { "epoch": 23.867557715674362, "grad_norm": 0.5315920114517212, "learning_rate": 2.949e-05, "loss": 0.0735, "step": 9833 }, { "epoch": 23.869987849331714, "grad_norm": 0.47883689403533936, "learning_rate": 2.9493e-05, "loss": 0.0597, "step": 9834 }, { "epoch": 23.872417982989063, "grad_norm": 0.40157800912857056, "learning_rate": 2.9496e-05, "loss": 0.0335, "step": 9835 }, { "epoch": 23.874848116646415, "grad_norm": 1.0539487600326538, "learning_rate": 2.9499e-05, "loss": 0.0301, "step": 9836 }, { "epoch": 23.877278250303767, "grad_norm": 0.532812774181366, "learning_rate": 2.9502000000000003e-05, "loss": 0.0268, "step": 9837 }, { "epoch": 23.879708383961116, "grad_norm": 0.4234025776386261, "learning_rate": 2.9505000000000003e-05, "loss": 0.0306, "step": 9838 }, { "epoch": 23.88213851761847, "grad_norm": 0.5158641934394836, "learning_rate": 2.9508000000000003e-05, "loss": 0.0311, "step": 9839 }, { "epoch": 23.88456865127582, "grad_norm": 0.31592923402786255, "learning_rate": 2.9511000000000003e-05, "loss": 0.0213, "step": 9840 }, { "epoch": 23.886998784933173, "grad_norm": 0.8613980412483215, "learning_rate": 2.9514000000000002e-05, "loss": 0.0329, "step": 9841 }, { "epoch": 23.88942891859052, "grad_norm": 1.1918686628341675, "learning_rate": 2.9517000000000002e-05, "loss": 0.0254, "step": 9842 }, { "epoch": 23.891859052247874, "grad_norm": 0.4444010853767395, "learning_rate": 2.9520000000000002e-05, "loss": 0.0309, "step": 9843 }, { "epoch": 23.894289185905226, "grad_norm": 0.4754830300807953, "learning_rate": 2.9523e-05, "loss": 0.0255, "step": 9844 }, { "epoch": 23.896719319562575, "grad_norm": 0.31241655349731445, "learning_rate": 2.9525999999999998e-05, "loss": 0.0212, "step": 9845 }, { "epoch": 23.899149453219927, "grad_norm": 0.7459553480148315, "learning_rate": 2.9528999999999998e-05, "loss": 0.0267, "step": 9846 }, { "epoch": 23.90157958687728, "grad_norm": 0.3264201283454895, "learning_rate": 2.9532e-05, "loss": 0.0284, "step": 9847 }, { "epoch": 23.904009720534628, "grad_norm": 0.33510714769363403, "learning_rate": 2.9535e-05, "loss": 0.0198, "step": 9848 }, { "epoch": 23.90643985419198, "grad_norm": 0.4002806842327118, "learning_rate": 2.9538e-05, "loss": 0.0249, "step": 9849 }, { "epoch": 23.908869987849332, "grad_norm": 0.5702695846557617, "learning_rate": 2.9541e-05, "loss": 0.0244, "step": 9850 }, { "epoch": 23.911300121506684, "grad_norm": 0.44201546907424927, "learning_rate": 2.9544e-05, "loss": 0.0243, "step": 9851 }, { "epoch": 23.913730255164033, "grad_norm": 0.4536641538143158, "learning_rate": 2.9547e-05, "loss": 0.022, "step": 9852 }, { "epoch": 23.916160388821385, "grad_norm": 0.748887836933136, "learning_rate": 2.955e-05, "loss": 0.0194, "step": 9853 }, { "epoch": 23.918590522478738, "grad_norm": 0.5660763382911682, "learning_rate": 2.9553e-05, "loss": 0.0221, "step": 9854 }, { "epoch": 23.921020656136086, "grad_norm": 0.4161059260368347, "learning_rate": 2.9556e-05, "loss": 0.0168, "step": 9855 }, { "epoch": 23.92345078979344, "grad_norm": 0.28617411851882935, "learning_rate": 2.9559e-05, "loss": 0.0112, "step": 9856 }, { "epoch": 23.92588092345079, "grad_norm": 0.6592788696289062, "learning_rate": 2.9562000000000003e-05, "loss": 0.0238, "step": 9857 }, { "epoch": 23.92831105710814, "grad_norm": 0.6187601089477539, "learning_rate": 2.9565000000000002e-05, "loss": 0.017, "step": 9858 }, { "epoch": 23.93074119076549, "grad_norm": 0.38406604528427124, "learning_rate": 2.9568000000000002e-05, "loss": 0.0136, "step": 9859 }, { "epoch": 23.933171324422844, "grad_norm": 1.0113978385925293, "learning_rate": 2.9571000000000002e-05, "loss": 0.0296, "step": 9860 }, { "epoch": 23.935601458080196, "grad_norm": 0.39745429158210754, "learning_rate": 2.9574000000000002e-05, "loss": 0.0194, "step": 9861 }, { "epoch": 23.938031591737545, "grad_norm": 0.6709166765213013, "learning_rate": 2.9577e-05, "loss": 0.0233, "step": 9862 }, { "epoch": 23.940461725394897, "grad_norm": 0.6449007391929626, "learning_rate": 2.958e-05, "loss": 0.0209, "step": 9863 }, { "epoch": 23.94289185905225, "grad_norm": 0.46655064821243286, "learning_rate": 2.9583e-05, "loss": 0.021, "step": 9864 }, { "epoch": 23.945321992709598, "grad_norm": 0.8893966674804688, "learning_rate": 2.9586e-05, "loss": 0.029, "step": 9865 }, { "epoch": 23.94775212636695, "grad_norm": 0.6730701327323914, "learning_rate": 2.9589e-05, "loss": 0.0214, "step": 9866 }, { "epoch": 23.950182260024302, "grad_norm": 0.3987506031990051, "learning_rate": 2.9592000000000004e-05, "loss": 0.0157, "step": 9867 }, { "epoch": 23.95261239368165, "grad_norm": 0.33383598923683167, "learning_rate": 2.9595e-05, "loss": 0.0161, "step": 9868 }, { "epoch": 23.955042527339003, "grad_norm": 0.3505139648914337, "learning_rate": 2.9598e-05, "loss": 0.0199, "step": 9869 }, { "epoch": 23.957472660996356, "grad_norm": 0.6897822618484497, "learning_rate": 2.9601e-05, "loss": 0.0269, "step": 9870 }, { "epoch": 23.959902794653708, "grad_norm": 0.8322789072990417, "learning_rate": 2.9604e-05, "loss": 0.0291, "step": 9871 }, { "epoch": 23.962332928311056, "grad_norm": 0.4849978983402252, "learning_rate": 2.9607e-05, "loss": 0.0152, "step": 9872 }, { "epoch": 23.96476306196841, "grad_norm": 0.979604959487915, "learning_rate": 2.961e-05, "loss": 0.0391, "step": 9873 }, { "epoch": 23.96719319562576, "grad_norm": 0.60432368516922, "learning_rate": 2.9613e-05, "loss": 0.0234, "step": 9874 }, { "epoch": 23.96962332928311, "grad_norm": 0.6836251616477966, "learning_rate": 2.9616e-05, "loss": 0.0248, "step": 9875 }, { "epoch": 23.972053462940462, "grad_norm": 2.064427137374878, "learning_rate": 2.9619e-05, "loss": 0.0589, "step": 9876 }, { "epoch": 23.974483596597814, "grad_norm": 1.0212223529815674, "learning_rate": 2.9622000000000002e-05, "loss": 0.1631, "step": 9877 }, { "epoch": 23.976913730255163, "grad_norm": 1.0419306755065918, "learning_rate": 2.9625000000000002e-05, "loss": 0.0692, "step": 9878 }, { "epoch": 23.979343863912515, "grad_norm": 0.5604053735733032, "learning_rate": 2.9628e-05, "loss": 0.0294, "step": 9879 }, { "epoch": 23.981773997569867, "grad_norm": 0.575038731098175, "learning_rate": 2.9631e-05, "loss": 0.0297, "step": 9880 }, { "epoch": 23.984204131227216, "grad_norm": 0.7771075367927551, "learning_rate": 2.9634e-05, "loss": 0.0366, "step": 9881 }, { "epoch": 23.986634264884568, "grad_norm": 0.2930268943309784, "learning_rate": 2.9637e-05, "loss": 0.0156, "step": 9882 }, { "epoch": 23.98906439854192, "grad_norm": 0.43041276931762695, "learning_rate": 2.964e-05, "loss": 0.0264, "step": 9883 }, { "epoch": 23.991494532199273, "grad_norm": 0.4629424214363098, "learning_rate": 2.9643e-05, "loss": 0.0252, "step": 9884 }, { "epoch": 23.99392466585662, "grad_norm": 0.3498910367488861, "learning_rate": 2.9646e-05, "loss": 0.0246, "step": 9885 }, { "epoch": 23.996354799513973, "grad_norm": 0.8796498775482178, "learning_rate": 2.9649e-05, "loss": 0.0335, "step": 9886 }, { "epoch": 23.998784933171326, "grad_norm": 0.38483354449272156, "learning_rate": 2.9652e-05, "loss": 0.0214, "step": 9887 }, { "epoch": 24.0, "grad_norm": 1.128991961479187, "learning_rate": 2.9655000000000003e-05, "loss": 0.0321, "step": 9888 }, { "epoch": 24.002430133657352, "grad_norm": 0.8160355687141418, "learning_rate": 2.9658000000000003e-05, "loss": 0.2172, "step": 9889 }, { "epoch": 24.0048602673147, "grad_norm": 0.545640766620636, "learning_rate": 2.9661000000000003e-05, "loss": 0.1323, "step": 9890 }, { "epoch": 24.007290400972053, "grad_norm": 0.45471683144569397, "learning_rate": 2.9664000000000003e-05, "loss": 0.0944, "step": 9891 }, { "epoch": 24.009720534629405, "grad_norm": 0.6201019287109375, "learning_rate": 2.9667000000000002e-05, "loss": 0.107, "step": 9892 }, { "epoch": 24.012150668286754, "grad_norm": 0.6689167618751526, "learning_rate": 2.967e-05, "loss": 0.0916, "step": 9893 }, { "epoch": 24.014580801944106, "grad_norm": 0.4836483299732208, "learning_rate": 2.9673e-05, "loss": 0.0841, "step": 9894 }, { "epoch": 24.01701093560146, "grad_norm": 0.4399881958961487, "learning_rate": 2.9676e-05, "loss": 0.0475, "step": 9895 }, { "epoch": 24.01944106925881, "grad_norm": 0.5732226967811584, "learning_rate": 2.9678999999999998e-05, "loss": 0.0437, "step": 9896 }, { "epoch": 24.02187120291616, "grad_norm": 0.7089022397994995, "learning_rate": 2.9681999999999998e-05, "loss": 0.037, "step": 9897 }, { "epoch": 24.02430133657351, "grad_norm": 0.6471890807151794, "learning_rate": 2.9685e-05, "loss": 0.0216, "step": 9898 }, { "epoch": 24.026731470230864, "grad_norm": 0.4606384336948395, "learning_rate": 2.9688e-05, "loss": 0.0268, "step": 9899 }, { "epoch": 24.029161603888213, "grad_norm": 0.5236558318138123, "learning_rate": 2.9691e-05, "loss": 0.0313, "step": 9900 }, { "epoch": 24.031591737545565, "grad_norm": 0.6360529661178589, "learning_rate": 2.9694e-05, "loss": 0.0264, "step": 9901 }, { "epoch": 24.034021871202917, "grad_norm": 0.7007638812065125, "learning_rate": 2.9697e-05, "loss": 0.0423, "step": 9902 }, { "epoch": 24.036452004860266, "grad_norm": 0.3145723342895508, "learning_rate": 2.97e-05, "loss": 0.0173, "step": 9903 }, { "epoch": 24.038882138517618, "grad_norm": 0.5800657272338867, "learning_rate": 2.9703e-05, "loss": 0.027, "step": 9904 }, { "epoch": 24.04131227217497, "grad_norm": 0.49561187624931335, "learning_rate": 2.9706e-05, "loss": 0.014, "step": 9905 }, { "epoch": 24.043742405832322, "grad_norm": 0.4119066596031189, "learning_rate": 2.9709e-05, "loss": 0.021, "step": 9906 }, { "epoch": 24.04617253948967, "grad_norm": 0.5853497385978699, "learning_rate": 2.9712e-05, "loss": 0.0175, "step": 9907 }, { "epoch": 24.048602673147023, "grad_norm": 0.3216400444507599, "learning_rate": 2.9715000000000003e-05, "loss": 0.0122, "step": 9908 }, { "epoch": 24.051032806804375, "grad_norm": 0.371423602104187, "learning_rate": 2.9718000000000002e-05, "loss": 0.0187, "step": 9909 }, { "epoch": 24.053462940461724, "grad_norm": 0.3816506266593933, "learning_rate": 2.9721000000000002e-05, "loss": 0.0156, "step": 9910 }, { "epoch": 24.055893074119076, "grad_norm": 0.3789234757423401, "learning_rate": 2.9724000000000002e-05, "loss": 0.0195, "step": 9911 }, { "epoch": 24.05832320777643, "grad_norm": 0.39552634954452515, "learning_rate": 2.9727000000000002e-05, "loss": 0.0201, "step": 9912 }, { "epoch": 24.060753341433777, "grad_norm": 1.8847105503082275, "learning_rate": 2.973e-05, "loss": 0.0166, "step": 9913 }, { "epoch": 24.06318347509113, "grad_norm": 0.4605678915977478, "learning_rate": 2.9733e-05, "loss": 0.0128, "step": 9914 }, { "epoch": 24.06561360874848, "grad_norm": 0.857143759727478, "learning_rate": 2.9736e-05, "loss": 0.0157, "step": 9915 }, { "epoch": 24.068043742405834, "grad_norm": 0.1408373862504959, "learning_rate": 2.9739e-05, "loss": 0.0078, "step": 9916 }, { "epoch": 24.070473876063183, "grad_norm": 0.3907758891582489, "learning_rate": 2.9742e-05, "loss": 0.0131, "step": 9917 }, { "epoch": 24.072904009720535, "grad_norm": 0.33860448002815247, "learning_rate": 2.9745e-05, "loss": 0.0154, "step": 9918 }, { "epoch": 24.075334143377887, "grad_norm": 0.21638265252113342, "learning_rate": 2.9748e-05, "loss": 0.0139, "step": 9919 }, { "epoch": 24.077764277035236, "grad_norm": 0.33306241035461426, "learning_rate": 2.9751e-05, "loss": 0.0182, "step": 9920 }, { "epoch": 24.080194410692588, "grad_norm": 0.3401792347431183, "learning_rate": 2.9754e-05, "loss": 0.0121, "step": 9921 }, { "epoch": 24.08262454434994, "grad_norm": 0.7117286920547485, "learning_rate": 2.9757e-05, "loss": 0.0274, "step": 9922 }, { "epoch": 24.08505467800729, "grad_norm": 0.5690158009529114, "learning_rate": 2.976e-05, "loss": 0.0663, "step": 9923 }, { "epoch": 24.08748481166464, "grad_norm": 0.34988507628440857, "learning_rate": 2.9763e-05, "loss": 0.0191, "step": 9924 }, { "epoch": 24.089914945321993, "grad_norm": 0.38838130235671997, "learning_rate": 2.9766e-05, "loss": 0.0149, "step": 9925 }, { "epoch": 24.092345078979346, "grad_norm": 0.6634572744369507, "learning_rate": 2.9769e-05, "loss": 0.0699, "step": 9926 }, { "epoch": 24.094775212636694, "grad_norm": 1.367710828781128, "learning_rate": 2.9772e-05, "loss": 0.0167, "step": 9927 }, { "epoch": 24.097205346294047, "grad_norm": 0.4709543287754059, "learning_rate": 2.9775000000000002e-05, "loss": 0.0232, "step": 9928 }, { "epoch": 24.0996354799514, "grad_norm": 0.6120651960372925, "learning_rate": 2.9778000000000002e-05, "loss": 0.0203, "step": 9929 }, { "epoch": 24.102065613608747, "grad_norm": 1.6344505548477173, "learning_rate": 2.9781e-05, "loss": 0.0313, "step": 9930 }, { "epoch": 24.1044957472661, "grad_norm": 0.43032193183898926, "learning_rate": 2.9784e-05, "loss": 0.0223, "step": 9931 }, { "epoch": 24.106925880923452, "grad_norm": 0.4586420953273773, "learning_rate": 2.9787e-05, "loss": 0.0229, "step": 9932 }, { "epoch": 24.1093560145808, "grad_norm": 0.6385231614112854, "learning_rate": 2.979e-05, "loss": 0.0177, "step": 9933 }, { "epoch": 24.111786148238153, "grad_norm": 0.592793345451355, "learning_rate": 2.9793e-05, "loss": 0.0164, "step": 9934 }, { "epoch": 24.114216281895505, "grad_norm": 0.5401715040206909, "learning_rate": 2.9796e-05, "loss": 0.021, "step": 9935 }, { "epoch": 24.116646415552854, "grad_norm": 0.9308722019195557, "learning_rate": 2.9799e-05, "loss": 0.0393, "step": 9936 }, { "epoch": 24.119076549210206, "grad_norm": 1.6132588386535645, "learning_rate": 2.9802e-05, "loss": 0.0706, "step": 9937 }, { "epoch": 24.121506682867558, "grad_norm": 1.5767607688903809, "learning_rate": 2.9805000000000003e-05, "loss": 0.0349, "step": 9938 }, { "epoch": 24.12393681652491, "grad_norm": 0.8149644732475281, "learning_rate": 2.9808000000000003e-05, "loss": 0.2113, "step": 9939 }, { "epoch": 24.12636695018226, "grad_norm": 0.5573172569274902, "learning_rate": 2.9811000000000003e-05, "loss": 0.1298, "step": 9940 }, { "epoch": 24.12879708383961, "grad_norm": 0.408971905708313, "learning_rate": 2.9814000000000003e-05, "loss": 0.0959, "step": 9941 }, { "epoch": 24.131227217496964, "grad_norm": 1.744966745376587, "learning_rate": 2.9817e-05, "loss": 0.1169, "step": 9942 }, { "epoch": 24.133657351154312, "grad_norm": 0.6319958567619324, "learning_rate": 2.982e-05, "loss": 0.0732, "step": 9943 }, { "epoch": 24.136087484811664, "grad_norm": 0.451099693775177, "learning_rate": 2.9823e-05, "loss": 0.0641, "step": 9944 }, { "epoch": 24.138517618469017, "grad_norm": 0.5564274787902832, "learning_rate": 2.9826e-05, "loss": 0.058, "step": 9945 }, { "epoch": 24.140947752126365, "grad_norm": 0.3943633437156677, "learning_rate": 2.9829e-05, "loss": 0.0399, "step": 9946 }, { "epoch": 24.143377885783718, "grad_norm": 0.6718392372131348, "learning_rate": 2.9831999999999998e-05, "loss": 0.0266, "step": 9947 }, { "epoch": 24.14580801944107, "grad_norm": 0.4752497971057892, "learning_rate": 2.9835e-05, "loss": 0.0269, "step": 9948 }, { "epoch": 24.148238153098422, "grad_norm": 0.46581774950027466, "learning_rate": 2.9838e-05, "loss": 0.0369, "step": 9949 }, { "epoch": 24.15066828675577, "grad_norm": 0.5211724638938904, "learning_rate": 2.9841e-05, "loss": 0.0425, "step": 9950 }, { "epoch": 24.153098420413123, "grad_norm": 0.6592776775360107, "learning_rate": 2.9844e-05, "loss": 0.0187, "step": 9951 }, { "epoch": 24.155528554070475, "grad_norm": 0.32224977016448975, "learning_rate": 2.9847e-05, "loss": 0.0185, "step": 9952 }, { "epoch": 24.157958687727824, "grad_norm": 0.35219889879226685, "learning_rate": 2.985e-05, "loss": 0.0304, "step": 9953 }, { "epoch": 24.160388821385176, "grad_norm": 0.30147749185562134, "learning_rate": 2.9853e-05, "loss": 0.0178, "step": 9954 }, { "epoch": 24.16281895504253, "grad_norm": 0.4236515760421753, "learning_rate": 2.9856e-05, "loss": 0.0203, "step": 9955 }, { "epoch": 24.165249088699877, "grad_norm": 0.8889481425285339, "learning_rate": 2.9859e-05, "loss": 0.036, "step": 9956 }, { "epoch": 24.16767922235723, "grad_norm": 0.19643396139144897, "learning_rate": 2.9862e-05, "loss": 0.009, "step": 9957 }, { "epoch": 24.17010935601458, "grad_norm": 0.72061687707901, "learning_rate": 2.9865000000000003e-05, "loss": 0.0276, "step": 9958 }, { "epoch": 24.172539489671934, "grad_norm": 0.29193300008773804, "learning_rate": 2.9868000000000003e-05, "loss": 0.0186, "step": 9959 }, { "epoch": 24.174969623329282, "grad_norm": 0.45433545112609863, "learning_rate": 2.9871000000000003e-05, "loss": 0.0162, "step": 9960 }, { "epoch": 24.177399756986635, "grad_norm": 0.5580081939697266, "learning_rate": 2.9874000000000002e-05, "loss": 0.0165, "step": 9961 }, { "epoch": 24.179829890643987, "grad_norm": 0.20160312950611115, "learning_rate": 2.9877000000000002e-05, "loss": 0.0082, "step": 9962 }, { "epoch": 24.182260024301335, "grad_norm": 0.42397162318229675, "learning_rate": 2.9880000000000002e-05, "loss": 0.0277, "step": 9963 }, { "epoch": 24.184690157958688, "grad_norm": 0.5721720457077026, "learning_rate": 2.9883000000000002e-05, "loss": 0.0321, "step": 9964 }, { "epoch": 24.18712029161604, "grad_norm": 0.6567350029945374, "learning_rate": 2.9886e-05, "loss": 0.0122, "step": 9965 }, { "epoch": 24.18955042527339, "grad_norm": 0.43051183223724365, "learning_rate": 2.9889e-05, "loss": 0.0133, "step": 9966 }, { "epoch": 24.19198055893074, "grad_norm": 0.8555141091346741, "learning_rate": 2.9891999999999998e-05, "loss": 0.0212, "step": 9967 }, { "epoch": 24.194410692588093, "grad_norm": 0.6625598669052124, "learning_rate": 2.9895e-05, "loss": 0.0211, "step": 9968 }, { "epoch": 24.19684082624544, "grad_norm": 0.46556973457336426, "learning_rate": 2.9898e-05, "loss": 0.0233, "step": 9969 }, { "epoch": 24.199270959902794, "grad_norm": 0.5816047787666321, "learning_rate": 2.9901e-05, "loss": 0.0259, "step": 9970 }, { "epoch": 24.201701093560146, "grad_norm": 0.6936691403388977, "learning_rate": 2.9904e-05, "loss": 0.0212, "step": 9971 }, { "epoch": 24.2041312272175, "grad_norm": 0.545774519443512, "learning_rate": 2.9907e-05, "loss": 0.0155, "step": 9972 }, { "epoch": 24.206561360874847, "grad_norm": 0.45313653349876404, "learning_rate": 2.991e-05, "loss": 0.0179, "step": 9973 }, { "epoch": 24.2089914945322, "grad_norm": 0.37018996477127075, "learning_rate": 2.9913e-05, "loss": 0.0131, "step": 9974 }, { "epoch": 24.21142162818955, "grad_norm": 0.6694053411483765, "learning_rate": 2.9916e-05, "loss": 0.0258, "step": 9975 }, { "epoch": 24.2138517618469, "grad_norm": 0.19510594010353088, "learning_rate": 2.9919e-05, "loss": 0.0086, "step": 9976 }, { "epoch": 24.216281895504252, "grad_norm": 0.5796852111816406, "learning_rate": 2.9922e-05, "loss": 0.0211, "step": 9977 }, { "epoch": 24.218712029161605, "grad_norm": 0.45481279492378235, "learning_rate": 2.9925000000000002e-05, "loss": 0.022, "step": 9978 }, { "epoch": 24.221142162818953, "grad_norm": 0.45854249596595764, "learning_rate": 2.9928000000000002e-05, "loss": 0.0124, "step": 9979 }, { "epoch": 24.223572296476306, "grad_norm": 0.5010560750961304, "learning_rate": 2.9931000000000002e-05, "loss": 0.0179, "step": 9980 }, { "epoch": 24.226002430133658, "grad_norm": 0.841099202632904, "learning_rate": 2.9934000000000002e-05, "loss": 0.0331, "step": 9981 }, { "epoch": 24.22843256379101, "grad_norm": 0.5056720972061157, "learning_rate": 2.9937e-05, "loss": 0.0157, "step": 9982 }, { "epoch": 24.23086269744836, "grad_norm": 0.9435467720031738, "learning_rate": 2.994e-05, "loss": 0.0283, "step": 9983 }, { "epoch": 24.23329283110571, "grad_norm": 0.728722095489502, "learning_rate": 2.9943e-05, "loss": 0.0217, "step": 9984 }, { "epoch": 24.235722964763063, "grad_norm": 0.5526447892189026, "learning_rate": 2.9946e-05, "loss": 0.0142, "step": 9985 }, { "epoch": 24.238153098420412, "grad_norm": 0.8447514176368713, "learning_rate": 2.9949e-05, "loss": 0.0242, "step": 9986 }, { "epoch": 24.240583232077764, "grad_norm": 0.9752268195152283, "learning_rate": 2.9952e-05, "loss": 0.0304, "step": 9987 }, { "epoch": 24.243013365735116, "grad_norm": 0.9920390248298645, "learning_rate": 2.9955000000000004e-05, "loss": 0.0247, "step": 9988 }, { "epoch": 24.245443499392465, "grad_norm": 0.9220691323280334, "learning_rate": 2.9958000000000004e-05, "loss": 0.1981, "step": 9989 }, { "epoch": 24.247873633049817, "grad_norm": 0.5186508893966675, "learning_rate": 2.9961000000000003e-05, "loss": 0.1413, "step": 9990 }, { "epoch": 24.25030376670717, "grad_norm": 0.4418710470199585, "learning_rate": 2.9964e-05, "loss": 0.0889, "step": 9991 }, { "epoch": 24.25273390036452, "grad_norm": 0.6531668901443481, "learning_rate": 2.9967e-05, "loss": 0.0992, "step": 9992 }, { "epoch": 24.25516403402187, "grad_norm": 0.6637730002403259, "learning_rate": 2.997e-05, "loss": 0.0757, "step": 9993 }, { "epoch": 24.257594167679223, "grad_norm": 0.684723436832428, "learning_rate": 2.9973e-05, "loss": 0.0693, "step": 9994 }, { "epoch": 24.260024301336575, "grad_norm": 0.5931289792060852, "learning_rate": 2.9976e-05, "loss": 0.0493, "step": 9995 }, { "epoch": 24.262454434993924, "grad_norm": 0.40869542956352234, "learning_rate": 2.9979e-05, "loss": 0.0401, "step": 9996 }, { "epoch": 24.264884568651276, "grad_norm": 0.4746434986591339, "learning_rate": 2.9982e-05, "loss": 0.0346, "step": 9997 }, { "epoch": 24.267314702308628, "grad_norm": 0.33596956729888916, "learning_rate": 2.9985000000000002e-05, "loss": 0.0252, "step": 9998 }, { "epoch": 24.269744835965977, "grad_norm": 0.32638514041900635, "learning_rate": 2.9988e-05, "loss": 0.02, "step": 9999 }, { "epoch": 24.27217496962333, "grad_norm": 0.5542601943016052, "learning_rate": 2.9991e-05, "loss": 0.0288, "step": 10000 }, { "epoch": 24.27217496962333, "eval_cer": 0.09549433780419243, "eval_loss": 0.30927562713623047, "eval_runtime": 8.3831, "eval_samples_per_second": 12.048, "eval_steps_per_second": 0.477, "eval_wer": 0.3002645502645503, "step": 10000 }, { "epoch": 24.27217496962333, "step": 10000, "total_flos": 1.0154443101225655e+20, "train_loss": 0.4037482573915273, "train_runtime": 15149.5057, "train_samples_per_second": 422.456, "train_steps_per_second": 6.601 } ], "logging_steps": 1.0, "max_steps": 100000, "num_input_tokens_seen": 0, "num_train_epochs": 244, "save_steps": 1000, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 5, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 5 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.0154443101225655e+20, "train_batch_size": 32, "trial_name": null, "trial_params": null }