{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 0, "global_step": 406, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0024630541871921183, "grad_norm": 11.540680885314941, "learning_rate": 1e-05, "loss": 0.9851, "step": 1 }, { "epoch": 0.0049261083743842365, "grad_norm": 4.776151657104492, "learning_rate": 9.999850312505222e-06, "loss": 0.9615, "step": 2 }, { "epoch": 0.007389162561576354, "grad_norm": 3.0982847213745117, "learning_rate": 9.999401258983426e-06, "loss": 0.9231, "step": 3 }, { "epoch": 0.009852216748768473, "grad_norm": 2.3222382068634033, "learning_rate": 9.998652866321688e-06, "loss": 0.7308, "step": 4 }, { "epoch": 0.012315270935960592, "grad_norm": 1.997889757156372, "learning_rate": 9.997605179330018e-06, "loss": 0.7855, "step": 5 }, { "epoch": 0.014778325123152709, "grad_norm": 1.7656501531600952, "learning_rate": 9.996258260738676e-06, "loss": 0.8678, "step": 6 }, { "epoch": 0.017241379310344827, "grad_norm": 1.3362767696380615, "learning_rate": 9.994612191194407e-06, "loss": 0.7193, "step": 7 }, { "epoch": 0.019704433497536946, "grad_norm": 2.2548017501831055, "learning_rate": 9.99266706925562e-06, "loss": 0.6996, "step": 8 }, { "epoch": 0.022167487684729065, "grad_norm": 1.2499150037765503, "learning_rate": 9.990423011386489e-06, "loss": 0.7632, "step": 9 }, { "epoch": 0.024630541871921183, "grad_norm": 1.1288288831710815, "learning_rate": 9.987880151949976e-06, "loss": 0.7409, "step": 10 }, { "epoch": 0.027093596059113302, "grad_norm": 1.0057581663131714, "learning_rate": 9.98503864319978e-06, "loss": 0.7303, "step": 11 }, { "epoch": 0.029556650246305417, "grad_norm": 0.968294084072113, "learning_rate": 9.981898655271237e-06, "loss": 0.6851, "step": 12 }, { "epoch": 0.03201970443349754, "grad_norm": 1.1969448328018188, "learning_rate": 9.978460376171113e-06, "loss": 0.7647, "step": 13 }, { "epoch": 0.034482758620689655, "grad_norm": 0.983593225479126, "learning_rate": 9.974724011766364e-06, "loss": 0.7308, "step": 14 }, { "epoch": 0.03694581280788178, "grad_norm": 1.9946510791778564, "learning_rate": 9.970689785771798e-06, "loss": 0.6388, "step": 15 }, { "epoch": 0.03940886699507389, "grad_norm": 1.070317029953003, "learning_rate": 9.966357939736692e-06, "loss": 0.6595, "step": 16 }, { "epoch": 0.04187192118226601, "grad_norm": 1.241529941558838, "learning_rate": 9.961728733030318e-06, "loss": 0.8895, "step": 17 }, { "epoch": 0.04433497536945813, "grad_norm": 1.0109727382659912, "learning_rate": 9.956802442826417e-06, "loss": 0.7378, "step": 18 }, { "epoch": 0.046798029556650245, "grad_norm": 1.079637050628662, "learning_rate": 9.951579364086603e-06, "loss": 0.7178, "step": 19 }, { "epoch": 0.04926108374384237, "grad_norm": 1.2305184602737427, "learning_rate": 9.946059809542706e-06, "loss": 0.5985, "step": 20 }, { "epoch": 0.05172413793103448, "grad_norm": 11.067344665527344, "learning_rate": 9.940244109678043e-06, "loss": 0.681, "step": 21 }, { "epoch": 0.054187192118226604, "grad_norm": 1.4613230228424072, "learning_rate": 9.934132612707631e-06, "loss": 0.6258, "step": 22 }, { "epoch": 0.05665024630541872, "grad_norm": 1.041548728942871, "learning_rate": 9.927725684557339e-06, "loss": 0.5983, "step": 23 }, { "epoch": 0.059113300492610835, "grad_norm": 1.0944949388504028, "learning_rate": 9.921023708841975e-06, "loss": 0.6303, "step": 24 }, { "epoch": 0.06157635467980296, "grad_norm": 0.964139461517334, "learning_rate": 9.914027086842323e-06, "loss": 0.6394, "step": 25 }, { "epoch": 0.06403940886699508, "grad_norm": 1.1896731853485107, "learning_rate": 9.90673623748111e-06, "loss": 0.6009, "step": 26 }, { "epoch": 0.0665024630541872, "grad_norm": 1.1914570331573486, "learning_rate": 9.899151597297923e-06, "loss": 0.6132, "step": 27 }, { "epoch": 0.06896551724137931, "grad_norm": 0.910982608795166, "learning_rate": 9.891273620423083e-06, "loss": 0.6051, "step": 28 }, { "epoch": 0.07142857142857142, "grad_norm": 0.9912497401237488, "learning_rate": 9.883102778550434e-06, "loss": 0.5512, "step": 29 }, { "epoch": 0.07389162561576355, "grad_norm": 1.612174153327942, "learning_rate": 9.874639560909118e-06, "loss": 0.5836, "step": 30 }, { "epoch": 0.07635467980295567, "grad_norm": 0.967313289642334, "learning_rate": 9.865884474234275e-06, "loss": 0.6504, "step": 31 }, { "epoch": 0.07881773399014778, "grad_norm": 4.314541339874268, "learning_rate": 9.856838042736698e-06, "loss": 0.654, "step": 32 }, { "epoch": 0.0812807881773399, "grad_norm": 0.9903213977813721, "learning_rate": 9.847500808071458e-06, "loss": 0.5655, "step": 33 }, { "epoch": 0.08374384236453201, "grad_norm": 0.9885880947113037, "learning_rate": 9.837873329305458e-06, "loss": 0.6559, "step": 34 }, { "epoch": 0.08620689655172414, "grad_norm": 0.9196619987487793, "learning_rate": 9.82795618288397e-06, "loss": 0.5714, "step": 35 }, { "epoch": 0.08866995073891626, "grad_norm": 0.9677406549453735, "learning_rate": 9.817749962596115e-06, "loss": 0.5687, "step": 36 }, { "epoch": 0.09113300492610837, "grad_norm": 1.034245491027832, "learning_rate": 9.807255279539313e-06, "loss": 0.6626, "step": 37 }, { "epoch": 0.09359605911330049, "grad_norm": 1.936618685722351, "learning_rate": 9.796472762082687e-06, "loss": 0.6928, "step": 38 }, { "epoch": 0.0960591133004926, "grad_norm": 0.9693952202796936, "learning_rate": 9.78540305582945e-06, "loss": 0.5772, "step": 39 }, { "epoch": 0.09852216748768473, "grad_norm": 1.231882929801941, "learning_rate": 9.77404682357824e-06, "loss": 0.6339, "step": 40 }, { "epoch": 0.10098522167487685, "grad_norm": 0.9423834681510925, "learning_rate": 9.762404745283439e-06, "loss": 0.5313, "step": 41 }, { "epoch": 0.10344827586206896, "grad_norm": 1.3421170711517334, "learning_rate": 9.75047751801446e-06, "loss": 0.6631, "step": 42 }, { "epoch": 0.10591133004926108, "grad_norm": 1.0659502744674683, "learning_rate": 9.738265855914014e-06, "loss": 0.6041, "step": 43 }, { "epoch": 0.10837438423645321, "grad_norm": 14.726941108703613, "learning_rate": 9.725770490155338e-06, "loss": 0.4993, "step": 44 }, { "epoch": 0.11083743842364532, "grad_norm": 1.113370418548584, "learning_rate": 9.712992168898436e-06, "loss": 0.6032, "step": 45 }, { "epoch": 0.11330049261083744, "grad_norm": 4.380392074584961, "learning_rate": 9.699931657245264e-06, "loss": 0.5649, "step": 46 }, { "epoch": 0.11576354679802955, "grad_norm": 1.0713157653808594, "learning_rate": 9.686589737193929e-06, "loss": 0.5833, "step": 47 }, { "epoch": 0.11822660098522167, "grad_norm": 0.79201340675354, "learning_rate": 9.67296720759187e-06, "loss": 0.6057, "step": 48 }, { "epoch": 0.1206896551724138, "grad_norm": 0.8670496940612793, "learning_rate": 9.659064884088017e-06, "loss": 0.5203, "step": 49 }, { "epoch": 0.12315270935960591, "grad_norm": 0.9468395709991455, "learning_rate": 9.644883599083959e-06, "loss": 0.6177, "step": 50 }, { "epoch": 0.12561576354679804, "grad_norm": 1.1626007556915283, "learning_rate": 9.630424201684105e-06, "loss": 0.5259, "step": 51 }, { "epoch": 0.12807881773399016, "grad_norm": 0.8885717391967773, "learning_rate": 9.615687557644848e-06, "loss": 0.5835, "step": 52 }, { "epoch": 0.13054187192118227, "grad_norm": 0.9217110872268677, "learning_rate": 9.600674549322716e-06, "loss": 0.5965, "step": 53 }, { "epoch": 0.1330049261083744, "grad_norm": 0.9233891367912292, "learning_rate": 9.585386075621553e-06, "loss": 0.5679, "step": 54 }, { "epoch": 0.1354679802955665, "grad_norm": 1.285866141319275, "learning_rate": 9.569823051938689e-06, "loss": 0.5387, "step": 55 }, { "epoch": 0.13793103448275862, "grad_norm": 0.8282471895217896, "learning_rate": 9.553986410110135e-06, "loss": 0.476, "step": 56 }, { "epoch": 0.14039408866995073, "grad_norm": 0.9375513792037964, "learning_rate": 9.537877098354787e-06, "loss": 0.5456, "step": 57 }, { "epoch": 0.14285714285714285, "grad_norm": 1.073931097984314, "learning_rate": 9.521496081217652e-06, "loss": 0.4895, "step": 58 }, { "epoch": 0.14532019704433496, "grad_norm": 0.9736307263374329, "learning_rate": 9.504844339512096e-06, "loss": 0.6053, "step": 59 }, { "epoch": 0.1477832512315271, "grad_norm": 1.0761698484420776, "learning_rate": 9.487922870261123e-06, "loss": 0.4654, "step": 60 }, { "epoch": 0.15024630541871922, "grad_norm": 0.9952367544174194, "learning_rate": 9.470732686637665e-06, "loss": 0.5585, "step": 61 }, { "epoch": 0.15270935960591134, "grad_norm": 0.8771824836730957, "learning_rate": 9.453274817903932e-06, "loss": 0.5827, "step": 62 }, { "epoch": 0.15517241379310345, "grad_norm": 1.1052207946777344, "learning_rate": 9.435550309349776e-06, "loss": 0.5705, "step": 63 }, { "epoch": 0.15763546798029557, "grad_norm": 0.8488280177116394, "learning_rate": 9.417560222230115e-06, "loss": 0.5058, "step": 64 }, { "epoch": 0.16009852216748768, "grad_norm": 0.9430592656135559, "learning_rate": 9.399305633701372e-06, "loss": 0.506, "step": 65 }, { "epoch": 0.1625615763546798, "grad_norm": 1.6385177373886108, "learning_rate": 9.380787636757002e-06, "loss": 0.6934, "step": 66 }, { "epoch": 0.16502463054187191, "grad_norm": 0.9341111183166504, "learning_rate": 9.36200734016203e-06, "loss": 0.6388, "step": 67 }, { "epoch": 0.16748768472906403, "grad_norm": 0.7698712944984436, "learning_rate": 9.342965868386674e-06, "loss": 0.4639, "step": 68 }, { "epoch": 0.16995073891625614, "grad_norm": 0.8888192176818848, "learning_rate": 9.32366436153902e-06, "loss": 0.5409, "step": 69 }, { "epoch": 0.1724137931034483, "grad_norm": 1.2890245914459229, "learning_rate": 9.30410397529675e-06, "loss": 0.6643, "step": 70 }, { "epoch": 0.1748768472906404, "grad_norm": 0.8433436155319214, "learning_rate": 9.284285880837947e-06, "loss": 0.5345, "step": 71 }, { "epoch": 0.17733990147783252, "grad_norm": 0.8337154388427734, "learning_rate": 9.264211264770977e-06, "loss": 0.4866, "step": 72 }, { "epoch": 0.17980295566502463, "grad_norm": 0.8894972801208496, "learning_rate": 9.243881329063436e-06, "loss": 0.5478, "step": 73 }, { "epoch": 0.18226600985221675, "grad_norm": 0.8818243741989136, "learning_rate": 9.22329729097018e-06, "loss": 0.5189, "step": 74 }, { "epoch": 0.18472906403940886, "grad_norm": 1.0139381885528564, "learning_rate": 9.202460382960449e-06, "loss": 0.5413, "step": 75 }, { "epoch": 0.18719211822660098, "grad_norm": 1.144716739654541, "learning_rate": 9.181371852644063e-06, "loss": 0.5689, "step": 76 }, { "epoch": 0.1896551724137931, "grad_norm": 1.1229621171951294, "learning_rate": 9.160032962696734e-06, "loss": 0.5018, "step": 77 }, { "epoch": 0.1921182266009852, "grad_norm": 1.20614755153656, "learning_rate": 9.138444990784455e-06, "loss": 0.5297, "step": 78 }, { "epoch": 0.19458128078817735, "grad_norm": 1.7722558975219727, "learning_rate": 9.116609229486992e-06, "loss": 0.4937, "step": 79 }, { "epoch": 0.19704433497536947, "grad_norm": 1.1039201021194458, "learning_rate": 9.094526986220513e-06, "loss": 0.5181, "step": 80 }, { "epoch": 0.19950738916256158, "grad_norm": 1.0505284070968628, "learning_rate": 9.072199583159285e-06, "loss": 0.6021, "step": 81 }, { "epoch": 0.2019704433497537, "grad_norm": 0.9036574959754944, "learning_rate": 9.049628357156522e-06, "loss": 0.5434, "step": 82 }, { "epoch": 0.2044334975369458, "grad_norm": 1.5033870935440063, "learning_rate": 9.026814659664331e-06, "loss": 0.462, "step": 83 }, { "epoch": 0.20689655172413793, "grad_norm": 0.9125608205795288, "learning_rate": 9.003759856652803e-06, "loss": 0.456, "step": 84 }, { "epoch": 0.20935960591133004, "grad_norm": 3.8365941047668457, "learning_rate": 8.98046532852822e-06, "loss": 0.4463, "step": 85 }, { "epoch": 0.21182266009852216, "grad_norm": 1.0002238750457764, "learning_rate": 8.956932470050405e-06, "loss": 0.5616, "step": 86 }, { "epoch": 0.21428571428571427, "grad_norm": 0.807572603225708, "learning_rate": 8.93316269024921e-06, "loss": 0.4795, "step": 87 }, { "epoch": 0.21674876847290642, "grad_norm": 0.7959713935852051, "learning_rate": 8.90915741234015e-06, "loss": 0.4098, "step": 88 }, { "epoch": 0.21921182266009853, "grad_norm": 2.6254165172576904, "learning_rate": 8.88491807363919e-06, "loss": 0.6164, "step": 89 }, { "epoch": 0.22167487684729065, "grad_norm": 0.8573901653289795, "learning_rate": 8.860446125476688e-06, "loss": 0.5426, "step": 90 }, { "epoch": 0.22413793103448276, "grad_norm": 0.8792327046394348, "learning_rate": 8.835743033110482e-06, "loss": 0.5441, "step": 91 }, { "epoch": 0.22660098522167488, "grad_norm": 1.1924870014190674, "learning_rate": 8.810810275638183e-06, "loss": 0.4542, "step": 92 }, { "epoch": 0.229064039408867, "grad_norm": 0.7673595547676086, "learning_rate": 8.78564934590859e-06, "loss": 0.4864, "step": 93 }, { "epoch": 0.2315270935960591, "grad_norm": 1.037055492401123, "learning_rate": 8.760261750432312e-06, "loss": 0.5286, "step": 94 }, { "epoch": 0.23399014778325122, "grad_norm": 1.1282660961151123, "learning_rate": 8.734649009291586e-06, "loss": 0.5892, "step": 95 }, { "epoch": 0.23645320197044334, "grad_norm": 0.9695258140563965, "learning_rate": 8.708812656049227e-06, "loss": 0.5851, "step": 96 }, { "epoch": 0.23891625615763548, "grad_norm": 0.9574100971221924, "learning_rate": 8.68275423765683e-06, "loss": 0.5058, "step": 97 }, { "epoch": 0.2413793103448276, "grad_norm": 0.8745037317276001, "learning_rate": 8.656475314362149e-06, "loss": 0.4157, "step": 98 }, { "epoch": 0.2438423645320197, "grad_norm": 0.8950684070587158, "learning_rate": 8.629977459615655e-06, "loss": 0.4444, "step": 99 }, { "epoch": 0.24630541871921183, "grad_norm": 1.0956140756607056, "learning_rate": 8.603262259976348e-06, "loss": 0.5102, "step": 100 }, { "epoch": 0.24876847290640394, "grad_norm": 0.9333466291427612, "learning_rate": 8.576331315016753e-06, "loss": 0.5257, "step": 101 }, { "epoch": 0.2512315270935961, "grad_norm": 0.9434043169021606, "learning_rate": 8.549186237227138e-06, "loss": 0.532, "step": 102 }, { "epoch": 0.2536945812807882, "grad_norm": 1.0310450792312622, "learning_rate": 8.521828651918983e-06, "loss": 0.5588, "step": 103 }, { "epoch": 0.2561576354679803, "grad_norm": 1.0325108766555786, "learning_rate": 8.49426019712765e-06, "loss": 0.4569, "step": 104 }, { "epoch": 0.25862068965517243, "grad_norm": 1.030055284500122, "learning_rate": 8.46648252351431e-06, "loss": 0.5172, "step": 105 }, { "epoch": 0.26108374384236455, "grad_norm": 0.8964868783950806, "learning_rate": 8.438497294267117e-06, "loss": 0.5143, "step": 106 }, { "epoch": 0.26354679802955666, "grad_norm": 1.2119203805923462, "learning_rate": 8.41030618500161e-06, "loss": 0.4589, "step": 107 }, { "epoch": 0.2660098522167488, "grad_norm": 0.880455732345581, "learning_rate": 8.3819108836604e-06, "loss": 0.4789, "step": 108 }, { "epoch": 0.2684729064039409, "grad_norm": 1.026513695716858, "learning_rate": 8.353313090412093e-06, "loss": 0.4132, "step": 109 }, { "epoch": 0.270935960591133, "grad_norm": 0.8435222506523132, "learning_rate": 8.3245145175495e-06, "loss": 0.4623, "step": 110 }, { "epoch": 0.2733990147783251, "grad_norm": 0.8828105926513672, "learning_rate": 8.295516889387115e-06, "loss": 0.4267, "step": 111 }, { "epoch": 0.27586206896551724, "grad_norm": 0.7920165061950684, "learning_rate": 8.26632194215786e-06, "loss": 0.4472, "step": 112 }, { "epoch": 0.27832512315270935, "grad_norm": 1.1182401180267334, "learning_rate": 8.23693142390914e-06, "loss": 0.4812, "step": 113 }, { "epoch": 0.28078817733990147, "grad_norm": 1.444700002670288, "learning_rate": 8.207347094398173e-06, "loss": 0.4815, "step": 114 }, { "epoch": 0.2832512315270936, "grad_norm": 0.9819019436836243, "learning_rate": 8.177570724986627e-06, "loss": 0.47, "step": 115 }, { "epoch": 0.2857142857142857, "grad_norm": 0.84306401014328, "learning_rate": 8.14760409853456e-06, "loss": 0.547, "step": 116 }, { "epoch": 0.2881773399014778, "grad_norm": 0.9470686912536621, "learning_rate": 8.117449009293668e-06, "loss": 0.4874, "step": 117 }, { "epoch": 0.29064039408866993, "grad_norm": 0.7772971987724304, "learning_rate": 8.087107262799856e-06, "loss": 0.5282, "step": 118 }, { "epoch": 0.29310344827586204, "grad_norm": 0.9155469536781311, "learning_rate": 8.05658067576513e-06, "loss": 0.4475, "step": 119 }, { "epoch": 0.2955665024630542, "grad_norm": 0.823267936706543, "learning_rate": 8.025871075968828e-06, "loss": 0.4273, "step": 120 }, { "epoch": 0.29802955665024633, "grad_norm": 0.921981155872345, "learning_rate": 7.99498030214817e-06, "loss": 0.5247, "step": 121 }, { "epoch": 0.30049261083743845, "grad_norm": 0.8674582839012146, "learning_rate": 7.963910203888177e-06, "loss": 0.4733, "step": 122 }, { "epoch": 0.30295566502463056, "grad_norm": 0.895481526851654, "learning_rate": 7.932662641510915e-06, "loss": 0.5376, "step": 123 }, { "epoch": 0.3054187192118227, "grad_norm": 0.7914291024208069, "learning_rate": 7.90123948596412e-06, "loss": 0.4572, "step": 124 }, { "epoch": 0.3078817733990148, "grad_norm": 1.2109417915344238, "learning_rate": 7.869642618709162e-06, "loss": 0.4873, "step": 125 }, { "epoch": 0.3103448275862069, "grad_norm": 0.9359944462776184, "learning_rate": 7.8378739316084e-06, "loss": 0.4277, "step": 126 }, { "epoch": 0.312807881773399, "grad_norm": 0.8828278183937073, "learning_rate": 7.805935326811913e-06, "loss": 0.5501, "step": 127 }, { "epoch": 0.31527093596059114, "grad_norm": 1.0584925413131714, "learning_rate": 7.773828716643592e-06, "loss": 0.4268, "step": 128 }, { "epoch": 0.31773399014778325, "grad_norm": 1.5920400619506836, "learning_rate": 7.741556023486655e-06, "loss": 0.5066, "step": 129 }, { "epoch": 0.32019704433497537, "grad_norm": 0.8200149536132812, "learning_rate": 7.709119179668538e-06, "loss": 0.419, "step": 130 }, { "epoch": 0.3226600985221675, "grad_norm": 1.1859734058380127, "learning_rate": 7.676520127345198e-06, "loss": 0.496, "step": 131 }, { "epoch": 0.3251231527093596, "grad_norm": 0.9637341499328613, "learning_rate": 7.64376081838482e-06, "loss": 0.5875, "step": 132 }, { "epoch": 0.3275862068965517, "grad_norm": 0.9584794640541077, "learning_rate": 7.610843214250964e-06, "loss": 0.4876, "step": 133 }, { "epoch": 0.33004926108374383, "grad_norm": 0.9975658059120178, "learning_rate": 7.57776928588511e-06, "loss": 0.4481, "step": 134 }, { "epoch": 0.33251231527093594, "grad_norm": 1.1614975929260254, "learning_rate": 7.5445410135886455e-06, "loss": 0.5802, "step": 135 }, { "epoch": 0.33497536945812806, "grad_norm": 2.236684799194336, "learning_rate": 7.511160386904306e-06, "loss": 0.4955, "step": 136 }, { "epoch": 0.3374384236453202, "grad_norm": 0.8656754493713379, "learning_rate": 7.477629404497048e-06, "loss": 0.5312, "step": 137 }, { "epoch": 0.3399014778325123, "grad_norm": 0.7941805124282837, "learning_rate": 7.4439500740343685e-06, "loss": 0.4797, "step": 138 }, { "epoch": 0.34236453201970446, "grad_norm": 0.8105161190032959, "learning_rate": 7.4101244120661105e-06, "loss": 0.4181, "step": 139 }, { "epoch": 0.3448275862068966, "grad_norm": 2.2261204719543457, "learning_rate": 7.376154443903714e-06, "loss": 0.5089, "step": 140 }, { "epoch": 0.3472906403940887, "grad_norm": 0.9390556812286377, "learning_rate": 7.342042203498952e-06, "loss": 0.5437, "step": 141 }, { "epoch": 0.3497536945812808, "grad_norm": 0.8173288702964783, "learning_rate": 7.307789733322146e-06, "loss": 0.4328, "step": 142 }, { "epoch": 0.3522167487684729, "grad_norm": 1.0669867992401123, "learning_rate": 7.273399084239878e-06, "loss": 0.4557, "step": 143 }, { "epoch": 0.35467980295566504, "grad_norm": 2.4072816371917725, "learning_rate": 7.238872315392189e-06, "loss": 0.4371, "step": 144 }, { "epoch": 0.35714285714285715, "grad_norm": 0.8072436451911926, "learning_rate": 7.204211494069292e-06, "loss": 0.3932, "step": 145 }, { "epoch": 0.35960591133004927, "grad_norm": 1.3799082040786743, "learning_rate": 7.169418695587791e-06, "loss": 0.4526, "step": 146 }, { "epoch": 0.3620689655172414, "grad_norm": 0.9200634956359863, "learning_rate": 7.134496003166423e-06, "loss": 0.4379, "step": 147 }, { "epoch": 0.3645320197044335, "grad_norm": 2.47597336769104, "learning_rate": 7.099445507801324e-06, "loss": 0.4638, "step": 148 }, { "epoch": 0.3669950738916256, "grad_norm": 0.9331015348434448, "learning_rate": 7.06426930814083e-06, "loss": 0.4635, "step": 149 }, { "epoch": 0.3694581280788177, "grad_norm": 0.8710178732872009, "learning_rate": 7.028969510359821e-06, "loss": 0.3501, "step": 150 }, { "epoch": 0.37192118226600984, "grad_norm": 1.0825324058532715, "learning_rate": 6.993548228033618e-06, "loss": 0.4367, "step": 151 }, { "epoch": 0.37438423645320196, "grad_norm": 0.8021703362464905, "learning_rate": 6.9580075820114255e-06, "loss": 0.5561, "step": 152 }, { "epoch": 0.3768472906403941, "grad_norm": 0.799213707447052, "learning_rate": 6.922349700289348e-06, "loss": 0.4704, "step": 153 }, { "epoch": 0.3793103448275862, "grad_norm": 1.3380686044692993, "learning_rate": 6.886576717882982e-06, "loss": 0.4525, "step": 154 }, { "epoch": 0.3817733990147783, "grad_norm": 0.807714581489563, "learning_rate": 6.850690776699574e-06, "loss": 0.3812, "step": 155 }, { "epoch": 0.3842364532019704, "grad_norm": 0.8078760504722595, "learning_rate": 6.814694025409773e-06, "loss": 0.4816, "step": 156 }, { "epoch": 0.3866995073891626, "grad_norm": 0.884308934211731, "learning_rate": 6.7785886193189936e-06, "loss": 0.5723, "step": 157 }, { "epoch": 0.3891625615763547, "grad_norm": 0.8629475831985474, "learning_rate": 6.742376720238346e-06, "loss": 0.4831, "step": 158 }, { "epoch": 0.3916256157635468, "grad_norm": 0.8618782162666321, "learning_rate": 6.7060604963552125e-06, "loss": 0.4798, "step": 159 }, { "epoch": 0.39408866995073893, "grad_norm": 0.7644566893577576, "learning_rate": 6.669642122103423e-06, "loss": 0.4965, "step": 160 }, { "epoch": 0.39655172413793105, "grad_norm": 1.092326283454895, "learning_rate": 6.633123778033061e-06, "loss": 0.4821, "step": 161 }, { "epoch": 0.39901477832512317, "grad_norm": 0.9640597105026245, "learning_rate": 6.5965076506799e-06, "loss": 0.39, "step": 162 }, { "epoch": 0.4014778325123153, "grad_norm": 1.1585789918899536, "learning_rate": 6.559795932434489e-06, "loss": 0.4484, "step": 163 }, { "epoch": 0.4039408866995074, "grad_norm": 0.8530002236366272, "learning_rate": 6.522990821410881e-06, "loss": 0.3783, "step": 164 }, { "epoch": 0.4064039408866995, "grad_norm": 0.8914724588394165, "learning_rate": 6.486094521315022e-06, "loss": 0.4685, "step": 165 }, { "epoch": 0.4088669950738916, "grad_norm": 0.7827121019363403, "learning_rate": 6.449109241312803e-06, "loss": 0.4192, "step": 166 }, { "epoch": 0.41133004926108374, "grad_norm": 1.0978766679763794, "learning_rate": 6.412037195897786e-06, "loss": 0.408, "step": 167 }, { "epoch": 0.41379310344827586, "grad_norm": 0.8382684588432312, "learning_rate": 6.3748806047586155e-06, "loss": 0.457, "step": 168 }, { "epoch": 0.41625615763546797, "grad_norm": 1.073023796081543, "learning_rate": 6.337641692646106e-06, "loss": 0.4697, "step": 169 }, { "epoch": 0.4187192118226601, "grad_norm": 0.9375430941581726, "learning_rate": 6.300322689240042e-06, "loss": 0.4769, "step": 170 }, { "epoch": 0.4211822660098522, "grad_norm": 0.8222770690917969, "learning_rate": 6.262925829015675e-06, "loss": 0.4652, "step": 171 }, { "epoch": 0.4236453201970443, "grad_norm": 0.9433236718177795, "learning_rate": 6.2254533511099345e-06, "loss": 0.4507, "step": 172 }, { "epoch": 0.42610837438423643, "grad_norm": 0.8848790526390076, "learning_rate": 6.187907499187357e-06, "loss": 0.4219, "step": 173 }, { "epoch": 0.42857142857142855, "grad_norm": 0.8615171313285828, "learning_rate": 6.150290521305746e-06, "loss": 0.4878, "step": 174 }, { "epoch": 0.43103448275862066, "grad_norm": 0.8783863186836243, "learning_rate": 6.112604669781572e-06, "loss": 0.4351, "step": 175 }, { "epoch": 0.43349753694581283, "grad_norm": 0.8565014004707336, "learning_rate": 6.074852201055121e-06, "loss": 0.4027, "step": 176 }, { "epoch": 0.43596059113300495, "grad_norm": 0.8122984766960144, "learning_rate": 6.037035375555376e-06, "loss": 0.4788, "step": 177 }, { "epoch": 0.43842364532019706, "grad_norm": 0.8158956170082092, "learning_rate": 5.9991564575646855e-06, "loss": 0.4449, "step": 178 }, { "epoch": 0.4408866995073892, "grad_norm": 0.8699731826782227, "learning_rate": 5.961217715083185e-06, "loss": 0.4864, "step": 179 }, { "epoch": 0.4433497536945813, "grad_norm": 0.9397453665733337, "learning_rate": 5.923221419693002e-06, "loss": 0.518, "step": 180 }, { "epoch": 0.4458128078817734, "grad_norm": 0.8958430290222168, "learning_rate": 5.885169846422242e-06, "loss": 0.4862, "step": 181 }, { "epoch": 0.4482758620689655, "grad_norm": 0.8838856816291809, "learning_rate": 5.847065273608777e-06, "loss": 0.4701, "step": 182 }, { "epoch": 0.45073891625615764, "grad_norm": 0.8200072050094604, "learning_rate": 5.808909982763825e-06, "loss": 0.3679, "step": 183 }, { "epoch": 0.45320197044334976, "grad_norm": 0.931821346282959, "learning_rate": 5.770706258435342e-06, "loss": 0.4082, "step": 184 }, { "epoch": 0.45566502463054187, "grad_norm": 1.0416653156280518, "learning_rate": 5.732456388071247e-06, "loss": 0.4608, "step": 185 }, { "epoch": 0.458128078817734, "grad_norm": 0.8697349429130554, "learning_rate": 5.6941626618824445e-06, "loss": 0.5462, "step": 186 }, { "epoch": 0.4605911330049261, "grad_norm": 0.760611355304718, "learning_rate": 5.655827372705712e-06, "loss": 0.4358, "step": 187 }, { "epoch": 0.4630541871921182, "grad_norm": 0.8112332820892334, "learning_rate": 5.61745281586641e-06, "loss": 0.5072, "step": 188 }, { "epoch": 0.46551724137931033, "grad_norm": 0.857423722743988, "learning_rate": 5.579041289041045e-06, "loss": 0.4662, "step": 189 }, { "epoch": 0.46798029556650245, "grad_norm": 0.9442394375801086, "learning_rate": 5.540595092119709e-06, "loss": 0.5124, "step": 190 }, { "epoch": 0.47044334975369456, "grad_norm": 0.7853634357452393, "learning_rate": 5.502116527068363e-06, "loss": 0.4048, "step": 191 }, { "epoch": 0.4729064039408867, "grad_norm": 1.0823391675949097, "learning_rate": 5.463607897791006e-06, "loss": 0.4303, "step": 192 }, { "epoch": 0.4753694581280788, "grad_norm": 1.0174390077590942, "learning_rate": 5.425071509991737e-06, "loss": 0.467, "step": 193 }, { "epoch": 0.47783251231527096, "grad_norm": 0.9144822955131531, "learning_rate": 5.386509671036695e-06, "loss": 0.4526, "step": 194 }, { "epoch": 0.4802955665024631, "grad_norm": 0.9582109451293945, "learning_rate": 5.347924689815906e-06, "loss": 0.4019, "step": 195 }, { "epoch": 0.4827586206896552, "grad_norm": 0.7563319802284241, "learning_rate": 5.309318876605043e-06, "loss": 0.4331, "step": 196 }, { "epoch": 0.4852216748768473, "grad_norm": 0.8050902485847473, "learning_rate": 5.270694542927089e-06, "loss": 0.4731, "step": 197 }, { "epoch": 0.4876847290640394, "grad_norm": 0.9717739820480347, "learning_rate": 5.2320540014139405e-06, "loss": 0.4701, "step": 198 }, { "epoch": 0.49014778325123154, "grad_norm": 0.9432171583175659, "learning_rate": 5.193399565667945e-06, "loss": 0.3581, "step": 199 }, { "epoch": 0.49261083743842365, "grad_norm": 1.2452778816223145, "learning_rate": 5.154733550123357e-06, "loss": 0.4548, "step": 200 }, { "epoch": 0.49507389162561577, "grad_norm": 0.78749680519104, "learning_rate": 5.116058269907779e-06, "loss": 0.421, "step": 201 }, { "epoch": 0.4975369458128079, "grad_norm": 0.8748036026954651, "learning_rate": 5.077376040703533e-06, "loss": 0.4386, "step": 202 }, { "epoch": 0.5, "grad_norm": 0.8578478693962097, "learning_rate": 5.038689178609011e-06, "loss": 0.5316, "step": 203 }, { "epoch": 0.5024630541871922, "grad_norm": 0.7721994519233704, "learning_rate": 5e-06, "loss": 0.4235, "step": 204 }, { "epoch": 0.5049261083743842, "grad_norm": 0.7942141890525818, "learning_rate": 4.96131082139099e-06, "loss": 0.4798, "step": 205 }, { "epoch": 0.5073891625615764, "grad_norm": 1.1543359756469727, "learning_rate": 4.922623959296469e-06, "loss": 0.4402, "step": 206 }, { "epoch": 0.5098522167487685, "grad_norm": 0.9413177967071533, "learning_rate": 4.883941730092222e-06, "loss": 0.5872, "step": 207 }, { "epoch": 0.5123152709359606, "grad_norm": 0.7829416394233704, "learning_rate": 4.845266449876646e-06, "loss": 0.4577, "step": 208 }, { "epoch": 0.5147783251231527, "grad_norm": 0.7771807312965393, "learning_rate": 4.806600434332056e-06, "loss": 0.4063, "step": 209 }, { "epoch": 0.5172413793103449, "grad_norm": 0.8723726868629456, "learning_rate": 4.76794599858606e-06, "loss": 0.4745, "step": 210 }, { "epoch": 0.5197044334975369, "grad_norm": 0.8430385589599609, "learning_rate": 4.729305457072913e-06, "loss": 0.4174, "step": 211 }, { "epoch": 0.5221674876847291, "grad_norm": 0.6947190165519714, "learning_rate": 4.690681123394959e-06, "loss": 0.378, "step": 212 }, { "epoch": 0.5246305418719212, "grad_norm": 0.932201623916626, "learning_rate": 4.6520753101840945e-06, "loss": 0.4578, "step": 213 }, { "epoch": 0.5270935960591133, "grad_norm": 0.8713343739509583, "learning_rate": 4.613490328963307e-06, "loss": 0.39, "step": 214 }, { "epoch": 0.5295566502463054, "grad_norm": 0.7538230419158936, "learning_rate": 4.574928490008264e-06, "loss": 0.3857, "step": 215 }, { "epoch": 0.5320197044334976, "grad_norm": 0.8916656970977783, "learning_rate": 4.536392102208998e-06, "loss": 0.5202, "step": 216 }, { "epoch": 0.5344827586206896, "grad_norm": 0.8922424912452698, "learning_rate": 4.497883472931639e-06, "loss": 0.4081, "step": 217 }, { "epoch": 0.5369458128078818, "grad_norm": 1.8260999917984009, "learning_rate": 4.459404907880293e-06, "loss": 0.4833, "step": 218 }, { "epoch": 0.5394088669950738, "grad_norm": 1.0410164594650269, "learning_rate": 4.4209587109589565e-06, "loss": 0.3233, "step": 219 }, { "epoch": 0.541871921182266, "grad_norm": 0.9134721755981445, "learning_rate": 4.382547184133593e-06, "loss": 0.373, "step": 220 }, { "epoch": 0.5443349753694581, "grad_norm": 0.8863281607627869, "learning_rate": 4.3441726272942895e-06, "loss": 0.3112, "step": 221 }, { "epoch": 0.5467980295566502, "grad_norm": 0.7182394862174988, "learning_rate": 4.305837338117557e-06, "loss": 0.4257, "step": 222 }, { "epoch": 0.5492610837438424, "grad_norm": 1.0430289506912231, "learning_rate": 4.267543611928755e-06, "loss": 0.4123, "step": 223 }, { "epoch": 0.5517241379310345, "grad_norm": 0.9563698768615723, "learning_rate": 4.229293741564658e-06, "loss": 0.5008, "step": 224 }, { "epoch": 0.5541871921182266, "grad_norm": 1.4074031114578247, "learning_rate": 4.191090017236177e-06, "loss": 0.4318, "step": 225 }, { "epoch": 0.5566502463054187, "grad_norm": 0.8157017230987549, "learning_rate": 4.152934726391223e-06, "loss": 0.5214, "step": 226 }, { "epoch": 0.5591133004926109, "grad_norm": 0.7647298574447632, "learning_rate": 4.114830153577759e-06, "loss": 0.3445, "step": 227 }, { "epoch": 0.5615763546798029, "grad_norm": 1.0865991115570068, "learning_rate": 4.076778580306999e-06, "loss": 0.3936, "step": 228 }, { "epoch": 0.5640394088669951, "grad_norm": 1.014061450958252, "learning_rate": 4.0387822849168165e-06, "loss": 0.3716, "step": 229 }, { "epoch": 0.5665024630541872, "grad_norm": 0.7353817224502563, "learning_rate": 4.000843542435315e-06, "loss": 0.3828, "step": 230 }, { "epoch": 0.5689655172413793, "grad_norm": 1.005282998085022, "learning_rate": 3.962964624444625e-06, "loss": 0.4572, "step": 231 }, { "epoch": 0.5714285714285714, "grad_norm": 0.9712240099906921, "learning_rate": 3.92514779894488e-06, "loss": 0.3327, "step": 232 }, { "epoch": 0.5738916256157636, "grad_norm": 0.7928905487060547, "learning_rate": 3.887395330218429e-06, "loss": 0.3351, "step": 233 }, { "epoch": 0.5763546798029556, "grad_norm": 0.8094484806060791, "learning_rate": 3.849709478694256e-06, "loss": 0.4305, "step": 234 }, { "epoch": 0.5788177339901478, "grad_norm": 3.6431384086608887, "learning_rate": 3.8120925008126457e-06, "loss": 0.4964, "step": 235 }, { "epoch": 0.5812807881773399, "grad_norm": 0.9250912666320801, "learning_rate": 3.7745466488900663e-06, "loss": 0.4114, "step": 236 }, { "epoch": 0.583743842364532, "grad_norm": 1.0961583852767944, "learning_rate": 3.7370741709843263e-06, "loss": 0.4588, "step": 237 }, { "epoch": 0.5862068965517241, "grad_norm": 0.857764482498169, "learning_rate": 3.6996773107599605e-06, "loss": 0.3729, "step": 238 }, { "epoch": 0.5886699507389163, "grad_norm": 1.0068374872207642, "learning_rate": 3.662358307353897e-06, "loss": 0.4843, "step": 239 }, { "epoch": 0.5911330049261084, "grad_norm": 0.8471587300300598, "learning_rate": 3.6251193952413866e-06, "loss": 0.4503, "step": 240 }, { "epoch": 0.5935960591133005, "grad_norm": 0.8168319463729858, "learning_rate": 3.587962804102214e-06, "loss": 0.3802, "step": 241 }, { "epoch": 0.5960591133004927, "grad_norm": 0.8261502981185913, "learning_rate": 3.550890758687199e-06, "loss": 0.4659, "step": 242 }, { "epoch": 0.5985221674876847, "grad_norm": 1.202130913734436, "learning_rate": 3.5139054786849787e-06, "loss": 0.4397, "step": 243 }, { "epoch": 0.6009852216748769, "grad_norm": 0.8035978078842163, "learning_rate": 3.4770091785891207e-06, "loss": 0.4111, "step": 244 }, { "epoch": 0.603448275862069, "grad_norm": 2.1057868003845215, "learning_rate": 3.440204067565511e-06, "loss": 0.4438, "step": 245 }, { "epoch": 0.6059113300492611, "grad_norm": 0.8185580372810364, "learning_rate": 3.403492349320101e-06, "loss": 0.3776, "step": 246 }, { "epoch": 0.6083743842364532, "grad_norm": 0.8482059240341187, "learning_rate": 3.3668762219669393e-06, "loss": 0.4184, "step": 247 }, { "epoch": 0.6108374384236454, "grad_norm": 0.8905600309371948, "learning_rate": 3.330357877896577e-06, "loss": 0.4145, "step": 248 }, { "epoch": 0.6133004926108374, "grad_norm": 0.7447388172149658, "learning_rate": 3.293939503644788e-06, "loss": 0.4, "step": 249 }, { "epoch": 0.6157635467980296, "grad_norm": 1.7505443096160889, "learning_rate": 3.2576232797616556e-06, "loss": 0.3793, "step": 250 }, { "epoch": 0.6182266009852216, "grad_norm": 0.9543902277946472, "learning_rate": 3.2214113806810077e-06, "loss": 0.3999, "step": 251 }, { "epoch": 0.6206896551724138, "grad_norm": 0.889468789100647, "learning_rate": 3.1853059745902287e-06, "loss": 0.3603, "step": 252 }, { "epoch": 0.6231527093596059, "grad_norm": 0.9314963817596436, "learning_rate": 3.149309223300428e-06, "loss": 0.4, "step": 253 }, { "epoch": 0.625615763546798, "grad_norm": 0.7808181047439575, "learning_rate": 3.1134232821170202e-06, "loss": 0.4358, "step": 254 }, { "epoch": 0.6280788177339901, "grad_norm": 0.8075929284095764, "learning_rate": 3.0776502997106526e-06, "loss": 0.4397, "step": 255 }, { "epoch": 0.6305418719211823, "grad_norm": 0.991936445236206, "learning_rate": 3.041992417988577e-06, "loss": 0.3703, "step": 256 }, { "epoch": 0.6330049261083743, "grad_norm": 0.9331387877464294, "learning_rate": 3.0064517719663833e-06, "loss": 0.4161, "step": 257 }, { "epoch": 0.6354679802955665, "grad_norm": 0.9041942358016968, "learning_rate": 2.9710304896401803e-06, "loss": 0.3642, "step": 258 }, { "epoch": 0.6379310344827587, "grad_norm": 1.105132818222046, "learning_rate": 2.935730691859172e-06, "loss": 0.3843, "step": 259 }, { "epoch": 0.6403940886699507, "grad_norm": 0.7953276634216309, "learning_rate": 2.9005544921986774e-06, "loss": 0.3583, "step": 260 }, { "epoch": 0.6428571428571429, "grad_norm": 0.9104849696159363, "learning_rate": 2.8655039968335774e-06, "loss": 0.4262, "step": 261 }, { "epoch": 0.645320197044335, "grad_norm": 1.0020502805709839, "learning_rate": 2.83058130441221e-06, "loss": 0.3959, "step": 262 }, { "epoch": 0.6477832512315271, "grad_norm": 1.0070898532867432, "learning_rate": 2.7957885059307097e-06, "loss": 0.4128, "step": 263 }, { "epoch": 0.6502463054187192, "grad_norm": 0.8092946410179138, "learning_rate": 2.761127684607811e-06, "loss": 0.4152, "step": 264 }, { "epoch": 0.6527093596059114, "grad_norm": 0.9570103883743286, "learning_rate": 2.7266009157601226e-06, "loss": 0.4408, "step": 265 }, { "epoch": 0.6551724137931034, "grad_norm": 0.9353964924812317, "learning_rate": 2.692210266677855e-06, "loss": 0.4548, "step": 266 }, { "epoch": 0.6576354679802956, "grad_norm": 0.7735761404037476, "learning_rate": 2.65795779650105e-06, "loss": 0.4139, "step": 267 }, { "epoch": 0.6600985221674877, "grad_norm": 0.8776670098304749, "learning_rate": 2.6238455560962884e-06, "loss": 0.3719, "step": 268 }, { "epoch": 0.6625615763546798, "grad_norm": 0.9098663330078125, "learning_rate": 2.589875587933892e-06, "loss": 0.4113, "step": 269 }, { "epoch": 0.6650246305418719, "grad_norm": 3.0552971363067627, "learning_rate": 2.5560499259656323e-06, "loss": 0.4058, "step": 270 }, { "epoch": 0.6674876847290641, "grad_norm": 0.7983712553977966, "learning_rate": 2.522370595502954e-06, "loss": 0.4385, "step": 271 }, { "epoch": 0.6699507389162561, "grad_norm": 0.695418119430542, "learning_rate": 2.488839613095695e-06, "loss": 0.3056, "step": 272 }, { "epoch": 0.6724137931034483, "grad_norm": 1.0092236995697021, "learning_rate": 2.4554589864113566e-06, "loss": 0.3816, "step": 273 }, { "epoch": 0.6748768472906403, "grad_norm": 0.7949216961860657, "learning_rate": 2.422230714114891e-06, "loss": 0.3537, "step": 274 }, { "epoch": 0.6773399014778325, "grad_norm": 0.9313486218452454, "learning_rate": 2.3891567857490373e-06, "loss": 0.4997, "step": 275 }, { "epoch": 0.6798029556650246, "grad_norm": 0.8018554449081421, "learning_rate": 2.3562391816151807e-06, "loss": 0.417, "step": 276 }, { "epoch": 0.6822660098522167, "grad_norm": 0.9989979267120361, "learning_rate": 2.323479872654805e-06, "loss": 0.4071, "step": 277 }, { "epoch": 0.6847290640394089, "grad_norm": 0.7565258741378784, "learning_rate": 2.2908808203314637e-06, "loss": 0.3126, "step": 278 }, { "epoch": 0.687192118226601, "grad_norm": 0.9910544157028198, "learning_rate": 2.2584439765133453e-06, "loss": 0.4361, "step": 279 }, { "epoch": 0.6896551724137931, "grad_norm": 0.7968178987503052, "learning_rate": 2.226171283356409e-06, "loss": 0.3032, "step": 280 }, { "epoch": 0.6921182266009852, "grad_norm": 0.8218366503715515, "learning_rate": 2.1940646731880887e-06, "loss": 0.3931, "step": 281 }, { "epoch": 0.6945812807881774, "grad_norm": 0.8241420984268188, "learning_rate": 2.162126068391601e-06, "loss": 0.4846, "step": 282 }, { "epoch": 0.6970443349753694, "grad_norm": 0.9964359998703003, "learning_rate": 2.1303573812908383e-06, "loss": 0.4059, "step": 283 }, { "epoch": 0.6995073891625616, "grad_norm": 0.8862767219543457, "learning_rate": 2.0987605140358823e-06, "loss": 0.387, "step": 284 }, { "epoch": 0.7019704433497537, "grad_norm": 0.8447444438934326, "learning_rate": 2.0673373584890847e-06, "loss": 0.367, "step": 285 }, { "epoch": 0.7044334975369458, "grad_norm": 0.719472348690033, "learning_rate": 2.036089796111825e-06, "loss": 0.3834, "step": 286 }, { "epoch": 0.7068965517241379, "grad_norm": 0.8975826501846313, "learning_rate": 2.0050196978518323e-06, "loss": 0.3538, "step": 287 }, { "epoch": 0.7093596059113301, "grad_norm": 1.0086376667022705, "learning_rate": 1.9741289240311757e-06, "loss": 0.498, "step": 288 }, { "epoch": 0.7118226600985221, "grad_norm": 0.9242886304855347, "learning_rate": 1.943419324234871e-06, "loss": 0.442, "step": 289 }, { "epoch": 0.7142857142857143, "grad_norm": 0.8589749932289124, "learning_rate": 1.9128927372001456e-06, "loss": 0.3315, "step": 290 }, { "epoch": 0.7167487684729064, "grad_norm": 1.2725121974945068, "learning_rate": 1.8825509907063328e-06, "loss": 0.5441, "step": 291 }, { "epoch": 0.7192118226600985, "grad_norm": 0.975878119468689, "learning_rate": 1.852395901465441e-06, "loss": 0.4301, "step": 292 }, { "epoch": 0.7216748768472906, "grad_norm": 0.9230057001113892, "learning_rate": 1.8224292750133743e-06, "loss": 0.4157, "step": 293 }, { "epoch": 0.7241379310344828, "grad_norm": 0.9983015656471252, "learning_rate": 1.79265290560183e-06, "loss": 0.4511, "step": 294 }, { "epoch": 0.7266009852216748, "grad_norm": 0.8537778258323669, "learning_rate": 1.7630685760908623e-06, "loss": 0.4724, "step": 295 }, { "epoch": 0.729064039408867, "grad_norm": 0.8379884958267212, "learning_rate": 1.733678057842142e-06, "loss": 0.3558, "step": 296 }, { "epoch": 0.7315270935960592, "grad_norm": 0.835649311542511, "learning_rate": 1.7044831106128867e-06, "loss": 0.3725, "step": 297 }, { "epoch": 0.7339901477832512, "grad_norm": 0.7726622819900513, "learning_rate": 1.675485482450499e-06, "loss": 0.3666, "step": 298 }, { "epoch": 0.7364532019704434, "grad_norm": 0.9081035256385803, "learning_rate": 1.6466869095879079e-06, "loss": 0.3409, "step": 299 }, { "epoch": 0.7389162561576355, "grad_norm": 0.7757613658905029, "learning_rate": 1.6180891163396013e-06, "loss": 0.3568, "step": 300 }, { "epoch": 0.7413793103448276, "grad_norm": 0.9632595777511597, "learning_rate": 1.589693814998391e-06, "loss": 0.4876, "step": 301 }, { "epoch": 0.7438423645320197, "grad_norm": 0.7990944385528564, "learning_rate": 1.561502705732883e-06, "loss": 0.4158, "step": 302 }, { "epoch": 0.7463054187192119, "grad_norm": 0.9227884411811829, "learning_rate": 1.533517476485691e-06, "loss": 0.3277, "step": 303 }, { "epoch": 0.7487684729064039, "grad_norm": 1.0761818885803223, "learning_rate": 1.5057398028723514e-06, "loss": 0.3425, "step": 304 }, { "epoch": 0.7512315270935961, "grad_norm": 0.7749660015106201, "learning_rate": 1.4781713480810184e-06, "loss": 0.3679, "step": 305 }, { "epoch": 0.7536945812807881, "grad_norm": 2.2163593769073486, "learning_rate": 1.450813762772863e-06, "loss": 0.407, "step": 306 }, { "epoch": 0.7561576354679803, "grad_norm": 1.069761037826538, "learning_rate": 1.4236686849832497e-06, "loss": 0.3972, "step": 307 }, { "epoch": 0.7586206896551724, "grad_norm": 0.9363300800323486, "learning_rate": 1.3967377400236515e-06, "loss": 0.4288, "step": 308 }, { "epoch": 0.7610837438423645, "grad_norm": 0.8307360410690308, "learning_rate": 1.370022540384347e-06, "loss": 0.5181, "step": 309 }, { "epoch": 0.7635467980295566, "grad_norm": 0.9606369137763977, "learning_rate": 1.3435246856378524e-06, "loss": 0.5306, "step": 310 }, { "epoch": 0.7660098522167488, "grad_norm": 0.9734224677085876, "learning_rate": 1.3172457623431706e-06, "loss": 0.53, "step": 311 }, { "epoch": 0.7684729064039408, "grad_norm": 0.9703786373138428, "learning_rate": 1.2911873439507766e-06, "loss": 0.4361, "step": 312 }, { "epoch": 0.770935960591133, "grad_norm": 1.0127205848693848, "learning_rate": 1.2653509907084171e-06, "loss": 0.3778, "step": 313 }, { "epoch": 0.7733990147783252, "grad_norm": 0.6820623874664307, "learning_rate": 1.2397382495676873e-06, "loss": 0.3225, "step": 314 }, { "epoch": 0.7758620689655172, "grad_norm": 1.9675172567367554, "learning_rate": 1.214350654091413e-06, "loss": 0.4524, "step": 315 }, { "epoch": 0.7783251231527094, "grad_norm": 0.8468368649482727, "learning_rate": 1.1891897243618184e-06, "loss": 0.4673, "step": 316 }, { "epoch": 0.7807881773399015, "grad_norm": 1.0746312141418457, "learning_rate": 1.1642569668895171e-06, "loss": 0.4392, "step": 317 }, { "epoch": 0.7832512315270936, "grad_norm": 1.0559433698654175, "learning_rate": 1.139553874523313e-06, "loss": 0.4199, "step": 318 }, { "epoch": 0.7857142857142857, "grad_norm": 0.7627314925193787, "learning_rate": 1.1150819263608098e-06, "loss": 0.3582, "step": 319 }, { "epoch": 0.7881773399014779, "grad_norm": 0.8994935750961304, "learning_rate": 1.0908425876598512e-06, "loss": 0.3603, "step": 320 }, { "epoch": 0.7906403940886699, "grad_norm": 0.8457913398742676, "learning_rate": 1.0668373097507922e-06, "loss": 0.3291, "step": 321 }, { "epoch": 0.7931034482758621, "grad_norm": 0.8527902364730835, "learning_rate": 1.0430675299495973e-06, "loss": 0.3809, "step": 322 }, { "epoch": 0.7955665024630542, "grad_norm": 0.7819089889526367, "learning_rate": 1.0195346714717813e-06, "loss": 0.4148, "step": 323 }, { "epoch": 0.7980295566502463, "grad_norm": 0.878807544708252, "learning_rate": 9.962401433471985e-07, "loss": 0.3385, "step": 324 }, { "epoch": 0.8004926108374384, "grad_norm": 1.3636759519577026, "learning_rate": 9.731853403356705e-07, "loss": 0.378, "step": 325 }, { "epoch": 0.8029556650246306, "grad_norm": 22.09219741821289, "learning_rate": 9.5037164284348e-07, "loss": 0.3528, "step": 326 }, { "epoch": 0.8054187192118226, "grad_norm": 0.763201117515564, "learning_rate": 9.278004168407151e-07, "loss": 0.4066, "step": 327 }, { "epoch": 0.8078817733990148, "grad_norm": 0.7910692095756531, "learning_rate": 9.054730137794887e-07, "loss": 0.2703, "step": 328 }, { "epoch": 0.8103448275862069, "grad_norm": 0.9270244836807251, "learning_rate": 8.833907705130091e-07, "loss": 0.4674, "step": 329 }, { "epoch": 0.812807881773399, "grad_norm": 0.7657389044761658, "learning_rate": 8.615550092155478e-07, "loss": 0.3766, "step": 330 }, { "epoch": 0.8152709359605911, "grad_norm": 0.9968824982643127, "learning_rate": 8.399670373032665e-07, "loss": 0.443, "step": 331 }, { "epoch": 0.8177339901477833, "grad_norm": 0.9094595313072205, "learning_rate": 8.186281473559382e-07, "loss": 0.4102, "step": 332 }, { "epoch": 0.8201970443349754, "grad_norm": 0.9319184422492981, "learning_rate": 7.975396170395522e-07, "loss": 0.3985, "step": 333 }, { "epoch": 0.8226600985221675, "grad_norm": 0.8672550916671753, "learning_rate": 7.767027090298207e-07, "loss": 0.4297, "step": 334 }, { "epoch": 0.8251231527093597, "grad_norm": 1.1561846733093262, "learning_rate": 7.561186709365653e-07, "loss": 0.5102, "step": 335 }, { "epoch": 0.8275862068965517, "grad_norm": 1.1552369594573975, "learning_rate": 7.357887352290227e-07, "loss": 0.4518, "step": 336 }, { "epoch": 0.8300492610837439, "grad_norm": 8.381624221801758, "learning_rate": 7.157141191620548e-07, "loss": 0.4467, "step": 337 }, { "epoch": 0.8325123152709359, "grad_norm": 0.7928099632263184, "learning_rate": 6.958960247032515e-07, "loss": 0.3273, "step": 338 }, { "epoch": 0.8349753694581281, "grad_norm": 1.1730235815048218, "learning_rate": 6.763356384609809e-07, "loss": 0.4065, "step": 339 }, { "epoch": 0.8374384236453202, "grad_norm": 1.0426114797592163, "learning_rate": 6.570341316133272e-07, "loss": 0.4301, "step": 340 }, { "epoch": 0.8399014778325123, "grad_norm": 0.7004266381263733, "learning_rate": 6.379926598379727e-07, "loss": 0.2412, "step": 341 }, { "epoch": 0.8423645320197044, "grad_norm": 0.8835524320602417, "learning_rate": 6.192123632429986e-07, "loss": 0.4428, "step": 342 }, { "epoch": 0.8448275862068966, "grad_norm": 0.7402558922767639, "learning_rate": 6.006943662986275e-07, "loss": 0.3081, "step": 343 }, { "epoch": 0.8472906403940886, "grad_norm": 0.9316273927688599, "learning_rate": 5.824397777698859e-07, "loss": 0.3839, "step": 344 }, { "epoch": 0.8497536945812808, "grad_norm": 1.0180613994598389, "learning_rate": 5.644496906502233e-07, "loss": 0.3383, "step": 345 }, { "epoch": 0.8522167487684729, "grad_norm": 0.9619033932685852, "learning_rate": 5.4672518209607e-07, "loss": 0.3643, "step": 346 }, { "epoch": 0.854679802955665, "grad_norm": 1.3424688577651978, "learning_rate": 5.292673133623372e-07, "loss": 0.5491, "step": 347 }, { "epoch": 0.8571428571428571, "grad_norm": 1.1439658403396606, "learning_rate": 5.120771297388788e-07, "loss": 0.3623, "step": 348 }, { "epoch": 0.8596059113300493, "grad_norm": 0.9782744646072388, "learning_rate": 4.951556604879049e-07, "loss": 0.5006, "step": 349 }, { "epoch": 0.8620689655172413, "grad_norm": 0.9459072351455688, "learning_rate": 4.785039187823503e-07, "loss": 0.4076, "step": 350 }, { "epoch": 0.8645320197044335, "grad_norm": 0.9256353378295898, "learning_rate": 4.6212290164521554e-07, "loss": 0.4268, "step": 351 }, { "epoch": 0.8669950738916257, "grad_norm": 0.9616571068763733, "learning_rate": 4.46013589889866e-07, "loss": 0.2967, "step": 352 }, { "epoch": 0.8694581280788177, "grad_norm": 0.7549436688423157, "learning_rate": 4.3017694806131163e-07, "loss": 0.4012, "step": 353 }, { "epoch": 0.8719211822660099, "grad_norm": 0.829087495803833, "learning_rate": 4.146139243784475e-07, "loss": 0.3399, "step": 354 }, { "epoch": 0.874384236453202, "grad_norm": 0.885208785533905, "learning_rate": 3.9932545067728366e-07, "loss": 0.3451, "step": 355 }, { "epoch": 0.8768472906403941, "grad_norm": 0.9004522562026978, "learning_rate": 3.8431244235515366e-07, "loss": 0.3754, "step": 356 }, { "epoch": 0.8793103448275862, "grad_norm": 0.9170466065406799, "learning_rate": 3.695757983158954e-07, "loss": 0.3638, "step": 357 }, { "epoch": 0.8817733990147784, "grad_norm": 1.3507622480392456, "learning_rate": 3.5511640091604293e-07, "loss": 0.3724, "step": 358 }, { "epoch": 0.8842364532019704, "grad_norm": 0.8250304460525513, "learning_rate": 3.409351159119845e-07, "loss": 0.3516, "step": 359 }, { "epoch": 0.8866995073891626, "grad_norm": 0.7117695212364197, "learning_rate": 3.270327924081301e-07, "loss": 0.402, "step": 360 }, { "epoch": 0.8891625615763546, "grad_norm": 1.0209925174713135, "learning_rate": 3.134102628060698e-07, "loss": 0.5029, "step": 361 }, { "epoch": 0.8916256157635468, "grad_norm": 1.013433814048767, "learning_rate": 3.000683427547374e-07, "loss": 0.4095, "step": 362 }, { "epoch": 0.8940886699507389, "grad_norm": 0.9493584036827087, "learning_rate": 2.8700783110156507e-07, "loss": 0.4316, "step": 363 }, { "epoch": 0.896551724137931, "grad_norm": 0.8495911955833435, "learning_rate": 2.742295098446623e-07, "loss": 0.3361, "step": 364 }, { "epoch": 0.8990147783251231, "grad_norm": 0.8885901570320129, "learning_rate": 2.617341440859883e-07, "loss": 0.4157, "step": 365 }, { "epoch": 0.9014778325123153, "grad_norm": 1.3665013313293457, "learning_rate": 2.4952248198554075e-07, "loss": 0.4276, "step": 366 }, { "epoch": 0.9039408866995073, "grad_norm": 0.9520452618598938, "learning_rate": 2.3759525471656163e-07, "loss": 0.3771, "step": 367 }, { "epoch": 0.9064039408866995, "grad_norm": 0.8242120742797852, "learning_rate": 2.259531764217604e-07, "loss": 0.361, "step": 368 }, { "epoch": 0.9088669950738916, "grad_norm": 0.9279873371124268, "learning_rate": 2.1459694417055033e-07, "loss": 0.4935, "step": 369 }, { "epoch": 0.9113300492610837, "grad_norm": 1.1166878938674927, "learning_rate": 2.0352723791731366e-07, "loss": 0.3956, "step": 370 }, { "epoch": 0.9137931034482759, "grad_norm": 0.8491089344024658, "learning_rate": 1.9274472046068805e-07, "loss": 0.2792, "step": 371 }, { "epoch": 0.916256157635468, "grad_norm": 1.089476227760315, "learning_rate": 1.8225003740388546e-07, "loss": 0.3604, "step": 372 }, { "epoch": 0.9187192118226601, "grad_norm": 0.9467745423316956, "learning_rate": 1.7204381711603046e-07, "loss": 0.3956, "step": 373 }, { "epoch": 0.9211822660098522, "grad_norm": 0.9419111013412476, "learning_rate": 1.621266706945429e-07, "loss": 0.4508, "step": 374 }, { "epoch": 0.9236453201970444, "grad_norm": 2.099689245223999, "learning_rate": 1.524991919285429e-07, "loss": 0.5749, "step": 375 }, { "epoch": 0.9261083743842364, "grad_norm": 1.3350390195846558, "learning_rate": 1.431619572633014e-07, "loss": 0.4098, "step": 376 }, { "epoch": 0.9285714285714286, "grad_norm": 0.912725567817688, "learning_rate": 1.3411552576572562e-07, "loss": 0.4331, "step": 377 }, { "epoch": 0.9310344827586207, "grad_norm": 0.9236518144607544, "learning_rate": 1.253604390908819e-07, "loss": 0.3197, "step": 378 }, { "epoch": 0.9334975369458128, "grad_norm": 0.9659357070922852, "learning_rate": 1.1689722144956672e-07, "loss": 0.4172, "step": 379 }, { "epoch": 0.9359605911330049, "grad_norm": 0.7131395936012268, "learning_rate": 1.0872637957691834e-07, "loss": 0.3812, "step": 380 }, { "epoch": 0.9384236453201971, "grad_norm": 1.1041388511657715, "learning_rate": 1.008484027020773e-07, "loss": 0.4566, "step": 381 }, { "epoch": 0.9408866995073891, "grad_norm": 0.7971521615982056, "learning_rate": 9.326376251889202e-08, "loss": 0.343, "step": 382 }, { "epoch": 0.9433497536945813, "grad_norm": 0.7595178484916687, "learning_rate": 8.597291315767808e-08, "loss": 0.37, "step": 383 }, { "epoch": 0.9458128078817734, "grad_norm": 0.709429144859314, "learning_rate": 7.897629115802553e-08, "loss": 0.3152, "step": 384 }, { "epoch": 0.9482758620689655, "grad_norm": 9.246243476867676, "learning_rate": 7.227431544266194e-08, "loss": 0.6465, "step": 385 }, { "epoch": 0.9507389162561576, "grad_norm": 1.1263999938964844, "learning_rate": 6.58673872923693e-08, "loss": 0.33, "step": 386 }, { "epoch": 0.9532019704433498, "grad_norm": 0.830968976020813, "learning_rate": 5.97558903219575e-08, "loss": 0.369, "step": 387 }, { "epoch": 0.9556650246305419, "grad_norm": 0.7675350904464722, "learning_rate": 5.3940190457294486e-08, "loss": 0.3185, "step": 388 }, { "epoch": 0.958128078817734, "grad_norm": 1.0136815309524536, "learning_rate": 4.842063591339763e-08, "loss": 0.4651, "step": 389 }, { "epoch": 0.9605911330049262, "grad_norm": 0.9319408535957336, "learning_rate": 4.3197557173584317e-08, "loss": 0.3876, "step": 390 }, { "epoch": 0.9630541871921182, "grad_norm": 0.8962835669517517, "learning_rate": 3.82712669696822e-08, "loss": 0.4232, "step": 391 }, { "epoch": 0.9655172413793104, "grad_norm": 0.8527126312255859, "learning_rate": 3.364206026330752e-08, "loss": 0.3564, "step": 392 }, { "epoch": 0.9679802955665024, "grad_norm": 0.8836209177970886, "learning_rate": 2.9310214228202016e-08, "loss": 0.3616, "step": 393 }, { "epoch": 0.9704433497536946, "grad_norm": 0.8913058638572693, "learning_rate": 2.527598823363786e-08, "loss": 0.4859, "step": 394 }, { "epoch": 0.9729064039408867, "grad_norm": 0.8344419598579407, "learning_rate": 2.153962382888841e-08, "loss": 0.3994, "step": 395 }, { "epoch": 0.9753694581280788, "grad_norm": 0.9367709755897522, "learning_rate": 1.8101344728764236e-08, "loss": 0.321, "step": 396 }, { "epoch": 0.9778325123152709, "grad_norm": 0.862615704536438, "learning_rate": 1.496135680021993e-08, "loss": 0.3368, "step": 397 }, { "epoch": 0.9802955665024631, "grad_norm": 0.8697728514671326, "learning_rate": 1.2119848050025084e-08, "loss": 0.3877, "step": 398 }, { "epoch": 0.9827586206896551, "grad_norm": 0.985674262046814, "learning_rate": 9.576988613511084e-09, "loss": 0.3868, "step": 399 }, { "epoch": 0.9852216748768473, "grad_norm": 0.7607033252716064, "learning_rate": 7.332930744380906e-09, "loss": 0.4217, "step": 400 }, { "epoch": 0.9876847290640394, "grad_norm": 0.9515877962112427, "learning_rate": 5.387808805594752e-09, "loss": 0.417, "step": 401 }, { "epoch": 0.9901477832512315, "grad_norm": 1.0010173320770264, "learning_rate": 3.741739261324817e-09, "loss": 0.4093, "step": 402 }, { "epoch": 0.9926108374384236, "grad_norm": 0.6813908219337463, "learning_rate": 2.3948206699819787e-09, "loss": 0.3174, "step": 403 }, { "epoch": 0.9950738916256158, "grad_norm": 0.9744001626968384, "learning_rate": 1.347133678313295e-09, "loss": 0.3682, "step": 404 }, { "epoch": 0.9975369458128078, "grad_norm": 0.7549907565116882, "learning_rate": 5.987410165758656e-10, "loss": 0.4171, "step": 405 }, { "epoch": 1.0, "grad_norm": 0.7962020635604858, "learning_rate": 1.4968749477872746e-10, "loss": 0.359, "step": 406 } ], "logging_steps": 1.0, "max_steps": 406, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 0, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.842746096485663e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }