{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 230, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02, "learning_rate": 2.8571428571428573e-06, "loss": 0.1731, "step": 1 }, { "epoch": 0.04, "learning_rate": 5.7142857142857145e-06, "loss": 0.1107, "step": 2 }, { "epoch": 0.07, "learning_rate": 8.571428571428571e-06, "loss": 0.1074, "step": 3 }, { "epoch": 0.09, "learning_rate": 1.1428571428571429e-05, "loss": 0.1454, "step": 4 }, { "epoch": 0.11, "learning_rate": 1.4285714285714287e-05, "loss": 0.1185, "step": 5 }, { "epoch": 0.13, "learning_rate": 1.7142857142857142e-05, "loss": 0.2155, "step": 6 }, { "epoch": 0.15, "learning_rate": 2e-05, "loss": 0.1626, "step": 7 }, { "epoch": 0.17, "learning_rate": 1.9999007677495127e-05, "loss": 0.1486, "step": 8 }, { "epoch": 0.2, "learning_rate": 1.9996030906921302e-05, "loss": 0.1611, "step": 9 }, { "epoch": 0.22, "learning_rate": 1.9991070279061808e-05, "loss": 0.1354, "step": 10 }, { "epoch": 0.24, "learning_rate": 1.9984126778425178e-05, "loss": 0.1309, "step": 11 }, { "epoch": 0.26, "learning_rate": 1.9975201783049804e-05, "loss": 0.1228, "step": 12 }, { "epoch": 0.28, "learning_rate": 1.9964297064230437e-05, "loss": 0.1096, "step": 13 }, { "epoch": 0.3, "learning_rate": 1.9951414786166656e-05, "loss": 0.1152, "step": 14 }, { "epoch": 0.33, "learning_rate": 1.9936557505533346e-05, "loss": 0.1058, "step": 15 }, { "epoch": 0.35, "learning_rate": 1.9919728170973297e-05, "loss": 0.0853, "step": 16 }, { "epoch": 0.37, "learning_rate": 1.9900930122511993e-05, "loss": 0.1111, "step": 17 }, { "epoch": 0.39, "learning_rate": 1.988016709089474e-05, "loss": 1.8086, "step": 18 }, { "epoch": 0.41, "learning_rate": 1.985744319684625e-05, "loss": 0.6606, "step": 19 }, { "epoch": 0.43, "learning_rate": 1.9832762950252813e-05, "loss": 0.2228, "step": 20 }, { "epoch": 0.46, "learning_rate": 1.9806131249267256e-05, "loss": 0.1629, "step": 21 }, { "epoch": 0.48, "learning_rate": 1.977755337933682e-05, "loss": 0.1505, "step": 22 }, { "epoch": 0.5, "learning_rate": 1.9747035012154203e-05, "loss": 0.1273, "step": 23 }, { "epoch": 0.52, "learning_rate": 1.971458220453192e-05, "loss": 0.159, "step": 24 }, { "epoch": 0.54, "learning_rate": 1.968020139720024e-05, "loss": 0.1383, "step": 25 }, { "epoch": 0.57, "learning_rate": 1.9643899413528926e-05, "loss": 0.1456, "step": 26 }, { "epoch": 0.59, "learning_rate": 1.960568345817306e-05, "loss": 0.1609, "step": 27 }, { "epoch": 0.61, "learning_rate": 1.9565561115643153e-05, "loss": 0.1258, "step": 28 }, { "epoch": 0.63, "learning_rate": 1.9523540348799887e-05, "loss": 0.1219, "step": 29 }, { "epoch": 0.65, "learning_rate": 1.9479629497273783e-05, "loss": 0.122, "step": 30 }, { "epoch": 0.67, "learning_rate": 1.9433837275810084e-05, "loss": 0.1164, "step": 31 }, { "epoch": 0.7, "learning_rate": 1.9386172772539162e-05, "loss": 0.1234, "step": 32 }, { "epoch": 0.72, "learning_rate": 1.933664544717288e-05, "loss": 0.1189, "step": 33 }, { "epoch": 0.74, "learning_rate": 1.928526512912715e-05, "loss": 0.1278, "step": 34 }, { "epoch": 0.76, "learning_rate": 1.9232042015571152e-05, "loss": 0.1117, "step": 35 }, { "epoch": 0.78, "learning_rate": 1.9176986669403556e-05, "loss": 0.1157, "step": 36 }, { "epoch": 0.8, "learning_rate": 1.9120110017156172e-05, "loss": 0.1032, "step": 37 }, { "epoch": 0.83, "learning_rate": 1.9061423346825395e-05, "loss": 0.1035, "step": 38 }, { "epoch": 0.85, "learning_rate": 1.9000938305631975e-05, "loss": 0.1029, "step": 39 }, { "epoch": 0.87, "learning_rate": 1.8938666897709427e-05, "loss": 0.1058, "step": 40 }, { "epoch": 0.89, "learning_rate": 1.8874621481721645e-05, "loss": 0.1039, "step": 41 }, { "epoch": 0.91, "learning_rate": 1.8808814768410157e-05, "loss": 0.114, "step": 42 }, { "epoch": 0.93, "learning_rate": 1.874125981807148e-05, "loss": 0.1076, "step": 43 }, { "epoch": 0.96, "learning_rate": 1.867197003796512e-05, "loss": 0.0863, "step": 44 }, { "epoch": 0.98, "learning_rate": 1.8600959179652708e-05, "loss": 0.0893, "step": 45 }, { "epoch": 1.0, "learning_rate": 1.852824133626881e-05, "loss": 0.0946, "step": 46 }, { "epoch": 1.02, "learning_rate": 1.8453830939723913e-05, "loss": 0.0782, "step": 47 }, { "epoch": 1.04, "learning_rate": 1.8377742757840246e-05, "loss": 0.0882, "step": 48 }, { "epoch": 1.07, "learning_rate": 1.8299991891420848e-05, "loss": 0.0871, "step": 49 }, { "epoch": 1.09, "learning_rate": 1.822059377125263e-05, "loss": 0.073, "step": 50 }, { "epoch": 1.11, "learning_rate": 1.8139564155043885e-05, "loss": 0.0777, "step": 51 }, { "epoch": 1.13, "learning_rate": 1.8056919124296957e-05, "loss": 0.0825, "step": 52 }, { "epoch": 1.15, "learning_rate": 1.797267508111664e-05, "loss": 0.0732, "step": 53 }, { "epoch": 1.17, "learning_rate": 1.788684874495491e-05, "loss": 0.079, "step": 54 }, { "epoch": 1.2, "learning_rate": 1.7799457149292752e-05, "loss": 0.0681, "step": 55 }, { "epoch": 1.22, "learning_rate": 1.7710517638259593e-05, "loss": 0.0683, "step": 56 }, { "epoch": 1.24, "learning_rate": 1.76200478631911e-05, "loss": 0.0743, "step": 57 }, { "epoch": 1.26, "learning_rate": 1.7528065779126035e-05, "loss": 0.0763, "step": 58 }, { "epoch": 1.28, "learning_rate": 1.7434589641242814e-05, "loss": 0.067, "step": 59 }, { "epoch": 1.3, "learning_rate": 1.7339638001236495e-05, "loss": 0.0667, "step": 60 }, { "epoch": 1.33, "learning_rate": 1.7243229703636924e-05, "loss": 0.0703, "step": 61 }, { "epoch": 1.35, "learning_rate": 1.714538388206878e-05, "loss": 0.0716, "step": 62 }, { "epoch": 1.37, "learning_rate": 1.704611995545421e-05, "loss": 0.0513, "step": 63 }, { "epoch": 1.39, "learning_rate": 1.694545762415887e-05, "loss": 0.0663, "step": 64 }, { "epoch": 1.41, "learning_rate": 1.6843416866082118e-05, "loss": 0.0827, "step": 65 }, { "epoch": 1.43, "learning_rate": 1.6740017932692073e-05, "loss": 0.0845, "step": 66 }, { "epoch": 1.46, "learning_rate": 1.663528134500646e-05, "loss": 0.0791, "step": 67 }, { "epoch": 1.48, "learning_rate": 1.6529227889519884e-05, "loss": 0.0596, "step": 68 }, { "epoch": 1.5, "learning_rate": 1.642187861407847e-05, "loss": 0.0678, "step": 69 }, { "epoch": 1.52, "learning_rate": 1.631325482370259e-05, "loss": 0.0822, "step": 70 }, { "epoch": 1.54, "learning_rate": 1.6203378076358602e-05, "loss": 0.0595, "step": 71 }, { "epoch": 1.57, "learning_rate": 1.609227017868033e-05, "loss": 0.0673, "step": 72 }, { "epoch": 1.59, "learning_rate": 1.5979953181641246e-05, "loss": 0.0865, "step": 73 }, { "epoch": 1.61, "learning_rate": 1.5866449376178118e-05, "loss": 0.0639, "step": 74 }, { "epoch": 1.63, "learning_rate": 1.5751781288767052e-05, "loss": 0.0703, "step": 75 }, { "epoch": 1.65, "learning_rate": 1.56359716769528e-05, "loss": 0.0707, "step": 76 }, { "epoch": 1.67, "learning_rate": 1.551904352483217e-05, "loss": 0.0527, "step": 77 }, { "epoch": 1.7, "learning_rate": 1.540102003849253e-05, "loss": 0.0503, "step": 78 }, { "epoch": 1.72, "learning_rate": 1.52819246414062e-05, "loss": 0.0625, "step": 79 }, { "epoch": 1.74, "learning_rate": 1.5161780969781728e-05, "loss": 0.0628, "step": 80 }, { "epoch": 1.76, "learning_rate": 1.5040612867872945e-05, "loss": 0.0685, "step": 81 }, { "epoch": 1.78, "learning_rate": 1.4918444383246738e-05, "loss": 0.0839, "step": 82 }, { "epoch": 1.8, "learning_rate": 1.479529976201044e-05, "loss": 0.0643, "step": 83 }, { "epoch": 1.83, "learning_rate": 1.4671203443999847e-05, "loss": 0.068, "step": 84 }, { "epoch": 1.85, "learning_rate": 1.4546180057928792e-05, "loss": 0.0494, "step": 85 }, { "epoch": 1.87, "learning_rate": 1.4420254416501198e-05, "loss": 0.0942, "step": 86 }, { "epoch": 1.89, "learning_rate": 1.4293451511486658e-05, "loss": 0.0798, "step": 87 }, { "epoch": 1.91, "learning_rate": 1.416579650876043e-05, "loss": 0.0801, "step": 88 }, { "epoch": 1.93, "learning_rate": 1.403731474330893e-05, "loss": 0.0556, "step": 89 }, { "epoch": 1.96, "learning_rate": 1.3908031714201621e-05, "loss": 0.0655, "step": 90 }, { "epoch": 1.98, "learning_rate": 1.3777973079530362e-05, "loss": 0.0596, "step": 91 }, { "epoch": 2.0, "learning_rate": 1.3647164651317178e-05, "loss": 0.0463, "step": 92 }, { "epoch": 2.02, "learning_rate": 1.35156323903915e-05, "loss": 0.0551, "step": 93 }, { "epoch": 2.04, "learning_rate": 1.338340240123785e-05, "loss": 0.0386, "step": 94 }, { "epoch": 2.07, "learning_rate": 1.3250500926815046e-05, "loss": 0.047, "step": 95 }, { "epoch": 2.09, "learning_rate": 1.3116954343347882e-05, "loss": 0.0612, "step": 96 }, { "epoch": 2.11, "learning_rate": 1.2982789155092407e-05, "loss": 0.0409, "step": 97 }, { "epoch": 2.13, "learning_rate": 1.2848031989075754e-05, "loss": 0.0528, "step": 98 }, { "epoch": 2.15, "learning_rate": 1.2712709589811629e-05, "loss": 0.0525, "step": 99 }, { "epoch": 2.17, "learning_rate": 1.2576848813992475e-05, "loss": 0.0381, "step": 100 }, { "epoch": 2.2, "learning_rate": 1.2440476625159363e-05, "loss": 0.0542, "step": 101 }, { "epoch": 2.22, "learning_rate": 1.23036200883507e-05, "loss": 0.0405, "step": 102 }, { "epoch": 2.24, "learning_rate": 1.2166306364730766e-05, "loss": 0.055, "step": 103 }, { "epoch": 2.26, "learning_rate": 1.2028562706199201e-05, "loss": 0.0436, "step": 104 }, { "epoch": 2.28, "learning_rate": 1.1890416449982451e-05, "loss": 0.0467, "step": 105 }, { "epoch": 2.3, "learning_rate": 1.1751895013208325e-05, "loss": 0.0693, "step": 106 }, { "epoch": 2.33, "learning_rate": 1.1613025887464642e-05, "loss": 0.0495, "step": 107 }, { "epoch": 2.35, "learning_rate": 1.1473836633343145e-05, "loss": 0.0571, "step": 108 }, { "epoch": 2.37, "learning_rate": 1.133435487496969e-05, "loss": 0.0331, "step": 109 }, { "epoch": 2.39, "learning_rate": 1.1194608294521853e-05, "loss": 0.0342, "step": 110 }, { "epoch": 2.41, "learning_rate": 1.1054624626734985e-05, "loss": 0.0679, "step": 111 }, { "epoch": 2.43, "learning_rate": 1.0914431653397856e-05, "loss": 0.0509, "step": 112 }, { "epoch": 2.46, "learning_rate": 1.0774057197838963e-05, "loss": 0.0543, "step": 113 }, { "epoch": 2.48, "learning_rate": 1.0633529119404571e-05, "loss": 0.0893, "step": 114 }, { "epoch": 2.5, "learning_rate": 1.0492875307929643e-05, "loss": 0.0386, "step": 115 }, { "epoch": 2.52, "learning_rate": 1.0352123678202686e-05, "loss": 0.0581, "step": 116 }, { "epoch": 2.54, "learning_rate": 1.0211302164425657e-05, "loss": 0.0599, "step": 117 }, { "epoch": 2.57, "learning_rate": 1.0070438714670004e-05, "loss": 0.0426, "step": 118 }, { "epoch": 2.59, "learning_rate": 9.929561285329998e-06, "loss": 0.0425, "step": 119 }, { "epoch": 2.61, "learning_rate": 9.788697835574348e-06, "loss": 0.0571, "step": 120 }, { "epoch": 2.63, "learning_rate": 9.647876321797314e-06, "loss": 0.0478, "step": 121 }, { "epoch": 2.65, "learning_rate": 9.507124692070356e-06, "loss": 0.0626, "step": 122 }, { "epoch": 2.67, "learning_rate": 9.366470880595434e-06, "loss": 0.0438, "step": 123 }, { "epoch": 2.7, "learning_rate": 9.225942802161042e-06, "loss": 0.0385, "step": 124 }, { "epoch": 2.72, "learning_rate": 9.085568346602146e-06, "loss": 0.0533, "step": 125 }, { "epoch": 2.74, "learning_rate": 8.945375373265017e-06, "loss": 0.0568, "step": 126 }, { "epoch": 2.76, "learning_rate": 8.805391705478149e-06, "loss": 0.0449, "step": 127 }, { "epoch": 2.78, "learning_rate": 8.665645125030312e-06, "loss": 0.0579, "step": 128 }, { "epoch": 2.8, "learning_rate": 8.526163366656858e-06, "loss": 0.0386, "step": 129 }, { "epoch": 2.83, "learning_rate": 8.38697411253536e-06, "loss": 0.0435, "step": 130 }, { "epoch": 2.85, "learning_rate": 8.248104986791677e-06, "loss": 0.0453, "step": 131 }, { "epoch": 2.87, "learning_rate": 8.10958355001755e-06, "loss": 0.0455, "step": 132 }, { "epoch": 2.89, "learning_rate": 7.971437293800804e-06, "loss": 0.0434, "step": 133 }, { "epoch": 2.91, "learning_rate": 7.833693635269235e-06, "loss": 0.0447, "step": 134 }, { "epoch": 2.93, "learning_rate": 7.696379911649303e-06, "loss": 0.0376, "step": 135 }, { "epoch": 2.96, "learning_rate": 7.559523374840639e-06, "loss": 0.0608, "step": 136 }, { "epoch": 2.98, "learning_rate": 7.423151186007527e-06, "loss": 0.0406, "step": 137 }, { "epoch": 3.0, "learning_rate": 7.287290410188374e-06, "loss": 0.0457, "step": 138 }, { "epoch": 3.02, "learning_rate": 7.1519680109242486e-06, "loss": 0.0456, "step": 139 }, { "epoch": 3.04, "learning_rate": 7.017210844907598e-06, "loss": 0.0251, "step": 140 }, { "epoch": 3.07, "learning_rate": 6.883045656652122e-06, "loss": 0.0294, "step": 141 }, { "epoch": 3.09, "learning_rate": 6.749499073184957e-06, "loss": 0.0357, "step": 142 }, { "epoch": 3.11, "learning_rate": 6.616597598762151e-06, "loss": 0.036, "step": 143 }, { "epoch": 3.13, "learning_rate": 6.484367609608503e-06, "loss": 0.0287, "step": 144 }, { "epoch": 3.15, "learning_rate": 6.352835348682824e-06, "loss": 0.0295, "step": 145 }, { "epoch": 3.17, "learning_rate": 6.22202692046964e-06, "loss": 0.0306, "step": 146 }, { "epoch": 3.2, "learning_rate": 6.09196828579838e-06, "loss": 0.0247, "step": 147 }, { "epoch": 3.22, "learning_rate": 5.962685256691071e-06, "loss": 0.0372, "step": 148 }, { "epoch": 3.24, "learning_rate": 5.834203491239574e-06, "loss": 0.0227, "step": 149 }, { "epoch": 3.26, "learning_rate": 5.706548488513347e-06, "loss": 0.0257, "step": 150 }, { "epoch": 3.28, "learning_rate": 5.579745583498802e-06, "loss": 0.0251, "step": 151 }, { "epoch": 3.3, "learning_rate": 5.453819942071212e-06, "loss": 0.0247, "step": 152 }, { "epoch": 3.33, "learning_rate": 5.328796556000153e-06, "loss": 0.0356, "step": 153 }, { "epoch": 3.35, "learning_rate": 5.204700237989564e-06, "loss": 0.0303, "step": 154 }, { "epoch": 3.37, "learning_rate": 5.081555616753264e-06, "loss": 0.02, "step": 155 }, { "epoch": 3.39, "learning_rate": 4.959387132127054e-06, "loss": 0.0328, "step": 156 }, { "epoch": 3.41, "learning_rate": 4.838219030218274e-06, "loss": 0.0264, "step": 157 }, { "epoch": 3.43, "learning_rate": 4.718075358593802e-06, "loss": 0.0308, "step": 158 }, { "epoch": 3.46, "learning_rate": 4.598979961507472e-06, "loss": 0.0358, "step": 159 }, { "epoch": 3.48, "learning_rate": 4.48095647516783e-06, "loss": 0.0272, "step": 160 }, { "epoch": 3.5, "learning_rate": 4.364028323047205e-06, "loss": 0.0306, "step": 161 }, { "epoch": 3.52, "learning_rate": 4.248218711232952e-06, "loss": 0.0193, "step": 162 }, { "epoch": 3.54, "learning_rate": 4.133550623821884e-06, "loss": 0.0243, "step": 163 }, { "epoch": 3.57, "learning_rate": 4.0200468183587556e-06, "loss": 0.0359, "step": 164 }, { "epoch": 3.59, "learning_rate": 3.90772982131967e-06, "loss": 0.0305, "step": 165 }, { "epoch": 3.61, "learning_rate": 3.7966219236414036e-06, "loss": 0.0324, "step": 166 }, { "epoch": 3.63, "learning_rate": 3.6867451762974117e-06, "loss": 0.0355, "step": 167 }, { "epoch": 3.65, "learning_rate": 3.5781213859215334e-06, "loss": 0.0308, "step": 168 }, { "epoch": 3.67, "learning_rate": 3.4707721104801175e-06, "loss": 0.0487, "step": 169 }, { "epoch": 3.7, "learning_rate": 3.3647186549935407e-06, "loss": 0.0165, "step": 170 }, { "epoch": 3.72, "learning_rate": 3.2599820673079286e-06, "loss": 0.03, "step": 171 }, { "epoch": 3.74, "learning_rate": 3.1565831339178844e-06, "loss": 0.0225, "step": 172 }, { "epoch": 3.76, "learning_rate": 3.0545423758411298e-06, "loss": 0.0364, "step": 173 }, { "epoch": 3.78, "learning_rate": 2.953880044545795e-06, "loss": 0.0269, "step": 174 }, { "epoch": 3.8, "learning_rate": 2.8546161179312247e-06, "loss": 0.0142, "step": 175 }, { "epoch": 3.83, "learning_rate": 2.7567702963630805e-06, "loss": 0.0377, "step": 176 }, { "epoch": 3.85, "learning_rate": 2.6603619987635087e-06, "loss": 0.0295, "step": 177 }, { "epoch": 3.87, "learning_rate": 2.5654103587571887e-06, "loss": 0.031, "step": 178 }, { "epoch": 3.89, "learning_rate": 2.4719342208739695e-06, "loss": 0.0387, "step": 179 }, { "epoch": 3.91, "learning_rate": 2.379952136808903e-06, "loss": 0.0274, "step": 180 }, { "epoch": 3.93, "learning_rate": 2.2894823617404107e-06, "loss": 0.0341, "step": 181 }, { "epoch": 3.96, "learning_rate": 2.200542850707247e-06, "loss": 0.032, "step": 182 }, { "epoch": 3.98, "learning_rate": 2.113151255045095e-06, "loss": 0.0333, "step": 183 }, { "epoch": 4.0, "learning_rate": 2.0273249188833656e-06, "loss": 0.0298, "step": 184 }, { "epoch": 4.02, "learning_rate": 1.9430808757030452e-06, "loss": 0.0201, "step": 185 }, { "epoch": 4.04, "learning_rate": 1.860435844956121e-06, "loss": 0.0156, "step": 186 }, { "epoch": 4.07, "learning_rate": 1.7794062287473734e-06, "loss": 0.021, "step": 187 }, { "epoch": 4.09, "learning_rate": 1.7000081085791541e-06, "loss": 0.0197, "step": 188 }, { "epoch": 4.11, "learning_rate": 1.622257242159756e-06, "loss": 0.029, "step": 189 }, { "epoch": 4.13, "learning_rate": 1.5461690602760882e-06, "loss": 0.0249, "step": 190 }, { "epoch": 4.15, "learning_rate": 1.4717586637311943e-06, "loss": 0.0294, "step": 191 }, { "epoch": 4.17, "learning_rate": 1.3990408203472938e-06, "loss": 0.0112, "step": 192 }, { "epoch": 4.2, "learning_rate": 1.3280299620348847e-06, "loss": 0.0172, "step": 193 }, { "epoch": 4.22, "learning_rate": 1.258740181928524e-06, "loss": 0.0159, "step": 194 }, { "epoch": 4.24, "learning_rate": 1.1911852315898465e-06, "loss": 0.0241, "step": 195 }, { "epoch": 4.26, "learning_rate": 1.1253785182783571e-06, "loss": 0.0178, "step": 196 }, { "epoch": 4.28, "learning_rate": 1.061333102290576e-06, "loss": 0.0173, "step": 197 }, { "epoch": 4.3, "learning_rate": 9.990616943680266e-07, "loss": 0.0149, "step": 198 }, { "epoch": 4.33, "learning_rate": 9.385766531746055e-07, "loss": 0.0354, "step": 199 }, { "epoch": 4.35, "learning_rate": 8.798899828438334e-07, "loss": 0.0179, "step": 200 }, { "epoch": 4.37, "learning_rate": 8.23013330596445e-07, "loss": 0.0145, "step": 201 }, { "epoch": 4.39, "learning_rate": 7.679579844288509e-07, "loss": 0.0276, "step": 202 }, { "epoch": 4.41, "learning_rate": 7.147348708728508e-07, "loss": 0.0117, "step": 203 }, { "epoch": 4.43, "learning_rate": 6.633545528271213e-07, "loss": 0.0189, "step": 204 }, { "epoch": 4.46, "learning_rate": 6.138272274608404e-07, "loss": 0.0257, "step": 205 }, { "epoch": 4.48, "learning_rate": 5.661627241899193e-07, "loss": 0.0158, "step": 206 }, { "epoch": 4.5, "learning_rate": 5.203705027262185e-07, "loss": 0.013, "step": 207 }, { "epoch": 4.52, "learning_rate": 4.7645965120011627e-07, "loss": 0.0146, "step": 208 }, { "epoch": 4.54, "learning_rate": 4.344388843568503e-07, "loss": 0.0208, "step": 209 }, { "epoch": 4.57, "learning_rate": 3.943165418269401e-07, "loss": 0.0241, "step": 210 }, { "epoch": 4.59, "learning_rate": 3.561005864710754e-07, "loss": 0.016, "step": 211 }, { "epoch": 4.61, "learning_rate": 3.197986027997657e-07, "loss": 0.0222, "step": 212 }, { "epoch": 4.63, "learning_rate": 2.8541779546808255e-07, "loss": 0.0155, "step": 213 }, { "epoch": 4.65, "learning_rate": 2.529649878457985e-07, "loss": 0.0228, "step": 214 }, { "epoch": 4.67, "learning_rate": 2.2244662066318146e-07, "loss": 0.0134, "step": 215 }, { "epoch": 4.7, "learning_rate": 1.9386875073274636e-07, "loss": 0.0233, "step": 216 }, { "epoch": 4.72, "learning_rate": 1.6723704974718758e-07, "loss": 0.0204, "step": 217 }, { "epoch": 4.74, "learning_rate": 1.4255680315375164e-07, "loss": 0.0167, "step": 218 }, { "epoch": 4.76, "learning_rate": 1.198329091052608e-07, "loss": 0.0147, "step": 219 }, { "epoch": 4.78, "learning_rate": 9.906987748800945e-08, "loss": 0.0194, "step": 220 }, { "epoch": 4.8, "learning_rate": 8.027182902670571e-08, "loss": 0.0162, "step": 221 }, { "epoch": 4.83, "learning_rate": 6.344249446665673e-08, "loss": 0.0126, "step": 222 }, { "epoch": 4.85, "learning_rate": 4.8585213833348686e-08, "loss": 0.0132, "step": 223 }, { "epoch": 4.87, "learning_rate": 3.570293576956596e-08, "loss": 0.0175, "step": 224 }, { "epoch": 4.89, "learning_rate": 2.479821695019813e-08, "loss": 0.0136, "step": 225 }, { "epoch": 4.91, "learning_rate": 1.587322157482252e-08, "loss": 0.0182, "step": 226 }, { "epoch": 4.93, "learning_rate": 8.929720938193331e-09, "loss": 0.017, "step": 227 }, { "epoch": 4.96, "learning_rate": 3.9690930786995266e-09, "loss": 0.0134, "step": 228 }, { "epoch": 4.98, "learning_rate": 9.923225048724672e-10, "loss": 0.0196, "step": 229 }, { "epoch": 5.0, "learning_rate": 0.0, "loss": 0.0165, "step": 230 }, { "epoch": 5.0, "step": 230, "total_flos": 5455864135680.0, "train_loss": 0.06894507200821587, "train_runtime": 1542.9779, "train_samples_per_second": 9.482, "train_steps_per_second": 0.149 } ], "logging_steps": 1.0, "max_steps": 230, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 800, "total_flos": 5455864135680.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }