{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 540.0, "global_step": 540, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.003703703703703704, "grad_norm": 9.5, "learning_rate": 2.787878787878788e-07, "loss": 0.7464, "step": 2 }, { "epoch": 0.007407407407407408, "grad_norm": 9.6875, "learning_rate": 5.575757575757576e-07, "loss": 0.7497, "step": 4 }, { "epoch": 0.011111111111111112, "grad_norm": 9.6875, "learning_rate": 8.363636363636364e-07, "loss": 0.7544, "step": 6 }, { "epoch": 0.014814814814814815, "grad_norm": 9.375, "learning_rate": 1.1151515151515153e-06, "loss": 0.7549, "step": 8 }, { "epoch": 0.018518518518518517, "grad_norm": 9.6875, "learning_rate": 1.393939393939394e-06, "loss": 0.7816, "step": 10 }, { "epoch": 0.022222222222222223, "grad_norm": 9.5, "learning_rate": 1.6727272727272728e-06, "loss": 0.7629, "step": 12 }, { "epoch": 0.025925925925925925, "grad_norm": 9.375, "learning_rate": 1.9515151515151518e-06, "loss": 0.777, "step": 14 }, { "epoch": 0.02962962962962963, "grad_norm": 9.625, "learning_rate": 2.2303030303030305e-06, "loss": 0.7604, "step": 16 }, { "epoch": 0.03333333333333333, "grad_norm": 9.3125, "learning_rate": 2.509090909090909e-06, "loss": 0.7431, "step": 18 }, { "epoch": 0.037037037037037035, "grad_norm": 9.125, "learning_rate": 2.787878787878788e-06, "loss": 0.7333, "step": 20 }, { "epoch": 0.040740740740740744, "grad_norm": 9.875, "learning_rate": 3.0666666666666664e-06, "loss": 0.7854, "step": 22 }, { "epoch": 0.044444444444444446, "grad_norm": 9.5625, "learning_rate": 3.3454545454545456e-06, "loss": 0.7264, "step": 24 }, { "epoch": 0.04814814814814815, "grad_norm": 9.5625, "learning_rate": 3.624242424242424e-06, "loss": 0.7634, "step": 26 }, { "epoch": 0.05185185185185185, "grad_norm": 9.4375, "learning_rate": 3.9030303030303035e-06, "loss": 0.7677, "step": 28 }, { "epoch": 0.05555555555555555, "grad_norm": 9.5, "learning_rate": 4.1818181818181814e-06, "loss": 0.76, "step": 30 }, { "epoch": 0.05925925925925926, "grad_norm": 9.625, "learning_rate": 4.460606060606061e-06, "loss": 0.7615, "step": 32 }, { "epoch": 0.06296296296296296, "grad_norm": 9.5, "learning_rate": 4.59998964610166e-06, "loss": 0.7493, "step": 34 }, { "epoch": 0.06666666666666667, "grad_norm": 9.4375, "learning_rate": 4.599906815474259e-06, "loss": 0.7541, "step": 36 }, { "epoch": 0.07037037037037037, "grad_norm": 9.5625, "learning_rate": 4.599741157202478e-06, "loss": 0.7721, "step": 38 }, { "epoch": 0.07407407407407407, "grad_norm": 9.125, "learning_rate": 4.599492677252244e-06, "loss": 0.7385, "step": 40 }, { "epoch": 0.07777777777777778, "grad_norm": 9.375, "learning_rate": 4.599161384572187e-06, "loss": 0.7255, "step": 42 }, { "epoch": 0.08148148148148149, "grad_norm": 9.375, "learning_rate": 4.5987472910933085e-06, "loss": 0.7548, "step": 44 }, { "epoch": 0.08518518518518518, "grad_norm": 9.3125, "learning_rate": 4.598250411728554e-06, "loss": 0.747, "step": 46 }, { "epoch": 0.08888888888888889, "grad_norm": 9.3125, "learning_rate": 4.597670764372279e-06, "loss": 0.7368, "step": 48 }, { "epoch": 0.09259259259259259, "grad_norm": 9.25, "learning_rate": 4.5970083698996e-06, "loss": 0.7536, "step": 50 }, { "epoch": 0.0962962962962963, "grad_norm": 9.3125, "learning_rate": 4.596263252165647e-06, "loss": 0.7242, "step": 52 }, { "epoch": 0.1, "grad_norm": 9.375, "learning_rate": 4.595435438004701e-06, "loss": 0.7248, "step": 54 }, { "epoch": 0.1037037037037037, "grad_norm": 9.1875, "learning_rate": 4.594524957229229e-06, "loss": 0.7113, "step": 56 }, { "epoch": 0.10740740740740741, "grad_norm": 9.375, "learning_rate": 4.593531842628811e-06, "loss": 0.7352, "step": 58 }, { "epoch": 0.1111111111111111, "grad_norm": 8.875, "learning_rate": 4.592456129968958e-06, "loss": 0.6914, "step": 60 }, { "epoch": 0.11481481481481481, "grad_norm": 9.125, "learning_rate": 4.591297857989825e-06, "loss": 0.6962, "step": 62 }, { "epoch": 0.11851851851851852, "grad_norm": 8.9375, "learning_rate": 4.590057068404815e-06, "loss": 0.7002, "step": 64 }, { "epoch": 0.12222222222222222, "grad_norm": 9.1875, "learning_rate": 4.588733805899076e-06, "loss": 0.7196, "step": 66 }, { "epoch": 0.1259259259259259, "grad_norm": 9.5625, "learning_rate": 4.587328118127894e-06, "loss": 0.7366, "step": 68 }, { "epoch": 0.12962962962962962, "grad_norm": 9.0625, "learning_rate": 4.585840055714976e-06, "loss": 0.7042, "step": 70 }, { "epoch": 0.13333333333333333, "grad_norm": 9.1875, "learning_rate": 4.584269672250625e-06, "loss": 0.6914, "step": 72 }, { "epoch": 0.13703703703703704, "grad_norm": 9.1875, "learning_rate": 4.582617024289811e-06, "loss": 0.7045, "step": 74 }, { "epoch": 0.14074074074074075, "grad_norm": 9.1875, "learning_rate": 4.5808821713501374e-06, "loss": 0.7275, "step": 76 }, { "epoch": 0.14444444444444443, "grad_norm": 9.125, "learning_rate": 4.579065175909692e-06, "loss": 0.6943, "step": 78 }, { "epoch": 0.14814814814814814, "grad_norm": 8.9375, "learning_rate": 4.577166103404801e-06, "loss": 0.6792, "step": 80 }, { "epoch": 0.15185185185185185, "grad_norm": 9.1875, "learning_rate": 4.5751850222276705e-06, "loss": 0.6854, "step": 82 }, { "epoch": 0.15555555555555556, "grad_norm": 8.875, "learning_rate": 4.573122003723925e-06, "loss": 0.6675, "step": 84 }, { "epoch": 0.15925925925925927, "grad_norm": 9.625, "learning_rate": 4.5709771221900375e-06, "loss": 0.6882, "step": 86 }, { "epoch": 0.16296296296296298, "grad_norm": 8.875, "learning_rate": 4.568750454870651e-06, "loss": 0.675, "step": 88 }, { "epoch": 0.16666666666666666, "grad_norm": 9.0625, "learning_rate": 4.5664420819558035e-06, "loss": 0.6996, "step": 90 }, { "epoch": 0.17037037037037037, "grad_norm": 9.375, "learning_rate": 4.564052086578031e-06, "loss": 0.7039, "step": 92 }, { "epoch": 0.17407407407407408, "grad_norm": 8.8125, "learning_rate": 4.561580554809381e-06, "loss": 0.6587, "step": 94 }, { "epoch": 0.17777777777777778, "grad_norm": 9.0, "learning_rate": 4.559027575658309e-06, "loss": 0.6556, "step": 96 }, { "epoch": 0.1814814814814815, "grad_norm": 8.6875, "learning_rate": 4.556393241066476e-06, "loss": 0.6617, "step": 98 }, { "epoch": 0.18518518518518517, "grad_norm": 9.375, "learning_rate": 4.553677645905431e-06, "loss": 0.6828, "step": 100 }, { "epoch": 0.18888888888888888, "grad_norm": 8.625, "learning_rate": 4.550880887973204e-06, "loss": 0.6371, "step": 102 }, { "epoch": 0.1925925925925926, "grad_norm": 8.8125, "learning_rate": 4.548003067990776e-06, "loss": 0.65, "step": 104 }, { "epoch": 0.1962962962962963, "grad_norm": 8.875, "learning_rate": 4.545044289598454e-06, "loss": 0.6718, "step": 106 }, { "epoch": 0.2, "grad_norm": 9.0625, "learning_rate": 4.54200465935214e-06, "loss": 0.6834, "step": 108 }, { "epoch": 0.2037037037037037, "grad_norm": 8.9375, "learning_rate": 4.5388842867194925e-06, "loss": 0.6696, "step": 110 }, { "epoch": 0.2074074074074074, "grad_norm": 8.8125, "learning_rate": 4.535683284075984e-06, "loss": 0.6811, "step": 112 }, { "epoch": 0.2111111111111111, "grad_norm": 8.875, "learning_rate": 4.5324017667008535e-06, "loss": 0.666, "step": 114 }, { "epoch": 0.21481481481481482, "grad_norm": 9.125, "learning_rate": 4.529039852772956e-06, "loss": 0.6534, "step": 116 }, { "epoch": 0.21851851851851853, "grad_norm": 9.0, "learning_rate": 4.525597663366506e-06, "loss": 0.66, "step": 118 }, { "epoch": 0.2222222222222222, "grad_norm": 8.6875, "learning_rate": 4.522075322446718e-06, "loss": 0.6418, "step": 120 }, { "epoch": 0.22592592592592592, "grad_norm": 9.125, "learning_rate": 4.51847295686534e-06, "loss": 0.6477, "step": 122 }, { "epoch": 0.22962962962962963, "grad_norm": 8.75, "learning_rate": 4.514790696356086e-06, "loss": 0.6517, "step": 124 }, { "epoch": 0.23333333333333334, "grad_norm": 9.125, "learning_rate": 4.511028673529967e-06, "loss": 0.6605, "step": 126 }, { "epoch": 0.23703703703703705, "grad_norm": 8.75, "learning_rate": 4.507187023870511e-06, "loss": 0.6302, "step": 128 }, { "epoch": 0.24074074074074073, "grad_norm": 8.75, "learning_rate": 4.503265885728883e-06, "loss": 0.6274, "step": 130 }, { "epoch": 0.24444444444444444, "grad_norm": 8.625, "learning_rate": 4.499265400318908e-06, "loss": 0.6427, "step": 132 }, { "epoch": 0.24814814814814815, "grad_norm": 8.5, "learning_rate": 4.49518571171198e-06, "loss": 0.6336, "step": 134 }, { "epoch": 0.2518518518518518, "grad_norm": 9.0, "learning_rate": 4.4910269668318775e-06, "loss": 0.6704, "step": 136 }, { "epoch": 0.25555555555555554, "grad_norm": 8.8125, "learning_rate": 4.486789315449469e-06, "loss": 0.6276, "step": 138 }, { "epoch": 0.25925925925925924, "grad_norm": 8.875, "learning_rate": 4.4824729101773205e-06, "loss": 0.6456, "step": 140 }, { "epoch": 0.26296296296296295, "grad_norm": 8.875, "learning_rate": 4.4780779064642e-06, "loss": 0.6438, "step": 142 }, { "epoch": 0.26666666666666666, "grad_norm": 8.875, "learning_rate": 4.473604462589479e-06, "loss": 0.6319, "step": 144 }, { "epoch": 0.27037037037037037, "grad_norm": 9.5625, "learning_rate": 4.469052739657432e-06, "loss": 0.6766, "step": 146 }, { "epoch": 0.2740740740740741, "grad_norm": 8.875, "learning_rate": 4.464422901591434e-06, "loss": 0.6316, "step": 148 }, { "epoch": 0.2777777777777778, "grad_norm": 8.6875, "learning_rate": 4.459715115128058e-06, "loss": 0.629, "step": 150 }, { "epoch": 0.2814814814814815, "grad_norm": 8.9375, "learning_rate": 4.454929549811071e-06, "loss": 0.6362, "step": 152 }, { "epoch": 0.2851851851851852, "grad_norm": 8.75, "learning_rate": 4.450066377985326e-06, "loss": 0.6216, "step": 154 }, { "epoch": 0.28888888888888886, "grad_norm": 9.0625, "learning_rate": 4.445125774790555e-06, "loss": 0.6535, "step": 156 }, { "epoch": 0.29259259259259257, "grad_norm": 8.8125, "learning_rate": 4.440107918155065e-06, "loss": 0.6278, "step": 158 }, { "epoch": 0.2962962962962963, "grad_norm": 8.8125, "learning_rate": 4.435012988789327e-06, "loss": 0.6178, "step": 160 }, { "epoch": 0.3, "grad_norm": 8.4375, "learning_rate": 4.429841170179471e-06, "loss": 0.6243, "step": 162 }, { "epoch": 0.3037037037037037, "grad_norm": 8.875, "learning_rate": 4.4245926485806745e-06, "loss": 0.6201, "step": 164 }, { "epoch": 0.3074074074074074, "grad_norm": 8.75, "learning_rate": 4.419267613010454e-06, "loss": 0.6267, "step": 166 }, { "epoch": 0.3111111111111111, "grad_norm": 8.75, "learning_rate": 4.413866255241867e-06, "loss": 0.6474, "step": 168 }, { "epoch": 0.3148148148148148, "grad_norm": 8.4375, "learning_rate": 4.4083887697965915e-06, "loss": 0.6021, "step": 170 }, { "epoch": 0.31851851851851853, "grad_norm": 8.5625, "learning_rate": 4.402835353937933e-06, "loss": 0.6044, "step": 172 }, { "epoch": 0.32222222222222224, "grad_norm": 8.9375, "learning_rate": 4.397206207663713e-06, "loss": 0.6293, "step": 174 }, { "epoch": 0.32592592592592595, "grad_norm": 8.6875, "learning_rate": 4.391501533699068e-06, "loss": 0.6039, "step": 176 }, { "epoch": 0.3296296296296296, "grad_norm": 8.875, "learning_rate": 4.385721537489152e-06, "loss": 0.6116, "step": 178 }, { "epoch": 0.3333333333333333, "grad_norm": 8.4375, "learning_rate": 4.379866427191734e-06, "loss": 0.5855, "step": 180 }, { "epoch": 0.337037037037037, "grad_norm": 8.625, "learning_rate": 4.373936413669699e-06, "loss": 0.5902, "step": 182 }, { "epoch": 0.34074074074074073, "grad_norm": 8.25, "learning_rate": 4.367931710483465e-06, "loss": 0.6014, "step": 184 }, { "epoch": 0.34444444444444444, "grad_norm": 8.625, "learning_rate": 4.361852533883278e-06, "loss": 0.6035, "step": 186 }, { "epoch": 0.34814814814814815, "grad_norm": 8.3125, "learning_rate": 4.355699102801434e-06, "loss": 0.5791, "step": 188 }, { "epoch": 0.35185185185185186, "grad_norm": 8.6875, "learning_rate": 4.349471638844388e-06, "loss": 0.5952, "step": 190 }, { "epoch": 0.35555555555555557, "grad_norm": 8.9375, "learning_rate": 4.3431703662847814e-06, "loss": 0.6405, "step": 192 }, { "epoch": 0.3592592592592593, "grad_norm": 8.375, "learning_rate": 4.336795512053356e-06, "loss": 0.5953, "step": 194 }, { "epoch": 0.362962962962963, "grad_norm": 8.6875, "learning_rate": 4.330347305730786e-06, "loss": 0.6133, "step": 196 }, { "epoch": 0.36666666666666664, "grad_norm": 8.6875, "learning_rate": 4.323825979539413e-06, "loss": 0.6232, "step": 198 }, { "epoch": 0.37037037037037035, "grad_norm": 8.625, "learning_rate": 4.317231768334875e-06, "loss": 0.6017, "step": 200 }, { "epoch": 0.37407407407407406, "grad_norm": 8.4375, "learning_rate": 4.310564909597654e-06, "loss": 0.5907, "step": 202 }, { "epoch": 0.37777777777777777, "grad_norm": 8.4375, "learning_rate": 4.303825643424525e-06, "loss": 0.62, "step": 204 }, { "epoch": 0.3814814814814815, "grad_norm": 8.375, "learning_rate": 4.297014212519903e-06, "loss": 0.5838, "step": 206 }, { "epoch": 0.3851851851851852, "grad_norm": 8.25, "learning_rate": 4.290130862187108e-06, "loss": 0.5953, "step": 208 }, { "epoch": 0.3888888888888889, "grad_norm": 9.0, "learning_rate": 4.283175840319529e-06, "loss": 0.6231, "step": 210 }, { "epoch": 0.3925925925925926, "grad_norm": 8.375, "learning_rate": 4.276149397391696e-06, "loss": 0.6054, "step": 212 }, { "epoch": 0.3962962962962963, "grad_norm": 8.4375, "learning_rate": 4.269051786450258e-06, "loss": 0.5715, "step": 214 }, { "epoch": 0.4, "grad_norm": 8.1875, "learning_rate": 4.261883263104874e-06, "loss": 0.5643, "step": 216 }, { "epoch": 0.40370370370370373, "grad_norm": 8.375, "learning_rate": 4.2546440855190055e-06, "loss": 0.5686, "step": 218 }, { "epoch": 0.4074074074074074, "grad_norm": 8.875, "learning_rate": 4.2473345144006165e-06, "loss": 0.595, "step": 220 }, { "epoch": 0.4111111111111111, "grad_norm": 8.8125, "learning_rate": 4.239954812992789e-06, "loss": 0.6095, "step": 222 }, { "epoch": 0.4148148148148148, "grad_norm": 8.3125, "learning_rate": 4.23250524706424e-06, "loss": 0.554, "step": 224 }, { "epoch": 0.4185185185185185, "grad_norm": 8.875, "learning_rate": 4.224986084899751e-06, "loss": 0.6128, "step": 226 }, { "epoch": 0.4222222222222222, "grad_norm": 8.4375, "learning_rate": 4.217397597290506e-06, "loss": 0.5895, "step": 228 }, { "epoch": 0.42592592592592593, "grad_norm": 8.6875, "learning_rate": 4.20974005752434e-06, "loss": 0.613, "step": 230 }, { "epoch": 0.42962962962962964, "grad_norm": 8.625, "learning_rate": 4.202013741375896e-06, "loss": 0.5693, "step": 232 }, { "epoch": 0.43333333333333335, "grad_norm": 8.0625, "learning_rate": 4.194218927096692e-06, "loss": 0.5706, "step": 234 }, { "epoch": 0.43703703703703706, "grad_norm": 8.75, "learning_rate": 4.186355895405106e-06, "loss": 0.5816, "step": 236 }, { "epoch": 0.44074074074074077, "grad_norm": 8.6875, "learning_rate": 4.1784249294762585e-06, "loss": 0.5983, "step": 238 }, { "epoch": 0.4444444444444444, "grad_norm": 8.5625, "learning_rate": 4.170426314931819e-06, "loss": 0.5774, "step": 240 }, { "epoch": 0.44814814814814813, "grad_norm": 8.8125, "learning_rate": 4.16236033982972e-06, "loss": 0.5933, "step": 242 }, { "epoch": 0.45185185185185184, "grad_norm": 8.8125, "learning_rate": 4.154227294653782e-06, "loss": 0.5751, "step": 244 }, { "epoch": 0.45555555555555555, "grad_norm": 8.0625, "learning_rate": 4.146027472303251e-06, "loss": 0.5488, "step": 246 }, { "epoch": 0.45925925925925926, "grad_norm": 8.375, "learning_rate": 4.137761168082251e-06, "loss": 0.5881, "step": 248 }, { "epoch": 0.46296296296296297, "grad_norm": 8.5625, "learning_rate": 4.129428679689153e-06, "loss": 0.5714, "step": 250 }, { "epoch": 0.4666666666666667, "grad_norm": 8.4375, "learning_rate": 4.121030307205846e-06, "loss": 0.5852, "step": 252 }, { "epoch": 0.4703703703703704, "grad_norm": 8.1875, "learning_rate": 4.112566353086935e-06, "loss": 0.5773, "step": 254 }, { "epoch": 0.4740740740740741, "grad_norm": 8.3125, "learning_rate": 4.1040371221488506e-06, "loss": 0.5557, "step": 256 }, { "epoch": 0.4777777777777778, "grad_norm": 8.5, "learning_rate": 4.0954429215588655e-06, "loss": 0.5793, "step": 258 }, { "epoch": 0.48148148148148145, "grad_norm": 8.125, "learning_rate": 4.086784060824037e-06, "loss": 0.5691, "step": 260 }, { "epoch": 0.48518518518518516, "grad_norm": 8.5, "learning_rate": 4.078060851780059e-06, "loss": 0.569, "step": 262 }, { "epoch": 0.4888888888888889, "grad_norm": 8.5, "learning_rate": 4.069273608580033e-06, "loss": 0.5831, "step": 264 }, { "epoch": 0.4925925925925926, "grad_norm": 8.8125, "learning_rate": 4.060422647683151e-06, "loss": 0.5827, "step": 266 }, { "epoch": 0.4962962962962963, "grad_norm": 8.1875, "learning_rate": 4.051508287843302e-06, "loss": 0.5563, "step": 268 }, { "epoch": 0.5, "grad_norm": 8.75, "learning_rate": 4.042530850097591e-06, "loss": 0.5807, "step": 270 }, { "epoch": 0.5037037037037037, "grad_norm": 8.4375, "learning_rate": 4.033490657754778e-06, "loss": 0.5927, "step": 272 }, { "epoch": 0.5074074074074074, "grad_norm": 8.75, "learning_rate": 4.024388036383636e-06, "loss": 0.5615, "step": 274 }, { "epoch": 0.5111111111111111, "grad_norm": 8.625, "learning_rate": 4.015223313801222e-06, "loss": 0.5552, "step": 276 }, { "epoch": 0.5148148148148148, "grad_norm": 8.25, "learning_rate": 4.0059968200610755e-06, "loss": 0.5492, "step": 278 }, { "epoch": 0.5185185185185185, "grad_norm": 8.375, "learning_rate": 3.996708887441328e-06, "loss": 0.5802, "step": 280 }, { "epoch": 0.5222222222222223, "grad_norm": 8.375, "learning_rate": 3.98735985043274e-06, "loss": 0.5961, "step": 282 }, { "epoch": 0.5259259259259259, "grad_norm": 8.6875, "learning_rate": 3.977950045726656e-06, "loss": 0.587, "step": 284 }, { "epoch": 0.5296296296296297, "grad_norm": 8.125, "learning_rate": 3.968479812202871e-06, "loss": 0.5425, "step": 286 }, { "epoch": 0.5333333333333333, "grad_norm": 8.5625, "learning_rate": 3.9589494909174376e-06, "loss": 0.5566, "step": 288 }, { "epoch": 0.5370370370370371, "grad_norm": 8.5625, "learning_rate": 3.949359425090375e-06, "loss": 0.5746, "step": 290 }, { "epoch": 0.5407407407407407, "grad_norm": 8.5625, "learning_rate": 3.939709960093312e-06, "loss": 0.5679, "step": 292 }, { "epoch": 0.5444444444444444, "grad_norm": 8.125, "learning_rate": 3.930001443437046e-06, "loss": 0.5553, "step": 294 }, { "epoch": 0.5481481481481482, "grad_norm": 8.75, "learning_rate": 3.920234224759034e-06, "loss": 0.578, "step": 296 }, { "epoch": 0.5518518518518518, "grad_norm": 8.625, "learning_rate": 3.910408655810793e-06, "loss": 0.5866, "step": 298 }, { "epoch": 0.5555555555555556, "grad_norm": 8.625, "learning_rate": 3.900525090445238e-06, "loss": 0.5767, "step": 300 }, { "epoch": 0.5592592592592592, "grad_norm": 8.5, "learning_rate": 3.890583884603937e-06, "loss": 0.5557, "step": 302 }, { "epoch": 0.562962962962963, "grad_norm": 8.625, "learning_rate": 3.880585396304293e-06, "loss": 0.5618, "step": 304 }, { "epoch": 0.5666666666666667, "grad_norm": 8.0625, "learning_rate": 3.870529985626646e-06, "loss": 0.5323, "step": 306 }, { "epoch": 0.5703703703703704, "grad_norm": 8.8125, "learning_rate": 3.860418014701313e-06, "loss": 0.5737, "step": 308 }, { "epoch": 0.5740740740740741, "grad_norm": 8.125, "learning_rate": 3.850249847695538e-06, "loss": 0.5486, "step": 310 }, { "epoch": 0.5777777777777777, "grad_norm": 8.125, "learning_rate": 3.840025850800386e-06, "loss": 0.5394, "step": 312 }, { "epoch": 0.5814814814814815, "grad_norm": 8.125, "learning_rate": 3.8297463922175465e-06, "loss": 0.5573, "step": 314 }, { "epoch": 0.5851851851851851, "grad_norm": 8.5625, "learning_rate": 3.81941184214608e-06, "loss": 0.5522, "step": 316 }, { "epoch": 0.5888888888888889, "grad_norm": 8.625, "learning_rate": 3.8090225727690826e-06, "loss": 0.5727, "step": 318 }, { "epoch": 0.5925925925925926, "grad_norm": 8.125, "learning_rate": 3.798578958240281e-06, "loss": 0.5549, "step": 320 }, { "epoch": 0.5962962962962963, "grad_norm": 8.1875, "learning_rate": 3.7880813746705614e-06, "loss": 0.5452, "step": 322 }, { "epoch": 0.6, "grad_norm": 8.0625, "learning_rate": 3.7775302001144237e-06, "loss": 0.5246, "step": 324 }, { "epoch": 0.6037037037037037, "grad_norm": 8.6875, "learning_rate": 3.7669258145563636e-06, "loss": 0.5316, "step": 326 }, { "epoch": 0.6074074074074074, "grad_norm": 8.5625, "learning_rate": 3.756268599897193e-06, "loss": 0.5523, "step": 328 }, { "epoch": 0.6111111111111112, "grad_norm": 8.375, "learning_rate": 3.74555893994028e-06, "loss": 0.5412, "step": 330 }, { "epoch": 0.6148148148148148, "grad_norm": 8.4375, "learning_rate": 3.7347972203777317e-06, "loss": 0.5598, "step": 332 }, { "epoch": 0.6185185185185185, "grad_norm": 7.96875, "learning_rate": 3.7239838287765044e-06, "loss": 0.5334, "step": 334 }, { "epoch": 0.6222222222222222, "grad_norm": 8.1875, "learning_rate": 3.7131191545644415e-06, "loss": 0.5353, "step": 336 }, { "epoch": 0.6259259259259259, "grad_norm": 8.375, "learning_rate": 3.702203589016253e-06, "loss": 0.5547, "step": 338 }, { "epoch": 0.6296296296296297, "grad_norm": 8.625, "learning_rate": 3.691237525239424e-06, "loss": 0.5703, "step": 340 }, { "epoch": 0.6333333333333333, "grad_norm": 8.5, "learning_rate": 3.6802213581600538e-06, "loss": 0.5669, "step": 342 }, { "epoch": 0.6370370370370371, "grad_norm": 8.6875, "learning_rate": 3.669155484508639e-06, "loss": 0.5535, "step": 344 }, { "epoch": 0.6407407407407407, "grad_norm": 8.125, "learning_rate": 3.6580403028057785e-06, "loss": 0.544, "step": 346 }, { "epoch": 0.6444444444444445, "grad_norm": 8.375, "learning_rate": 3.6468762133478317e-06, "loss": 0.5295, "step": 348 }, { "epoch": 0.6481481481481481, "grad_norm": 8.625, "learning_rate": 3.6356636181924892e-06, "loss": 0.5698, "step": 350 }, { "epoch": 0.6518518518518519, "grad_norm": 7.9375, "learning_rate": 3.6244029211443076e-06, "loss": 0.5422, "step": 352 }, { "epoch": 0.6555555555555556, "grad_norm": 8.6875, "learning_rate": 3.613094527740155e-06, "loss": 0.5365, "step": 354 }, { "epoch": 0.6592592592592592, "grad_norm": 8.5625, "learning_rate": 3.601738845234613e-06, "loss": 0.5389, "step": 356 }, { "epoch": 0.662962962962963, "grad_norm": 8.625, "learning_rate": 3.5903362825853077e-06, "loss": 0.5571, "step": 358 }, { "epoch": 0.6666666666666666, "grad_norm": 8.0625, "learning_rate": 3.5788872504381836e-06, "loss": 0.5259, "step": 360 }, { "epoch": 0.6703703703703704, "grad_norm": 8.375, "learning_rate": 3.5673921611127115e-06, "loss": 0.557, "step": 362 }, { "epoch": 0.674074074074074, "grad_norm": 8.0625, "learning_rate": 3.5558514285870426e-06, "loss": 0.5442, "step": 364 }, { "epoch": 0.6777777777777778, "grad_norm": 8.0625, "learning_rate": 3.5442654684830982e-06, "loss": 0.5267, "step": 366 }, { "epoch": 0.6814814814814815, "grad_norm": 8.5, "learning_rate": 3.5326346980516022e-06, "loss": 0.5572, "step": 368 }, { "epoch": 0.6851851851851852, "grad_norm": 8.375, "learning_rate": 3.520959536157054e-06, "loss": 0.5482, "step": 370 }, { "epoch": 0.6888888888888889, "grad_norm": 8.4375, "learning_rate": 3.5092404032626437e-06, "loss": 0.5537, "step": 372 }, { "epoch": 0.6925925925925925, "grad_norm": 8.0625, "learning_rate": 3.4974777214151117e-06, "loss": 0.5388, "step": 374 }, { "epoch": 0.6962962962962963, "grad_norm": 8.5, "learning_rate": 3.4856719142295446e-06, "loss": 0.5352, "step": 376 }, { "epoch": 0.7, "grad_norm": 8.375, "learning_rate": 3.4738234068741254e-06, "loss": 0.5161, "step": 378 }, { "epoch": 0.7037037037037037, "grad_norm": 7.96875, "learning_rate": 3.4619326260548185e-06, "loss": 0.544, "step": 380 }, { "epoch": 0.7074074074074074, "grad_norm": 7.9375, "learning_rate": 3.45e-06, "loss": 0.5386, "step": 382 }, { "epoch": 0.7111111111111111, "grad_norm": 8.375, "learning_rate": 3.438025958445042e-06, "loss": 0.5254, "step": 384 }, { "epoch": 0.7148148148148148, "grad_norm": 8.375, "learning_rate": 3.4260109326168295e-06, "loss": 0.5373, "step": 386 }, { "epoch": 0.7185185185185186, "grad_norm": 8.0625, "learning_rate": 3.413955355218237e-06, "loss": 0.5188, "step": 388 }, { "epoch": 0.7222222222222222, "grad_norm": 8.0625, "learning_rate": 3.40185966041254e-06, "loss": 0.5286, "step": 390 }, { "epoch": 0.725925925925926, "grad_norm": 8.3125, "learning_rate": 3.3897242838077815e-06, "loss": 0.5281, "step": 392 }, { "epoch": 0.7296296296296296, "grad_norm": 8.5, "learning_rate": 3.3775496624410846e-06, "loss": 0.5568, "step": 394 }, { "epoch": 0.7333333333333333, "grad_norm": 7.90625, "learning_rate": 3.365336234762914e-06, "loss": 0.5025, "step": 396 }, { "epoch": 0.737037037037037, "grad_norm": 7.9375, "learning_rate": 3.3530844406212813e-06, "loss": 0.5306, "step": 398 }, { "epoch": 0.7407407407407407, "grad_norm": 8.125, "learning_rate": 3.340794721245911e-06, "loss": 0.5232, "step": 400 }, { "epoch": 0.7444444444444445, "grad_norm": 8.4375, "learning_rate": 3.3284675192323466e-06, "loss": 0.5402, "step": 402 }, { "epoch": 0.7481481481481481, "grad_norm": 8.625, "learning_rate": 3.3161032785260114e-06, "loss": 0.5587, "step": 404 }, { "epoch": 0.7518518518518519, "grad_norm": 7.875, "learning_rate": 3.30370244440622e-06, "loss": 0.5217, "step": 406 }, { "epoch": 0.7555555555555555, "grad_norm": 8.125, "learning_rate": 3.291265463470143e-06, "loss": 0.5097, "step": 408 }, { "epoch": 0.7592592592592593, "grad_norm": 7.90625, "learning_rate": 3.2787927836167273e-06, "loss": 0.5301, "step": 410 }, { "epoch": 0.762962962962963, "grad_norm": 7.875, "learning_rate": 3.2662848540305566e-06, "loss": 0.515, "step": 412 }, { "epoch": 0.7666666666666667, "grad_norm": 8.1875, "learning_rate": 3.253742125165684e-06, "loss": 0.5769, "step": 414 }, { "epoch": 0.7703703703703704, "grad_norm": 8.0625, "learning_rate": 3.241165048729404e-06, "loss": 0.5125, "step": 416 }, { "epoch": 0.774074074074074, "grad_norm": 7.96875, "learning_rate": 3.2285540776659865e-06, "loss": 0.5187, "step": 418 }, { "epoch": 0.7777777777777778, "grad_norm": 7.96875, "learning_rate": 3.215909666140367e-06, "loss": 0.5163, "step": 420 }, { "epoch": 0.7814814814814814, "grad_norm": 8.4375, "learning_rate": 3.2032322695217835e-06, "loss": 0.5378, "step": 422 }, { "epoch": 0.7851851851851852, "grad_norm": 8.1875, "learning_rate": 3.1905223443673878e-06, "loss": 0.5254, "step": 424 }, { "epoch": 0.7888888888888889, "grad_norm": 8.1875, "learning_rate": 3.1777803484057937e-06, "loss": 0.5277, "step": 426 }, { "epoch": 0.7925925925925926, "grad_norm": 7.90625, "learning_rate": 3.165006740520598e-06, "loss": 0.514, "step": 428 }, { "epoch": 0.7962962962962963, "grad_norm": 8.0625, "learning_rate": 3.1522019807338508e-06, "loss": 0.5231, "step": 430 }, { "epoch": 0.8, "grad_norm": 8.25, "learning_rate": 3.1393665301894926e-06, "loss": 0.5135, "step": 432 }, { "epoch": 0.8037037037037037, "grad_norm": 8.0, "learning_rate": 3.126500851136745e-06, "loss": 0.5319, "step": 434 }, { "epoch": 0.8074074074074075, "grad_norm": 8.0625, "learning_rate": 3.1136054069134607e-06, "loss": 0.53, "step": 436 }, { "epoch": 0.8111111111111111, "grad_norm": 8.5, "learning_rate": 3.1006806619294428e-06, "loss": 0.5229, "step": 438 }, { "epoch": 0.8148148148148148, "grad_norm": 8.5625, "learning_rate": 3.087727081649715e-06, "loss": 0.5429, "step": 440 }, { "epoch": 0.8185185185185185, "grad_norm": 8.0625, "learning_rate": 3.0747451325777605e-06, "loss": 0.5103, "step": 442 }, { "epoch": 0.8222222222222222, "grad_norm": 8.5625, "learning_rate": 3.061735282238722e-06, "loss": 0.5541, "step": 444 }, { "epoch": 0.825925925925926, "grad_norm": 7.90625, "learning_rate": 3.0486979991625627e-06, "loss": 0.5284, "step": 446 }, { "epoch": 0.8296296296296296, "grad_norm": 8.375, "learning_rate": 3.035633752867196e-06, "loss": 0.5209, "step": 448 }, { "epoch": 0.8333333333333334, "grad_norm": 8.375, "learning_rate": 3.022543013841572e-06, "loss": 0.53, "step": 450 }, { "epoch": 0.837037037037037, "grad_norm": 7.8125, "learning_rate": 3.0094262535287385e-06, "loss": 0.5104, "step": 452 }, { "epoch": 0.8407407407407408, "grad_norm": 7.78125, "learning_rate": 2.99628394430886e-06, "loss": 0.4987, "step": 454 }, { "epoch": 0.8444444444444444, "grad_norm": 7.875, "learning_rate": 2.9831165594822035e-06, "loss": 0.5181, "step": 456 }, { "epoch": 0.8481481481481481, "grad_norm": 7.90625, "learning_rate": 2.9699245732521005e-06, "loss": 0.5204, "step": 458 }, { "epoch": 0.8518518518518519, "grad_norm": 8.3125, "learning_rate": 2.95670846070786e-06, "loss": 0.5254, "step": 460 }, { "epoch": 0.8555555555555555, "grad_norm": 8.5, "learning_rate": 2.943468697807666e-06, "loss": 0.5233, "step": 462 }, { "epoch": 0.8592592592592593, "grad_norm": 8.25, "learning_rate": 2.930205761361434e-06, "loss": 0.5238, "step": 464 }, { "epoch": 0.8629629629629629, "grad_norm": 8.5, "learning_rate": 2.9169201290136377e-06, "loss": 0.5196, "step": 466 }, { "epoch": 0.8666666666666667, "grad_norm": 8.3125, "learning_rate": 2.903612279226112e-06, "loss": 0.5359, "step": 468 }, { "epoch": 0.8703703703703703, "grad_norm": 7.6875, "learning_rate": 2.8902826912608155e-06, "loss": 0.5168, "step": 470 }, { "epoch": 0.8740740740740741, "grad_norm": 8.0625, "learning_rate": 2.8769318451625792e-06, "loss": 0.5393, "step": 472 }, { "epoch": 0.8777777777777778, "grad_norm": 7.75, "learning_rate": 2.8635602217418073e-06, "loss": 0.5155, "step": 474 }, { "epoch": 0.8814814814814815, "grad_norm": 7.75, "learning_rate": 2.850168302557173e-06, "loss": 0.4892, "step": 476 }, { "epoch": 0.8851851851851852, "grad_norm": 7.8125, "learning_rate": 2.8367565698982674e-06, "loss": 0.5184, "step": 478 }, { "epoch": 0.8888888888888888, "grad_norm": 8.4375, "learning_rate": 2.8233255067682357e-06, "loss": 0.526, "step": 480 }, { "epoch": 0.8925925925925926, "grad_norm": 8.4375, "learning_rate": 2.8098755968663775e-06, "loss": 0.5202, "step": 482 }, { "epoch": 0.8962962962962963, "grad_norm": 8.25, "learning_rate": 2.7964073245707345e-06, "loss": 0.5124, "step": 484 }, { "epoch": 0.9, "grad_norm": 8.375, "learning_rate": 2.7829211749206393e-06, "loss": 0.5341, "step": 486 }, { "epoch": 0.9037037037037037, "grad_norm": 8.25, "learning_rate": 2.76941763359925e-06, "loss": 0.5118, "step": 488 }, { "epoch": 0.9074074074074074, "grad_norm": 8.3125, "learning_rate": 2.7558971869160605e-06, "loss": 0.5252, "step": 490 }, { "epoch": 0.9111111111111111, "grad_norm": 7.6875, "learning_rate": 2.7423603217893853e-06, "loss": 0.5061, "step": 492 }, { "epoch": 0.9148148148148149, "grad_norm": 8.375, "learning_rate": 2.7288075257288237e-06, "loss": 0.5002, "step": 494 }, { "epoch": 0.9185185185185185, "grad_norm": 8.5625, "learning_rate": 2.7152392868177043e-06, "loss": 0.542, "step": 496 }, { "epoch": 0.9222222222222223, "grad_norm": 7.78125, "learning_rate": 2.7016560936955053e-06, "loss": 0.5133, "step": 498 }, { "epoch": 0.9259259259259259, "grad_norm": 8.0, "learning_rate": 2.6880584355402586e-06, "loss": 0.524, "step": 500 }, { "epoch": 0.9296296296296296, "grad_norm": 7.8125, "learning_rate": 2.6744468020509324e-06, "loss": 0.5031, "step": 502 }, { "epoch": 0.9333333333333333, "grad_norm": 8.375, "learning_rate": 2.6608216834297947e-06, "loss": 0.5137, "step": 504 }, { "epoch": 0.937037037037037, "grad_norm": 7.8125, "learning_rate": 2.647183570364761e-06, "loss": 0.5095, "step": 506 }, { "epoch": 0.9407407407407408, "grad_norm": 8.25, "learning_rate": 2.633532954011721e-06, "loss": 0.5195, "step": 508 }, { "epoch": 0.9444444444444444, "grad_norm": 8.375, "learning_rate": 2.6198703259768517e-06, "loss": 0.5133, "step": 510 }, { "epoch": 0.9481481481481482, "grad_norm": 7.84375, "learning_rate": 2.606196178298913e-06, "loss": 0.4951, "step": 512 }, { "epoch": 0.9518518518518518, "grad_norm": 8.5, "learning_rate": 2.592511003431526e-06, "loss": 0.5214, "step": 514 }, { "epoch": 0.9555555555555556, "grad_norm": 8.5, "learning_rate": 2.5788152942254395e-06, "loss": 0.5309, "step": 516 }, { "epoch": 0.9592592592592593, "grad_norm": 7.78125, "learning_rate": 2.5651095439107826e-06, "loss": 0.5009, "step": 518 }, { "epoch": 0.9629629629629629, "grad_norm": 8.375, "learning_rate": 2.5513942460792966e-06, "loss": 0.503, "step": 520 }, { "epoch": 0.9666666666666667, "grad_norm": 8.25, "learning_rate": 2.5376698946665634e-06, "loss": 0.5223, "step": 522 }, { "epoch": 0.9703703703703703, "grad_norm": 8.4375, "learning_rate": 2.523936983934217e-06, "loss": 0.5341, "step": 524 }, { "epoch": 0.9740740740740741, "grad_norm": 8.375, "learning_rate": 2.5101960084521407e-06, "loss": 0.5507, "step": 526 }, { "epoch": 0.9777777777777777, "grad_norm": 8.3125, "learning_rate": 2.4964474630806573e-06, "loss": 0.5107, "step": 528 }, { "epoch": 0.9814814814814815, "grad_norm": 8.25, "learning_rate": 2.482691842952709e-06, "loss": 0.5035, "step": 530 }, { "epoch": 0.9851851851851852, "grad_norm": 8.5, "learning_rate": 2.468929643456024e-06, "loss": 0.5083, "step": 532 }, { "epoch": 0.9888888888888889, "grad_norm": 8.25, "learning_rate": 2.4551613602152758e-06, "loss": 0.5121, "step": 534 }, { "epoch": 0.9925925925925926, "grad_norm": 8.3125, "learning_rate": 2.4413874890742364e-06, "loss": 0.535, "step": 536 }, { "epoch": 0.9962962962962963, "grad_norm": 8.1875, "learning_rate": 2.427608526077915e-06, "loss": 0.5184, "step": 538 }, { "epoch": 1.0, "grad_norm": 7.6875, "learning_rate": 2.413824967454698e-06, "loss": 0.4873, "step": 540 } ], "logging_steps": 2, "max_steps": 1080, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 540, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 8.66563154509824e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }