|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 300, |
|
"global_step": 1089, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0027548209366391185, |
|
"grad_norm": 10.8125, |
|
"learning_rate": 1e-06, |
|
"loss": 1.9168, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.013774104683195593, |
|
"grad_norm": 8.4375, |
|
"learning_rate": 1.999962411893365e-06, |
|
"loss": 1.9099, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.027548209366391185, |
|
"grad_norm": 4.90625, |
|
"learning_rate": 1.9997327170302815e-06, |
|
"loss": 1.8629, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.04132231404958678, |
|
"grad_norm": 3.671875, |
|
"learning_rate": 1.99929425749243e-06, |
|
"loss": 1.8376, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.05509641873278237, |
|
"grad_norm": 2.828125, |
|
"learning_rate": 1.998647124839145e-06, |
|
"loss": 1.8137, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.06887052341597796, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 1.997791454204984e-06, |
|
"loss": 1.8055, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.08264462809917356, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 1.9967274242715065e-06, |
|
"loss": 1.7913, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.09641873278236915, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 1.995455257229964e-06, |
|
"loss": 1.7897, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.11019283746556474, |
|
"grad_norm": 1.8828125, |
|
"learning_rate": 1.9939752187349e-06, |
|
"loss": 1.7752, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.12396694214876033, |
|
"grad_norm": 1.8671875, |
|
"learning_rate": 1.9922876178486764e-06, |
|
"loss": 1.7802, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.13774104683195593, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 1.9903928069769356e-06, |
|
"loss": 1.7686, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.15151515151515152, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 1.9882911817950105e-06, |
|
"loss": 1.7702, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.1652892561983471, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 1.985983181165299e-06, |
|
"loss": 1.7618, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.1790633608815427, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 1.98346928704562e-06, |
|
"loss": 1.7627, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.1928374655647383, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 1.9807500243885744e-06, |
|
"loss": 1.7665, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.2066115702479339, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 1.9778259610319187e-06, |
|
"loss": 1.755, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.22038567493112948, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 1.9746977075799933e-06, |
|
"loss": 1.7574, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.23415977961432508, |
|
"grad_norm": 1.4609375, |
|
"learning_rate": 1.9713659172762126e-06, |
|
"loss": 1.7529, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.24793388429752067, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 1.9678312858666578e-06, |
|
"loss": 1.7417, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.26170798898071623, |
|
"grad_norm": 1.421875, |
|
"learning_rate": 1.964094551454788e-06, |
|
"loss": 1.7509, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.27548209366391185, |
|
"grad_norm": 1.5, |
|
"learning_rate": 1.960156494347309e-06, |
|
"loss": 1.7486, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.2892561983471074, |
|
"grad_norm": 1.3984375, |
|
"learning_rate": 1.9560179368912327e-06, |
|
"loss": 1.7531, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.30303030303030304, |
|
"grad_norm": 1.40625, |
|
"learning_rate": 1.95167974330215e-06, |
|
"loss": 1.7435, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.3168044077134986, |
|
"grad_norm": 1.4140625, |
|
"learning_rate": 1.9471428194837667e-06, |
|
"loss": 1.7403, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.3305785123966942, |
|
"grad_norm": 1.3984375, |
|
"learning_rate": 1.9424081128387337e-06, |
|
"loss": 1.7435, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.3443526170798898, |
|
"grad_norm": 1.40625, |
|
"learning_rate": 1.9374766120708077e-06, |
|
"loss": 1.75, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.3581267217630854, |
|
"grad_norm": 1.3828125, |
|
"learning_rate": 1.932349346978389e-06, |
|
"loss": 1.7469, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.371900826446281, |
|
"grad_norm": 1.359375, |
|
"learning_rate": 1.927027388239482e-06, |
|
"loss": 1.734, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.3856749311294766, |
|
"grad_norm": 1.40625, |
|
"learning_rate": 1.921511847188112e-06, |
|
"loss": 1.7385, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.39944903581267216, |
|
"grad_norm": 1.390625, |
|
"learning_rate": 1.915803875582259e-06, |
|
"loss": 1.736, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.4132231404958678, |
|
"grad_norm": 1.34375, |
|
"learning_rate": 1.9099046653633437e-06, |
|
"loss": 1.7336, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.42699724517906334, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 1.9038154484073284e-06, |
|
"loss": 1.7324, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.44077134986225897, |
|
"grad_norm": 1.3828125, |
|
"learning_rate": 1.8975374962674753e-06, |
|
"loss": 1.7406, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.45454545454545453, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 1.8910721199088195e-06, |
|
"loss": 1.7461, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.46831955922865015, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 1.8844206694344138e-06, |
|
"loss": 1.7296, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.4820936639118457, |
|
"grad_norm": 1.390625, |
|
"learning_rate": 1.877584533803398e-06, |
|
"loss": 1.7286, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.49586776859504134, |
|
"grad_norm": 1.3203125, |
|
"learning_rate": 1.8705651405409566e-06, |
|
"loss": 1.7225, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.509641873278237, |
|
"grad_norm": 1.328125, |
|
"learning_rate": 1.8633639554402234e-06, |
|
"loss": 1.7366, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.5234159779614325, |
|
"grad_norm": 1.3828125, |
|
"learning_rate": 1.8559824822561913e-06, |
|
"loss": 1.7353, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.5371900826446281, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 1.8484222623917e-06, |
|
"loss": 1.7223, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.5509641873278237, |
|
"grad_norm": 1.328125, |
|
"learning_rate": 1.8406848745755578e-06, |
|
"loss": 1.7256, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5647382920110193, |
|
"grad_norm": 1.3671875, |
|
"learning_rate": 1.832771934532872e-06, |
|
"loss": 1.7288, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.5785123966942148, |
|
"grad_norm": 1.34375, |
|
"learning_rate": 1.8246850946476505e-06, |
|
"loss": 1.7247, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.5922865013774105, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 1.8164260436177524e-06, |
|
"loss": 1.7221, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.6060606060606061, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 1.8079965061022518e-06, |
|
"loss": 1.7224, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.6198347107438017, |
|
"grad_norm": 1.328125, |
|
"learning_rate": 1.7993982423612941e-06, |
|
"loss": 1.7206, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.6336088154269972, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 1.7906330478885174e-06, |
|
"loss": 1.7252, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.6473829201101928, |
|
"grad_norm": 1.359375, |
|
"learning_rate": 1.7817027530361174e-06, |
|
"loss": 1.725, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.6611570247933884, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 1.7726092226326315e-06, |
|
"loss": 1.7189, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.6749311294765841, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 1.7633543555935245e-06, |
|
"loss": 1.719, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.6887052341597796, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 1.7539400845246564e-06, |
|
"loss": 1.7121, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.7024793388429752, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 1.744368375318715e-06, |
|
"loss": 1.7198, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.7162534435261708, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 1.7346412267446958e-06, |
|
"loss": 1.7149, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.7300275482093664, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 1.724760670030521e-06, |
|
"loss": 1.713, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.743801652892562, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 1.7147287684388738e-06, |
|
"loss": 1.7192, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.7575757575757576, |
|
"grad_norm": 1.359375, |
|
"learning_rate": 1.7045476168363498e-06, |
|
"loss": 1.721, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.7713498622589532, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 1.6942193412560043e-06, |
|
"loss": 1.7102, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.7851239669421488, |
|
"grad_norm": 1.34375, |
|
"learning_rate": 1.6837460984533934e-06, |
|
"loss": 1.7218, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.7988980716253443, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 1.6731300754562008e-06, |
|
"loss": 1.7107, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.8126721763085399, |
|
"grad_norm": 1.34375, |
|
"learning_rate": 1.6623734891075385e-06, |
|
"loss": 1.7138, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.8264462809917356, |
|
"grad_norm": 1.3203125, |
|
"learning_rate": 1.6514785856030272e-06, |
|
"loss": 1.7112, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.8264462809917356, |
|
"eval_loss": 1.7193405628204346, |
|
"eval_runtime": 8.403, |
|
"eval_samples_per_second": 83.779, |
|
"eval_steps_per_second": 2.618, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.8402203856749312, |
|
"grad_norm": 1.359375, |
|
"learning_rate": 1.640447640021744e-06, |
|
"loss": 1.7211, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.8539944903581267, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 1.6292829558511376e-06, |
|
"loss": 1.7142, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.8677685950413223, |
|
"grad_norm": 1.328125, |
|
"learning_rate": 1.6179868645060162e-06, |
|
"loss": 1.7028, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.8815426997245179, |
|
"grad_norm": 1.34375, |
|
"learning_rate": 1.6065617248416967e-06, |
|
"loss": 1.7091, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.8953168044077136, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 1.59500992266143e-06, |
|
"loss": 1.7143, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.9090909090909091, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 1.5833338702181959e-06, |
|
"loss": 1.7105, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.9228650137741047, |
|
"grad_norm": 1.40625, |
|
"learning_rate": 1.5715360057109744e-06, |
|
"loss": 1.7146, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.9366391184573003, |
|
"grad_norm": 1.4140625, |
|
"learning_rate": 1.5596187927755993e-06, |
|
"loss": 1.7145, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.9504132231404959, |
|
"grad_norm": 1.3828125, |
|
"learning_rate": 1.5475847199703033e-06, |
|
"loss": 1.7099, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.9641873278236914, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 1.535436300256053e-06, |
|
"loss": 1.7143, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.977961432506887, |
|
"grad_norm": 1.3828125, |
|
"learning_rate": 1.523176070471793e-06, |
|
"loss": 1.7131, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.9917355371900827, |
|
"grad_norm": 1.34375, |
|
"learning_rate": 1.5108065908047014e-06, |
|
"loss": 1.7248, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.0055096418732783, |
|
"grad_norm": 1.3203125, |
|
"learning_rate": 1.4983304442555698e-06, |
|
"loss": 1.7079, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 1.019283746556474, |
|
"grad_norm": 1.3671875, |
|
"learning_rate": 1.4857502360994204e-06, |
|
"loss": 1.712, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.0330578512396693, |
|
"grad_norm": 1.34375, |
|
"learning_rate": 1.4730685933414714e-06, |
|
"loss": 1.703, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 1.046831955922865, |
|
"grad_norm": 1.34375, |
|
"learning_rate": 1.4602881641685643e-06, |
|
"loss": 1.6944, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.0606060606060606, |
|
"grad_norm": 1.375, |
|
"learning_rate": 1.4474116173961668e-06, |
|
"loss": 1.7032, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 1.0743801652892562, |
|
"grad_norm": 1.375, |
|
"learning_rate": 1.4344416419110728e-06, |
|
"loss": 1.7122, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.0881542699724518, |
|
"grad_norm": 1.359375, |
|
"learning_rate": 1.4213809461099033e-06, |
|
"loss": 1.703, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 1.1019283746556474, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 1.4082322573335422e-06, |
|
"loss": 1.703, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.115702479338843, |
|
"grad_norm": 1.328125, |
|
"learning_rate": 1.394998321297608e-06, |
|
"loss": 1.7024, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 1.1294765840220387, |
|
"grad_norm": 1.359375, |
|
"learning_rate": 1.3816819015190943e-06, |
|
"loss": 1.7058, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.1432506887052343, |
|
"grad_norm": 1.34375, |
|
"learning_rate": 1.3682857787392905e-06, |
|
"loss": 1.6987, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 1.1570247933884297, |
|
"grad_norm": 1.375, |
|
"learning_rate": 1.3548127503431038e-06, |
|
"loss": 1.7029, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.1707988980716253, |
|
"grad_norm": 1.375, |
|
"learning_rate": 1.3412656297749135e-06, |
|
"loss": 1.6998, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 1.184573002754821, |
|
"grad_norm": 1.3828125, |
|
"learning_rate": 1.327647245951058e-06, |
|
"loss": 1.7051, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.1983471074380165, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 1.3139604426691072e-06, |
|
"loss": 1.7065, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 1.2121212121212122, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 1.300208078014014e-06, |
|
"loss": 1.7019, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.2258953168044078, |
|
"grad_norm": 1.359375, |
|
"learning_rate": 1.2863930237612896e-06, |
|
"loss": 1.6999, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 1.2396694214876034, |
|
"grad_norm": 1.3203125, |
|
"learning_rate": 1.2725181647773174e-06, |
|
"loss": 1.7046, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.2534435261707988, |
|
"grad_norm": 1.390625, |
|
"learning_rate": 1.2585863984169343e-06, |
|
"loss": 1.7069, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 1.2672176308539944, |
|
"grad_norm": 1.359375, |
|
"learning_rate": 1.2446006339184035e-06, |
|
"loss": 1.7025, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.28099173553719, |
|
"grad_norm": 1.40625, |
|
"learning_rate": 1.2305637917959058e-06, |
|
"loss": 1.7016, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 1.2947658402203857, |
|
"grad_norm": 1.3203125, |
|
"learning_rate": 1.2164788032296755e-06, |
|
"loss": 1.6997, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.3085399449035813, |
|
"grad_norm": 1.328125, |
|
"learning_rate": 1.2023486094539124e-06, |
|
"loss": 1.7003, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 1.322314049586777, |
|
"grad_norm": 1.3671875, |
|
"learning_rate": 1.1881761611425888e-06, |
|
"loss": 1.6885, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.3360881542699725, |
|
"grad_norm": 1.375, |
|
"learning_rate": 1.1739644177932907e-06, |
|
"loss": 1.7, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 1.3498622589531681, |
|
"grad_norm": 1.34375, |
|
"learning_rate": 1.159716347109213e-06, |
|
"loss": 1.6989, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.3636363636363638, |
|
"grad_norm": 1.3671875, |
|
"learning_rate": 1.1454349243794419e-06, |
|
"loss": 1.7009, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 1.3774104683195592, |
|
"grad_norm": 1.359375, |
|
"learning_rate": 1.1311231318576545e-06, |
|
"loss": 1.7007, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.3911845730027548, |
|
"grad_norm": 1.3828125, |
|
"learning_rate": 1.1167839581393628e-06, |
|
"loss": 1.6918, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 1.4049586776859504, |
|
"grad_norm": 1.296875, |
|
"learning_rate": 1.1024203975378335e-06, |
|
"loss": 1.6975, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.418732782369146, |
|
"grad_norm": 1.3203125, |
|
"learning_rate": 1.0880354494588138e-06, |
|
"loss": 1.6998, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 1.4325068870523416, |
|
"grad_norm": 1.375, |
|
"learning_rate": 1.073632117774195e-06, |
|
"loss": 1.6973, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.4462809917355373, |
|
"grad_norm": 1.296875, |
|
"learning_rate": 1.0592134101947417e-06, |
|
"loss": 1.7023, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 1.4600550964187327, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 1.0447823376420206e-06, |
|
"loss": 1.707, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.4738292011019283, |
|
"grad_norm": 1.34375, |
|
"learning_rate": 1.0303419136196575e-06, |
|
"loss": 1.6916, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 1.487603305785124, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 1.0158951535840576e-06, |
|
"loss": 1.6977, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.5013774104683195, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 1.0014450743147145e-06, |
|
"loss": 1.699, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 1.5151515151515151, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 9.869946932842466e-07, |
|
"loss": 1.6907, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.5289256198347108, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 9.725470280282855e-07, |
|
"loss": 1.7001, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 1.5426997245179064, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 9.581050955153545e-07, |
|
"loss": 1.7054, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.556473829201102, |
|
"grad_norm": 1.296875, |
|
"learning_rate": 9.43671911516861e-07, |
|
"loss": 1.687, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 1.5702479338842976, |
|
"grad_norm": 1.3203125, |
|
"learning_rate": 9.292504899773453e-07, |
|
"loss": 1.7055, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.5840220385674932, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 9.148438423851041e-07, |
|
"loss": 1.6975, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 1.5977961432506889, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 9.00454977143331e-07, |
|
"loss": 1.6997, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.6115702479338843, |
|
"grad_norm": 1.296875, |
|
"learning_rate": 8.860868989419017e-07, |
|
"loss": 1.6983, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 1.6253443526170799, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 8.717426081299308e-07, |
|
"loss": 1.6995, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.6391184573002755, |
|
"grad_norm": 1.296875, |
|
"learning_rate": 8.574251000892386e-07, |
|
"loss": 1.6948, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 1.6528925619834711, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 8.431373646088549e-07, |
|
"loss": 1.6961, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.6528925619834711, |
|
"eval_loss": 1.7039618492126465, |
|
"eval_runtime": 8.3736, |
|
"eval_samples_per_second": 84.073, |
|
"eval_steps_per_second": 2.627, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.6666666666666665, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 8.288823852606882e-07, |
|
"loss": 1.6915, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 1.6804407713498621, |
|
"grad_norm": 1.296875, |
|
"learning_rate": 8.14663138776496e-07, |
|
"loss": 1.7006, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.6942148760330578, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 8.004825944262805e-07, |
|
"loss": 1.7029, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 1.7079889807162534, |
|
"grad_norm": 1.375, |
|
"learning_rate": 7.863437133982471e-07, |
|
"loss": 1.6942, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.721763085399449, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 7.722494481804445e-07, |
|
"loss": 1.6927, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 1.7355371900826446, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 7.582027419442268e-07, |
|
"loss": 1.6929, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.7493112947658402, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 7.442065279296578e-07, |
|
"loss": 1.702, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 1.7630853994490359, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 7.302637288329915e-07, |
|
"loss": 1.6971, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.7768595041322315, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 7.163772561963519e-07, |
|
"loss": 1.6923, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 1.790633608815427, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 7.02550009799745e-07, |
|
"loss": 1.6883, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.8044077134986227, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 6.887848770555234e-07, |
|
"loss": 1.6991, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 1.8181818181818183, |
|
"grad_norm": 1.296875, |
|
"learning_rate": 6.750847324054374e-07, |
|
"loss": 1.7069, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.8319559228650137, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 6.614524367203906e-07, |
|
"loss": 1.6979, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 1.8457300275482094, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 6.478908367030338e-07, |
|
"loss": 1.6829, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.859504132231405, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 6.344027642933128e-07, |
|
"loss": 1.692, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 1.8732782369146006, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 6.209910360771033e-07, |
|
"loss": 1.703, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.887052341597796, |
|
"grad_norm": 1.328125, |
|
"learning_rate": 6.076584526980484e-07, |
|
"loss": 1.706, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 1.9008264462809916, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 5.944077982727285e-07, |
|
"loss": 1.6906, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.9146005509641872, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 5.812418398092787e-07, |
|
"loss": 1.6982, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 1.9283746556473829, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 5.681633266295834e-07, |
|
"loss": 1.6917, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.9421487603305785, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 5.551749897951582e-07, |
|
"loss": 1.7014, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 1.955922865013774, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 5.422795415368518e-07, |
|
"loss": 1.6861, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.9696969696969697, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 5.294796746884745e-07, |
|
"loss": 1.6953, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 1.9834710743801653, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 5.167780621244801e-07, |
|
"loss": 1.6973, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.997245179063361, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 5.041773562018135e-07, |
|
"loss": 1.7019, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 2.0110192837465566, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 4.91680188206047e-07, |
|
"loss": 1.7011, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 2.024793388429752, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 4.792891678019115e-07, |
|
"loss": 1.7013, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 2.038567493112948, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 4.6700688248834664e-07, |
|
"loss": 1.6895, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 2.0523415977961434, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 4.548358970581757e-07, |
|
"loss": 1.7029, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 2.0661157024793386, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 4.427787530625278e-07, |
|
"loss": 1.6931, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.0798898071625342, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 4.3083796828010675e-07, |
|
"loss": 1.6886, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 2.09366391184573, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 4.190160361914292e-07, |
|
"loss": 1.6907, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 2.1074380165289255, |
|
"grad_norm": 1.296875, |
|
"learning_rate": 4.07315425458134e-07, |
|
"loss": 1.6924, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 2.121212121212121, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 3.9573857940747537e-07, |
|
"loss": 1.7019, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 2.1349862258953167, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 3.8428791552210594e-07, |
|
"loss": 1.6975, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 2.1487603305785123, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 3.729658249352563e-07, |
|
"loss": 1.6986, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 2.162534435261708, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 3.6177467193141886e-07, |
|
"loss": 1.6893, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 2.1763085399449036, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 3.5071679345263537e-07, |
|
"loss": 1.6833, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 2.190082644628099, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 3.397944986104968e-07, |
|
"loss": 1.693, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 2.203856749311295, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 3.290100682039516e-07, |
|
"loss": 1.6978, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.2176308539944904, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 3.1836575424303034e-07, |
|
"loss": 1.7019, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 2.231404958677686, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 3.078637794785791e-07, |
|
"loss": 1.6977, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 2.2451790633608817, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 2.9750633693810224e-07, |
|
"loss": 1.6898, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 2.2589531680440773, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 2.872955894678153e-07, |
|
"loss": 1.6915, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 2.2727272727272725, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 2.7723366928099754e-07, |
|
"loss": 1.6922, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 2.2865013774104685, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 2.673226775127422e-07, |
|
"loss": 1.6922, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 2.3002754820936637, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 2.5756468378119533e-07, |
|
"loss": 1.6873, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 2.3140495867768593, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 2.4796172575537934e-07, |
|
"loss": 1.7068, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.327823691460055, |
|
"grad_norm": 1.3203125, |
|
"learning_rate": 2.3851580872968435e-07, |
|
"loss": 1.6993, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 2.3415977961432506, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 2.292289052051224e-07, |
|
"loss": 1.6992, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.355371900826446, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 2.2010295447742743e-07, |
|
"loss": 1.6891, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 2.369146005509642, |
|
"grad_norm": 1.296875, |
|
"learning_rate": 2.111398622320927e-07, |
|
"loss": 1.6968, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.3829201101928374, |
|
"grad_norm": 1.296875, |
|
"learning_rate": 2.0234150014642305e-07, |
|
"loss": 1.6946, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 2.396694214876033, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 1.937097054986915e-07, |
|
"loss": 1.6892, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 2.4104683195592287, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 1.8524628078447602e-07, |
|
"loss": 1.6915, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 2.4242424242424243, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 1.769529933402637e-07, |
|
"loss": 1.6946, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.43801652892562, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 1.6883157497439349e-07, |
|
"loss": 1.6975, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 2.4517906336088156, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 1.6088372160541962e-07, |
|
"loss": 1.6871, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 2.465564738292011, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 1.531110929079681e-07, |
|
"loss": 1.6909, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 2.479338842975207, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 1.4551531196616396e-07, |
|
"loss": 1.6908, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.479338842975207, |
|
"eval_loss": 1.7025996446609497, |
|
"eval_runtime": 8.3873, |
|
"eval_samples_per_second": 83.936, |
|
"eval_steps_per_second": 2.623, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.4931129476584024, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 1.3809796493469728e-07, |
|
"loss": 1.6981, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 2.5068870523415976, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 1.3086060070760196e-07, |
|
"loss": 1.6902, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 2.5206611570247937, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 1.23804730594814e-07, |
|
"loss": 1.6964, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 2.534435261707989, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 1.1693182800658042e-07, |
|
"loss": 1.6884, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 2.5482093663911844, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 1.102433281457802e-07, |
|
"loss": 1.6969, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 2.56198347107438, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 1.0374062770822411e-07, |
|
"loss": 1.7003, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 2.5757575757575757, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 9.742508459099707e-08, |
|
"loss": 1.7095, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 2.5895316804407713, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 9.129801760890076e-08, |
|
"loss": 1.7026, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 2.603305785123967, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 8.536070621905811e-08, |
|
"loss": 1.6964, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 2.6170798898071626, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 7.961439025373617e-08, |
|
"loss": 1.6984, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.630853994490358, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 7.40602696614444e-08, |
|
"loss": 1.7022, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 2.644628099173554, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 6.869950425636095e-08, |
|
"loss": 1.6955, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 2.6584022038567494, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 6.353321347613815e-08, |
|
"loss": 1.6962, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 2.672176308539945, |
|
"grad_norm": 1.3828125, |
|
"learning_rate": 5.856247614814292e-08, |
|
"loss": 1.6914, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 2.6859504132231407, |
|
"grad_norm": 1.296875, |
|
"learning_rate": 5.3788330264174506e-08, |
|
"loss": 1.6934, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 2.6997245179063363, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 4.921177276371069e-08, |
|
"loss": 1.6947, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 2.7134986225895315, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 4.483375932572597e-08, |
|
"loss": 1.6929, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 2.7272727272727275, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 4.0655204169127156e-08, |
|
"loss": 1.6944, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 2.7410468319559227, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 3.667697986184526e-08, |
|
"loss": 1.6898, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 2.7548209366391183, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 3.2899917138625055e-08, |
|
"loss": 1.7061, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.768595041322314, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 2.9324804727551055e-08, |
|
"loss": 1.6974, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 2.7823691460055096, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 2.5952389185344925e-08, |
|
"loss": 1.6892, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 2.796143250688705, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 2.2783374741469186e-08, |
|
"loss": 1.696, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 2.809917355371901, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 1.9818423151069406e-08, |
|
"loss": 1.6879, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 2.8236914600550964, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 1.705815355678619e-08, |
|
"loss": 1.6943, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 2.837465564738292, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 1.4503142359465925e-08, |
|
"loss": 1.6919, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 2.8512396694214877, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 1.215392309779617e-08, |
|
"loss": 1.6907, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 2.8650137741046833, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 1.0010986336891458e-08, |
|
"loss": 1.704, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 2.878787878787879, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 8.074779565854117e-09, |
|
"loss": 1.691, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 2.8925619834710745, |
|
"grad_norm": 1.3203125, |
|
"learning_rate": 6.34570710432869e-09, |
|
"loss": 1.6975, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 2.90633608815427, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 4.824130018072026e-09, |
|
"loss": 1.6918, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 2.9201101928374653, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 3.5103660435551465e-09, |
|
"loss": 1.6933, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 2.9338842975206614, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 2.4046895216136563e-09, |
|
"loss": 1.6872, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 2.9476584022038566, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 1.5073313401594568e-09, |
|
"loss": 1.696, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 2.9614325068870526, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 8.184788859667557e-10, |
|
"loss": 1.6964, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 2.975206611570248, |
|
"grad_norm": 1.3203125, |
|
"learning_rate": 3.3827600554170444e-10, |
|
"loss": 1.6941, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 2.9889807162534434, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 6.682297508464608e-11, |
|
"loss": 1.6993, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"step": 1089, |
|
"total_flos": 6.427401199279931e+18, |
|
"train_loss": 1.711617823146263, |
|
"train_runtime": 5145.0339, |
|
"train_samples_per_second": 13.545, |
|
"train_steps_per_second": 0.212 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 1089, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 300, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 6.427401199279931e+18, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|