|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 100, |
|
"global_step": 1626, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0018450184501845018, |
|
"grad_norm": 9.194052941983164, |
|
"learning_rate": 1.226993865030675e-07, |
|
"loss": 1.1392, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.00922509225092251, |
|
"grad_norm": 8.728469464225432, |
|
"learning_rate": 6.134969325153375e-07, |
|
"loss": 1.1321, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.01845018450184502, |
|
"grad_norm": 5.066035045474869, |
|
"learning_rate": 1.226993865030675e-06, |
|
"loss": 1.0802, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.027675276752767528, |
|
"grad_norm": 6.151048691792626, |
|
"learning_rate": 1.8404907975460124e-06, |
|
"loss": 1.0186, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.03690036900369004, |
|
"grad_norm": 2.030218046940431, |
|
"learning_rate": 2.45398773006135e-06, |
|
"loss": 1.0181, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.046125461254612546, |
|
"grad_norm": 1.7169054577646434, |
|
"learning_rate": 3.0674846625766875e-06, |
|
"loss": 0.9867, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.055350553505535055, |
|
"grad_norm": 1.414702551784086, |
|
"learning_rate": 3.680981595092025e-06, |
|
"loss": 0.9848, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.06457564575645756, |
|
"grad_norm": 1.471062511929668, |
|
"learning_rate": 4.294478527607362e-06, |
|
"loss": 0.975, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.07380073800738007, |
|
"grad_norm": 1.9876641303020315, |
|
"learning_rate": 4.9079754601227e-06, |
|
"loss": 0.9616, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.08302583025830258, |
|
"grad_norm": 1.7086605102377759, |
|
"learning_rate": 5.521472392638038e-06, |
|
"loss": 0.9716, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.09225092250922509, |
|
"grad_norm": 2.202769359683669, |
|
"learning_rate": 6.134969325153375e-06, |
|
"loss": 0.9766, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.1014760147601476, |
|
"grad_norm": 1.6222357117334487, |
|
"learning_rate": 6.748466257668712e-06, |
|
"loss": 0.9929, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.11070110701107011, |
|
"grad_norm": 2.161648398755977, |
|
"learning_rate": 7.36196319018405e-06, |
|
"loss": 0.9774, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.11992619926199262, |
|
"grad_norm": 1.7198404521131392, |
|
"learning_rate": 7.975460122699386e-06, |
|
"loss": 0.9743, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.12915129151291513, |
|
"grad_norm": 2.5936580446065594, |
|
"learning_rate": 8.588957055214725e-06, |
|
"loss": 0.9878, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.13837638376383765, |
|
"grad_norm": 2.188257188915145, |
|
"learning_rate": 9.202453987730062e-06, |
|
"loss": 0.9568, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.14760147601476015, |
|
"grad_norm": 1.7531151641523148, |
|
"learning_rate": 9.8159509202454e-06, |
|
"loss": 0.9789, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.15682656826568267, |
|
"grad_norm": 1.8091240872427208, |
|
"learning_rate": 1.0429447852760737e-05, |
|
"loss": 0.9678, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.16605166051660517, |
|
"grad_norm": 2.098514635540621, |
|
"learning_rate": 1.1042944785276076e-05, |
|
"loss": 0.9617, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.1752767527675277, |
|
"grad_norm": 2.4275494428488607, |
|
"learning_rate": 1.1656441717791411e-05, |
|
"loss": 0.9676, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.18450184501845018, |
|
"grad_norm": 2.0637923302738095, |
|
"learning_rate": 1.226993865030675e-05, |
|
"loss": 0.9681, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.18450184501845018, |
|
"eval_loss": 0.9788174629211426, |
|
"eval_runtime": 515.1712, |
|
"eval_samples_per_second": 29.796, |
|
"eval_steps_per_second": 0.116, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.1937269372693727, |
|
"grad_norm": 2.069416549180579, |
|
"learning_rate": 1.2883435582822085e-05, |
|
"loss": 0.9528, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.2029520295202952, |
|
"grad_norm": 2.2916715973700024, |
|
"learning_rate": 1.3496932515337424e-05, |
|
"loss": 0.9696, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.21217712177121772, |
|
"grad_norm": 2.062468142825091, |
|
"learning_rate": 1.4110429447852763e-05, |
|
"loss": 0.9747, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.22140221402214022, |
|
"grad_norm": 1.7271367882138293, |
|
"learning_rate": 1.47239263803681e-05, |
|
"loss": 0.9786, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.23062730627306274, |
|
"grad_norm": 1.9545058702706481, |
|
"learning_rate": 1.5337423312883436e-05, |
|
"loss": 0.9758, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.23985239852398524, |
|
"grad_norm": 1.9400595646067775, |
|
"learning_rate": 1.5950920245398772e-05, |
|
"loss": 0.9829, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.24907749077490776, |
|
"grad_norm": 1.865861850010034, |
|
"learning_rate": 1.656441717791411e-05, |
|
"loss": 0.9915, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.25830258302583026, |
|
"grad_norm": 1.9529698824708406, |
|
"learning_rate": 1.717791411042945e-05, |
|
"loss": 0.9831, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.26752767527675275, |
|
"grad_norm": 1.8749039852563243, |
|
"learning_rate": 1.7791411042944788e-05, |
|
"loss": 0.9842, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.2767527675276753, |
|
"grad_norm": 1.4867806820095497, |
|
"learning_rate": 1.8404907975460123e-05, |
|
"loss": 0.9859, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.2859778597785978, |
|
"grad_norm": 2.1169911338934644, |
|
"learning_rate": 1.9018404907975462e-05, |
|
"loss": 0.9771, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.2952029520295203, |
|
"grad_norm": 1.5398155481235816, |
|
"learning_rate": 1.96319018404908e-05, |
|
"loss": 0.9817, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.3044280442804428, |
|
"grad_norm": 1.7130250807487832, |
|
"learning_rate": 1.9999907776750355e-05, |
|
"loss": 0.9997, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.31365313653136534, |
|
"grad_norm": 2.1366171045520383, |
|
"learning_rate": 1.9998870284726968e-05, |
|
"loss": 1.0004, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.32287822878228783, |
|
"grad_norm": 2.01400597362679, |
|
"learning_rate": 1.9996680141616956e-05, |
|
"loss": 0.9937, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.33210332103321033, |
|
"grad_norm": 1.925569839756876, |
|
"learning_rate": 1.9993337599895925e-05, |
|
"loss": 0.9939, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.3413284132841328, |
|
"grad_norm": 1.4590251035585917, |
|
"learning_rate": 1.998884304488584e-05, |
|
"loss": 0.9982, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.3505535055350554, |
|
"grad_norm": 1.531094729781709, |
|
"learning_rate": 1.998319699471061e-05, |
|
"loss": 0.9925, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.35977859778597787, |
|
"grad_norm": 1.9624667257758441, |
|
"learning_rate": 1.997640010023634e-05, |
|
"loss": 0.9765, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.36900369003690037, |
|
"grad_norm": 1.6943282928766075, |
|
"learning_rate": 1.9968453144996345e-05, |
|
"loss": 0.9962, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.36900369003690037, |
|
"eval_loss": 1.0030262470245361, |
|
"eval_runtime": 518.1283, |
|
"eval_samples_per_second": 29.626, |
|
"eval_steps_per_second": 0.116, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.37822878228782286, |
|
"grad_norm": 3.0469311990676857, |
|
"learning_rate": 1.9959357045100764e-05, |
|
"loss": 0.9947, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.3874538745387454, |
|
"grad_norm": 2.2551326104892864, |
|
"learning_rate": 1.9949112849131005e-05, |
|
"loss": 1.0023, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.3966789667896679, |
|
"grad_norm": 1.8771683509279502, |
|
"learning_rate": 1.993772173801884e-05, |
|
"loss": 0.9934, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.4059040590405904, |
|
"grad_norm": 1.8016877222967922, |
|
"learning_rate": 1.992518502491028e-05, |
|
"loss": 0.9807, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.4151291512915129, |
|
"grad_norm": 1.4456497466009737, |
|
"learning_rate": 1.9911504155014187e-05, |
|
"loss": 0.9926, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.42435424354243545, |
|
"grad_norm": 1.5156073716841811, |
|
"learning_rate": 1.989668070543569e-05, |
|
"loss": 0.9766, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.43357933579335795, |
|
"grad_norm": 1.3959824735787207, |
|
"learning_rate": 1.9880716384994355e-05, |
|
"loss": 0.9964, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.44280442804428044, |
|
"grad_norm": 1.4724192694561282, |
|
"learning_rate": 1.9863613034027224e-05, |
|
"loss": 0.9942, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.45202952029520294, |
|
"grad_norm": 2.064409139190994, |
|
"learning_rate": 1.9845372624176646e-05, |
|
"loss": 1.0103, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.4612546125461255, |
|
"grad_norm": 2.190902421105104, |
|
"learning_rate": 1.982599725816299e-05, |
|
"loss": 1.0075, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.470479704797048, |
|
"grad_norm": 1.9443583169417478, |
|
"learning_rate": 1.9805489169542245e-05, |
|
"loss": 0.9971, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.4797047970479705, |
|
"grad_norm": 1.553791831408308, |
|
"learning_rate": 1.978385072244857e-05, |
|
"loss": 0.9992, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.488929889298893, |
|
"grad_norm": 1.4174068635451451, |
|
"learning_rate": 1.9761084411321706e-05, |
|
"loss": 0.9793, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.4981549815498155, |
|
"grad_norm": 1.4969414214930414, |
|
"learning_rate": 1.9737192860619477e-05, |
|
"loss": 0.9791, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.507380073800738, |
|
"grad_norm": 1.4025421975340602, |
|
"learning_rate": 1.971217882451521e-05, |
|
"loss": 0.9796, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.5166051660516605, |
|
"grad_norm": 1.4448369862138994, |
|
"learning_rate": 1.9686045186580258e-05, |
|
"loss": 0.9884, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.525830258302583, |
|
"grad_norm": 2.0639483249182464, |
|
"learning_rate": 1.9658794959451583e-05, |
|
"loss": 0.9831, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.5350553505535055, |
|
"grad_norm": 1.6048970102781592, |
|
"learning_rate": 1.9630431284484447e-05, |
|
"loss": 0.9849, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.544280442804428, |
|
"grad_norm": 1.4540480684938577, |
|
"learning_rate": 1.960095743139033e-05, |
|
"loss": 0.9796, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.5535055350553506, |
|
"grad_norm": 1.424947900669971, |
|
"learning_rate": 1.957037679785994e-05, |
|
"loss": 0.9917, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.5535055350553506, |
|
"eval_loss": 1.0008341073989868, |
|
"eval_runtime": 513.1068, |
|
"eval_samples_per_second": 29.916, |
|
"eval_steps_per_second": 0.117, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.5627306273062731, |
|
"grad_norm": 1.2480517696242786, |
|
"learning_rate": 1.953869290917158e-05, |
|
"loss": 0.9943, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.5719557195571956, |
|
"grad_norm": 1.191133450390735, |
|
"learning_rate": 1.9505909417784758e-05, |
|
"loss": 0.9899, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.5811808118081181, |
|
"grad_norm": 1.1766418475997753, |
|
"learning_rate": 1.9472030102919102e-05, |
|
"loss": 0.9883, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.5904059040590406, |
|
"grad_norm": 1.2121897211885717, |
|
"learning_rate": 1.9437058870118745e-05, |
|
"loss": 1.0037, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.5996309963099631, |
|
"grad_norm": 1.2903187102851559, |
|
"learning_rate": 1.940099975080207e-05, |
|
"loss": 0.9892, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.6088560885608856, |
|
"grad_norm": 1.4260318993897811, |
|
"learning_rate": 1.9363856901796984e-05, |
|
"loss": 0.9896, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.6180811808118081, |
|
"grad_norm": 1.324489901337969, |
|
"learning_rate": 1.9325634604861728e-05, |
|
"loss": 0.9978, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.6273062730627307, |
|
"grad_norm": 1.275426852454915, |
|
"learning_rate": 1.9286337266191295e-05, |
|
"loss": 0.993, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.6365313653136532, |
|
"grad_norm": 1.329213272796139, |
|
"learning_rate": 1.9245969415909464e-05, |
|
"loss": 0.9879, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.6457564575645757, |
|
"grad_norm": 1.4085398606096227, |
|
"learning_rate": 1.9204535707546602e-05, |
|
"loss": 0.9869, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.6549815498154982, |
|
"grad_norm": 1.1848936755869721, |
|
"learning_rate": 1.916204091750321e-05, |
|
"loss": 0.9726, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.6642066420664207, |
|
"grad_norm": 1.2968309154541056, |
|
"learning_rate": 1.9118489944499287e-05, |
|
"loss": 0.9902, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.6734317343173432, |
|
"grad_norm": 1.2286246913756114, |
|
"learning_rate": 1.907388780900964e-05, |
|
"loss": 0.9811, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.6826568265682657, |
|
"grad_norm": 1.2591567733071325, |
|
"learning_rate": 1.902823965268513e-05, |
|
"loss": 0.9858, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.6918819188191881, |
|
"grad_norm": 1.4378514619406175, |
|
"learning_rate": 1.8981550737759932e-05, |
|
"loss": 0.9828, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.7011070110701108, |
|
"grad_norm": 1.497308547977116, |
|
"learning_rate": 1.8933826446444933e-05, |
|
"loss": 0.9892, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.7103321033210332, |
|
"grad_norm": 1.1745393096620436, |
|
"learning_rate": 1.888507228030729e-05, |
|
"loss": 0.9859, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.7195571955719557, |
|
"grad_norm": 1.2233160586824499, |
|
"learning_rate": 1.8835293859636177e-05, |
|
"loss": 0.9763, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.7287822878228782, |
|
"grad_norm": 1.3127902536541989, |
|
"learning_rate": 1.8784496922794947e-05, |
|
"loss": 0.981, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.7380073800738007, |
|
"grad_norm": 1.3089866347753676, |
|
"learning_rate": 1.873268732555957e-05, |
|
"loss": 0.9652, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.7380073800738007, |
|
"eval_loss": 0.993894636631012, |
|
"eval_runtime": 513.7147, |
|
"eval_samples_per_second": 29.88, |
|
"eval_steps_per_second": 0.117, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.7472324723247232, |
|
"grad_norm": 1.3088189271034285, |
|
"learning_rate": 1.8679871040443632e-05, |
|
"loss": 1.0048, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.7564575645756457, |
|
"grad_norm": 1.2954577066196238, |
|
"learning_rate": 1.8626054156009807e-05, |
|
"loss": 0.9927, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.7656826568265682, |
|
"grad_norm": 1.317981053662398, |
|
"learning_rate": 1.8571242876167995e-05, |
|
"loss": 0.9752, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.7749077490774908, |
|
"grad_norm": 1.4156756831610378, |
|
"learning_rate": 1.851544351946014e-05, |
|
"loss": 0.9945, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.7841328413284133, |
|
"grad_norm": 1.1285773664771428, |
|
"learning_rate": 1.845866251833183e-05, |
|
"loss": 0.9708, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.7933579335793358, |
|
"grad_norm": 1.2640468813011223, |
|
"learning_rate": 1.8400906418390808e-05, |
|
"loss": 0.9757, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.8025830258302583, |
|
"grad_norm": 1.288546177133416, |
|
"learning_rate": 1.834218187765237e-05, |
|
"loss": 0.976, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.8118081180811808, |
|
"grad_norm": 1.3086160465192265, |
|
"learning_rate": 1.8282495665771864e-05, |
|
"loss": 0.9761, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.8210332103321033, |
|
"grad_norm": 1.1919282548241303, |
|
"learning_rate": 1.8221854663264294e-05, |
|
"loss": 0.9718, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.8302583025830258, |
|
"grad_norm": 1.2454331164701038, |
|
"learning_rate": 1.8160265860711134e-05, |
|
"loss": 0.9842, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.8394833948339483, |
|
"grad_norm": 1.183454477783249, |
|
"learning_rate": 1.8097736357954487e-05, |
|
"loss": 0.9705, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.8487084870848709, |
|
"grad_norm": 1.1394535207411802, |
|
"learning_rate": 1.8034273363278615e-05, |
|
"loss": 0.9751, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.8579335793357934, |
|
"grad_norm": 1.1866949984179949, |
|
"learning_rate": 1.7969884192578977e-05, |
|
"loss": 0.9749, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.8671586715867159, |
|
"grad_norm": 1.299660479182102, |
|
"learning_rate": 1.7904576268518886e-05, |
|
"loss": 0.9598, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.8763837638376384, |
|
"grad_norm": 1.2221383874437446, |
|
"learning_rate": 1.783835711967382e-05, |
|
"loss": 0.9842, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.8856088560885609, |
|
"grad_norm": 1.2535423952991984, |
|
"learning_rate": 1.7771234379663545e-05, |
|
"loss": 0.9641, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.8948339483394834, |
|
"grad_norm": 1.4654400132426395, |
|
"learning_rate": 1.770321578627213e-05, |
|
"loss": 0.9784, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.9040590405904059, |
|
"grad_norm": 1.3747052246285973, |
|
"learning_rate": 1.763430918055595e-05, |
|
"loss": 0.9694, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.9132841328413284, |
|
"grad_norm": 1.1551950486505687, |
|
"learning_rate": 1.756452250593979e-05, |
|
"loss": 0.9727, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.922509225092251, |
|
"grad_norm": 1.128236535385729, |
|
"learning_rate": 1.7493863807301116e-05, |
|
"loss": 0.9666, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.922509225092251, |
|
"eval_loss": 0.9816026091575623, |
|
"eval_runtime": 517.2137, |
|
"eval_samples_per_second": 29.678, |
|
"eval_steps_per_second": 0.116, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.9317343173431735, |
|
"grad_norm": 1.230218009681161, |
|
"learning_rate": 1.74223412300427e-05, |
|
"loss": 0.9769, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.940959409594096, |
|
"grad_norm": 1.1847589898088133, |
|
"learning_rate": 1.7349963019153638e-05, |
|
"loss": 0.9628, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.9501845018450185, |
|
"grad_norm": 1.2246308831747907, |
|
"learning_rate": 1.7276737518258865e-05, |
|
"loss": 0.9602, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.959409594095941, |
|
"grad_norm": 1.1390750572317663, |
|
"learning_rate": 1.7202673168657318e-05, |
|
"loss": 0.9627, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.9686346863468634, |
|
"grad_norm": 1.1728205351456946, |
|
"learning_rate": 1.7127778508348858e-05, |
|
"loss": 0.9714, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.977859778597786, |
|
"grad_norm": 1.2796699310011739, |
|
"learning_rate": 1.7052062171050008e-05, |
|
"loss": 0.967, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.9870848708487084, |
|
"grad_norm": 1.1205342517216532, |
|
"learning_rate": 1.6975532885198678e-05, |
|
"loss": 0.9663, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.996309963099631, |
|
"grad_norm": 1.185279277131673, |
|
"learning_rate": 1.6898199472947972e-05, |
|
"loss": 0.9581, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.0055350553505535, |
|
"grad_norm": 3.007398366081561, |
|
"learning_rate": 1.6820070849149174e-05, |
|
"loss": 0.8519, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 1.014760147601476, |
|
"grad_norm": 2.1038299784593337, |
|
"learning_rate": 1.6741156020324086e-05, |
|
"loss": 0.7509, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.0239852398523985, |
|
"grad_norm": 1.5701183943228265, |
|
"learning_rate": 1.6661464083626734e-05, |
|
"loss": 0.7453, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 1.033210332103321, |
|
"grad_norm": 1.2911074026361753, |
|
"learning_rate": 1.6581004225794715e-05, |
|
"loss": 0.7391, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.0424354243542435, |
|
"grad_norm": 1.5938907876285198, |
|
"learning_rate": 1.649978572209012e-05, |
|
"loss": 0.7347, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 1.051660516605166, |
|
"grad_norm": 1.3495506131008623, |
|
"learning_rate": 1.6417817935230318e-05, |
|
"loss": 0.7396, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.0608856088560885, |
|
"grad_norm": 1.2781771587882627, |
|
"learning_rate": 1.6335110314308654e-05, |
|
"loss": 0.7305, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 1.070110701107011, |
|
"grad_norm": 1.5798733908227265, |
|
"learning_rate": 1.6251672393705155e-05, |
|
"loss": 0.7365, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.0793357933579335, |
|
"grad_norm": 1.416304183876239, |
|
"learning_rate": 1.6167513791987423e-05, |
|
"loss": 0.7373, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 1.088560885608856, |
|
"grad_norm": 1.3677150489575043, |
|
"learning_rate": 1.6082644210801846e-05, |
|
"loss": 0.7299, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.0977859778597785, |
|
"grad_norm": 1.3506677105351055, |
|
"learning_rate": 1.5997073433755187e-05, |
|
"loss": 0.7426, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 1.1070110701107012, |
|
"grad_norm": 1.461155474048458, |
|
"learning_rate": 1.5910811325286768e-05, |
|
"loss": 0.7366, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.1070110701107012, |
|
"eval_loss": 0.9852360486984253, |
|
"eval_runtime": 516.2338, |
|
"eval_samples_per_second": 29.735, |
|
"eval_steps_per_second": 0.116, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.1162361623616237, |
|
"grad_norm": 1.2999195127889172, |
|
"learning_rate": 1.582386782953129e-05, |
|
"loss": 0.7351, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 1.1254612546125462, |
|
"grad_norm": 1.5599221554130673, |
|
"learning_rate": 1.5736252969172522e-05, |
|
"loss": 0.7335, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.1346863468634687, |
|
"grad_norm": 1.30824219510555, |
|
"learning_rate": 1.5647976844287884e-05, |
|
"loss": 0.7321, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 1.1439114391143912, |
|
"grad_norm": 1.3590431139669035, |
|
"learning_rate": 1.5559049631184136e-05, |
|
"loss": 0.7294, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.1531365313653137, |
|
"grad_norm": 1.5685872513743657, |
|
"learning_rate": 1.5469481581224274e-05, |
|
"loss": 0.7372, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 1.1623616236162362, |
|
"grad_norm": 1.4194329169102744, |
|
"learning_rate": 1.5379283019645757e-05, |
|
"loss": 0.7423, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.1715867158671587, |
|
"grad_norm": 1.8516238628155155, |
|
"learning_rate": 1.5288464344370267e-05, |
|
"loss": 0.7389, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 1.1808118081180812, |
|
"grad_norm": 1.3787465939384576, |
|
"learning_rate": 1.5197036024805018e-05, |
|
"loss": 0.7277, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.1900369003690037, |
|
"grad_norm": 1.2679935699299498, |
|
"learning_rate": 1.5105008600635888e-05, |
|
"loss": 0.7251, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 1.1992619926199262, |
|
"grad_norm": 1.3661565990701046, |
|
"learning_rate": 1.5012392680612408e-05, |
|
"loss": 0.7348, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.2084870848708487, |
|
"grad_norm": 1.380476117633752, |
|
"learning_rate": 1.4919198941324813e-05, |
|
"loss": 0.733, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 1.2177121771217712, |
|
"grad_norm": 1.301175007422796, |
|
"learning_rate": 1.4825438125973263e-05, |
|
"loss": 0.7331, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.2269372693726937, |
|
"grad_norm": 1.3531205842843421, |
|
"learning_rate": 1.4731121043129392e-05, |
|
"loss": 0.7379, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 1.2361623616236161, |
|
"grad_norm": 1.444864127952419, |
|
"learning_rate": 1.4636258565490304e-05, |
|
"loss": 0.739, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.2453874538745389, |
|
"grad_norm": 1.2863648775710423, |
|
"learning_rate": 1.4540861628625207e-05, |
|
"loss": 0.7368, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 1.2546125461254611, |
|
"grad_norm": 1.2200332099647682, |
|
"learning_rate": 1.444494122971476e-05, |
|
"loss": 0.7343, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.2638376383763839, |
|
"grad_norm": 1.3714375121406106, |
|
"learning_rate": 1.4348508426283342e-05, |
|
"loss": 0.7391, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 1.2730627306273063, |
|
"grad_norm": 1.2638691361743832, |
|
"learning_rate": 1.4251574334924395e-05, |
|
"loss": 0.7397, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.2822878228782288, |
|
"grad_norm": 1.4011111864399106, |
|
"learning_rate": 1.4154150130018867e-05, |
|
"loss": 0.7374, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 1.2915129151291513, |
|
"grad_norm": 1.2912923761278596, |
|
"learning_rate": 1.4056247042447096e-05, |
|
"loss": 0.7228, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.2915129151291513, |
|
"eval_loss": 0.9835454225540161, |
|
"eval_runtime": 517.9285, |
|
"eval_samples_per_second": 29.637, |
|
"eval_steps_per_second": 0.116, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.3007380073800738, |
|
"grad_norm": 1.5854901671726367, |
|
"learning_rate": 1.3957876358294115e-05, |
|
"loss": 0.7296, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 1.3099630996309963, |
|
"grad_norm": 1.38846996136312, |
|
"learning_rate": 1.385904941754862e-05, |
|
"loss": 0.7257, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.3191881918819188, |
|
"grad_norm": 1.5297133474564781, |
|
"learning_rate": 1.375977761279571e-05, |
|
"loss": 0.7352, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 1.3284132841328413, |
|
"grad_norm": 1.287259224142701, |
|
"learning_rate": 1.366007238790358e-05, |
|
"loss": 0.7301, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.3376383763837638, |
|
"grad_norm": 1.2884194224179173, |
|
"learning_rate": 1.3559945236704286e-05, |
|
"loss": 0.7383, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 1.3468634686346863, |
|
"grad_norm": 1.3779553004575515, |
|
"learning_rate": 1.3459407701668762e-05, |
|
"loss": 0.7313, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.3560885608856088, |
|
"grad_norm": 1.5349656095564503, |
|
"learning_rate": 1.3358471372576229e-05, |
|
"loss": 0.7334, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 1.3653136531365313, |
|
"grad_norm": 1.3570612666553503, |
|
"learning_rate": 1.3257147885178125e-05, |
|
"loss": 0.7253, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.3745387453874538, |
|
"grad_norm": 1.3514442377769267, |
|
"learning_rate": 1.3155448919856792e-05, |
|
"loss": 0.7375, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 1.3837638376383765, |
|
"grad_norm": 1.338752928401098, |
|
"learning_rate": 1.3053386200278963e-05, |
|
"loss": 0.7349, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.3929889298892988, |
|
"grad_norm": 1.3943704063449442, |
|
"learning_rate": 1.2950971492044272e-05, |
|
"loss": 0.7338, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 1.4022140221402215, |
|
"grad_norm": 1.3567491078204894, |
|
"learning_rate": 1.2848216601328958e-05, |
|
"loss": 0.7385, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.4114391143911438, |
|
"grad_norm": 1.2556919848553412, |
|
"learning_rate": 1.2745133373524855e-05, |
|
"loss": 0.7457, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 1.4206642066420665, |
|
"grad_norm": 1.3027608934231716, |
|
"learning_rate": 1.2641733691873884e-05, |
|
"loss": 0.7342, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.429889298892989, |
|
"grad_norm": 1.2668132369825373, |
|
"learning_rate": 1.2538029476098175e-05, |
|
"loss": 0.7317, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 1.4391143911439115, |
|
"grad_norm": 1.2498842281077402, |
|
"learning_rate": 1.2434032681025986e-05, |
|
"loss": 0.732, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.448339483394834, |
|
"grad_norm": 1.221148464370588, |
|
"learning_rate": 1.2329755295213568e-05, |
|
"loss": 0.7168, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 1.4575645756457565, |
|
"grad_norm": 1.2029873246463332, |
|
"learning_rate": 1.2225209339563144e-05, |
|
"loss": 0.7299, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.466789667896679, |
|
"grad_norm": 1.2769506053242343, |
|
"learning_rate": 1.2120406865937174e-05, |
|
"loss": 0.7385, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 1.4760147601476015, |
|
"grad_norm": 1.5254063393209267, |
|
"learning_rate": 1.2015359955769021e-05, |
|
"loss": 0.7319, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.4760147601476015, |
|
"eval_loss": 0.9644125699996948, |
|
"eval_runtime": 512.8317, |
|
"eval_samples_per_second": 29.932, |
|
"eval_steps_per_second": 0.117, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.485239852398524, |
|
"grad_norm": 1.4657220418578245, |
|
"learning_rate": 1.1910080718670246e-05, |
|
"loss": 0.7234, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 1.4944649446494465, |
|
"grad_norm": 1.3333083489866098, |
|
"learning_rate": 1.1804581291034615e-05, |
|
"loss": 0.7314, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.503690036900369, |
|
"grad_norm": 1.3111534531304956, |
|
"learning_rate": 1.169887383463906e-05, |
|
"loss": 0.7212, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 1.5129151291512914, |
|
"grad_norm": 1.2536260067392955, |
|
"learning_rate": 1.1592970535241668e-05, |
|
"loss": 0.723, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.5221402214022142, |
|
"grad_norm": 1.239943596383526, |
|
"learning_rate": 1.1486883601176944e-05, |
|
"loss": 0.7315, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 1.5313653136531364, |
|
"grad_norm": 1.188861248391431, |
|
"learning_rate": 1.1380625261948458e-05, |
|
"loss": 0.7301, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.5405904059040592, |
|
"grad_norm": 1.247650108627454, |
|
"learning_rate": 1.127420776681905e-05, |
|
"loss": 0.7202, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 1.5498154981549814, |
|
"grad_norm": 1.4048683840262912, |
|
"learning_rate": 1.1167643383398746e-05, |
|
"loss": 0.7247, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.5590405904059041, |
|
"grad_norm": 1.2897015340446114, |
|
"learning_rate": 1.1060944396230583e-05, |
|
"loss": 0.7311, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 1.5682656826568264, |
|
"grad_norm": 1.21939417183643, |
|
"learning_rate": 1.0954123105374468e-05, |
|
"loss": 0.7249, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.5774907749077491, |
|
"grad_norm": 1.2309319468475195, |
|
"learning_rate": 1.0847191824989252e-05, |
|
"loss": 0.7298, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 1.5867158671586716, |
|
"grad_norm": 1.2218109998078897, |
|
"learning_rate": 1.0740162881913165e-05, |
|
"loss": 0.7223, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.5959409594095941, |
|
"grad_norm": 1.4183791452745522, |
|
"learning_rate": 1.0633048614242817e-05, |
|
"loss": 0.7359, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 1.6051660516605166, |
|
"grad_norm": 1.2210289040303786, |
|
"learning_rate": 1.0525861369910877e-05, |
|
"loss": 0.7302, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.6143911439114391, |
|
"grad_norm": 1.3175608261808258, |
|
"learning_rate": 1.0418613505262623e-05, |
|
"loss": 0.7226, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 1.6236162361623616, |
|
"grad_norm": 1.3018239201611663, |
|
"learning_rate": 1.0311317383631532e-05, |
|
"loss": 0.7227, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.632841328413284, |
|
"grad_norm": 1.1647552351758403, |
|
"learning_rate": 1.0203985373914056e-05, |
|
"loss": 0.7204, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 1.6420664206642066, |
|
"grad_norm": 1.210717925144679, |
|
"learning_rate": 1.0096629849143757e-05, |
|
"loss": 0.7115, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.651291512915129, |
|
"grad_norm": 1.1959081633999162, |
|
"learning_rate": 9.989263185064974e-06, |
|
"loss": 0.7164, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 1.6605166051660518, |
|
"grad_norm": 1.1679984043624778, |
|
"learning_rate": 9.881897758706155e-06, |
|
"loss": 0.7177, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.6605166051660518, |
|
"eval_loss": 0.9529369473457336, |
|
"eval_runtime": 516.4151, |
|
"eval_samples_per_second": 29.724, |
|
"eval_steps_per_second": 0.116, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.669741697416974, |
|
"grad_norm": 1.1784785526719634, |
|
"learning_rate": 9.77454594695308e-06, |
|
"loss": 0.7274, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 1.6789667896678968, |
|
"grad_norm": 1.1964871209199903, |
|
"learning_rate": 9.667220125122044e-06, |
|
"loss": 0.7119, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.688191881918819, |
|
"grad_norm": 1.173031357661576, |
|
"learning_rate": 9.559932665533291e-06, |
|
"loss": 0.7134, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 1.6974169741697418, |
|
"grad_norm": 1.2312863536042935, |
|
"learning_rate": 9.452695936084728e-06, |
|
"loss": 0.7144, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.706642066420664, |
|
"grad_norm": 1.2013984113686338, |
|
"learning_rate": 9.345522298826177e-06, |
|
"loss": 0.7146, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 1.7158671586715868, |
|
"grad_norm": 1.1285995450468198, |
|
"learning_rate": 9.238424108534333e-06, |
|
"loss": 0.7126, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.725092250922509, |
|
"grad_norm": 1.1727971825533714, |
|
"learning_rate": 9.131413711288485e-06, |
|
"loss": 0.7173, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 1.7343173431734318, |
|
"grad_norm": 1.198238879588798, |
|
"learning_rate": 9.024503443047318e-06, |
|
"loss": 0.7186, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.7435424354243543, |
|
"grad_norm": 1.2092538734459182, |
|
"learning_rate": 8.917705628226823e-06, |
|
"loss": 0.7064, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 1.7527675276752768, |
|
"grad_norm": 1.1850959753551464, |
|
"learning_rate": 8.81103257827957e-06, |
|
"loss": 0.7196, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.7619926199261993, |
|
"grad_norm": 1.1849846233150378, |
|
"learning_rate": 8.704496590275479e-06, |
|
"loss": 0.7181, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 1.7712177121771218, |
|
"grad_norm": 1.1192440025321218, |
|
"learning_rate": 8.598109945484208e-06, |
|
"loss": 0.7127, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.7804428044280443, |
|
"grad_norm": 1.185810311236685, |
|
"learning_rate": 8.491884907959426e-06, |
|
"loss": 0.7092, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 1.7896678966789668, |
|
"grad_norm": 1.1653670987242044, |
|
"learning_rate": 8.385833723125006e-06, |
|
"loss": 0.7115, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.7988929889298892, |
|
"grad_norm": 1.2928934171032893, |
|
"learning_rate": 8.279968616363417e-06, |
|
"loss": 0.7116, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 1.8081180811808117, |
|
"grad_norm": 1.1749460908752425, |
|
"learning_rate": 8.174301791606384e-06, |
|
"loss": 0.7159, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.8173431734317345, |
|
"grad_norm": 1.2968530721458553, |
|
"learning_rate": 8.06884542992806e-06, |
|
"loss": 0.7022, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 1.8265682656826567, |
|
"grad_norm": 1.214409149915767, |
|
"learning_rate": 7.963611688140814e-06, |
|
"loss": 0.705, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.8357933579335795, |
|
"grad_norm": 1.1751136227927774, |
|
"learning_rate": 7.858612697393792e-06, |
|
"loss": 0.7166, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 1.8450184501845017, |
|
"grad_norm": 1.2707314516132002, |
|
"learning_rate": 7.753860561774495e-06, |
|
"loss": 0.7095, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.8450184501845017, |
|
"eval_loss": 0.9393758773803711, |
|
"eval_runtime": 524.5955, |
|
"eval_samples_per_second": 29.261, |
|
"eval_steps_per_second": 0.114, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.8542435424354244, |
|
"grad_norm": 1.2737022554438457, |
|
"learning_rate": 7.649367356913422e-06, |
|
"loss": 0.7133, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 1.8634686346863467, |
|
"grad_norm": 1.2146494230865963, |
|
"learning_rate": 7.545145128592009e-06, |
|
"loss": 0.7162, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.8726937269372694, |
|
"grad_norm": 1.2563305762066708, |
|
"learning_rate": 7.441205891354037e-06, |
|
"loss": 0.7128, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 1.881918819188192, |
|
"grad_norm": 1.2400110075293442, |
|
"learning_rate": 7.337561627120591e-06, |
|
"loss": 0.7059, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.8911439114391144, |
|
"grad_norm": 1.2653437150866325, |
|
"learning_rate": 7.234224283808832e-06, |
|
"loss": 0.7058, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 1.900369003690037, |
|
"grad_norm": 1.1646651085367645, |
|
"learning_rate": 7.131205773954636e-06, |
|
"loss": 0.706, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 1.9095940959409594, |
|
"grad_norm": 1.1518551233990397, |
|
"learning_rate": 7.028517973339361e-06, |
|
"loss": 0.7138, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 1.918819188191882, |
|
"grad_norm": 1.223360815231687, |
|
"learning_rate": 6.926172719620827e-06, |
|
"loss": 0.697, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.9280442804428044, |
|
"grad_norm": 1.2198079984824493, |
|
"learning_rate": 6.824181810968675e-06, |
|
"loss": 0.7004, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 1.937269372693727, |
|
"grad_norm": 1.176959664107674, |
|
"learning_rate": 6.722557004704322e-06, |
|
"loss": 0.7082, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.9464944649446494, |
|
"grad_norm": 1.1844320699248965, |
|
"learning_rate": 6.62131001594558e-06, |
|
"loss": 0.7043, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 1.9557195571955721, |
|
"grad_norm": 1.148753422424237, |
|
"learning_rate": 6.520452516256157e-06, |
|
"loss": 0.6949, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 1.9649446494464944, |
|
"grad_norm": 1.1572577267352544, |
|
"learning_rate": 6.419996132300203e-06, |
|
"loss": 0.7071, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 1.974169741697417, |
|
"grad_norm": 1.2001014830908205, |
|
"learning_rate": 6.319952444501984e-06, |
|
"loss": 0.7103, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 1.9833948339483394, |
|
"grad_norm": 1.4841715888010063, |
|
"learning_rate": 6.220332985710936e-06, |
|
"loss": 0.694, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 1.992619926199262, |
|
"grad_norm": 1.4256755997357629, |
|
"learning_rate": 6.121149239872151e-06, |
|
"loss": 0.6964, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 2.0018450184501844, |
|
"grad_norm": 4.270149025567802, |
|
"learning_rate": 6.0224126407025616e-06, |
|
"loss": 0.6543, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 2.011070110701107, |
|
"grad_norm": 2.6490744221351044, |
|
"learning_rate": 5.924134570372863e-06, |
|
"loss": 0.4529, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 2.0202952029520294, |
|
"grad_norm": 2.2645999605838227, |
|
"learning_rate": 5.826326358195391e-06, |
|
"loss": 0.4559, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 2.029520295202952, |
|
"grad_norm": 1.5705400512864462, |
|
"learning_rate": 5.728999279318131e-06, |
|
"loss": 0.4465, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.029520295202952, |
|
"eval_loss": 0.9917108416557312, |
|
"eval_runtime": 517.7798, |
|
"eval_samples_per_second": 29.646, |
|
"eval_steps_per_second": 0.116, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.0387453874538743, |
|
"grad_norm": 1.6254518927847355, |
|
"learning_rate": 5.632164553424904e-06, |
|
"loss": 0.4353, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 2.047970479704797, |
|
"grad_norm": 14.583137561537578, |
|
"learning_rate": 5.5358333434420054e-06, |
|
"loss": 0.4424, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 2.0571955719557193, |
|
"grad_norm": 1.447005279720627, |
|
"learning_rate": 5.440016754251364e-06, |
|
"loss": 0.4423, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 2.066420664206642, |
|
"grad_norm": 1.4595204240426687, |
|
"learning_rate": 5.344725831410369e-06, |
|
"loss": 0.4384, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 2.0756457564575648, |
|
"grad_norm": 1.3190598016289843, |
|
"learning_rate": 5.24997155987859e-06, |
|
"loss": 0.4368, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 2.084870848708487, |
|
"grad_norm": 1.322338946677976, |
|
"learning_rate": 5.155764862751427e-06, |
|
"loss": 0.4392, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 2.0940959409594098, |
|
"grad_norm": 1.3472757392525208, |
|
"learning_rate": 5.062116600000933e-06, |
|
"loss": 0.4297, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 2.103321033210332, |
|
"grad_norm": 1.2895577097092337, |
|
"learning_rate": 4.969037567223881e-06, |
|
"loss": 0.4413, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 2.1125461254612548, |
|
"grad_norm": 1.3471090116973288, |
|
"learning_rate": 4.876538494397274e-06, |
|
"loss": 0.4317, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 2.121771217712177, |
|
"grad_norm": 1.3092628602239211, |
|
"learning_rate": 4.784630044641435e-06, |
|
"loss": 0.4509, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 2.1309963099630997, |
|
"grad_norm": 1.344809966917295, |
|
"learning_rate": 4.6933228129907395e-06, |
|
"loss": 0.4375, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 2.140221402214022, |
|
"grad_norm": 1.3014430618254322, |
|
"learning_rate": 4.602627325172279e-06, |
|
"loss": 0.4424, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 2.1494464944649447, |
|
"grad_norm": 1.3672933559982345, |
|
"learning_rate": 4.512554036392448e-06, |
|
"loss": 0.4419, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 2.158671586715867, |
|
"grad_norm": 1.3446667993737584, |
|
"learning_rate": 4.423113330131708e-06, |
|
"loss": 0.4303, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 2.1678966789667897, |
|
"grad_norm": 1.3257443131859206, |
|
"learning_rate": 4.33431551694758e-06, |
|
"loss": 0.4369, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 2.177121771217712, |
|
"grad_norm": 1.3655737456565726, |
|
"learning_rate": 4.246170833286075e-06, |
|
"loss": 0.4293, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 2.1863468634686347, |
|
"grad_norm": 1.3298593125645854, |
|
"learning_rate": 4.1586894403016576e-06, |
|
"loss": 0.439, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 2.195571955719557, |
|
"grad_norm": 1.32505780264794, |
|
"learning_rate": 4.071881422685877e-06, |
|
"loss": 0.4285, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 2.2047970479704797, |
|
"grad_norm": 1.3004312804341762, |
|
"learning_rate": 3.985756787504837e-06, |
|
"loss": 0.4353, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 2.2140221402214024, |
|
"grad_norm": 1.3177561620055287, |
|
"learning_rate": 3.9003254630455775e-06, |
|
"loss": 0.4341, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.2140221402214024, |
|
"eval_loss": 0.9978848695755005, |
|
"eval_runtime": 514.7843, |
|
"eval_samples_per_second": 29.818, |
|
"eval_steps_per_second": 0.117, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.2232472324723247, |
|
"grad_norm": 1.3438896554856818, |
|
"learning_rate": 3.815597297671578e-06, |
|
"loss": 0.4336, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 2.2324723247232474, |
|
"grad_norm": 1.2896295540334282, |
|
"learning_rate": 3.731582058687462e-06, |
|
"loss": 0.435, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 2.2416974169741697, |
|
"grad_norm": 1.358035688644123, |
|
"learning_rate": 3.6482894312130146e-06, |
|
"loss": 0.4324, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 2.2509225092250924, |
|
"grad_norm": 1.312197292051631, |
|
"learning_rate": 3.565729017066729e-06, |
|
"loss": 0.4315, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 2.2601476014760147, |
|
"grad_norm": 1.3227121347141655, |
|
"learning_rate": 3.483910333658913e-06, |
|
"loss": 0.4364, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 2.2693726937269374, |
|
"grad_norm": 1.3256090212374516, |
|
"learning_rate": 3.402842812894529e-06, |
|
"loss": 0.4356, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 2.2785977859778597, |
|
"grad_norm": 1.317549750635349, |
|
"learning_rate": 3.3225358000859287e-06, |
|
"loss": 0.4349, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 2.2878228782287824, |
|
"grad_norm": 1.2612830347481554, |
|
"learning_rate": 3.2429985528755127e-06, |
|
"loss": 0.4306, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 2.2970479704797047, |
|
"grad_norm": 1.3450073317730427, |
|
"learning_rate": 3.1642402401685557e-06, |
|
"loss": 0.4361, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 2.3062730627306274, |
|
"grad_norm": 1.3431835139445107, |
|
"learning_rate": 3.0862699410762043e-06, |
|
"loss": 0.4393, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 2.3154981549815496, |
|
"grad_norm": 1.3379126436430948, |
|
"learning_rate": 3.0090966438688774e-06, |
|
"loss": 0.4306, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 2.3247232472324724, |
|
"grad_norm": 1.2809064467748859, |
|
"learning_rate": 2.9327292449401067e-06, |
|
"loss": 0.4416, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 2.3339483394833946, |
|
"grad_norm": 1.3548015164880183, |
|
"learning_rate": 2.8571765477809645e-06, |
|
"loss": 0.4338, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 2.3431734317343174, |
|
"grad_norm": 1.320665427008479, |
|
"learning_rate": 2.7824472619652386e-06, |
|
"loss": 0.4361, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 2.35239852398524, |
|
"grad_norm": 1.3096646770487193, |
|
"learning_rate": 2.7085500021453838e-06, |
|
"loss": 0.4294, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 2.3616236162361623, |
|
"grad_norm": 1.2800372167523524, |
|
"learning_rate": 2.635493287059464e-06, |
|
"loss": 0.4299, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 2.3708487084870846, |
|
"grad_norm": 1.303993086907089, |
|
"learning_rate": 2.563285538549104e-06, |
|
"loss": 0.4361, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 2.3800738007380073, |
|
"grad_norm": 1.2720280407092956, |
|
"learning_rate": 2.491935080588658e-06, |
|
"loss": 0.4384, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 2.38929889298893, |
|
"grad_norm": 1.2941980810201439, |
|
"learning_rate": 2.421450138325625e-06, |
|
"loss": 0.4306, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 2.3985239852398523, |
|
"grad_norm": 1.2949495993502738, |
|
"learning_rate": 2.351838837132464e-06, |
|
"loss": 0.432, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 2.3985239852398523, |
|
"eval_loss": 0.9954376816749573, |
|
"eval_runtime": 519.9495, |
|
"eval_samples_per_second": 29.522, |
|
"eval_steps_per_second": 0.115, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 2.407749077490775, |
|
"grad_norm": 1.3018815365771563, |
|
"learning_rate": 2.283109201669936e-06, |
|
"loss": 0.4357, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 2.4169741697416973, |
|
"grad_norm": 1.2956106687686837, |
|
"learning_rate": 2.2152691549620155e-06, |
|
"loss": 0.4283, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 2.42619926199262, |
|
"grad_norm": 1.287230882437174, |
|
"learning_rate": 2.148326517482543e-06, |
|
"loss": 0.4303, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 2.4354243542435423, |
|
"grad_norm": 1.2592322120333668, |
|
"learning_rate": 2.0822890062537106e-06, |
|
"loss": 0.4366, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 2.444649446494465, |
|
"grad_norm": 1.3039469988205457, |
|
"learning_rate": 2.01716423395644e-06, |
|
"loss": 0.4317, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 2.4538745387453873, |
|
"grad_norm": 1.282772824972497, |
|
"learning_rate": 1.9529597080528207e-06, |
|
"loss": 0.4272, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 2.46309963099631, |
|
"grad_norm": 1.3227463435260074, |
|
"learning_rate": 1.8896828299206494e-06, |
|
"loss": 0.4256, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 2.4723247232472323, |
|
"grad_norm": 1.3607936617452498, |
|
"learning_rate": 1.8273408940002202e-06, |
|
"loss": 0.4389, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 2.481549815498155, |
|
"grad_norm": 1.2740801988744865, |
|
"learning_rate": 1.7659410869534466e-06, |
|
"loss": 0.4247, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 2.4907749077490777, |
|
"grad_norm": 1.2544315701192987, |
|
"learning_rate": 1.7054904868353717e-06, |
|
"loss": 0.4256, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 1.31550558585801, |
|
"learning_rate": 1.6459960622782466e-06, |
|
"loss": 0.428, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 2.5092250922509223, |
|
"grad_norm": 1.3030144767834306, |
|
"learning_rate": 1.587464671688187e-06, |
|
"loss": 0.4217, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 2.518450184501845, |
|
"grad_norm": 1.261812680015863, |
|
"learning_rate": 1.5299030624545563e-06, |
|
"loss": 0.4381, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 2.5276752767527677, |
|
"grad_norm": 1.3015065571944802, |
|
"learning_rate": 1.4733178701721262e-06, |
|
"loss": 0.4337, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 2.53690036900369, |
|
"grad_norm": 1.2805139778312684, |
|
"learning_rate": 1.4177156178761508e-06, |
|
"loss": 0.4313, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 2.5461254612546127, |
|
"grad_norm": 1.3271791125805354, |
|
"learning_rate": 1.363102715290402e-06, |
|
"loss": 0.4314, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 2.555350553505535, |
|
"grad_norm": 1.3155240192251205, |
|
"learning_rate": 1.3094854580882599e-06, |
|
"loss": 0.4298, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 2.5645756457564577, |
|
"grad_norm": 1.2884517504542843, |
|
"learning_rate": 1.2568700271669676e-06, |
|
"loss": 0.4315, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 2.57380073800738, |
|
"grad_norm": 1.2601572769871257, |
|
"learning_rate": 1.2052624879351105e-06, |
|
"loss": 0.4341, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 2.5830258302583027, |
|
"grad_norm": 1.283042988722646, |
|
"learning_rate": 1.1546687896133924e-06, |
|
"loss": 0.4301, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.5830258302583027, |
|
"eval_loss": 0.9943162798881531, |
|
"eval_runtime": 513.9906, |
|
"eval_samples_per_second": 29.864, |
|
"eval_steps_per_second": 0.117, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.592250922509225, |
|
"grad_norm": 1.269448040169663, |
|
"learning_rate": 1.1050947645488419e-06, |
|
"loss": 0.424, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 2.6014760147601477, |
|
"grad_norm": 1.291108826010762, |
|
"learning_rate": 1.0565461275424504e-06, |
|
"loss": 0.4288, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 2.61070110701107, |
|
"grad_norm": 1.246075371329031, |
|
"learning_rate": 1.0090284751903989e-06, |
|
"loss": 0.4308, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 2.6199261992619927, |
|
"grad_norm": 1.268331381912208, |
|
"learning_rate": 9.625472852388739e-07, |
|
"loss": 0.4274, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 2.6291512915129154, |
|
"grad_norm": 1.2558980878489436, |
|
"learning_rate": 9.171079159526186e-07, |
|
"loss": 0.4263, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 2.6383763837638377, |
|
"grad_norm": 1.2507458001549574, |
|
"learning_rate": 8.727156054972374e-07, |
|
"loss": 0.4364, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 2.64760147601476, |
|
"grad_norm": 1.2344093421817917, |
|
"learning_rate": 8.29375471335343e-07, |
|
"loss": 0.43, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 2.6568265682656826, |
|
"grad_norm": 1.2520176453134155, |
|
"learning_rate": 7.870925096366366e-07, |
|
"loss": 0.4298, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 2.6660516605166054, |
|
"grad_norm": 1.2874930933327957, |
|
"learning_rate": 7.458715947019468e-07, |
|
"loss": 0.4262, |
|
"step": 1445 |
|
}, |
|
{ |
|
"epoch": 2.6752767527675276, |
|
"grad_norm": 1.2682188739552445, |
|
"learning_rate": 7.057174784013432e-07, |
|
"loss": 0.4339, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 2.6845018450184504, |
|
"grad_norm": 1.2828645340804818, |
|
"learning_rate": 6.666347896263326e-07, |
|
"loss": 0.4274, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 2.6937269372693726, |
|
"grad_norm": 1.2595258026091076, |
|
"learning_rate": 6.286280337562656e-07, |
|
"loss": 0.4303, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 2.7029520295202953, |
|
"grad_norm": 1.24521822647123, |
|
"learning_rate": 5.917015921389569e-07, |
|
"loss": 0.4288, |
|
"step": 1465 |
|
}, |
|
{ |
|
"epoch": 2.7121771217712176, |
|
"grad_norm": 1.232445478302712, |
|
"learning_rate": 5.558597215856065e-07, |
|
"loss": 0.4285, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 2.7214022140221403, |
|
"grad_norm": 1.216057817991593, |
|
"learning_rate": 5.211065538800952e-07, |
|
"loss": 0.4208, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 2.7306273062730626, |
|
"grad_norm": 1.288524367589534, |
|
"learning_rate": 4.874460953026705e-07, |
|
"loss": 0.4255, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 2.7398523985239853, |
|
"grad_norm": 1.2332155213343263, |
|
"learning_rate": 4.548822261681107e-07, |
|
"loss": 0.423, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 2.7490774907749076, |
|
"grad_norm": 1.2278878382563285, |
|
"learning_rate": 4.2341870037841516e-07, |
|
"loss": 0.4291, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 2.7583025830258303, |
|
"grad_norm": 1.262898121860552, |
|
"learning_rate": 3.930591449900578e-07, |
|
"loss": 0.4247, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 2.767527675276753, |
|
"grad_norm": 1.2437619506416164, |
|
"learning_rate": 3.638070597958665e-07, |
|
"loss": 0.4361, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.767527675276753, |
|
"eval_loss": 0.9930853247642517, |
|
"eval_runtime": 516.1928, |
|
"eval_samples_per_second": 29.737, |
|
"eval_steps_per_second": 0.116, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.7767527675276753, |
|
"grad_norm": 1.2468366513522777, |
|
"learning_rate": 3.356658169215743e-07, |
|
"loss": 0.4282, |
|
"step": 1505 |
|
}, |
|
{ |
|
"epoch": 2.7859778597785976, |
|
"grad_norm": 1.2336029324910027, |
|
"learning_rate": 3.0863866043708393e-07, |
|
"loss": 0.4267, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 2.7952029520295203, |
|
"grad_norm": 1.3330748292636831, |
|
"learning_rate": 2.8272870598250677e-07, |
|
"loss": 0.4281, |
|
"step": 1515 |
|
}, |
|
{ |
|
"epoch": 2.804428044280443, |
|
"grad_norm": 1.2486193575900169, |
|
"learning_rate": 2.5793894040898384e-07, |
|
"loss": 0.4224, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 2.8136531365313653, |
|
"grad_norm": 1.235394179484528, |
|
"learning_rate": 2.3427222143438065e-07, |
|
"loss": 0.4184, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 2.8228782287822876, |
|
"grad_norm": 1.2913244981868073, |
|
"learning_rate": 2.117312773138458e-07, |
|
"loss": 0.4238, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 2.8321033210332103, |
|
"grad_norm": 1.2580451640594703, |
|
"learning_rate": 1.903187065253076e-07, |
|
"loss": 0.4274, |
|
"step": 1535 |
|
}, |
|
{ |
|
"epoch": 2.841328413284133, |
|
"grad_norm": 1.262849856657073, |
|
"learning_rate": 1.7003697746992398e-07, |
|
"loss": 0.4242, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 2.8505535055350553, |
|
"grad_norm": 1.2336423601103856, |
|
"learning_rate": 1.5088842818752892e-07, |
|
"loss": 0.4338, |
|
"step": 1545 |
|
}, |
|
{ |
|
"epoch": 2.859778597785978, |
|
"grad_norm": 1.279029201429549, |
|
"learning_rate": 1.3287526608711132e-07, |
|
"loss": 0.4247, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 2.8690036900369003, |
|
"grad_norm": 1.2569044993333771, |
|
"learning_rate": 1.1599956769234533e-07, |
|
"loss": 0.4167, |
|
"step": 1555 |
|
}, |
|
{ |
|
"epoch": 2.878228782287823, |
|
"grad_norm": 1.229520630461672, |
|
"learning_rate": 1.0026327840221728e-07, |
|
"loss": 0.4182, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 2.8874538745387452, |
|
"grad_norm": 1.255986608003343, |
|
"learning_rate": 8.566821226675514e-08, |
|
"loss": 0.4294, |
|
"step": 1565 |
|
}, |
|
{ |
|
"epoch": 2.896678966789668, |
|
"grad_norm": 1.2895814979486142, |
|
"learning_rate": 7.22160517779169e-08, |
|
"loss": 0.429, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 2.9059040590405907, |
|
"grad_norm": 1.2790109206046127, |
|
"learning_rate": 5.99083476756357e-08, |
|
"loss": 0.4261, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 2.915129151291513, |
|
"grad_norm": 1.2194809596900478, |
|
"learning_rate": 4.87465187690439e-08, |
|
"loss": 0.4211, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 2.9243542435424352, |
|
"grad_norm": 1.2665552740156838, |
|
"learning_rate": 3.873185177292737e-08, |
|
"loss": 0.4251, |
|
"step": 1585 |
|
}, |
|
{ |
|
"epoch": 2.933579335793358, |
|
"grad_norm": 1.2812371627035533, |
|
"learning_rate": 2.9865501159387355e-08, |
|
"loss": 0.4282, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 2.9428044280442807, |
|
"grad_norm": 1.2399165066075877, |
|
"learning_rate": 2.214848902475808e-08, |
|
"loss": 0.4341, |
|
"step": 1595 |
|
}, |
|
{ |
|
"epoch": 2.952029520295203, |
|
"grad_norm": 1.2154194504631015, |
|
"learning_rate": 1.558170497178213e-08, |
|
"loss": 0.4256, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 2.952029520295203, |
|
"eval_loss": 0.9934021830558777, |
|
"eval_runtime": 525.4852, |
|
"eval_samples_per_second": 29.211, |
|
"eval_steps_per_second": 0.114, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 2.961254612546125, |
|
"grad_norm": 1.2717521820081574, |
|
"learning_rate": 1.0165906007056914e-08, |
|
"loss": 0.4323, |
|
"step": 1605 |
|
}, |
|
{ |
|
"epoch": 2.970479704797048, |
|
"grad_norm": 1.2491830905746684, |
|
"learning_rate": 5.901716453770023e-09, |
|
"loss": 0.4271, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 2.9797047970479706, |
|
"grad_norm": 1.2521953436091506, |
|
"learning_rate": 2.7896278797256983e-09, |
|
"loss": 0.4256, |
|
"step": 1615 |
|
}, |
|
{ |
|
"epoch": 2.988929889298893, |
|
"grad_norm": 1.2335508198968657, |
|
"learning_rate": 8.299990406823721e-10, |
|
"loss": 0.4342, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 2.9981549815498156, |
|
"grad_norm": 1.2480273735451688, |
|
"learning_rate": 2.3055838990204693e-11, |
|
"loss": 0.4266, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"step": 1626, |
|
"total_flos": 1361805280542720.0, |
|
"train_loss": 0.713569560815634, |
|
"train_runtime": 59769.2599, |
|
"train_samples_per_second": 6.961, |
|
"train_steps_per_second": 0.027 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 1626, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1361805280542720.0, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|