|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 9.956521739130435, |
|
"eval_steps": 500, |
|
"global_step": 2180, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.004576659038901602, |
|
"grad_norm": 0.795125424861908, |
|
"learning_rate": 9.174311926605506e-07, |
|
"loss": 2.0857, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.02288329519450801, |
|
"grad_norm": 0.7591376900672913, |
|
"learning_rate": 4.587155963302753e-06, |
|
"loss": 2.0888, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.04576659038901602, |
|
"grad_norm": 0.7638061046600342, |
|
"learning_rate": 9.174311926605506e-06, |
|
"loss": 2.0744, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.06864988558352403, |
|
"grad_norm": 0.6205304265022278, |
|
"learning_rate": 1.3761467889908258e-05, |
|
"loss": 2.0578, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.09153318077803203, |
|
"grad_norm": 0.40848588943481445, |
|
"learning_rate": 1.834862385321101e-05, |
|
"loss": 2.0541, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.11441647597254005, |
|
"grad_norm": 0.35839515924453735, |
|
"learning_rate": 2.2935779816513765e-05, |
|
"loss": 2.0396, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.13729977116704806, |
|
"grad_norm": 0.3498051166534424, |
|
"learning_rate": 2.7522935779816515e-05, |
|
"loss": 2.0186, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.16018306636155608, |
|
"grad_norm": 0.34915778040885925, |
|
"learning_rate": 3.211009174311927e-05, |
|
"loss": 1.9692, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.18306636155606407, |
|
"grad_norm": 0.2829169034957886, |
|
"learning_rate": 3.669724770642202e-05, |
|
"loss": 1.9478, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.20594965675057209, |
|
"grad_norm": 0.29471325874328613, |
|
"learning_rate": 4.1284403669724776e-05, |
|
"loss": 1.9081, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.2288329519450801, |
|
"grad_norm": 0.26286041736602783, |
|
"learning_rate": 4.587155963302753e-05, |
|
"loss": 1.8852, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.2517162471395881, |
|
"grad_norm": 0.253055214881897, |
|
"learning_rate": 5.0458715596330276e-05, |
|
"loss": 1.8466, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.2745995423340961, |
|
"grad_norm": 0.25780022144317627, |
|
"learning_rate": 5.504587155963303e-05, |
|
"loss": 1.7978, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.2974828375286041, |
|
"grad_norm": 0.22376658022403717, |
|
"learning_rate": 5.9633027522935784e-05, |
|
"loss": 1.7513, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.32036613272311215, |
|
"grad_norm": 0.20868130028247833, |
|
"learning_rate": 6.422018348623854e-05, |
|
"loss": 1.7581, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.34324942791762014, |
|
"grad_norm": 0.20019210875034332, |
|
"learning_rate": 6.880733944954129e-05, |
|
"loss": 1.7225, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.36613272311212813, |
|
"grad_norm": 0.20799387991428375, |
|
"learning_rate": 7.339449541284404e-05, |
|
"loss": 1.7073, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.3890160183066362, |
|
"grad_norm": 0.19921623170375824, |
|
"learning_rate": 7.79816513761468e-05, |
|
"loss": 1.698, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.41189931350114417, |
|
"grad_norm": 0.19697250425815582, |
|
"learning_rate": 8.256880733944955e-05, |
|
"loss": 1.6806, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.43478260869565216, |
|
"grad_norm": 0.21829445660114288, |
|
"learning_rate": 8.715596330275229e-05, |
|
"loss": 1.661, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.4576659038901602, |
|
"grad_norm": 0.20831452310085297, |
|
"learning_rate": 9.174311926605506e-05, |
|
"loss": 1.6499, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.4805491990846682, |
|
"grad_norm": 0.2224440723657608, |
|
"learning_rate": 9.63302752293578e-05, |
|
"loss": 1.6504, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.5034324942791762, |
|
"grad_norm": 0.22193986177444458, |
|
"learning_rate": 0.00010091743119266055, |
|
"loss": 1.6399, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.5263157894736842, |
|
"grad_norm": 0.2367602288722992, |
|
"learning_rate": 0.00010550458715596329, |
|
"loss": 1.6236, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.5491990846681922, |
|
"grad_norm": 0.2362162321805954, |
|
"learning_rate": 0.00011009174311926606, |
|
"loss": 1.6227, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.5720823798627003, |
|
"grad_norm": 0.24902412295341492, |
|
"learning_rate": 0.00011467889908256881, |
|
"loss": 1.5984, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.5949656750572082, |
|
"grad_norm": 0.2517242431640625, |
|
"learning_rate": 0.00011926605504587157, |
|
"loss": 1.6003, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.6178489702517163, |
|
"grad_norm": 0.2526886463165283, |
|
"learning_rate": 0.00012385321100917432, |
|
"loss": 1.5961, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.6407322654462243, |
|
"grad_norm": 0.2544921338558197, |
|
"learning_rate": 0.00012844036697247707, |
|
"loss": 1.5782, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.6636155606407322, |
|
"grad_norm": 0.2370745986700058, |
|
"learning_rate": 0.00013302752293577983, |
|
"loss": 1.5817, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.6864988558352403, |
|
"grad_norm": 0.2654547095298767, |
|
"learning_rate": 0.00013761467889908258, |
|
"loss": 1.5683, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.7093821510297483, |
|
"grad_norm": 0.251737505197525, |
|
"learning_rate": 0.0001422018348623853, |
|
"loss": 1.5683, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.7322654462242563, |
|
"grad_norm": 0.27123987674713135, |
|
"learning_rate": 0.0001467889908256881, |
|
"loss": 1.5492, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.7551487414187643, |
|
"grad_norm": 0.2787420451641083, |
|
"learning_rate": 0.00015137614678899084, |
|
"loss": 1.5573, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.7780320366132724, |
|
"grad_norm": 0.3056250810623169, |
|
"learning_rate": 0.0001559633027522936, |
|
"loss": 1.5535, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.8009153318077803, |
|
"grad_norm": 0.4223385751247406, |
|
"learning_rate": 0.00016055045871559632, |
|
"loss": 1.5624, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.8237986270022883, |
|
"grad_norm": 0.3226919174194336, |
|
"learning_rate": 0.0001651376146788991, |
|
"loss": 1.5548, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.8466819221967964, |
|
"grad_norm": 0.42250385880470276, |
|
"learning_rate": 0.00016972477064220186, |
|
"loss": 1.5403, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.8695652173913043, |
|
"grad_norm": 0.29832813143730164, |
|
"learning_rate": 0.00017431192660550458, |
|
"loss": 1.5406, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.8924485125858124, |
|
"grad_norm": 0.318697988986969, |
|
"learning_rate": 0.00017889908256880734, |
|
"loss": 1.5423, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.9153318077803204, |
|
"grad_norm": 0.30288153886795044, |
|
"learning_rate": 0.00018348623853211012, |
|
"loss": 1.5432, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.9382151029748284, |
|
"grad_norm": 0.26959261298179626, |
|
"learning_rate": 0.00018807339449541284, |
|
"loss": 1.5264, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.9610983981693364, |
|
"grad_norm": 0.27854207158088684, |
|
"learning_rate": 0.0001926605504587156, |
|
"loss": 1.5328, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.9839816933638444, |
|
"grad_norm": 0.26367124915122986, |
|
"learning_rate": 0.00019724770642201835, |
|
"loss": 1.5142, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 2.4457786083221436, |
|
"eval_runtime": 0.158, |
|
"eval_samples_per_second": 63.273, |
|
"eval_steps_per_second": 6.327, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 1.0045766590389016, |
|
"grad_norm": 0.24696815013885498, |
|
"learning_rate": 0.00019999948721966259, |
|
"loss": 1.5267, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.0274599542334095, |
|
"grad_norm": 0.2515549063682556, |
|
"learning_rate": 0.0001999937185012612, |
|
"loss": 1.5197, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 1.0503432494279177, |
|
"grad_norm": 0.26250341534614563, |
|
"learning_rate": 0.00019998154046002822, |
|
"loss": 1.4932, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.0732265446224256, |
|
"grad_norm": 0.2599635422229767, |
|
"learning_rate": 0.00019996295387654262, |
|
"loss": 1.5057, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 1.0961098398169336, |
|
"grad_norm": 0.2875217795372009, |
|
"learning_rate": 0.0001999379599421534, |
|
"loss": 1.5123, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.1189931350114417, |
|
"grad_norm": 0.2571655511856079, |
|
"learning_rate": 0.00019990656025890315, |
|
"loss": 1.5125, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 1.1418764302059496, |
|
"grad_norm": 0.2501162588596344, |
|
"learning_rate": 0.00019986875683942535, |
|
"loss": 1.5125, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.1647597254004576, |
|
"grad_norm": 0.2591252028942108, |
|
"learning_rate": 0.00019982455210681537, |
|
"loss": 1.491, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 1.1876430205949657, |
|
"grad_norm": 0.2666991055011749, |
|
"learning_rate": 0.00019977394889447524, |
|
"loss": 1.4931, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.2105263157894737, |
|
"grad_norm": 0.3265208601951599, |
|
"learning_rate": 0.00019971695044593196, |
|
"loss": 1.5088, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 1.2334096109839816, |
|
"grad_norm": 0.3080615699291229, |
|
"learning_rate": 0.00019965356041462955, |
|
"loss": 1.4943, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.2562929061784898, |
|
"grad_norm": 0.2847588062286377, |
|
"learning_rate": 0.00019958378286369502, |
|
"loss": 1.4829, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 1.2791762013729977, |
|
"grad_norm": 0.2870901823043823, |
|
"learning_rate": 0.00019950762226567781, |
|
"loss": 1.4841, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.3020594965675056, |
|
"grad_norm": 0.27676528692245483, |
|
"learning_rate": 0.00019942508350226314, |
|
"loss": 1.4901, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 1.3249427917620138, |
|
"grad_norm": 0.26728203892707825, |
|
"learning_rate": 0.00019933617186395917, |
|
"loss": 1.4743, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.3478260869565217, |
|
"grad_norm": 0.2857504189014435, |
|
"learning_rate": 0.0001992408930497578, |
|
"loss": 1.4919, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 1.3707093821510297, |
|
"grad_norm": 0.36729517579078674, |
|
"learning_rate": 0.00019913925316676945, |
|
"loss": 1.4872, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.3935926773455378, |
|
"grad_norm": 0.2691424489021301, |
|
"learning_rate": 0.0001990312587298316, |
|
"loss": 1.4785, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 1.4164759725400458, |
|
"grad_norm": 0.3448796570301056, |
|
"learning_rate": 0.00019891691666109113, |
|
"loss": 1.4857, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.4393592677345537, |
|
"grad_norm": 0.29720258712768555, |
|
"learning_rate": 0.0001987962342895607, |
|
"loss": 1.4844, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 1.4622425629290619, |
|
"grad_norm": 0.25346481800079346, |
|
"learning_rate": 0.00019866921935064906, |
|
"loss": 1.4822, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.4851258581235698, |
|
"grad_norm": 0.2484666258096695, |
|
"learning_rate": 0.0001985358799856651, |
|
"loss": 1.4804, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 1.5080091533180777, |
|
"grad_norm": 0.2337537705898285, |
|
"learning_rate": 0.00019839622474129596, |
|
"loss": 1.4762, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.5308924485125859, |
|
"grad_norm": 0.26846134662628174, |
|
"learning_rate": 0.00019825026256905948, |
|
"loss": 1.4656, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 1.5537757437070938, |
|
"grad_norm": 0.239125594496727, |
|
"learning_rate": 0.00019809800282473013, |
|
"loss": 1.4802, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.5766590389016018, |
|
"grad_norm": 0.2293742597103119, |
|
"learning_rate": 0.00019793945526773947, |
|
"loss": 1.4603, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 1.59954233409611, |
|
"grad_norm": 0.2655136287212372, |
|
"learning_rate": 0.0001977746300605507, |
|
"loss": 1.4783, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.6224256292906178, |
|
"grad_norm": 0.25485122203826904, |
|
"learning_rate": 0.00019760353776800704, |
|
"loss": 1.4755, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 1.6453089244851258, |
|
"grad_norm": 0.2485138326883316, |
|
"learning_rate": 0.00019742618935665476, |
|
"loss": 1.464, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.668192219679634, |
|
"grad_norm": 0.23952895402908325, |
|
"learning_rate": 0.0001972425961940401, |
|
"loss": 1.4652, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 1.6910755148741419, |
|
"grad_norm": 0.22710928320884705, |
|
"learning_rate": 0.00019705277004798073, |
|
"loss": 1.4585, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.7139588100686498, |
|
"grad_norm": 0.2363155037164688, |
|
"learning_rate": 0.00019685672308581152, |
|
"loss": 1.4725, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 1.736842105263158, |
|
"grad_norm": 0.23678353428840637, |
|
"learning_rate": 0.0001966544678736044, |
|
"loss": 1.4654, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.759725400457666, |
|
"grad_norm": 0.24644772708415985, |
|
"learning_rate": 0.00019644601737536338, |
|
"loss": 1.4473, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 1.7826086956521738, |
|
"grad_norm": 0.24988573789596558, |
|
"learning_rate": 0.00019623138495219292, |
|
"loss": 1.4658, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.805491990846682, |
|
"grad_norm": 0.2501707971096039, |
|
"learning_rate": 0.00019601058436144225, |
|
"loss": 1.4631, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 1.82837528604119, |
|
"grad_norm": 0.22899797558784485, |
|
"learning_rate": 0.00019578362975582292, |
|
"loss": 1.4663, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.8512585812356979, |
|
"grad_norm": 0.2957789897918701, |
|
"learning_rate": 0.0001955505356825021, |
|
"loss": 1.4649, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 1.874141876430206, |
|
"grad_norm": 0.26199233531951904, |
|
"learning_rate": 0.00019531131708217005, |
|
"loss": 1.4568, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.897025171624714, |
|
"grad_norm": 0.2635655403137207, |
|
"learning_rate": 0.00019506598928808216, |
|
"loss": 1.4748, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 1.919908466819222, |
|
"grad_norm": 0.23314692080020905, |
|
"learning_rate": 0.0001948145680250766, |
|
"loss": 1.4615, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.94279176201373, |
|
"grad_norm": 0.24421729147434235, |
|
"learning_rate": 0.000194557069408566, |
|
"loss": 1.463, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 1.965675057208238, |
|
"grad_norm": 0.2970636188983917, |
|
"learning_rate": 0.00019429350994350483, |
|
"loss": 1.4702, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.988558352402746, |
|
"grad_norm": 0.2254825085401535, |
|
"learning_rate": 0.0001940239065233311, |
|
"loss": 1.4574, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 2.456493854522705, |
|
"eval_runtime": 0.1591, |
|
"eval_samples_per_second": 62.851, |
|
"eval_steps_per_second": 6.285, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 2.009153318077803, |
|
"grad_norm": 0.2410830557346344, |
|
"learning_rate": 0.00019374827642888398, |
|
"loss": 1.4467, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 2.0320366132723113, |
|
"grad_norm": 0.3021443784236908, |
|
"learning_rate": 0.00019346663732729572, |
|
"loss": 1.4415, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 2.054919908466819, |
|
"grad_norm": 0.2840650975704193, |
|
"learning_rate": 0.0001931790072708596, |
|
"loss": 1.4438, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 2.077803203661327, |
|
"grad_norm": 0.25511375069618225, |
|
"learning_rate": 0.0001928854046958725, |
|
"loss": 1.4412, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 2.1006864988558354, |
|
"grad_norm": 0.25664883852005005, |
|
"learning_rate": 0.00019258584842145343, |
|
"loss": 1.4476, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 2.123569794050343, |
|
"grad_norm": 0.29023540019989014, |
|
"learning_rate": 0.00019228035764833718, |
|
"loss": 1.4393, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 2.1464530892448512, |
|
"grad_norm": 0.23540236055850983, |
|
"learning_rate": 0.00019196895195764362, |
|
"loss": 1.45, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 2.1693363844393594, |
|
"grad_norm": 0.23571573197841644, |
|
"learning_rate": 0.0001916516513096226, |
|
"loss": 1.4292, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 2.192219679633867, |
|
"grad_norm": 0.22221675515174866, |
|
"learning_rate": 0.0001913284760423745, |
|
"loss": 1.4385, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 2.2151029748283753, |
|
"grad_norm": 0.24515533447265625, |
|
"learning_rate": 0.00019099944687054672, |
|
"loss": 1.438, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 2.2379862700228834, |
|
"grad_norm": 0.22849489748477936, |
|
"learning_rate": 0.00019066458488400584, |
|
"loss": 1.4365, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 2.260869565217391, |
|
"grad_norm": 0.23650361597537994, |
|
"learning_rate": 0.0001903239115464859, |
|
"loss": 1.4421, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 2.2837528604118993, |
|
"grad_norm": 0.2418275624513626, |
|
"learning_rate": 0.00018997744869421246, |
|
"loss": 1.4275, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 2.3066361556064074, |
|
"grad_norm": 0.2576926350593567, |
|
"learning_rate": 0.00018962521853450323, |
|
"loss": 1.4315, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 2.329519450800915, |
|
"grad_norm": 0.26270532608032227, |
|
"learning_rate": 0.00018926724364434446, |
|
"loss": 1.4386, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 2.3524027459954233, |
|
"grad_norm": 0.2584366798400879, |
|
"learning_rate": 0.00018890354696894375, |
|
"loss": 1.4288, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 2.3752860411899315, |
|
"grad_norm": 0.2276887744665146, |
|
"learning_rate": 0.0001885341518202595, |
|
"loss": 1.4331, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 2.398169336384439, |
|
"grad_norm": 0.25574877858161926, |
|
"learning_rate": 0.00018815908187550667, |
|
"loss": 1.4343, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 2.4210526315789473, |
|
"grad_norm": 0.2539375424385071, |
|
"learning_rate": 0.00018777836117563892, |
|
"loss": 1.4283, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 2.4439359267734555, |
|
"grad_norm": 0.23258014023303986, |
|
"learning_rate": 0.0001873920141238079, |
|
"loss": 1.4333, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 2.466819221967963, |
|
"grad_norm": 0.24933487176895142, |
|
"learning_rate": 0.00018700006548379898, |
|
"loss": 1.4245, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 2.4897025171624714, |
|
"grad_norm": 0.2398696094751358, |
|
"learning_rate": 0.00018660254037844388, |
|
"loss": 1.4373, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 2.5125858123569795, |
|
"grad_norm": 0.2470606118440628, |
|
"learning_rate": 0.0001861994642880105, |
|
"loss": 1.4316, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 2.5354691075514877, |
|
"grad_norm": 0.2763492166996002, |
|
"learning_rate": 0.0001857908630485696, |
|
"loss": 1.4229, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 2.5583524027459954, |
|
"grad_norm": 0.25183382630348206, |
|
"learning_rate": 0.00018537676285033887, |
|
"loss": 1.4235, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 2.5812356979405036, |
|
"grad_norm": 0.25710180401802063, |
|
"learning_rate": 0.00018495719023600414, |
|
"loss": 1.4203, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 2.6041189931350113, |
|
"grad_norm": 0.22346384823322296, |
|
"learning_rate": 0.0001845321720990181, |
|
"loss": 1.4274, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 2.6270022883295194, |
|
"grad_norm": 0.2290782779455185, |
|
"learning_rate": 0.00018410173568187647, |
|
"loss": 1.4304, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 2.6498855835240276, |
|
"grad_norm": 0.25173699855804443, |
|
"learning_rate": 0.00018366590857437184, |
|
"loss": 1.4331, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 2.6727688787185357, |
|
"grad_norm": 0.2373346984386444, |
|
"learning_rate": 0.00018322471871182528, |
|
"loss": 1.4257, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 2.6956521739130435, |
|
"grad_norm": 0.2577199339866638, |
|
"learning_rate": 0.00018277819437329576, |
|
"loss": 1.4382, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 2.7185354691075516, |
|
"grad_norm": 0.23805540800094604, |
|
"learning_rate": 0.00018232636417976744, |
|
"loss": 1.4365, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 2.7414187643020593, |
|
"grad_norm": 0.25108346343040466, |
|
"learning_rate": 0.00018186925709231532, |
|
"loss": 1.424, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 2.7643020594965675, |
|
"grad_norm": 0.25178954005241394, |
|
"learning_rate": 0.00018140690241024872, |
|
"loss": 1.429, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 2.7871853546910756, |
|
"grad_norm": 0.2157055288553238, |
|
"learning_rate": 0.0001809393297692334, |
|
"loss": 1.4257, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 2.8100686498855834, |
|
"grad_norm": 0.22406814992427826, |
|
"learning_rate": 0.00018046656913939195, |
|
"loss": 1.4248, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 2.8329519450800915, |
|
"grad_norm": 0.22442927956581116, |
|
"learning_rate": 0.0001799886508233829, |
|
"loss": 1.4292, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 2.8558352402745997, |
|
"grad_norm": 0.22204960882663727, |
|
"learning_rate": 0.00017950560545445813, |
|
"loss": 1.4399, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 2.8787185354691074, |
|
"grad_norm": 0.2269710898399353, |
|
"learning_rate": 0.0001790174639944997, |
|
"loss": 1.4248, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 2.9016018306636155, |
|
"grad_norm": 0.23222693800926208, |
|
"learning_rate": 0.000178524257732035, |
|
"loss": 1.4261, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 2.9244851258581237, |
|
"grad_norm": 0.21854890882968903, |
|
"learning_rate": 0.00017802601828023138, |
|
"loss": 1.4172, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 2.9473684210526314, |
|
"grad_norm": 0.2316950112581253, |
|
"learning_rate": 0.0001775227775748699, |
|
"loss": 1.4156, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 2.9702517162471396, |
|
"grad_norm": 0.21298670768737793, |
|
"learning_rate": 0.00017701456787229804, |
|
"loss": 1.4288, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 2.9931350114416477, |
|
"grad_norm": 0.21937227249145508, |
|
"learning_rate": 0.00017650142174736262, |
|
"loss": 1.4298, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 2.4537367820739746, |
|
"eval_runtime": 0.1693, |
|
"eval_samples_per_second": 59.08, |
|
"eval_steps_per_second": 5.908, |
|
"step": 657 |
|
}, |
|
{ |
|
"epoch": 3.013729977116705, |
|
"grad_norm": 0.22709442675113678, |
|
"learning_rate": 0.0001759833720913214, |
|
"loss": 1.4058, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 3.0366132723112127, |
|
"grad_norm": 0.21663448214530945, |
|
"learning_rate": 0.00017546045210973507, |
|
"loss": 1.4108, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 3.059496567505721, |
|
"grad_norm": 0.21900029480457306, |
|
"learning_rate": 0.00017493269532033883, |
|
"loss": 1.41, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 3.082379862700229, |
|
"grad_norm": 0.23556417226791382, |
|
"learning_rate": 0.00017440013555089393, |
|
"loss": 1.4084, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 3.1052631578947367, |
|
"grad_norm": 0.26198992133140564, |
|
"learning_rate": 0.0001738628069370195, |
|
"loss": 1.4108, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 3.128146453089245, |
|
"grad_norm": 0.2543962299823761, |
|
"learning_rate": 0.00017332074392000454, |
|
"loss": 1.4124, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 3.151029748283753, |
|
"grad_norm": 0.23739871382713318, |
|
"learning_rate": 0.00017277398124460023, |
|
"loss": 1.4048, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 3.1739130434782608, |
|
"grad_norm": 0.23524770140647888, |
|
"learning_rate": 0.00017222255395679296, |
|
"loss": 1.4184, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 3.196796338672769, |
|
"grad_norm": 0.2273031771183014, |
|
"learning_rate": 0.000171666497401558, |
|
"loss": 1.4122, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 3.219679633867277, |
|
"grad_norm": 0.23595724999904633, |
|
"learning_rate": 0.00017110584722059393, |
|
"loss": 1.407, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 3.242562929061785, |
|
"grad_norm": 0.21827439963817596, |
|
"learning_rate": 0.0001705406393500381, |
|
"loss": 1.4027, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 3.265446224256293, |
|
"grad_norm": 0.22116319835186005, |
|
"learning_rate": 0.00016997091001816336, |
|
"loss": 1.4049, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 3.288329519450801, |
|
"grad_norm": 0.23044751584529877, |
|
"learning_rate": 0.00016939669574305566, |
|
"loss": 1.4095, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 3.311212814645309, |
|
"grad_norm": 0.22559754550457, |
|
"learning_rate": 0.00016881803333027362, |
|
"loss": 1.3991, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 3.334096109839817, |
|
"grad_norm": 0.2260756939649582, |
|
"learning_rate": 0.0001682349598704892, |
|
"loss": 1.4123, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 3.356979405034325, |
|
"grad_norm": 0.2145521193742752, |
|
"learning_rate": 0.00016764751273711044, |
|
"loss": 1.4119, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 3.379862700228833, |
|
"grad_norm": 0.23654837906360626, |
|
"learning_rate": 0.00016705572958388576, |
|
"loss": 1.4024, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 3.402745995423341, |
|
"grad_norm": 0.21862386167049408, |
|
"learning_rate": 0.0001664596483424906, |
|
"loss": 1.4104, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 3.425629290617849, |
|
"grad_norm": 0.2240535169839859, |
|
"learning_rate": 0.00016585930722009601, |
|
"loss": 1.4012, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 3.448512585812357, |
|
"grad_norm": 0.23204627633094788, |
|
"learning_rate": 0.00016525474469691984, |
|
"loss": 1.4044, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 3.471395881006865, |
|
"grad_norm": 0.23645585775375366, |
|
"learning_rate": 0.00016464599952375998, |
|
"loss": 1.4135, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 3.494279176201373, |
|
"grad_norm": 0.2199520468711853, |
|
"learning_rate": 0.00016403311071951082, |
|
"loss": 1.413, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 3.517162471395881, |
|
"grad_norm": 0.23638251423835754, |
|
"learning_rate": 0.000163416117568662, |
|
"loss": 1.4101, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 3.540045766590389, |
|
"grad_norm": 0.2419108748435974, |
|
"learning_rate": 0.00016279505961878064, |
|
"loss": 1.399, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 3.5629290617848968, |
|
"grad_norm": 0.23601311445236206, |
|
"learning_rate": 0.0001621699766779763, |
|
"loss": 1.4084, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 3.585812356979405, |
|
"grad_norm": 0.24460086226463318, |
|
"learning_rate": 0.0001615409088123493, |
|
"loss": 1.4144, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 3.608695652173913, |
|
"grad_norm": 0.21848763525485992, |
|
"learning_rate": 0.00016090789634342278, |
|
"loss": 1.3929, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 3.6315789473684212, |
|
"grad_norm": 0.22614765167236328, |
|
"learning_rate": 0.00016027097984555816, |
|
"loss": 1.399, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 3.654462242562929, |
|
"grad_norm": 0.22731591761112213, |
|
"learning_rate": 0.00015963020014335438, |
|
"loss": 1.3998, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 3.677345537757437, |
|
"grad_norm": 0.21770906448364258, |
|
"learning_rate": 0.00015898559830903106, |
|
"loss": 1.4064, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 3.700228832951945, |
|
"grad_norm": 0.25158295035362244, |
|
"learning_rate": 0.0001583372156597961, |
|
"loss": 1.4046, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 3.723112128146453, |
|
"grad_norm": 0.23650844395160675, |
|
"learning_rate": 0.00015768509375519726, |
|
"loss": 1.3942, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 3.745995423340961, |
|
"grad_norm": 0.22743116319179535, |
|
"learning_rate": 0.00015702927439445826, |
|
"loss": 1.4022, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 3.7688787185354693, |
|
"grad_norm": 0.23518233001232147, |
|
"learning_rate": 0.0001563697996137997, |
|
"loss": 1.3986, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 3.791762013729977, |
|
"grad_norm": 0.22372908890247345, |
|
"learning_rate": 0.00015570671168374438, |
|
"loss": 1.3937, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 3.814645308924485, |
|
"grad_norm": 0.2474096119403839, |
|
"learning_rate": 0.00015504005310640822, |
|
"loss": 1.3984, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 3.837528604118993, |
|
"grad_norm": 0.23158776760101318, |
|
"learning_rate": 0.00015436986661277577, |
|
"loss": 1.3972, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 3.860411899313501, |
|
"grad_norm": 0.239366814494133, |
|
"learning_rate": 0.0001536961951599613, |
|
"loss": 1.4023, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 3.883295194508009, |
|
"grad_norm": 0.26158976554870605, |
|
"learning_rate": 0.0001530190819284555, |
|
"loss": 1.3918, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 3.9061784897025174, |
|
"grad_norm": 0.2554950416088104, |
|
"learning_rate": 0.00015233857031935749, |
|
"loss": 1.3927, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 3.929061784897025, |
|
"grad_norm": 0.23338022828102112, |
|
"learning_rate": 0.00015165470395159313, |
|
"loss": 1.3929, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 3.9519450800915332, |
|
"grad_norm": 0.2260856032371521, |
|
"learning_rate": 0.00015096752665911913, |
|
"loss": 1.4055, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 3.974828375286041, |
|
"grad_norm": 0.2310597449541092, |
|
"learning_rate": 0.0001502770824881133, |
|
"loss": 1.3984, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 3.997711670480549, |
|
"grad_norm": 0.22370661795139313, |
|
"learning_rate": 0.00014958341569415147, |
|
"loss": 1.3972, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 2.462189197540283, |
|
"eval_runtime": 0.1584, |
|
"eval_samples_per_second": 63.121, |
|
"eval_steps_per_second": 6.312, |
|
"step": 876 |
|
}, |
|
{ |
|
"epoch": 4.018306636155606, |
|
"grad_norm": 0.24240370094776154, |
|
"learning_rate": 0.00014888657073937076, |
|
"loss": 1.3961, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 4.0411899313501145, |
|
"grad_norm": 0.22996193170547485, |
|
"learning_rate": 0.0001481865922896196, |
|
"loss": 1.3808, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 4.064073226544623, |
|
"grad_norm": 0.21983887255191803, |
|
"learning_rate": 0.00014748352521159493, |
|
"loss": 1.3764, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 4.086956521739131, |
|
"grad_norm": 0.23078066110610962, |
|
"learning_rate": 0.00014677741456996617, |
|
"loss": 1.3826, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 4.109839816933638, |
|
"grad_norm": 0.2580978274345398, |
|
"learning_rate": 0.0001460683056244869, |
|
"loss": 1.3816, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 4.132723112128146, |
|
"grad_norm": 0.22216792404651642, |
|
"learning_rate": 0.00014535624382709382, |
|
"loss": 1.3767, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 4.155606407322654, |
|
"grad_norm": 0.21901686489582062, |
|
"learning_rate": 0.00014464127481899312, |
|
"loss": 1.3951, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 4.178489702517163, |
|
"grad_norm": 0.22333645820617676, |
|
"learning_rate": 0.0001439234444277354, |
|
"loss": 1.3946, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 4.201372997711671, |
|
"grad_norm": 0.2606089413166046, |
|
"learning_rate": 0.00014320279866427796, |
|
"loss": 1.3836, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 4.224256292906179, |
|
"grad_norm": 0.2596084475517273, |
|
"learning_rate": 0.00014247938372003582, |
|
"loss": 1.3849, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 4.247139588100686, |
|
"grad_norm": 0.2835715115070343, |
|
"learning_rate": 0.00014175324596392075, |
|
"loss": 1.386, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 4.270022883295194, |
|
"grad_norm": 0.2396639585494995, |
|
"learning_rate": 0.0001410244319393694, |
|
"loss": 1.3848, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 4.2929061784897025, |
|
"grad_norm": 0.22530311346054077, |
|
"learning_rate": 0.00014029298836135988, |
|
"loss": 1.3839, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 4.315789473684211, |
|
"grad_norm": 0.23546090722084045, |
|
"learning_rate": 0.0001395589621134174, |
|
"loss": 1.3948, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 4.338672768878719, |
|
"grad_norm": 0.24042978882789612, |
|
"learning_rate": 0.00013882240024460927, |
|
"loss": 1.3891, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 4.361556064073227, |
|
"grad_norm": 0.26460978388786316, |
|
"learning_rate": 0.00013808334996652904, |
|
"loss": 1.3821, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 4.384439359267734, |
|
"grad_norm": 0.2298058718442917, |
|
"learning_rate": 0.0001373418586502706, |
|
"loss": 1.3796, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 4.407322654462242, |
|
"grad_norm": 0.21995796263217926, |
|
"learning_rate": 0.0001365979738233916, |
|
"loss": 1.3892, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 4.4302059496567505, |
|
"grad_norm": 0.22304576635360718, |
|
"learning_rate": 0.0001358517431668672, |
|
"loss": 1.3731, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 4.453089244851259, |
|
"grad_norm": 0.2750610411167145, |
|
"learning_rate": 0.0001351032145120337, |
|
"loss": 1.3732, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 4.475972540045767, |
|
"grad_norm": 0.24216437339782715, |
|
"learning_rate": 0.00013435243583752294, |
|
"loss": 1.3955, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 4.498855835240275, |
|
"grad_norm": 0.23672589659690857, |
|
"learning_rate": 0.00013359945526618668, |
|
"loss": 1.3859, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 4.521739130434782, |
|
"grad_norm": 0.21664075553417206, |
|
"learning_rate": 0.00013284432106201233, |
|
"loss": 1.3885, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 4.54462242562929, |
|
"grad_norm": 0.22267000377178192, |
|
"learning_rate": 0.00013208708162702922, |
|
"loss": 1.3811, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 4.567505720823799, |
|
"grad_norm": 0.23190075159072876, |
|
"learning_rate": 0.00013132778549820618, |
|
"loss": 1.3813, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 4.590389016018307, |
|
"grad_norm": 0.21617767214775085, |
|
"learning_rate": 0.0001305664813443405, |
|
"loss": 1.3821, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 4.613272311212815, |
|
"grad_norm": 0.22316230833530426, |
|
"learning_rate": 0.00012980321796293836, |
|
"loss": 1.3854, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 4.636155606407323, |
|
"grad_norm": 0.22770841419696808, |
|
"learning_rate": 0.00012903804427708704, |
|
"loss": 1.392, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 4.65903890160183, |
|
"grad_norm": 0.25319135189056396, |
|
"learning_rate": 0.00012827100933231905, |
|
"loss": 1.384, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 4.6819221967963385, |
|
"grad_norm": 0.24790321290493011, |
|
"learning_rate": 0.0001275021622934685, |
|
"loss": 1.3817, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 4.704805491990847, |
|
"grad_norm": 0.24224399030208588, |
|
"learning_rate": 0.00012673155244151985, |
|
"loss": 1.3869, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 4.727688787185355, |
|
"grad_norm": 0.2306058555841446, |
|
"learning_rate": 0.0001259592291704489, |
|
"loss": 1.3816, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 4.750572082379863, |
|
"grad_norm": 0.2296520620584488, |
|
"learning_rate": 0.000125185241984057, |
|
"loss": 1.3855, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 4.77345537757437, |
|
"grad_norm": 0.22120504081249237, |
|
"learning_rate": 0.00012440964049279787, |
|
"loss": 1.3839, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 4.796338672768878, |
|
"grad_norm": 0.23121312260627747, |
|
"learning_rate": 0.00012363247441059776, |
|
"loss": 1.3816, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 4.8192219679633865, |
|
"grad_norm": 0.22182002663612366, |
|
"learning_rate": 0.00012285379355166893, |
|
"loss": 1.3808, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 4.842105263157895, |
|
"grad_norm": 0.22699835896492004, |
|
"learning_rate": 0.00012207364782731655, |
|
"loss": 1.3829, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 4.864988558352403, |
|
"grad_norm": 0.24495604634284973, |
|
"learning_rate": 0.00012129208724273984, |
|
"loss": 1.3827, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 4.887871853546911, |
|
"grad_norm": 0.235788956284523, |
|
"learning_rate": 0.00012050916189382646, |
|
"loss": 1.3898, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 4.910755148741419, |
|
"grad_norm": 0.24433469772338867, |
|
"learning_rate": 0.00011972492196394187, |
|
"loss": 1.3913, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 4.933638443935926, |
|
"grad_norm": 0.2331000417470932, |
|
"learning_rate": 0.00011893941772071249, |
|
"loss": 1.3906, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 4.956521739130435, |
|
"grad_norm": 0.22595056891441345, |
|
"learning_rate": 0.0001181526995128038, |
|
"loss": 1.3981, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 4.979405034324943, |
|
"grad_norm": 0.22172114253044128, |
|
"learning_rate": 0.00011736481776669306, |
|
"loss": 1.3895, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 0.3044658601284027, |
|
"learning_rate": 0.0001165758229834371, |
|
"loss": 1.3868, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_loss": 2.481019973754883, |
|
"eval_runtime": 0.1574, |
|
"eval_samples_per_second": 63.534, |
|
"eval_steps_per_second": 6.353, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 5.022883295194508, |
|
"grad_norm": 0.24489176273345947, |
|
"learning_rate": 0.0001157857657354354, |
|
"loss": 1.376, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 5.045766590389016, |
|
"grad_norm": 0.2253871113061905, |
|
"learning_rate": 0.00011499469666318858, |
|
"loss": 1.3759, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 5.068649885583524, |
|
"grad_norm": 0.235123872756958, |
|
"learning_rate": 0.00011420266647205231, |
|
"loss": 1.3774, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 5.091533180778032, |
|
"grad_norm": 0.2346729189157486, |
|
"learning_rate": 0.00011340972592898744, |
|
"loss": 1.3556, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 5.11441647597254, |
|
"grad_norm": 0.2369905263185501, |
|
"learning_rate": 0.00011261592585930576, |
|
"loss": 1.3714, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 5.137299771167048, |
|
"grad_norm": 0.2265693098306656, |
|
"learning_rate": 0.00011182131714341247, |
|
"loss": 1.37, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 5.160183066361556, |
|
"grad_norm": 0.21262961626052856, |
|
"learning_rate": 0.00011102595071354472, |
|
"loss": 1.3639, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 5.183066361556064, |
|
"grad_norm": 0.2247275561094284, |
|
"learning_rate": 0.00011022987755050704, |
|
"loss": 1.3649, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 5.2059496567505725, |
|
"grad_norm": 0.22549380362033844, |
|
"learning_rate": 0.00010943314868040364, |
|
"loss": 1.3542, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 5.22883295194508, |
|
"grad_norm": 0.2336389124393463, |
|
"learning_rate": 0.00010863581517136776, |
|
"loss": 1.3659, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 5.251716247139588, |
|
"grad_norm": 0.24654635787010193, |
|
"learning_rate": 0.00010783792813028827, |
|
"loss": 1.3742, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 5.274599542334096, |
|
"grad_norm": 0.22848239541053772, |
|
"learning_rate": 0.00010703953869953403, |
|
"loss": 1.3783, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 5.297482837528604, |
|
"grad_norm": 0.241807758808136, |
|
"learning_rate": 0.00010624069805367559, |
|
"loss": 1.3731, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 5.320366132723112, |
|
"grad_norm": 0.23657450079917908, |
|
"learning_rate": 0.00010544145739620519, |
|
"loss": 1.3674, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 5.34324942791762, |
|
"grad_norm": 0.2408100813627243, |
|
"learning_rate": 0.00010464186795625482, |
|
"loss": 1.3732, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 5.366132723112128, |
|
"grad_norm": 0.23080427944660187, |
|
"learning_rate": 0.00010384198098531225, |
|
"loss": 1.3912, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 5.389016018306636, |
|
"grad_norm": 0.23397450149059296, |
|
"learning_rate": 0.00010304184775393642, |
|
"loss": 1.381, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 5.411899313501144, |
|
"grad_norm": 0.2350955307483673, |
|
"learning_rate": 0.00010224151954847064, |
|
"loss": 1.3768, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 5.434782608695652, |
|
"grad_norm": 0.23939336836338043, |
|
"learning_rate": 0.00010144104766775572, |
|
"loss": 1.3731, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 5.4576659038901605, |
|
"grad_norm": 0.23009169101715088, |
|
"learning_rate": 0.0001006404834198416, |
|
"loss": 1.3715, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 5.480549199084669, |
|
"grad_norm": 0.2236054390668869, |
|
"learning_rate": 9.983987811869862e-05, |
|
"loss": 1.3737, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 5.503432494279176, |
|
"grad_norm": 0.23314499855041504, |
|
"learning_rate": 9.903928308092865e-05, |
|
"loss": 1.369, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 5.526315789473684, |
|
"grad_norm": 0.22338473796844482, |
|
"learning_rate": 9.823874962247564e-05, |
|
"loss": 1.3717, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 5.549199084668192, |
|
"grad_norm": 0.22037141025066376, |
|
"learning_rate": 9.743832905533644e-05, |
|
"loss": 1.3789, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 5.5720823798627, |
|
"grad_norm": 0.2261509895324707, |
|
"learning_rate": 9.663807268427198e-05, |
|
"loss": 1.3741, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 5.5949656750572085, |
|
"grad_norm": 0.2503112256526947, |
|
"learning_rate": 9.583803180351852e-05, |
|
"loss": 1.3805, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 5.617848970251716, |
|
"grad_norm": 0.22896412014961243, |
|
"learning_rate": 9.503825769350017e-05, |
|
"loss": 1.3583, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 5.640732265446224, |
|
"grad_norm": 0.2552075982093811, |
|
"learning_rate": 9.423880161754158e-05, |
|
"loss": 1.3767, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 5.663615560640732, |
|
"grad_norm": 0.23098088800907135, |
|
"learning_rate": 9.343971481858246e-05, |
|
"loss": 1.3719, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 5.68649885583524, |
|
"grad_norm": 0.23425135016441345, |
|
"learning_rate": 9.264104851589273e-05, |
|
"loss": 1.3678, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 5.709382151029748, |
|
"grad_norm": 0.229757159948349, |
|
"learning_rate": 9.184285390178978e-05, |
|
"loss": 1.3652, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 5.732265446224257, |
|
"grad_norm": 0.22069285809993744, |
|
"learning_rate": 9.104518213835692e-05, |
|
"loss": 1.3628, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 5.755148741418765, |
|
"grad_norm": 0.23451228439807892, |
|
"learning_rate": 9.024808435416434e-05, |
|
"loss": 1.3658, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 5.778032036613272, |
|
"grad_norm": 0.2394520491361618, |
|
"learning_rate": 8.945161164099157e-05, |
|
"loss": 1.3775, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 5.80091533180778, |
|
"grad_norm": 0.23245683312416077, |
|
"learning_rate": 8.865581505055291e-05, |
|
"loss": 1.372, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 5.823798627002288, |
|
"grad_norm": 0.23066289722919464, |
|
"learning_rate": 8.7860745591225e-05, |
|
"loss": 1.3739, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 5.8466819221967965, |
|
"grad_norm": 0.23424118757247925, |
|
"learning_rate": 8.706645422477739e-05, |
|
"loss": 1.3597, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 5.869565217391305, |
|
"grad_norm": 0.2321302741765976, |
|
"learning_rate": 8.627299186310603e-05, |
|
"loss": 1.3732, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 5.892448512585812, |
|
"grad_norm": 0.2273801863193512, |
|
"learning_rate": 8.548040936496989e-05, |
|
"loss": 1.3685, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 5.91533180778032, |
|
"grad_norm": 0.2422792911529541, |
|
"learning_rate": 8.468875753273115e-05, |
|
"loss": 1.372, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 5.938215102974828, |
|
"grad_norm": 0.22604131698608398, |
|
"learning_rate": 8.389808710909881e-05, |
|
"loss": 1.3676, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 5.961098398169336, |
|
"grad_norm": 0.24549026787281036, |
|
"learning_rate": 8.310844877387637e-05, |
|
"loss": 1.358, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 5.983981693363845, |
|
"grad_norm": 0.23445729911327362, |
|
"learning_rate": 8.231989314071317e-05, |
|
"loss": 1.3692, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_loss": 2.5001282691955566, |
|
"eval_runtime": 0.1605, |
|
"eval_samples_per_second": 62.3, |
|
"eval_steps_per_second": 6.23, |
|
"step": 1314 |
|
}, |
|
{ |
|
"epoch": 6.004576659038902, |
|
"grad_norm": 0.2388521283864975, |
|
"learning_rate": 8.153247075386043e-05, |
|
"loss": 1.3541, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 6.02745995423341, |
|
"grad_norm": 0.23299774527549744, |
|
"learning_rate": 8.07462320849313e-05, |
|
"loss": 1.3572, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 6.050343249427917, |
|
"grad_norm": 0.23086461424827576, |
|
"learning_rate": 7.996122752966595e-05, |
|
"loss": 1.3568, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 6.073226544622425, |
|
"grad_norm": 0.23091675341129303, |
|
"learning_rate": 7.917750740470117e-05, |
|
"loss": 1.3618, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 6.0961098398169336, |
|
"grad_norm": 0.23161986470222473, |
|
"learning_rate": 7.839512194434531e-05, |
|
"loss": 1.363, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 6.118993135011442, |
|
"grad_norm": 0.23545809090137482, |
|
"learning_rate": 7.761412129735852e-05, |
|
"loss": 1.3609, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 6.14187643020595, |
|
"grad_norm": 0.23319639265537262, |
|
"learning_rate": 7.683455552373799e-05, |
|
"loss": 1.365, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 6.164759725400458, |
|
"grad_norm": 0.23536434769630432, |
|
"learning_rate": 7.605647459150961e-05, |
|
"loss": 1.3673, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 6.187643020594965, |
|
"grad_norm": 0.2269584983587265, |
|
"learning_rate": 7.527992837352501e-05, |
|
"loss": 1.3635, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 6.2105263157894735, |
|
"grad_norm": 0.22587326169013977, |
|
"learning_rate": 7.450496664426477e-05, |
|
"loss": 1.3595, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 6.233409610983982, |
|
"grad_norm": 0.22287671267986298, |
|
"learning_rate": 7.37316390766482e-05, |
|
"loss": 1.3752, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 6.25629290617849, |
|
"grad_norm": 0.23099535703659058, |
|
"learning_rate": 7.295999523884921e-05, |
|
"loss": 1.3613, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 6.279176201372998, |
|
"grad_norm": 0.22423085570335388, |
|
"learning_rate": 7.219008459111937e-05, |
|
"loss": 1.3658, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 6.302059496567506, |
|
"grad_norm": 0.23212574422359467, |
|
"learning_rate": 7.142195648261747e-05, |
|
"loss": 1.361, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 6.324942791762013, |
|
"grad_norm": 0.22750739753246307, |
|
"learning_rate": 7.065566014824643e-05, |
|
"loss": 1.3602, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 6.3478260869565215, |
|
"grad_norm": 0.2309405356645584, |
|
"learning_rate": 6.989124470549745e-05, |
|
"loss": 1.366, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 6.37070938215103, |
|
"grad_norm": 0.24688778817653656, |
|
"learning_rate": 6.912875915130183e-05, |
|
"loss": 1.3645, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 6.393592677345538, |
|
"grad_norm": 0.23304879665374756, |
|
"learning_rate": 6.83682523588902e-05, |
|
"loss": 1.3519, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 6.416475972540046, |
|
"grad_norm": 0.2324698269367218, |
|
"learning_rate": 6.760977307466008e-05, |
|
"loss": 1.358, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 6.439359267734554, |
|
"grad_norm": 0.23698380589485168, |
|
"learning_rate": 6.685336991505122e-05, |
|
"loss": 1.3569, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 6.462242562929061, |
|
"grad_norm": 0.2366679459810257, |
|
"learning_rate": 6.609909136342955e-05, |
|
"loss": 1.3538, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 6.48512585812357, |
|
"grad_norm": 0.2314102202653885, |
|
"learning_rate": 6.534698576697939e-05, |
|
"loss": 1.3607, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 6.508009153318078, |
|
"grad_norm": 0.2393520474433899, |
|
"learning_rate": 6.459710133360464e-05, |
|
"loss": 1.3649, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 6.530892448512586, |
|
"grad_norm": 0.23024214804172516, |
|
"learning_rate": 6.384948612883873e-05, |
|
"loss": 1.367, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 6.553775743707094, |
|
"grad_norm": 0.23126175999641418, |
|
"learning_rate": 6.310418807276375e-05, |
|
"loss": 1.3623, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 6.576659038901602, |
|
"grad_norm": 0.2327370047569275, |
|
"learning_rate": 6.2361254936939e-05, |
|
"loss": 1.3532, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 6.5995423340961095, |
|
"grad_norm": 0.22811928391456604, |
|
"learning_rate": 6.162073434133876e-05, |
|
"loss": 1.3662, |
|
"step": 1445 |
|
}, |
|
{ |
|
"epoch": 6.622425629290618, |
|
"grad_norm": 0.25274351239204407, |
|
"learning_rate": 6.088267375130023e-05, |
|
"loss": 1.3537, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 6.645308924485126, |
|
"grad_norm": 0.24889418482780457, |
|
"learning_rate": 6.01471204744809e-05, |
|
"loss": 1.3598, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 6.668192219679634, |
|
"grad_norm": 0.23805195093154907, |
|
"learning_rate": 5.941412165782645e-05, |
|
"loss": 1.3525, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 6.691075514874142, |
|
"grad_norm": 0.2376497983932495, |
|
"learning_rate": 5.868372428454861e-05, |
|
"loss": 1.3662, |
|
"step": 1465 |
|
}, |
|
{ |
|
"epoch": 6.71395881006865, |
|
"grad_norm": 0.24028287827968597, |
|
"learning_rate": 5.79559751711138e-05, |
|
"loss": 1.3524, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 6.7368421052631575, |
|
"grad_norm": 0.23552508652210236, |
|
"learning_rate": 5.72309209642422e-05, |
|
"loss": 1.3435, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 6.759725400457666, |
|
"grad_norm": 0.23250797390937805, |
|
"learning_rate": 5.650860813791785e-05, |
|
"loss": 1.3576, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 6.782608695652174, |
|
"grad_norm": 0.23260493576526642, |
|
"learning_rate": 5.5789082990409945e-05, |
|
"loss": 1.3648, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 6.805491990846682, |
|
"grad_norm": 0.23065823316574097, |
|
"learning_rate": 5.507239164130501e-05, |
|
"loss": 1.3553, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 6.82837528604119, |
|
"grad_norm": 0.22898992896080017, |
|
"learning_rate": 5.4358580028550896e-05, |
|
"loss": 1.357, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 6.851258581235698, |
|
"grad_norm": 0.24019262194633484, |
|
"learning_rate": 5.364769390551225e-05, |
|
"loss": 1.3561, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 6.874141876430206, |
|
"grad_norm": 0.24061718583106995, |
|
"learning_rate": 5.293977883803797e-05, |
|
"loss": 1.3654, |
|
"step": 1505 |
|
}, |
|
{ |
|
"epoch": 6.897025171624714, |
|
"grad_norm": 0.2238311618566513, |
|
"learning_rate": 5.2234880201540284e-05, |
|
"loss": 1.3542, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 6.919908466819222, |
|
"grad_norm": 0.22695277631282806, |
|
"learning_rate": 5.1533043178086536e-05, |
|
"loss": 1.354, |
|
"step": 1515 |
|
}, |
|
{ |
|
"epoch": 6.94279176201373, |
|
"grad_norm": 0.22949448227882385, |
|
"learning_rate": 5.0834312753503124e-05, |
|
"loss": 1.3613, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 6.965675057208238, |
|
"grad_norm": 0.23366492986679077, |
|
"learning_rate": 5.0138733714492e-05, |
|
"loss": 1.3497, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 6.988558352402746, |
|
"grad_norm": 0.2318800687789917, |
|
"learning_rate": 4.9446350645759885e-05, |
|
"loss": 1.3509, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_loss": 2.50384783744812, |
|
"eval_runtime": 0.1625, |
|
"eval_samples_per_second": 61.536, |
|
"eval_steps_per_second": 6.154, |
|
"step": 1533 |
|
}, |
|
{ |
|
"epoch": 7.009153318077804, |
|
"grad_norm": 0.24519094824790955, |
|
"learning_rate": 4.8757207927160584e-05, |
|
"loss": 1.3595, |
|
"step": 1535 |
|
}, |
|
{ |
|
"epoch": 7.032036613272311, |
|
"grad_norm": 0.23380842804908752, |
|
"learning_rate": 4.807134973085036e-05, |
|
"loss": 1.3402, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 7.054919908466819, |
|
"grad_norm": 0.23818600177764893, |
|
"learning_rate": 4.738882001845668e-05, |
|
"loss": 1.3542, |
|
"step": 1545 |
|
}, |
|
{ |
|
"epoch": 7.077803203661327, |
|
"grad_norm": 0.2337319701910019, |
|
"learning_rate": 4.6709662538260267e-05, |
|
"loss": 1.3603, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 7.100686498855835, |
|
"grad_norm": 0.23182658851146698, |
|
"learning_rate": 4.603392082239102e-05, |
|
"loss": 1.3513, |
|
"step": 1555 |
|
}, |
|
{ |
|
"epoch": 7.1235697940503435, |
|
"grad_norm": 0.22734318673610687, |
|
"learning_rate": 4.53616381840377e-05, |
|
"loss": 1.3515, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 7.146453089244852, |
|
"grad_norm": 0.2282598614692688, |
|
"learning_rate": 4.469285771467181e-05, |
|
"loss": 1.3529, |
|
"step": 1565 |
|
}, |
|
{ |
|
"epoch": 7.169336384439359, |
|
"grad_norm": 0.23265138268470764, |
|
"learning_rate": 4.402762228128531e-05, |
|
"loss": 1.3513, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 7.192219679633867, |
|
"grad_norm": 0.23814916610717773, |
|
"learning_rate": 4.336597452364309e-05, |
|
"loss": 1.3459, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 7.215102974828375, |
|
"grad_norm": 0.23314997553825378, |
|
"learning_rate": 4.2707956851550016e-05, |
|
"loss": 1.351, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 7.237986270022883, |
|
"grad_norm": 0.2370002567768097, |
|
"learning_rate": 4.205361144213227e-05, |
|
"loss": 1.3552, |
|
"step": 1585 |
|
}, |
|
{ |
|
"epoch": 7.260869565217392, |
|
"grad_norm": 0.2338656485080719, |
|
"learning_rate": 4.140298023713416e-05, |
|
"loss": 1.3598, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 7.2837528604119, |
|
"grad_norm": 0.2293270081281662, |
|
"learning_rate": 4.075610494022964e-05, |
|
"loss": 1.3565, |
|
"step": 1595 |
|
}, |
|
{ |
|
"epoch": 7.306636155606407, |
|
"grad_norm": 0.23523476719856262, |
|
"learning_rate": 4.011302701434937e-05, |
|
"loss": 1.3545, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 7.329519450800915, |
|
"grad_norm": 0.22826853394508362, |
|
"learning_rate": 3.947378767902284e-05, |
|
"loss": 1.3411, |
|
"step": 1605 |
|
}, |
|
{ |
|
"epoch": 7.352402745995423, |
|
"grad_norm": 0.24477452039718628, |
|
"learning_rate": 3.8838427907736476e-05, |
|
"loss": 1.3482, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 7.3752860411899315, |
|
"grad_norm": 0.23173397779464722, |
|
"learning_rate": 3.8206988425307246e-05, |
|
"loss": 1.3602, |
|
"step": 1615 |
|
}, |
|
{ |
|
"epoch": 7.39816933638444, |
|
"grad_norm": 0.2321847528219223, |
|
"learning_rate": 3.757950970527249e-05, |
|
"loss": 1.3572, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 7.421052631578947, |
|
"grad_norm": 0.2273811548948288, |
|
"learning_rate": 3.695603196729543e-05, |
|
"loss": 1.3575, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 7.443935926773455, |
|
"grad_norm": 0.22913090884685516, |
|
"learning_rate": 3.633659517458736e-05, |
|
"loss": 1.3566, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 7.466819221967963, |
|
"grad_norm": 0.23222309350967407, |
|
"learning_rate": 3.5721239031346066e-05, |
|
"loss": 1.3461, |
|
"step": 1635 |
|
}, |
|
{ |
|
"epoch": 7.489702517162471, |
|
"grad_norm": 0.2386443018913269, |
|
"learning_rate": 3.5110002980210975e-05, |
|
"loss": 1.3335, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 7.5125858123569795, |
|
"grad_norm": 0.23243063688278198, |
|
"learning_rate": 3.450292619973483e-05, |
|
"loss": 1.3497, |
|
"step": 1645 |
|
}, |
|
{ |
|
"epoch": 7.535469107551488, |
|
"grad_norm": 0.22777137160301208, |
|
"learning_rate": 3.3900047601872596e-05, |
|
"loss": 1.3526, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 7.558352402745996, |
|
"grad_norm": 0.23459511995315552, |
|
"learning_rate": 3.3301405829487195e-05, |
|
"loss": 1.3484, |
|
"step": 1655 |
|
}, |
|
{ |
|
"epoch": 7.581235697940503, |
|
"grad_norm": 0.22907888889312744, |
|
"learning_rate": 3.270703925387279e-05, |
|
"loss": 1.3529, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 7.604118993135011, |
|
"grad_norm": 0.22607311606407166, |
|
"learning_rate": 3.2116985972295076e-05, |
|
"loss": 1.3557, |
|
"step": 1665 |
|
}, |
|
{ |
|
"epoch": 7.627002288329519, |
|
"grad_norm": 0.2280961275100708, |
|
"learning_rate": 3.153128380554941e-05, |
|
"loss": 1.3485, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 7.649885583524028, |
|
"grad_norm": 0.22796593606472015, |
|
"learning_rate": 3.094997029553673e-05, |
|
"loss": 1.3524, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 7.672768878718536, |
|
"grad_norm": 0.22574880719184875, |
|
"learning_rate": 3.037308270285709e-05, |
|
"loss": 1.3415, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 7.695652173913043, |
|
"grad_norm": 0.22538483142852783, |
|
"learning_rate": 2.9800658004421366e-05, |
|
"loss": 1.3584, |
|
"step": 1685 |
|
}, |
|
{ |
|
"epoch": 7.718535469107551, |
|
"grad_norm": 0.22863738238811493, |
|
"learning_rate": 2.923273289108115e-05, |
|
"loss": 1.3555, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 7.741418764302059, |
|
"grad_norm": 0.23403213918209076, |
|
"learning_rate": 2.8669343765277078e-05, |
|
"loss": 1.3455, |
|
"step": 1695 |
|
}, |
|
{ |
|
"epoch": 7.7643020594965675, |
|
"grad_norm": 0.23343510925769806, |
|
"learning_rate": 2.8110526738705344e-05, |
|
"loss": 1.3472, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 7.787185354691076, |
|
"grad_norm": 0.2283450961112976, |
|
"learning_rate": 2.755631763000318e-05, |
|
"loss": 1.3594, |
|
"step": 1705 |
|
}, |
|
{ |
|
"epoch": 7.810068649885584, |
|
"grad_norm": 0.2360517978668213, |
|
"learning_rate": 2.7006751962452882e-05, |
|
"loss": 1.347, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 7.832951945080092, |
|
"grad_norm": 0.2376154214143753, |
|
"learning_rate": 2.6461864961704975e-05, |
|
"loss": 1.3532, |
|
"step": 1715 |
|
}, |
|
{ |
|
"epoch": 7.855835240274599, |
|
"grad_norm": 0.23671028017997742, |
|
"learning_rate": 2.592169155352031e-05, |
|
"loss": 1.3589, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 7.878718535469107, |
|
"grad_norm": 0.23020578920841217, |
|
"learning_rate": 2.538626636153131e-05, |
|
"loss": 1.3527, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 7.9016018306636155, |
|
"grad_norm": 0.2411489188671112, |
|
"learning_rate": 2.485562370502279e-05, |
|
"loss": 1.3508, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 7.924485125858124, |
|
"grad_norm": 0.23260930180549622, |
|
"learning_rate": 2.4329797596732252e-05, |
|
"loss": 1.3446, |
|
"step": 1735 |
|
}, |
|
{ |
|
"epoch": 7.947368421052632, |
|
"grad_norm": 0.23165710270404816, |
|
"learning_rate": 2.3808821740669606e-05, |
|
"loss": 1.357, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 7.970251716247139, |
|
"grad_norm": 0.240895077586174, |
|
"learning_rate": 2.3292729529956935e-05, |
|
"loss": 1.3631, |
|
"step": 1745 |
|
}, |
|
{ |
|
"epoch": 7.993135011441647, |
|
"grad_norm": 0.2322818636894226, |
|
"learning_rate": 2.2781554044688015e-05, |
|
"loss": 1.3413, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_loss": 2.5123343467712402, |
|
"eval_runtime": 0.1598, |
|
"eval_samples_per_second": 62.575, |
|
"eval_steps_per_second": 6.258, |
|
"step": 1752 |
|
}, |
|
{ |
|
"epoch": 8.013729977116705, |
|
"grad_norm": 0.22951894998550415, |
|
"learning_rate": 2.227532804980813e-05, |
|
"loss": 1.3476, |
|
"step": 1755 |
|
}, |
|
{ |
|
"epoch": 8.036613272311213, |
|
"grad_norm": 0.23239345848560333, |
|
"learning_rate": 2.1774083993013718e-05, |
|
"loss": 1.344, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 8.05949656750572, |
|
"grad_norm": 0.23161360621452332, |
|
"learning_rate": 2.1277854002672683e-05, |
|
"loss": 1.3465, |
|
"step": 1765 |
|
}, |
|
{ |
|
"epoch": 8.082379862700229, |
|
"grad_norm": 0.24183310568332672, |
|
"learning_rate": 2.078666988576504e-05, |
|
"loss": 1.3395, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 8.105263157894736, |
|
"grad_norm": 0.23045741021633148, |
|
"learning_rate": 2.030056312584424e-05, |
|
"loss": 1.3489, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 8.128146453089245, |
|
"grad_norm": 0.23021478950977325, |
|
"learning_rate": 1.9819564881018983e-05, |
|
"loss": 1.3461, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 8.151029748283753, |
|
"grad_norm": 0.22722554206848145, |
|
"learning_rate": 1.934370598195622e-05, |
|
"loss": 1.344, |
|
"step": 1785 |
|
}, |
|
{ |
|
"epoch": 8.173913043478262, |
|
"grad_norm": 0.23223134875297546, |
|
"learning_rate": 1.887301692990494e-05, |
|
"loss": 1.3585, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 8.196796338672769, |
|
"grad_norm": 0.2324499934911728, |
|
"learning_rate": 1.8407527894741184e-05, |
|
"loss": 1.3466, |
|
"step": 1795 |
|
}, |
|
{ |
|
"epoch": 8.219679633867276, |
|
"grad_norm": 0.23666158318519592, |
|
"learning_rate": 1.7947268713034127e-05, |
|
"loss": 1.3353, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 8.242562929061785, |
|
"grad_norm": 0.2395636886358261, |
|
"learning_rate": 1.7492268886133676e-05, |
|
"loss": 1.3312, |
|
"step": 1805 |
|
}, |
|
{ |
|
"epoch": 8.265446224256292, |
|
"grad_norm": 0.23455668985843658, |
|
"learning_rate": 1.7042557578279626e-05, |
|
"loss": 1.3536, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 8.288329519450802, |
|
"grad_norm": 0.24188482761383057, |
|
"learning_rate": 1.6598163614732154e-05, |
|
"loss": 1.3592, |
|
"step": 1815 |
|
}, |
|
{ |
|
"epoch": 8.311212814645309, |
|
"grad_norm": 0.23694472014904022, |
|
"learning_rate": 1.6159115479924257e-05, |
|
"loss": 1.3485, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 8.334096109839816, |
|
"grad_norm": 0.2400536835193634, |
|
"learning_rate": 1.5725441315636002e-05, |
|
"loss": 1.3387, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 8.356979405034325, |
|
"grad_norm": 0.2341165691614151, |
|
"learning_rate": 1.529716891919074e-05, |
|
"loss": 1.3429, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 8.379862700228832, |
|
"grad_norm": 0.23209024965763092, |
|
"learning_rate": 1.4874325741673278e-05, |
|
"loss": 1.3537, |
|
"step": 1835 |
|
}, |
|
{ |
|
"epoch": 8.402745995423341, |
|
"grad_norm": 0.2267337441444397, |
|
"learning_rate": 1.4456938886170412e-05, |
|
"loss": 1.3432, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 8.425629290617849, |
|
"grad_norm": 0.22982369363307953, |
|
"learning_rate": 1.4045035106033655e-05, |
|
"loss": 1.3401, |
|
"step": 1845 |
|
}, |
|
{ |
|
"epoch": 8.448512585812358, |
|
"grad_norm": 0.23104478418827057, |
|
"learning_rate": 1.3638640803164516e-05, |
|
"loss": 1.3481, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 8.471395881006865, |
|
"grad_norm": 0.22922466695308685, |
|
"learning_rate": 1.3237782026322055e-05, |
|
"loss": 1.3548, |
|
"step": 1855 |
|
}, |
|
{ |
|
"epoch": 8.494279176201372, |
|
"grad_norm": 0.2303161472082138, |
|
"learning_rate": 1.2842484469453365e-05, |
|
"loss": 1.3461, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 8.517162471395881, |
|
"grad_norm": 0.23580402135849, |
|
"learning_rate": 1.2452773470046541e-05, |
|
"loss": 1.3517, |
|
"step": 1865 |
|
}, |
|
{ |
|
"epoch": 8.540045766590389, |
|
"grad_norm": 0.2284560203552246, |
|
"learning_rate": 1.2068674007506786e-05, |
|
"loss": 1.34, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 8.562929061784898, |
|
"grad_norm": 0.23408174514770508, |
|
"learning_rate": 1.1690210701555104e-05, |
|
"loss": 1.3382, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 8.585812356979405, |
|
"grad_norm": 0.22775082290172577, |
|
"learning_rate": 1.1317407810650372e-05, |
|
"loss": 1.3533, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 8.608695652173914, |
|
"grad_norm": 0.23187576234340668, |
|
"learning_rate": 1.0950289230434374e-05, |
|
"loss": 1.342, |
|
"step": 1885 |
|
}, |
|
{ |
|
"epoch": 8.631578947368421, |
|
"grad_norm": 0.23138730227947235, |
|
"learning_rate": 1.058887849220026e-05, |
|
"loss": 1.3571, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 8.654462242562929, |
|
"grad_norm": 0.2307848185300827, |
|
"learning_rate": 1.02331987613841e-05, |
|
"loss": 1.3525, |
|
"step": 1895 |
|
}, |
|
{ |
|
"epoch": 8.677345537757438, |
|
"grad_norm": 0.2283322662115097, |
|
"learning_rate": 9.883272836080116e-06, |
|
"loss": 1.3472, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 8.700228832951945, |
|
"grad_norm": 0.22654347121715546, |
|
"learning_rate": 9.539123145579476e-06, |
|
"loss": 1.3477, |
|
"step": 1905 |
|
}, |
|
{ |
|
"epoch": 8.723112128146454, |
|
"grad_norm": 0.23034285008907318, |
|
"learning_rate": 9.200771748932513e-06, |
|
"loss": 1.345, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 8.745995423340961, |
|
"grad_norm": 0.23074457049369812, |
|
"learning_rate": 8.868240333534815e-06, |
|
"loss": 1.3389, |
|
"step": 1915 |
|
}, |
|
{ |
|
"epoch": 8.768878718535468, |
|
"grad_norm": 0.22868825495243073, |
|
"learning_rate": 8.541550213737171e-06, |
|
"loss": 1.3417, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 8.791762013729977, |
|
"grad_norm": 0.22654058039188385, |
|
"learning_rate": 8.220722329479346e-06, |
|
"loss": 1.3458, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 8.814645308924485, |
|
"grad_norm": 0.22552727162837982, |
|
"learning_rate": 7.905777244947954e-06, |
|
"loss": 1.3444, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 8.837528604118994, |
|
"grad_norm": 0.22772538661956787, |
|
"learning_rate": 7.5967351472582275e-06, |
|
"loss": 1.355, |
|
"step": 1935 |
|
}, |
|
{ |
|
"epoch": 8.860411899313501, |
|
"grad_norm": 0.2261328399181366, |
|
"learning_rate": 7.293615845160196e-06, |
|
"loss": 1.3314, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 8.883295194508008, |
|
"grad_norm": 0.23299558460712433, |
|
"learning_rate": 6.99643876776891e-06, |
|
"loss": 1.3461, |
|
"step": 1945 |
|
}, |
|
{ |
|
"epoch": 8.906178489702517, |
|
"grad_norm": 0.22399510443210602, |
|
"learning_rate": 6.705222963319191e-06, |
|
"loss": 1.3412, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 8.929061784897025, |
|
"grad_norm": 0.22913098335266113, |
|
"learning_rate": 6.419987097944579e-06, |
|
"loss": 1.3519, |
|
"step": 1955 |
|
}, |
|
{ |
|
"epoch": 8.951945080091534, |
|
"grad_norm": 0.22438156604766846, |
|
"learning_rate": 6.140749454480932e-06, |
|
"loss": 1.3513, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 8.974828375286041, |
|
"grad_norm": 0.23290272057056427, |
|
"learning_rate": 5.867527931294614e-06, |
|
"loss": 1.3495, |
|
"step": 1965 |
|
}, |
|
{ |
|
"epoch": 8.99771167048055, |
|
"grad_norm": 0.2285740226507187, |
|
"learning_rate": 5.6003400411351325e-06, |
|
"loss": 1.3562, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_loss": 2.517554998397827, |
|
"eval_runtime": 0.1572, |
|
"eval_samples_per_second": 63.605, |
|
"eval_steps_per_second": 6.36, |
|
"step": 1971 |
|
}, |
|
{ |
|
"epoch": 9.018306636155607, |
|
"grad_norm": 0.22652767598628998, |
|
"learning_rate": 5.339202910012708e-06, |
|
"loss": 1.3499, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 9.041189931350115, |
|
"grad_norm": 0.2265496850013733, |
|
"learning_rate": 5.0841332761005e-06, |
|
"loss": 1.3514, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 9.064073226544622, |
|
"grad_norm": 0.22489045560359955, |
|
"learning_rate": 4.835147488661795e-06, |
|
"loss": 1.3402, |
|
"step": 1985 |
|
}, |
|
{ |
|
"epoch": 9.08695652173913, |
|
"grad_norm": 0.23145288228988647, |
|
"learning_rate": 4.592261507001993e-06, |
|
"loss": 1.343, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 9.109839816933638, |
|
"grad_norm": 0.228094220161438, |
|
"learning_rate": 4.355490899445691e-06, |
|
"loss": 1.3422, |
|
"step": 1995 |
|
}, |
|
{ |
|
"epoch": 9.132723112128147, |
|
"grad_norm": 0.23417501151561737, |
|
"learning_rate": 4.124850842338779e-06, |
|
"loss": 1.3512, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 9.155606407322654, |
|
"grad_norm": 0.2259424477815628, |
|
"learning_rate": 3.900356119075743e-06, |
|
"loss": 1.3428, |
|
"step": 2005 |
|
}, |
|
{ |
|
"epoch": 9.178489702517162, |
|
"grad_norm": 0.22936026751995087, |
|
"learning_rate": 3.6820211191520125e-06, |
|
"loss": 1.343, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 9.20137299771167, |
|
"grad_norm": 0.2265051156282425, |
|
"learning_rate": 3.469859837241651e-06, |
|
"loss": 1.3517, |
|
"step": 2015 |
|
}, |
|
{ |
|
"epoch": 9.224256292906178, |
|
"grad_norm": 0.22797267138957977, |
|
"learning_rate": 3.263885872300343e-06, |
|
"loss": 1.3497, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 9.247139588100687, |
|
"grad_norm": 0.2263391762971878, |
|
"learning_rate": 3.064112426693799e-06, |
|
"loss": 1.3381, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 9.270022883295194, |
|
"grad_norm": 0.23144488036632538, |
|
"learning_rate": 2.8705523053513816e-06, |
|
"loss": 1.3453, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 9.292906178489703, |
|
"grad_norm": 0.2310468852519989, |
|
"learning_rate": 2.6832179149454793e-06, |
|
"loss": 1.338, |
|
"step": 2035 |
|
}, |
|
{ |
|
"epoch": 9.31578947368421, |
|
"grad_norm": 0.2297661006450653, |
|
"learning_rate": 2.502121263096224e-06, |
|
"loss": 1.3424, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 9.338672768878718, |
|
"grad_norm": 0.23045673966407776, |
|
"learning_rate": 2.3272739576017945e-06, |
|
"loss": 1.3525, |
|
"step": 2045 |
|
}, |
|
{ |
|
"epoch": 9.361556064073227, |
|
"grad_norm": 0.22856095433235168, |
|
"learning_rate": 2.1586872056944428e-06, |
|
"loss": 1.3441, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 9.384439359267734, |
|
"grad_norm": 0.23274967074394226, |
|
"learning_rate": 1.996371813322129e-06, |
|
"loss": 1.355, |
|
"step": 2055 |
|
}, |
|
{ |
|
"epoch": 9.407322654462243, |
|
"grad_norm": 0.22686432301998138, |
|
"learning_rate": 1.840338184455881e-06, |
|
"loss": 1.341, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 9.43020594965675, |
|
"grad_norm": 0.22302532196044922, |
|
"learning_rate": 1.6905963204229436e-06, |
|
"loss": 1.3573, |
|
"step": 2065 |
|
}, |
|
{ |
|
"epoch": 9.453089244851258, |
|
"grad_norm": 0.2239583283662796, |
|
"learning_rate": 1.5471558192656777e-06, |
|
"loss": 1.3357, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 9.475972540045767, |
|
"grad_norm": 0.22839844226837158, |
|
"learning_rate": 1.4100258751264195e-06, |
|
"loss": 1.3387, |
|
"step": 2075 |
|
}, |
|
{ |
|
"epoch": 9.498855835240274, |
|
"grad_norm": 0.22954653203487396, |
|
"learning_rate": 1.2792152776580968e-06, |
|
"loss": 1.3428, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 9.521739130434783, |
|
"grad_norm": 0.22747966647148132, |
|
"learning_rate": 1.1547324114608904e-06, |
|
"loss": 1.3396, |
|
"step": 2085 |
|
}, |
|
{ |
|
"epoch": 9.54462242562929, |
|
"grad_norm": 0.2284754067659378, |
|
"learning_rate": 1.036585255544764e-06, |
|
"loss": 1.3472, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 9.5675057208238, |
|
"grad_norm": 0.22196947038173676, |
|
"learning_rate": 9.247813828180407e-07, |
|
"loss": 1.3454, |
|
"step": 2095 |
|
}, |
|
{ |
|
"epoch": 9.590389016018307, |
|
"grad_norm": 0.22999419271945953, |
|
"learning_rate": 8.193279596020121e-07, |
|
"loss": 1.3431, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 9.613272311212814, |
|
"grad_norm": 0.23028169572353363, |
|
"learning_rate": 7.202317451716067e-07, |
|
"loss": 1.3277, |
|
"step": 2105 |
|
}, |
|
{ |
|
"epoch": 9.636155606407323, |
|
"grad_norm": 0.22792372107505798, |
|
"learning_rate": 6.274990913221035e-07, |
|
"loss": 1.3374, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 9.65903890160183, |
|
"grad_norm": 0.22512663900852203, |
|
"learning_rate": 5.411359419620232e-07, |
|
"loss": 1.3503, |
|
"step": 2115 |
|
}, |
|
{ |
|
"epoch": 9.68192219679634, |
|
"grad_norm": 0.22875413298606873, |
|
"learning_rate": 4.6114783273213393e-07, |
|
"loss": 1.3451, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 9.704805491990847, |
|
"grad_norm": 0.22629615664482117, |
|
"learning_rate": 3.8753989065064557e-07, |
|
"loss": 1.3315, |
|
"step": 2125 |
|
}, |
|
{ |
|
"epoch": 9.727688787185354, |
|
"grad_norm": 0.22211916744709015, |
|
"learning_rate": 3.203168337845508e-07, |
|
"loss": 1.35, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 9.750572082379863, |
|
"grad_norm": 0.22795149683952332, |
|
"learning_rate": 2.594829709472446e-07, |
|
"loss": 1.3314, |
|
"step": 2135 |
|
}, |
|
{ |
|
"epoch": 9.77345537757437, |
|
"grad_norm": 0.231648787856102, |
|
"learning_rate": 2.05042201422323e-07, |
|
"loss": 1.3568, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 9.79633867276888, |
|
"grad_norm": 0.22703665494918823, |
|
"learning_rate": 1.5699801471364962e-07, |
|
"loss": 1.3411, |
|
"step": 2145 |
|
}, |
|
{ |
|
"epoch": 9.819221967963387, |
|
"grad_norm": 0.2311718612909317, |
|
"learning_rate": 1.1535349032167908e-07, |
|
"loss": 1.3456, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 9.842105263157894, |
|
"grad_norm": 0.2319943606853485, |
|
"learning_rate": 8.011129754611491e-08, |
|
"loss": 1.3507, |
|
"step": 2155 |
|
}, |
|
{ |
|
"epoch": 9.864988558352403, |
|
"grad_norm": 0.2376013547182083, |
|
"learning_rate": 5.127369531473525e-08, |
|
"loss": 1.3372, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 9.88787185354691, |
|
"grad_norm": 0.2285187989473343, |
|
"learning_rate": 2.884253203869758e-08, |
|
"loss": 1.3469, |
|
"step": 2165 |
|
}, |
|
{ |
|
"epoch": 9.91075514874142, |
|
"grad_norm": 0.22958822548389435, |
|
"learning_rate": 1.2819245493955744e-08, |
|
"loss": 1.3467, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 9.933638443935926, |
|
"grad_norm": 0.22622478008270264, |
|
"learning_rate": 3.2048627292113887e-09, |
|
"loss": 1.3452, |
|
"step": 2175 |
|
}, |
|
{ |
|
"epoch": 9.956521739130435, |
|
"grad_norm": 0.2237871289253235, |
|
"learning_rate": 0.0, |
|
"loss": 1.3538, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 9.956521739130435, |
|
"eval_loss": 2.5189335346221924, |
|
"eval_runtime": 0.1831, |
|
"eval_samples_per_second": 54.615, |
|
"eval_steps_per_second": 5.461, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 9.956521739130435, |
|
"step": 2180, |
|
"total_flos": 2.4191218891641324e+18, |
|
"train_loss": 1.418152675716155, |
|
"train_runtime": 3547.1741, |
|
"train_samples_per_second": 39.395, |
|
"train_steps_per_second": 0.615 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 2180, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.4191218891641324e+18, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|