pszemraj commited on
Commit
b9aabb1
·
verified ·
1 Parent(s): d85a90d

Upload folder using huggingface_hub

Browse files
checkpoints/checkpoint-pt-15000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:46e9faee945f5542200559bcaccc7a2e3b11b0f078df0428d616c77f253e2040
3
+ size 1202681712
checkpoints/checkpoint-pt-15000/random_states_0.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:634ae87ad9ec14553a807f970f4e595e3fef7b62fd4afaddf671a76426ff94ed
3
+ size 14344
checkpoints/checkpoint-pt-20000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e5dd0359555b88eb0765a95933341da04a05fdb83ad333e6c793bb2a3bdaa022
3
+ size 1202681712
checkpoints/checkpoint-pt-20000/random_states_0.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:634ae87ad9ec14553a807f970f4e595e3fef7b62fd4afaddf671a76426ff94ed
3
+ size 14344
checkpoints/checkpoint-pt-25000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f4217365d5d5a1057269178d91516ab1fe8df45f10eada348f61bd8c0248ffcb
3
+ size 1202681712
checkpoints/checkpoint-pt-25000/random_states_0.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:634ae87ad9ec14553a807f970f4e595e3fef7b62fd4afaddf671a76426ff94ed
3
+ size 14344
checkpoints/main.log CHANGED
@@ -244,3 +244,333 @@ Mixed precision type: bf16
244
  [2024-08-09 22:44:33,158][Main][INFO] - [train] Step 10950 out of 80000 | Loss --> 2.581 | Grad_l2 --> 0.595 | Weights_l2 --> 8703.360 | Lr --> 0.008 | Seconds_per_step --> 4.271 |
245
  [2024-08-09 22:48:11,589][Main][INFO] - [train] Step 11000 out of 80000 | Loss --> 2.580 | Grad_l2 --> 0.601 | Weights_l2 --> 8704.410 | Lr --> 0.008 | Seconds_per_step --> 4.369 |
246
  [2024-08-09 22:51:45,840][Main][INFO] - [train] Step 11050 out of 80000 | Loss --> 2.578 | Grad_l2 --> 0.587 | Weights_l2 --> 8705.448 | Lr --> 0.008 | Seconds_per_step --> 4.285 |
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
244
  [2024-08-09 22:44:33,158][Main][INFO] - [train] Step 10950 out of 80000 | Loss --> 2.581 | Grad_l2 --> 0.595 | Weights_l2 --> 8703.360 | Lr --> 0.008 | Seconds_per_step --> 4.271 |
245
  [2024-08-09 22:48:11,589][Main][INFO] - [train] Step 11000 out of 80000 | Loss --> 2.580 | Grad_l2 --> 0.601 | Weights_l2 --> 8704.410 | Lr --> 0.008 | Seconds_per_step --> 4.369 |
246
  [2024-08-09 22:51:45,840][Main][INFO] - [train] Step 11050 out of 80000 | Loss --> 2.578 | Grad_l2 --> 0.587 | Weights_l2 --> 8705.448 | Lr --> 0.008 | Seconds_per_step --> 4.285 |
247
+ [2024-08-09 22:55:25,388][Main][INFO] - [train] Step 11100 out of 80000 | Loss --> 2.574 | Grad_l2 --> 0.599 | Weights_l2 --> 8706.475 | Lr --> 0.008 | Seconds_per_step --> 4.391 |
248
+ [2024-08-09 22:58:56,339][Main][INFO] - [train] Step 11150 out of 80000 | Loss --> 2.574 | Grad_l2 --> 0.599 | Weights_l2 --> 8707.487 | Lr --> 0.008 | Seconds_per_step --> 4.219 |
249
+ [2024-08-09 23:02:28,434][Main][INFO] - [train] Step 11200 out of 80000 | Loss --> 2.577 | Grad_l2 --> 0.600 | Weights_l2 --> 8708.529 | Lr --> 0.008 | Seconds_per_step --> 4.242 |
250
+ [2024-08-09 23:06:01,747][Main][INFO] - [train] Step 11250 out of 80000 | Loss --> 2.563 | Grad_l2 --> 0.582 | Weights_l2 --> 8709.582 | Lr --> 0.008 | Seconds_per_step --> 4.266 |
251
+ [2024-08-09 23:09:36,821][Main][INFO] - [train] Step 11300 out of 80000 | Loss --> 2.567 | Grad_l2 --> 0.559 | Weights_l2 --> 8710.620 | Lr --> 0.008 | Seconds_per_step --> 4.301 |
252
+ [2024-08-09 23:13:05,158][Main][INFO] - [train] Step 11350 out of 80000 | Loss --> 2.561 | Grad_l2 --> 0.598 | Weights_l2 --> 8711.669 | Lr --> 0.008 | Seconds_per_step --> 4.167 |
253
+ [2024-08-09 23:16:34,505][Main][INFO] - [train] Step 11400 out of 80000 | Loss --> 2.555 | Grad_l2 --> 0.588 | Weights_l2 --> 8712.697 | Lr --> 0.008 | Seconds_per_step --> 4.187 |
254
+ [2024-08-09 23:20:05,626][Main][INFO] - [train] Step 11450 out of 80000 | Loss --> 2.546 | Grad_l2 --> 0.582 | Weights_l2 --> 8713.753 | Lr --> 0.008 | Seconds_per_step --> 4.222 |
255
+ [2024-08-09 23:23:40,137][Main][INFO] - [train] Step 11500 out of 80000 | Loss --> 2.549 | Grad_l2 --> 0.583 | Weights_l2 --> 8714.804 | Lr --> 0.008 | Seconds_per_step --> 4.290 |
256
+ [2024-08-09 23:27:11,574][Main][INFO] - [train] Step 11550 out of 80000 | Loss --> 2.536 | Grad_l2 --> 0.582 | Weights_l2 --> 8715.826 | Lr --> 0.008 | Seconds_per_step --> 4.229 |
257
+ [2024-08-09 23:30:49,636][Main][INFO] - [train] Step 11600 out of 80000 | Loss --> 2.538 | Grad_l2 --> 0.576 | Weights_l2 --> 8716.881 | Lr --> 0.008 | Seconds_per_step --> 4.361 |
258
+ [2024-08-09 23:34:19,586][Main][INFO] - [train] Step 11650 out of 80000 | Loss --> 2.539 | Grad_l2 --> 0.580 | Weights_l2 --> 8717.926 | Lr --> 0.008 | Seconds_per_step --> 4.199 |
259
+ [2024-08-09 23:37:49,139][Main][INFO] - [train] Step 11700 out of 80000 | Loss --> 2.524 | Grad_l2 --> 0.585 | Weights_l2 --> 8718.968 | Lr --> 0.008 | Seconds_per_step --> 4.191 |
260
+ [2024-08-09 23:41:15,748][Main][INFO] - [train] Step 11750 out of 80000 | Loss --> 2.531 | Grad_l2 --> 0.601 | Weights_l2 --> 8720.024 | Lr --> 0.008 | Seconds_per_step --> 4.132 |
261
+ [2024-08-09 23:44:46,392][Main][INFO] - [train] Step 11800 out of 80000 | Loss --> 2.519 | Grad_l2 --> 0.586 | Weights_l2 --> 8721.064 | Lr --> 0.008 | Seconds_per_step --> 4.213 |
262
+ [2024-08-09 23:48:21,044][Main][INFO] - [train] Step 11850 out of 80000 | Loss --> 2.516 | Grad_l2 --> 0.576 | Weights_l2 --> 8722.098 | Lr --> 0.008 | Seconds_per_step --> 4.293 |
263
+ [2024-08-09 23:51:45,233][Main][INFO] - [train] Step 11900 out of 80000 | Loss --> 2.509 | Grad_l2 --> 0.566 | Weights_l2 --> 8723.110 | Lr --> 0.008 | Seconds_per_step --> 4.084 |
264
+ [2024-08-09 23:55:18,373][Main][INFO] - [train] Step 11950 out of 80000 | Loss --> 2.508 | Grad_l2 --> 0.605 | Weights_l2 --> 8724.151 | Lr --> 0.008 | Seconds_per_step --> 4.263 |
265
+ [2024-08-09 23:58:51,710][Main][INFO] - [train] Step 12000 out of 80000 | Loss --> 2.510 | Grad_l2 --> 0.587 | Weights_l2 --> 8725.199 | Lr --> 0.008 | Seconds_per_step --> 4.267 |
266
+ [2024-08-10 00:02:27,062][Main][INFO] - [train] Step 12050 out of 80000 | Loss --> 2.502 | Grad_l2 --> 0.573 | Weights_l2 --> 8726.242 | Lr --> 0.008 | Seconds_per_step --> 4.307 |
267
+ [2024-08-10 00:05:54,127][Main][INFO] - [train] Step 12100 out of 80000 | Loss --> 2.496 | Grad_l2 --> 0.583 | Weights_l2 --> 8727.250 | Lr --> 0.008 | Seconds_per_step --> 4.141 |
268
+ [2024-08-10 00:09:20,349][Main][INFO] - [train] Step 12150 out of 80000 | Loss --> 2.499 | Grad_l2 --> 0.553 | Weights_l2 --> 8728.275 | Lr --> 0.008 | Seconds_per_step --> 4.124 |
269
+ [2024-08-10 00:12:28,941][Main][INFO] - [train] Step 12200 out of 80000 | Loss --> 2.503 | Grad_l2 --> 0.561 | Weights_l2 --> 8729.279 | Lr --> 0.008 | Seconds_per_step --> 3.772 |
270
+ [2024-08-10 00:15:19,261][Main][INFO] - [train] Step 12250 out of 80000 | Loss --> 2.494 | Grad_l2 --> 0.590 | Weights_l2 --> 8730.313 | Lr --> 0.008 | Seconds_per_step --> 3.406 |
271
+ [2024-08-10 00:18:09,129][Main][INFO] - [train] Step 12300 out of 80000 | Loss --> 2.490 | Grad_l2 --> 0.552 | Weights_l2 --> 8731.341 | Lr --> 0.008 | Seconds_per_step --> 3.397 |
272
+ [2024-08-10 00:20:58,085][Main][INFO] - [train] Step 12350 out of 80000 | Loss --> 2.487 | Grad_l2 --> 0.548 | Weights_l2 --> 8732.401 | Lr --> 0.008 | Seconds_per_step --> 3.379 |
273
+ [2024-08-10 00:23:47,642][Main][INFO] - [train] Step 12400 out of 80000 | Loss --> 2.480 | Grad_l2 --> 0.542 | Weights_l2 --> 8733.439 | Lr --> 0.008 | Seconds_per_step --> 3.391 |
274
+ [2024-08-10 00:26:37,898][Main][INFO] - [train] Step 12450 out of 80000 | Loss --> 2.481 | Grad_l2 --> 0.551 | Weights_l2 --> 8734.469 | Lr --> 0.008 | Seconds_per_step --> 3.405 |
275
+ [2024-08-10 00:29:27,451][Main][INFO] - [train] Step 12500 out of 80000 | Loss --> 2.477 | Grad_l2 --> 0.558 | Weights_l2 --> 8735.510 | Lr --> 0.008 | Seconds_per_step --> 3.391 |
276
+ [2024-08-10 00:32:17,116][Main][INFO] - [train] Step 12550 out of 80000 | Loss --> 2.478 | Grad_l2 --> 0.549 | Weights_l2 --> 8736.541 | Lr --> 0.008 | Seconds_per_step --> 3.393 |
277
+ [2024-08-10 00:35:06,730][Main][INFO] - [train] Step 12600 out of 80000 | Loss --> 2.470 | Grad_l2 --> 0.545 | Weights_l2 --> 8737.575 | Lr --> 0.008 | Seconds_per_step --> 3.392 |
278
+ [2024-08-10 00:37:58,202][Main][INFO] - [train] Step 12650 out of 80000 | Loss --> 2.471 | Grad_l2 --> 0.547 | Weights_l2 --> 8738.595 | Lr --> 0.008 | Seconds_per_step --> 3.429 |
279
+ [2024-08-10 00:40:47,794][Main][INFO] - [train] Step 12700 out of 80000 | Loss --> 2.462 | Grad_l2 --> 0.528 | Weights_l2 --> 8739.622 | Lr --> 0.008 | Seconds_per_step --> 3.392 |
280
+ [2024-08-10 00:43:37,447][Main][INFO] - [train] Step 12750 out of 80000 | Loss --> 2.457 | Grad_l2 --> 0.533 | Weights_l2 --> 8740.657 | Lr --> 0.008 | Seconds_per_step --> 3.393 |
281
+ [2024-08-10 00:46:25,836][Main][INFO] - [train] Step 12800 out of 80000 | Loss --> 2.461 | Grad_l2 --> 0.549 | Weights_l2 --> 8741.689 | Lr --> 0.008 | Seconds_per_step --> 3.368 |
282
+ [2024-08-10 00:49:16,460][Main][INFO] - [train] Step 12850 out of 80000 | Loss --> 2.451 | Grad_l2 --> 0.531 | Weights_l2 --> 8742.743 | Lr --> 0.008 | Seconds_per_step --> 3.412 |
283
+ [2024-08-10 00:52:06,465][Main][INFO] - [train] Step 12900 out of 80000 | Loss --> 2.453 | Grad_l2 --> 0.527 | Weights_l2 --> 8743.761 | Lr --> 0.008 | Seconds_per_step --> 3.400 |
284
+ [2024-08-10 00:54:55,879][Main][INFO] - [train] Step 12950 out of 80000 | Loss --> 2.447 | Grad_l2 --> 0.520 | Weights_l2 --> 8744.791 | Lr --> 0.008 | Seconds_per_step --> 3.388 |
285
+ [2024-08-10 00:57:44,034][Main][INFO] - [train] Step 13000 out of 80000 | Loss --> 2.448 | Grad_l2 --> 0.539 | Weights_l2 --> 8745.805 | Lr --> 0.008 | Seconds_per_step --> 3.363 |
286
+ [2024-08-10 01:00:33,641][Main][INFO] - [train] Step 13050 out of 80000 | Loss --> 2.439 | Grad_l2 --> 0.511 | Weights_l2 --> 8746.858 | Lr --> 0.008 | Seconds_per_step --> 3.392 |
287
+ [2024-08-10 01:03:22,747][Main][INFO] - [train] Step 13100 out of 80000 | Loss --> 2.436 | Grad_l2 --> 0.524 | Weights_l2 --> 8747.888 | Lr --> 0.008 | Seconds_per_step --> 3.382 |
288
+ [2024-08-10 01:06:11,723][Main][INFO] - [train] Step 13150 out of 80000 | Loss --> 2.438 | Grad_l2 --> 0.525 | Weights_l2 --> 8748.918 | Lr --> 0.008 | Seconds_per_step --> 3.380 |
289
+ [2024-08-10 01:09:00,218][Main][INFO] - [train] Step 13200 out of 80000 | Loss --> 2.436 | Grad_l2 --> 0.519 | Weights_l2 --> 8749.966 | Lr --> 0.008 | Seconds_per_step --> 3.370 |
290
+ [2024-08-10 01:11:49,462][Main][INFO] - [train] Step 13250 out of 80000 | Loss --> 2.437 | Grad_l2 --> 0.511 | Weights_l2 --> 8751.004 | Lr --> 0.008 | Seconds_per_step --> 3.385 |
291
+ [2024-08-10 01:14:38,139][Main][INFO] - [train] Step 13300 out of 80000 | Loss --> 2.428 | Grad_l2 --> 0.512 | Weights_l2 --> 8752.040 | Lr --> 0.008 | Seconds_per_step --> 3.374 |
292
+ [2024-08-10 01:17:26,878][Main][INFO] - [train] Step 13350 out of 80000 | Loss --> 2.426 | Grad_l2 --> 0.514 | Weights_l2 --> 8753.062 | Lr --> 0.008 | Seconds_per_step --> 3.375 |
293
+ [2024-08-10 01:20:15,034][Main][INFO] - [train] Step 13400 out of 80000 | Loss --> 2.429 | Grad_l2 --> 0.509 | Weights_l2 --> 8754.109 | Lr --> 0.008 | Seconds_per_step --> 3.363 |
294
+ [2024-08-10 01:23:03,363][Main][INFO] - [train] Step 13450 out of 80000 | Loss --> 2.423 | Grad_l2 --> 0.511 | Weights_l2 --> 8755.150 | Lr --> 0.008 | Seconds_per_step --> 3.367 |
295
+ [2024-08-10 01:25:52,469][Main][INFO] - [train] Step 13500 out of 80000 | Loss --> 2.413 | Grad_l2 --> 0.502 | Weights_l2 --> 8756.209 | Lr --> 0.008 | Seconds_per_step --> 3.382 |
296
+ [2024-08-10 01:28:40,740][Main][INFO] - [train] Step 13550 out of 80000 | Loss --> 2.422 | Grad_l2 --> 0.504 | Weights_l2 --> 8757.222 | Lr --> 0.008 | Seconds_per_step --> 3.365 |
297
+ [2024-08-10 01:31:29,023][Main][INFO] - [train] Step 13600 out of 80000 | Loss --> 2.415 | Grad_l2 --> 0.495 | Weights_l2 --> 8758.279 | Lr --> 0.008 | Seconds_per_step --> 3.366 |
298
+ [2024-08-10 01:34:18,913][Main][INFO] - [train] Step 13650 out of 80000 | Loss --> 2.420 | Grad_l2 --> 0.505 | Weights_l2 --> 8759.320 | Lr --> 0.008 | Seconds_per_step --> 3.398 |
299
+ [2024-08-10 01:37:08,507][Main][INFO] - [train] Step 13700 out of 80000 | Loss --> 2.417 | Grad_l2 --> 0.500 | Weights_l2 --> 8760.334 | Lr --> 0.008 | Seconds_per_step --> 3.392 |
300
+ [2024-08-10 01:39:56,547][Main][INFO] - [train] Step 13750 out of 80000 | Loss --> 2.406 | Grad_l2 --> 0.495 | Weights_l2 --> 8761.384 | Lr --> 0.008 | Seconds_per_step --> 3.361 |
301
+ [2024-08-10 01:42:44,437][Main][INFO] - [train] Step 13800 out of 80000 | Loss --> 2.404 | Grad_l2 --> 0.501 | Weights_l2 --> 8762.410 | Lr --> 0.008 | Seconds_per_step --> 3.358 |
302
+ [2024-08-10 01:45:33,358][Main][INFO] - [train] Step 13850 out of 80000 | Loss --> 2.397 | Grad_l2 --> 0.502 | Weights_l2 --> 8763.443 | Lr --> 0.008 | Seconds_per_step --> 3.378 |
303
+ [2024-08-10 01:48:22,144][Main][INFO] - [train] Step 13900 out of 80000 | Loss --> 2.389 | Grad_l2 --> 0.492 | Weights_l2 --> 8764.465 | Lr --> 0.008 | Seconds_per_step --> 3.376 |
304
+ [2024-08-10 01:51:09,910][Main][INFO] - [train] Step 13950 out of 80000 | Loss --> 2.391 | Grad_l2 --> 0.502 | Weights_l2 --> 8765.511 | Lr --> 0.008 | Seconds_per_step --> 3.355 |
305
+ [2024-08-10 01:53:58,677][Main][INFO] - [train] Step 14000 out of 80000 | Loss --> 2.388 | Grad_l2 --> 0.497 | Weights_l2 --> 8766.530 | Lr --> 0.008 | Seconds_per_step --> 3.375 |
306
+ [2024-08-10 01:56:48,499][Main][INFO] - [train] Step 14050 out of 80000 | Loss --> 2.373 | Grad_l2 --> 0.491 | Weights_l2 --> 8767.573 | Lr --> 0.008 | Seconds_per_step --> 3.396 |
307
+ [2024-08-10 01:59:38,047][Main][INFO] - [train] Step 14100 out of 80000 | Loss --> 2.377 | Grad_l2 --> 0.503 | Weights_l2 --> 8768.588 | Lr --> 0.008 | Seconds_per_step --> 3.391 |
308
+ [2024-08-10 02:02:27,734][Main][INFO] - [train] Step 14150 out of 80000 | Loss --> 2.378 | Grad_l2 --> 0.488 | Weights_l2 --> 8769.605 | Lr --> 0.008 | Seconds_per_step --> 3.394 |
309
+ [2024-08-10 02:05:16,770][Main][INFO] - [train] Step 14200 out of 80000 | Loss --> 2.368 | Grad_l2 --> 0.496 | Weights_l2 --> 8770.616 | Lr --> 0.008 | Seconds_per_step --> 3.381 |
310
+ [2024-08-10 02:08:05,603][Main][INFO] - [train] Step 14250 out of 80000 | Loss --> 2.373 | Grad_l2 --> 0.488 | Weights_l2 --> 8771.662 | Lr --> 0.008 | Seconds_per_step --> 3.377 |
311
+ [2024-08-10 02:10:54,613][Main][INFO] - [train] Step 14300 out of 80000 | Loss --> 2.381 | Grad_l2 --> 0.490 | Weights_l2 --> 8772.676 | Lr --> 0.008 | Seconds_per_step --> 3.380 |
312
+ [2024-08-10 02:13:44,831][Main][INFO] - [train] Step 14350 out of 80000 | Loss --> 2.371 | Grad_l2 --> 0.483 | Weights_l2 --> 8773.704 | Lr --> 0.008 | Seconds_per_step --> 3.404 |
313
+ [2024-08-10 02:16:33,316][Main][INFO] - [train] Step 14400 out of 80000 | Loss --> 2.377 | Grad_l2 --> 0.487 | Weights_l2 --> 8774.735 | Lr --> 0.008 | Seconds_per_step --> 3.370 |
314
+ [2024-08-10 02:19:21,805][Main][INFO] - [train] Step 14450 out of 80000 | Loss --> 2.373 | Grad_l2 --> 0.482 | Weights_l2 --> 8775.731 | Lr --> 0.008 | Seconds_per_step --> 3.370 |
315
+ [2024-08-10 02:22:11,796][Main][INFO] - [train] Step 14500 out of 80000 | Loss --> 2.369 | Grad_l2 --> 0.499 | Weights_l2 --> 8776.744 | Lr --> 0.008 | Seconds_per_step --> 3.400 |
316
+ [2024-08-10 02:25:01,398][Main][INFO] - [train] Step 14550 out of 80000 | Loss --> 2.364 | Grad_l2 --> 0.485 | Weights_l2 --> 8777.791 | Lr --> 0.008 | Seconds_per_step --> 3.392 |
317
+ [2024-08-10 02:27:51,146][Main][INFO] - [train] Step 14600 out of 80000 | Loss --> 2.369 | Grad_l2 --> 0.481 | Weights_l2 --> 8778.816 | Lr --> 0.008 | Seconds_per_step --> 3.395 |
318
+ [2024-08-10 02:30:40,279][Main][INFO] - [train] Step 14650 out of 80000 | Loss --> 2.373 | Grad_l2 --> 0.486 | Weights_l2 --> 8779.856 | Lr --> 0.008 | Seconds_per_step --> 3.383 |
319
+ [2024-08-10 02:33:30,596][Main][INFO] - [train] Step 14700 out of 80000 | Loss --> 2.368 | Grad_l2 --> 0.488 | Weights_l2 --> 8780.880 | Lr --> 0.008 | Seconds_per_step --> 3.406 |
320
+ [2024-08-10 02:36:19,985][Main][INFO] - [train] Step 14750 out of 80000 | Loss --> 2.364 | Grad_l2 --> 0.480 | Weights_l2 --> 8781.909 | Lr --> 0.008 | Seconds_per_step --> 3.388 |
321
+ [2024-08-10 02:39:09,113][Main][INFO] - [train] Step 14800 out of 80000 | Loss --> 2.355 | Grad_l2 --> 0.490 | Weights_l2 --> 8782.954 | Lr --> 0.008 | Seconds_per_step --> 3.383 |
322
+ [2024-08-10 02:41:58,266][Main][INFO] - [train] Step 14850 out of 80000 | Loss --> 2.363 | Grad_l2 --> 0.486 | Weights_l2 --> 8783.980 | Lr --> 0.008 | Seconds_per_step --> 3.383 |
323
+ [2024-08-10 02:44:47,795][Main][INFO] - [train] Step 14900 out of 80000 | Loss --> 2.364 | Grad_l2 --> 0.479 | Weights_l2 --> 8784.994 | Lr --> 0.008 | Seconds_per_step --> 3.391 |
324
+ [2024-08-10 02:47:37,307][Main][INFO] - [train] Step 14950 out of 80000 | Loss --> 2.362 | Grad_l2 --> 0.477 | Weights_l2 --> 8786.006 | Lr --> 0.008 | Seconds_per_step --> 3.390 |
325
+ [2024-08-10 02:50:26,779][Main][INFO] - [train] Step 15000 out of 80000 | Loss --> 2.358 | Grad_l2 --> 0.483 | Weights_l2 --> 8787.026 | Lr --> 0.008 | Seconds_per_step --> 3.389 |
326
+ [2024-08-10 02:50:26,780][accelerate.accelerator][INFO] - Saving current state to checkpoint-pt-15000
327
+ [2024-08-10 02:50:26,783][accelerate.utils.other][WARNING] - Removed shared tensor {'encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'} while saving. This should be OK, but check by verifying that you don't receive any warning while reloading
328
+ [2024-08-10 02:50:28,800][accelerate.checkpointing][INFO] - Model weights saved in checkpoint-pt-15000/model.safetensors
329
+ [2024-08-10 02:50:31,549][accelerate.checkpointing][INFO] - Optimizer state saved in checkpoint-pt-15000/optimizer.bin
330
+ [2024-08-10 02:50:31,550][accelerate.checkpointing][INFO] - Scheduler state saved in checkpoint-pt-15000/scheduler.bin
331
+ [2024-08-10 02:50:31,550][accelerate.checkpointing][INFO] - Sampler state for dataloader 0 saved in checkpoint-pt-15000/sampler.bin
332
+ [2024-08-10 02:50:31,550][accelerate.checkpointing][INFO] - Sampler state for dataloader 1 saved in checkpoint-pt-15000/sampler_1.bin
333
+ [2024-08-10 02:50:31,551][accelerate.checkpointing][INFO] - Random states saved in checkpoint-pt-15000/random_states_0.pkl
334
+ [2024-08-10 02:53:20,359][Main][INFO] - [train] Step 15050 out of 80000 | Loss --> 2.367 | Grad_l2 --> 0.472 | Weights_l2 --> 8788.057 | Lr --> 0.008 | Seconds_per_step --> 3.472 |
335
+ [2024-08-10 02:56:09,018][Main][INFO] - [train] Step 15100 out of 80000 | Loss --> 2.358 | Grad_l2 --> 0.482 | Weights_l2 --> 8789.093 | Lr --> 0.008 | Seconds_per_step --> 3.373 |
336
+ [2024-08-10 02:58:59,528][Main][INFO] - [train] Step 15150 out of 80000 | Loss --> 2.357 | Grad_l2 --> 0.474 | Weights_l2 --> 8790.099 | Lr --> 0.008 | Seconds_per_step --> 3.410 |
337
+ [2024-08-10 03:01:49,372][Main][INFO] - [train] Step 15200 out of 80000 | Loss --> 2.361 | Grad_l2 --> 0.472 | Weights_l2 --> 8791.113 | Lr --> 0.008 | Seconds_per_step --> 3.397 |
338
+ [2024-08-10 03:04:37,075][Main][INFO] - [train] Step 15250 out of 80000 | Loss --> 2.350 | Grad_l2 --> 0.478 | Weights_l2 --> 8792.148 | Lr --> 0.008 | Seconds_per_step --> 3.354 |
339
+ [2024-08-10 03:07:24,386][Main][INFO] - [train] Step 15300 out of 80000 | Loss --> 2.356 | Grad_l2 --> 0.479 | Weights_l2 --> 8793.148 | Lr --> 0.008 | Seconds_per_step --> 3.346 |
340
+ [2024-08-10 03:10:13,428][Main][INFO] - [train] Step 15350 out of 80000 | Loss --> 2.350 | Grad_l2 --> 0.479 | Weights_l2 --> 8794.155 | Lr --> 0.008 | Seconds_per_step --> 3.381 |
341
+ [2024-08-10 03:13:02,393][Main][INFO] - [train] Step 15400 out of 80000 | Loss --> 2.347 | Grad_l2 --> 0.469 | Weights_l2 --> 8795.175 | Lr --> 0.008 | Seconds_per_step --> 3.379 |
342
+ [2024-08-10 03:15:51,829][Main][INFO] - [train] Step 15450 out of 80000 | Loss --> 2.347 | Grad_l2 --> 0.461 | Weights_l2 --> 8796.188 | Lr --> 0.008 | Seconds_per_step --> 3.389 |
343
+ [2024-08-10 03:18:41,239][Main][INFO] - [train] Step 15500 out of 80000 | Loss --> 2.349 | Grad_l2 --> 0.468 | Weights_l2 --> 8797.217 | Lr --> 0.008 | Seconds_per_step --> 3.388 |
344
+ [2024-08-10 03:21:30,852][Main][INFO] - [train] Step 15550 out of 80000 | Loss --> 2.341 | Grad_l2 --> 0.466 | Weights_l2 --> 8798.212 | Lr --> 0.008 | Seconds_per_step --> 3.392 |
345
+ [2024-08-10 03:24:19,122][Main][INFO] - [train] Step 15600 out of 80000 | Loss --> 2.345 | Grad_l2 --> 0.472 | Weights_l2 --> 8799.202 | Lr --> 0.008 | Seconds_per_step --> 3.365 |
346
+ [2024-08-10 03:27:08,990][Main][INFO] - [train] Step 15650 out of 80000 | Loss --> 2.350 | Grad_l2 --> 0.470 | Weights_l2 --> 8800.214 | Lr --> 0.008 | Seconds_per_step --> 3.397 |
347
+ [2024-08-10 03:29:58,136][Main][INFO] - [train] Step 15700 out of 80000 | Loss --> 2.338 | Grad_l2 --> 0.473 | Weights_l2 --> 8801.228 | Lr --> 0.008 | Seconds_per_step --> 3.383 |
348
+ [2024-08-10 03:32:47,841][Main][INFO] - [train] Step 15750 out of 80000 | Loss --> 2.335 | Grad_l2 --> 0.456 | Weights_l2 --> 8802.245 | Lr --> 0.008 | Seconds_per_step --> 3.394 |
349
+ [2024-08-10 03:35:36,029][Main][INFO] - [train] Step 15800 out of 80000 | Loss --> 2.332 | Grad_l2 --> 0.454 | Weights_l2 --> 8803.247 | Lr --> 0.008 | Seconds_per_step --> 3.364 |
350
+ [2024-08-10 03:38:25,696][Main][INFO] - [train] Step 15850 out of 80000 | Loss --> 2.329 | Grad_l2 --> 0.468 | Weights_l2 --> 8804.255 | Lr --> 0.008 | Seconds_per_step --> 3.393 |
351
+ [2024-08-10 03:41:14,705][Main][INFO] - [train] Step 15900 out of 80000 | Loss --> 2.344 | Grad_l2 --> 0.771 | Weights_l2 --> 8805.210 | Lr --> 0.008 | Seconds_per_step --> 3.380 |
352
+ [2024-08-10 03:44:05,016][Main][INFO] - [train] Step 15950 out of 80000 | Loss --> 2.336 | Grad_l2 --> 0.468 | Weights_l2 --> 8806.198 | Lr --> 0.008 | Seconds_per_step --> 3.406 |
353
+ [2024-08-10 03:46:54,039][Main][INFO] - [train] Step 16000 out of 80000 | Loss --> 2.322 | Grad_l2 --> 0.466 | Weights_l2 --> 8807.208 | Lr --> 0.008 | Seconds_per_step --> 3.380 |
354
+ [2024-08-10 03:49:43,020][Main][INFO] - [train] Step 16050 out of 80000 | Loss --> 2.327 | Grad_l2 --> 0.461 | Weights_l2 --> 8808.179 | Lr --> 0.008 | Seconds_per_step --> 3.380 |
355
+ [2024-08-10 03:52:31,751][Main][INFO] - [train] Step 16100 out of 80000 | Loss --> 2.335 | Grad_l2 --> 0.464 | Weights_l2 --> 8809.180 | Lr --> 0.008 | Seconds_per_step --> 3.375 |
356
+ [2024-08-10 03:55:21,300][Main][INFO] - [train] Step 16150 out of 80000 | Loss --> 2.332 | Grad_l2 --> 0.459 | Weights_l2 --> 8810.175 | Lr --> 0.008 | Seconds_per_step --> 3.391 |
357
+ [2024-08-10 03:58:10,233][Main][INFO] - [train] Step 16200 out of 80000 | Loss --> 2.330 | Grad_l2 --> 0.459 | Weights_l2 --> 8811.153 | Lr --> 0.008 | Seconds_per_step --> 3.379 |
358
+ [2024-08-10 04:00:58,805][Main][INFO] - [train] Step 16250 out of 80000 | Loss --> 2.328 | Grad_l2 --> 0.452 | Weights_l2 --> 8812.148 | Lr --> 0.008 | Seconds_per_step --> 3.371 |
359
+ [2024-08-10 04:03:47,753][Main][INFO] - [train] Step 16300 out of 80000 | Loss --> 2.327 | Grad_l2 --> 0.457 | Weights_l2 --> 8813.155 | Lr --> 0.008 | Seconds_per_step --> 3.379 |
360
+ [2024-08-10 04:06:37,968][Main][INFO] - [train] Step 16350 out of 80000 | Loss --> 2.315 | Grad_l2 --> 0.457 | Weights_l2 --> 8814.120 | Lr --> 0.008 | Seconds_per_step --> 3.404 |
361
+ [2024-08-10 04:09:27,345][Main][INFO] - [train] Step 16400 out of 80000 | Loss --> 2.323 | Grad_l2 --> 0.451 | Weights_l2 --> 8815.101 | Lr --> 0.008 | Seconds_per_step --> 3.388 |
362
+ [2024-08-10 04:12:16,639][Main][INFO] - [train] Step 16450 out of 80000 | Loss --> 2.323 | Grad_l2 --> 0.454 | Weights_l2 --> 8816.109 | Lr --> 0.008 | Seconds_per_step --> 3.386 |
363
+ [2024-08-10 04:15:06,000][Main][INFO] - [train] Step 16500 out of 80000 | Loss --> 2.316 | Grad_l2 --> 0.461 | Weights_l2 --> 8817.094 | Lr --> 0.008 | Seconds_per_step --> 3.387 |
364
+ [2024-08-10 04:18:00,644][Main][INFO] - [train] Step 16550 out of 80000 | Loss --> 2.316 | Grad_l2 --> 0.454 | Weights_l2 --> 8818.060 | Lr --> 0.008 | Seconds_per_step --> 3.493 |
365
+ [2024-08-10 04:20:52,878][Main][INFO] - [train] Step 16600 out of 80000 | Loss --> 2.325 | Grad_l2 --> 0.447 | Weights_l2 --> 8819.026 | Lr --> 0.008 | Seconds_per_step --> 3.445 |
366
+ [2024-08-10 04:23:41,401][Main][INFO] - [train] Step 16650 out of 80000 | Loss --> 2.310 | Grad_l2 --> 0.456 | Weights_l2 --> 8820.003 | Lr --> 0.008 | Seconds_per_step --> 3.370 |
367
+ [2024-08-10 04:26:30,469][Main][INFO] - [train] Step 16700 out of 80000 | Loss --> 2.312 | Grad_l2 --> 0.451 | Weights_l2 --> 8821.005 | Lr --> 0.008 | Seconds_per_step --> 3.381 |
368
+ [2024-08-10 04:29:19,628][Main][INFO] - [train] Step 16750 out of 80000 | Loss --> 2.324 | Grad_l2 --> 0.451 | Weights_l2 --> 8821.988 | Lr --> 0.008 | Seconds_per_step --> 3.383 |
369
+ [2024-08-10 04:32:09,203][Main][INFO] - [train] Step 16800 out of 80000 | Loss --> 2.308 | Grad_l2 --> 0.450 | Weights_l2 --> 8822.952 | Lr --> 0.008 | Seconds_per_step --> 3.391 |
370
+ [2024-08-10 04:35:05,100][Main][INFO] - [train] Step 16850 out of 80000 | Loss --> 2.294 | Grad_l2 --> 0.446 | Weights_l2 --> 8823.904 | Lr --> 0.008 | Seconds_per_step --> 3.518 |
371
+ [2024-08-10 04:37:58,342][Main][INFO] - [train] Step 16900 out of 80000 | Loss --> 2.310 | Grad_l2 --> 0.454 | Weights_l2 --> 8824.866 | Lr --> 0.008 | Seconds_per_step --> 3.465 |
372
+ [2024-08-10 04:40:55,063][Main][INFO] - [train] Step 16950 out of 80000 | Loss --> 2.294 | Grad_l2 --> 0.449 | Weights_l2 --> 8825.837 | Lr --> 0.008 | Seconds_per_step --> 3.534 |
373
+ [2024-08-10 04:44:25,073][Main][INFO] - [train] Step 17000 out of 80000 | Loss --> 2.298 | Grad_l2 --> 0.448 | Weights_l2 --> 8826.792 | Lr --> 0.008 | Seconds_per_step --> 4.200 |
374
+ [2024-08-10 04:47:13,391][Main][INFO] - [train] Step 17050 out of 80000 | Loss --> 2.300 | Grad_l2 --> 0.441 | Weights_l2 --> 8827.769 | Lr --> 0.008 | Seconds_per_step --> 3.366 |
375
+ [2024-08-10 04:50:17,595][Main][INFO] - [train] Step 17100 out of 80000 | Loss --> 2.300 | Grad_l2 --> 0.439 | Weights_l2 --> 8828.744 | Lr --> 0.008 | Seconds_per_step --> 3.684 |
376
+ [2024-08-10 04:53:21,981][Main][INFO] - [train] Step 17150 out of 80000 | Loss --> 2.300 | Grad_l2 --> 0.443 | Weights_l2 --> 8829.696 | Lr --> 0.008 | Seconds_per_step --> 3.688 |
377
+ [2024-08-10 04:56:15,559][Main][INFO] - [train] Step 17200 out of 80000 | Loss --> 2.301 | Grad_l2 --> 0.447 | Weights_l2 --> 8830.652 | Lr --> 0.008 | Seconds_per_step --> 3.472 |
378
+ [2024-08-10 04:59:19,644][Main][INFO] - [train] Step 17250 out of 80000 | Loss --> 2.299 | Grad_l2 --> 0.441 | Weights_l2 --> 8831.603 | Lr --> 0.008 | Seconds_per_step --> 3.682 |
379
+ [2024-08-10 05:03:08,540][Main][INFO] - [train] Step 17300 out of 80000 | Loss --> 2.298 | Grad_l2 --> 0.441 | Weights_l2 --> 8832.566 | Lr --> 0.008 | Seconds_per_step --> 4.578 |
380
+ [2024-08-10 05:06:12,612][Main][INFO] - [train] Step 17350 out of 80000 | Loss --> 2.292 | Grad_l2 --> 0.442 | Weights_l2 --> 8833.511 | Lr --> 0.008 | Seconds_per_step --> 3.681 |
381
+ [2024-08-10 05:09:09,744][Main][INFO] - [train] Step 17400 out of 80000 | Loss --> 2.295 | Grad_l2 --> 0.436 | Weights_l2 --> 8834.479 | Lr --> 0.008 | Seconds_per_step --> 3.543 |
382
+ [2024-08-10 05:12:04,488][Main][INFO] - [train] Step 17450 out of 80000 | Loss --> 2.292 | Grad_l2 --> 0.441 | Weights_l2 --> 8835.436 | Lr --> 0.008 | Seconds_per_step --> 3.495 |
383
+ [2024-08-10 05:15:05,017][Main][INFO] - [train] Step 17500 out of 80000 | Loss --> 2.293 | Grad_l2 --> 0.446 | Weights_l2 --> 8836.425 | Lr --> 0.008 | Seconds_per_step --> 3.611 |
384
+ [2024-08-10 05:17:58,854][Main][INFO] - [train] Step 17550 out of 80000 | Loss --> 2.286 | Grad_l2 --> 0.437 | Weights_l2 --> 8837.398 | Lr --> 0.008 | Seconds_per_step --> 3.477 |
385
+ [2024-08-10 05:21:01,251][Main][INFO] - [train] Step 17600 out of 80000 | Loss --> 2.293 | Grad_l2 --> 0.438 | Weights_l2 --> 8838.359 | Lr --> 0.008 | Seconds_per_step --> 3.648 |
386
+ [2024-08-10 05:23:50,306][Main][INFO] - [train] Step 17650 out of 80000 | Loss --> 2.290 | Grad_l2 --> 0.440 | Weights_l2 --> 8839.301 | Lr --> 0.008 | Seconds_per_step --> 3.381 |
387
+ [2024-08-10 05:26:39,934][Main][INFO] - [train] Step 17700 out of 80000 | Loss --> 2.279 | Grad_l2 --> 0.437 | Weights_l2 --> 8840.279 | Lr --> 0.008 | Seconds_per_step --> 3.393 |
388
+ [2024-08-10 05:29:31,132][Main][INFO] - [train] Step 17750 out of 80000 | Loss --> 2.295 | Grad_l2 --> 0.435 | Weights_l2 --> 8841.235 | Lr --> 0.008 | Seconds_per_step --> 3.424 |
389
+ [2024-08-10 05:32:28,592][Main][INFO] - [train] Step 17800 out of 80000 | Loss --> 2.285 | Grad_l2 --> 0.439 | Weights_l2 --> 8842.177 | Lr --> 0.008 | Seconds_per_step --> 3.549 |
390
+ [2024-08-10 05:35:29,530][Main][INFO] - [train] Step 17850 out of 80000 | Loss --> 2.278 | Grad_l2 --> 0.438 | Weights_l2 --> 8843.147 | Lr --> 0.008 | Seconds_per_step --> 3.619 |
391
+ [2024-08-10 05:38:26,746][Main][INFO] - [train] Step 17900 out of 80000 | Loss --> 2.280 | Grad_l2 --> 0.433 | Weights_l2 --> 8844.071 | Lr --> 0.008 | Seconds_per_step --> 3.544 |
392
+ [2024-08-10 05:41:16,386][Main][INFO] - [train] Step 17950 out of 80000 | Loss --> 2.279 | Grad_l2 --> 0.429 | Weights_l2 --> 8845.032 | Lr --> 0.008 | Seconds_per_step --> 3.393 |
393
+ [2024-08-10 05:44:05,564][Main][INFO] - [train] Step 18000 out of 80000 | Loss --> 2.274 | Grad_l2 --> 0.439 | Weights_l2 --> 8845.972 | Lr --> 0.008 | Seconds_per_step --> 3.384 |
394
+ [2024-08-10 05:46:54,634][Main][INFO] - [train] Step 18050 out of 80000 | Loss --> 2.272 | Grad_l2 --> 0.434 | Weights_l2 --> 8846.896 | Lr --> 0.008 | Seconds_per_step --> 3.381 |
395
+ [2024-08-10 05:49:43,407][Main][INFO] - [train] Step 18100 out of 80000 | Loss --> 2.269 | Grad_l2 --> 0.430 | Weights_l2 --> 8847.847 | Lr --> 0.008 | Seconds_per_step --> 3.375 |
396
+ [2024-08-10 05:52:32,975][Main][INFO] - [train] Step 18150 out of 80000 | Loss --> 2.268 | Grad_l2 --> 0.433 | Weights_l2 --> 8848.785 | Lr --> 0.008 | Seconds_per_step --> 3.391 |
397
+ [2024-08-10 05:55:22,235][Main][INFO] - [train] Step 18200 out of 80000 | Loss --> 2.274 | Grad_l2 --> 0.428 | Weights_l2 --> 8849.744 | Lr --> 0.008 | Seconds_per_step --> 3.385 |
398
+ [2024-08-10 05:58:12,437][Main][INFO] - [train] Step 18250 out of 80000 | Loss --> 2.274 | Grad_l2 --> 0.438 | Weights_l2 --> 8850.687 | Lr --> 0.008 | Seconds_per_step --> 3.404 |
399
+ [2024-08-10 06:01:01,584][Main][INFO] - [train] Step 18300 out of 80000 | Loss --> 2.272 | Grad_l2 --> 0.437 | Weights_l2 --> 8851.617 | Lr --> 0.008 | Seconds_per_step --> 3.383 |
400
+ [2024-08-10 06:03:50,792][Main][INFO] - [train] Step 18350 out of 80000 | Loss --> 2.262 | Grad_l2 --> 0.425 | Weights_l2 --> 8852.557 | Lr --> 0.008 | Seconds_per_step --> 3.384 |
401
+ [2024-08-10 06:06:40,061][Main][INFO] - [train] Step 18400 out of 80000 | Loss --> 2.265 | Grad_l2 --> 0.427 | Weights_l2 --> 8853.478 | Lr --> 0.008 | Seconds_per_step --> 3.385 |
402
+ [2024-08-10 06:09:28,691][Main][INFO] - [train] Step 18450 out of 80000 | Loss --> 2.250 | Grad_l2 --> 0.427 | Weights_l2 --> 8854.413 | Lr --> 0.008 | Seconds_per_step --> 3.373 |
403
+ [2024-08-10 06:12:17,730][Main][INFO] - [train] Step 18500 out of 80000 | Loss --> 2.258 | Grad_l2 --> 0.432 | Weights_l2 --> 8855.354 | Lr --> 0.008 | Seconds_per_step --> 3.381 |
404
+ [2024-08-10 06:15:06,731][Main][INFO] - [train] Step 18550 out of 80000 | Loss --> 2.264 | Grad_l2 --> 0.428 | Weights_l2 --> 8856.260 | Lr --> 0.008 | Seconds_per_step --> 3.380 |
405
+ [2024-08-10 06:17:55,946][Main][INFO] - [train] Step 18600 out of 80000 | Loss --> 2.257 | Grad_l2 --> 0.429 | Weights_l2 --> 8857.174 | Lr --> 0.008 | Seconds_per_step --> 3.384 |
406
+ [2024-08-10 06:20:45,707][Main][INFO] - [train] Step 18650 out of 80000 | Loss --> 2.251 | Grad_l2 --> 0.427 | Weights_l2 --> 8858.085 | Lr --> 0.008 | Seconds_per_step --> 3.395 |
407
+ [2024-08-10 06:23:34,836][Main][INFO] - [train] Step 18700 out of 80000 | Loss --> 2.262 | Grad_l2 --> 0.427 | Weights_l2 --> 8859.009 | Lr --> 0.008 | Seconds_per_step --> 3.383 |
408
+ [2024-08-10 06:26:23,643][Main][INFO] - [train] Step 18750 out of 80000 | Loss --> 2.253 | Grad_l2 --> 0.421 | Weights_l2 --> 8859.915 | Lr --> 0.008 | Seconds_per_step --> 3.376 |
409
+ [2024-08-10 06:29:11,354][Main][INFO] - [train] Step 18800 out of 80000 | Loss --> 2.250 | Grad_l2 --> 0.424 | Weights_l2 --> 8860.852 | Lr --> 0.008 | Seconds_per_step --> 3.354 |
410
+ [2024-08-10 06:32:00,884][Main][INFO] - [train] Step 18850 out of 80000 | Loss --> 2.242 | Grad_l2 --> 0.422 | Weights_l2 --> 8861.763 | Lr --> 0.008 | Seconds_per_step --> 3.391 |
411
+ [2024-08-10 06:34:48,915][Main][INFO] - [train] Step 18900 out of 80000 | Loss --> 2.253 | Grad_l2 --> 0.419 | Weights_l2 --> 8862.654 | Lr --> 0.008 | Seconds_per_step --> 3.361 |
412
+ [2024-08-10 06:37:37,887][Main][INFO] - [train] Step 18950 out of 80000 | Loss --> 2.241 | Grad_l2 --> 0.420 | Weights_l2 --> 8863.549 | Lr --> 0.008 | Seconds_per_step --> 3.379 |
413
+ [2024-08-10 06:40:26,873][Main][INFO] - [train] Step 19000 out of 80000 | Loss --> 2.256 | Grad_l2 --> 0.419 | Weights_l2 --> 8864.466 | Lr --> 0.008 | Seconds_per_step --> 3.380 |
414
+ [2024-08-10 06:43:16,993][Main][INFO] - [train] Step 19050 out of 80000 | Loss --> 2.244 | Grad_l2 --> 0.415 | Weights_l2 --> 8865.372 | Lr --> 0.008 | Seconds_per_step --> 3.402 |
415
+ [2024-08-10 06:46:06,020][Main][INFO] - [train] Step 19100 out of 80000 | Loss --> 2.242 | Grad_l2 --> 0.415 | Weights_l2 --> 8866.273 | Lr --> 0.008 | Seconds_per_step --> 3.381 |
416
+ [2024-08-10 06:48:54,927][Main][INFO] - [train] Step 19150 out of 80000 | Loss --> 2.241 | Grad_l2 --> 0.419 | Weights_l2 --> 8867.179 | Lr --> 0.008 | Seconds_per_step --> 3.378 |
417
+ [2024-08-10 06:51:43,818][Main][INFO] - [train] Step 19200 out of 80000 | Loss --> 2.252 | Grad_l2 --> 0.418 | Weights_l2 --> 8868.077 | Lr --> 0.008 | Seconds_per_step --> 3.378 |
418
+ [2024-08-10 06:54:33,553][Main][INFO] - [train] Step 19250 out of 80000 | Loss --> 2.245 | Grad_l2 --> 0.416 | Weights_l2 --> 8868.959 | Lr --> 0.008 | Seconds_per_step --> 3.395 |
419
+ [2024-08-10 06:57:22,776][Main][INFO] - [train] Step 19300 out of 80000 | Loss --> 2.239 | Grad_l2 --> 0.419 | Weights_l2 --> 8869.832 | Lr --> 0.008 | Seconds_per_step --> 3.384 |
420
+ [2024-08-10 07:00:11,595][Main][INFO] - [train] Step 19350 out of 80000 | Loss --> 2.237 | Grad_l2 --> 0.414 | Weights_l2 --> 8870.736 | Lr --> 0.008 | Seconds_per_step --> 3.376 |
421
+ [2024-08-10 07:03:00,088][Main][INFO] - [train] Step 19400 out of 80000 | Loss --> 2.224 | Grad_l2 --> 0.416 | Weights_l2 --> 8871.632 | Lr --> 0.008 | Seconds_per_step --> 3.370 |
422
+ [2024-08-10 07:05:49,818][Main][INFO] - [train] Step 19450 out of 80000 | Loss --> 2.228 | Grad_l2 --> 0.416 | Weights_l2 --> 8872.502 | Lr --> 0.008 | Seconds_per_step --> 3.395 |
423
+ [2024-08-10 07:08:38,161][Main][INFO] - [train] Step 19500 out of 80000 | Loss --> 2.229 | Grad_l2 --> 0.412 | Weights_l2 --> 8873.395 | Lr --> 0.008 | Seconds_per_step --> 3.367 |
424
+ [2024-08-10 07:11:26,283][Main][INFO] - [train] Step 19550 out of 80000 | Loss --> 2.228 | Grad_l2 --> 0.418 | Weights_l2 --> 8874.282 | Lr --> 0.008 | Seconds_per_step --> 3.362 |
425
+ [2024-08-10 07:14:14,560][Main][INFO] - [train] Step 19600 out of 80000 | Loss --> 2.228 | Grad_l2 --> 0.409 | Weights_l2 --> 8875.190 | Lr --> 0.008 | Seconds_per_step --> 3.366 |
426
+ [2024-08-10 07:17:03,585][Main][INFO] - [train] Step 19650 out of 80000 | Loss --> 2.224 | Grad_l2 --> 0.408 | Weights_l2 --> 8876.067 | Lr --> 0.008 | Seconds_per_step --> 3.380 |
427
+ [2024-08-10 07:19:52,539][Main][INFO] - [train] Step 19700 out of 80000 | Loss --> 2.223 | Grad_l2 --> 0.411 | Weights_l2 --> 8876.951 | Lr --> 0.008 | Seconds_per_step --> 3.379 |
428
+ [2024-08-10 07:22:41,624][Main][INFO] - [train] Step 19750 out of 80000 | Loss --> 2.204 | Grad_l2 --> 0.407 | Weights_l2 --> 8877.825 | Lr --> 0.008 | Seconds_per_step --> 3.382 |
429
+ [2024-08-10 07:25:30,616][Main][INFO] - [train] Step 19800 out of 80000 | Loss --> 2.228 | Grad_l2 --> 0.412 | Weights_l2 --> 8878.734 | Lr --> 0.008 | Seconds_per_step --> 3.380 |
430
+ [2024-08-10 07:28:19,413][Main][INFO] - [train] Step 19850 out of 80000 | Loss --> 2.219 | Grad_l2 --> 0.407 | Weights_l2 --> 8879.608 | Lr --> 0.008 | Seconds_per_step --> 3.376 |
431
+ [2024-08-10 07:31:08,190][Main][INFO] - [train] Step 19900 out of 80000 | Loss --> 2.221 | Grad_l2 --> 0.405 | Weights_l2 --> 8880.499 | Lr --> 0.008 | Seconds_per_step --> 3.376 |
432
+ [2024-08-10 07:33:58,992][Main][INFO] - [train] Step 19950 out of 80000 | Loss --> 2.209 | Grad_l2 --> 0.411 | Weights_l2 --> 8881.370 | Lr --> 0.008 | Seconds_per_step --> 3.416 |
433
+ [2024-08-10 07:36:48,090][Main][INFO] - [train] Step 20000 out of 80000 | Loss --> 2.216 | Grad_l2 --> 0.408 | Weights_l2 --> 8882.234 | Lr --> 0.008 | Seconds_per_step --> 3.382 |
434
+ [2024-08-10 07:36:48,091][accelerate.accelerator][INFO] - Saving current state to checkpoint-pt-20000
435
+ [2024-08-10 07:36:48,094][accelerate.utils.other][WARNING] - Removed shared tensor {'encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'} while saving. This should be OK, but check by verifying that you don't receive any warning while reloading
436
+ [2024-08-10 07:36:50,075][accelerate.checkpointing][INFO] - Model weights saved in checkpoint-pt-20000/model.safetensors
437
+ [2024-08-10 07:36:52,974][accelerate.checkpointing][INFO] - Optimizer state saved in checkpoint-pt-20000/optimizer.bin
438
+ [2024-08-10 07:36:52,974][accelerate.checkpointing][INFO] - Scheduler state saved in checkpoint-pt-20000/scheduler.bin
439
+ [2024-08-10 07:36:52,975][accelerate.checkpointing][INFO] - Sampler state for dataloader 0 saved in checkpoint-pt-20000/sampler.bin
440
+ [2024-08-10 07:36:52,975][accelerate.checkpointing][INFO] - Sampler state for dataloader 1 saved in checkpoint-pt-20000/sampler_1.bin
441
+ [2024-08-10 07:36:52,975][accelerate.checkpointing][INFO] - Random states saved in checkpoint-pt-20000/random_states_0.pkl
442
+ [2024-08-10 07:39:42,190][Main][INFO] - [train] Step 20050 out of 80000 | Loss --> 2.200 | Grad_l2 --> 0.407 | Weights_l2 --> 8883.086 | Lr --> 0.008 | Seconds_per_step --> 3.482 |
443
+ [2024-08-10 07:42:32,658][Main][INFO] - [train] Step 20100 out of 80000 | Loss --> 2.198 | Grad_l2 --> 0.404 | Weights_l2 --> 8883.945 | Lr --> 0.008 | Seconds_per_step --> 3.409 |
444
+ [2024-08-10 07:45:21,533][Main][INFO] - [train] Step 20150 out of 80000 | Loss --> 2.202 | Grad_l2 --> 0.408 | Weights_l2 --> 8884.806 | Lr --> 0.008 | Seconds_per_step --> 3.377 |
445
+ [2024-08-10 07:48:10,447][Main][INFO] - [train] Step 20200 out of 80000 | Loss --> 2.203 | Grad_l2 --> 0.407 | Weights_l2 --> 8885.699 | Lr --> 0.008 | Seconds_per_step --> 3.378 |
446
+ [2024-08-10 07:50:58,905][Main][INFO] - [train] Step 20250 out of 80000 | Loss --> 2.196 | Grad_l2 --> 0.404 | Weights_l2 --> 8886.567 | Lr --> 0.008 | Seconds_per_step --> 3.369 |
447
+ [2024-08-10 07:53:48,181][Main][INFO] - [train] Step 20300 out of 80000 | Loss --> 2.197 | Grad_l2 --> 0.406 | Weights_l2 --> 8887.444 | Lr --> 0.008 | Seconds_per_step --> 3.386 |
448
+ [2024-08-10 07:56:36,986][Main][INFO] - [train] Step 20350 out of 80000 | Loss --> 2.196 | Grad_l2 --> 0.403 | Weights_l2 --> 8888.293 | Lr --> 0.008 | Seconds_per_step --> 3.376 |
449
+ [2024-08-10 07:59:25,941][Main][INFO] - [train] Step 20400 out of 80000 | Loss --> 2.193 | Grad_l2 --> 0.406 | Weights_l2 --> 8889.139 | Lr --> 0.008 | Seconds_per_step --> 3.379 |
450
+ [2024-08-10 08:02:15,456][Main][INFO] - [train] Step 20450 out of 80000 | Loss --> 2.192 | Grad_l2 --> 0.407 | Weights_l2 --> 8889.993 | Lr --> 0.008 | Seconds_per_step --> 3.390 |
451
+ [2024-08-10 08:05:04,967][Main][INFO] - [train] Step 20500 out of 80000 | Loss --> 2.198 | Grad_l2 --> 0.399 | Weights_l2 --> 8890.854 | Lr --> 0.008 | Seconds_per_step --> 3.390 |
452
+ [2024-08-10 08:07:54,096][Main][INFO] - [train] Step 20550 out of 80000 | Loss --> 2.189 | Grad_l2 --> 0.403 | Weights_l2 --> 8891.715 | Lr --> 0.008 | Seconds_per_step --> 3.383 |
453
+ [2024-08-10 08:10:43,683][Main][INFO] - [train] Step 20600 out of 80000 | Loss --> 2.193 | Grad_l2 --> 0.398 | Weights_l2 --> 8892.575 | Lr --> 0.008 | Seconds_per_step --> 3.392 |
454
+ [2024-08-10 08:13:31,893][Main][INFO] - [train] Step 20650 out of 80000 | Loss --> 2.186 | Grad_l2 --> 0.403 | Weights_l2 --> 8893.421 | Lr --> 0.008 | Seconds_per_step --> 3.364 |
455
+ [2024-08-10 08:16:21,983][Main][INFO] - [train] Step 20700 out of 80000 | Loss --> 2.183 | Grad_l2 --> 0.399 | Weights_l2 --> 8894.287 | Lr --> 0.008 | Seconds_per_step --> 3.402 |
456
+ [2024-08-10 08:19:11,096][Main][INFO] - [train] Step 20750 out of 80000 | Loss --> 2.183 | Grad_l2 --> 0.397 | Weights_l2 --> 8895.130 | Lr --> 0.008 | Seconds_per_step --> 3.382 |
457
+ [2024-08-10 08:21:59,564][Main][INFO] - [train] Step 20800 out of 80000 | Loss --> 2.176 | Grad_l2 --> 0.404 | Weights_l2 --> 8895.973 | Lr --> 0.008 | Seconds_per_step --> 3.369 |
458
+ [2024-08-10 08:24:48,772][Main][INFO] - [train] Step 20850 out of 80000 | Loss --> 2.183 | Grad_l2 --> 0.399 | Weights_l2 --> 8896.827 | Lr --> 0.008 | Seconds_per_step --> 3.384 |
459
+ [2024-08-10 08:27:39,040][Main][INFO] - [train] Step 20900 out of 80000 | Loss --> 2.181 | Grad_l2 --> 0.398 | Weights_l2 --> 8897.692 | Lr --> 0.008 | Seconds_per_step --> 3.405 |
460
+ [2024-08-10 08:30:28,088][Main][INFO] - [train] Step 20950 out of 80000 | Loss --> 2.169 | Grad_l2 --> 0.396 | Weights_l2 --> 8898.537 | Lr --> 0.008 | Seconds_per_step --> 3.381 |
461
+ [2024-08-10 08:33:15,709][Main][INFO] - [train] Step 21000 out of 80000 | Loss --> 2.174 | Grad_l2 --> 0.403 | Weights_l2 --> 8899.403 | Lr --> 0.008 | Seconds_per_step --> 3.352 |
462
+ [2024-08-10 08:36:05,184][Main][INFO] - [train] Step 21050 out of 80000 | Loss --> 2.173 | Grad_l2 --> 0.395 | Weights_l2 --> 8900.227 | Lr --> 0.008 | Seconds_per_step --> 3.389 |
463
+ [2024-08-10 08:38:54,935][Main][INFO] - [train] Step 21100 out of 80000 | Loss --> 2.164 | Grad_l2 --> 0.394 | Weights_l2 --> 8901.058 | Lr --> 0.008 | Seconds_per_step --> 3.395 |
464
+ [2024-08-10 08:41:44,183][Main][INFO] - [train] Step 21150 out of 80000 | Loss --> 2.172 | Grad_l2 --> 0.396 | Weights_l2 --> 8901.899 | Lr --> 0.008 | Seconds_per_step --> 3.385 |
465
+ [2024-08-10 08:44:34,115][Main][INFO] - [train] Step 21200 out of 80000 | Loss --> 2.164 | Grad_l2 --> 0.399 | Weights_l2 --> 8902.719 | Lr --> 0.008 | Seconds_per_step --> 3.399 |
466
+ [2024-08-10 08:47:23,707][Main][INFO] - [train] Step 21250 out of 80000 | Loss --> 2.162 | Grad_l2 --> 0.399 | Weights_l2 --> 8903.545 | Lr --> 0.008 | Seconds_per_step --> 3.392 |
467
+ [2024-08-10 08:50:13,259][Main][INFO] - [train] Step 21300 out of 80000 | Loss --> 2.152 | Grad_l2 --> 0.394 | Weights_l2 --> 8904.388 | Lr --> 0.007 | Seconds_per_step --> 3.391 |
468
+ [2024-08-10 08:53:03,350][Main][INFO] - [train] Step 21350 out of 80000 | Loss --> 2.161 | Grad_l2 --> 0.397 | Weights_l2 --> 8905.227 | Lr --> 0.007 | Seconds_per_step --> 3.402 |
469
+ [2024-08-10 08:55:51,861][Main][INFO] - [train] Step 21400 out of 80000 | Loss --> 2.153 | Grad_l2 --> 0.396 | Weights_l2 --> 8906.073 | Lr --> 0.007 | Seconds_per_step --> 3.370 |
470
+ [2024-08-10 08:58:41,307][Main][INFO] - [train] Step 21450 out of 80000 | Loss --> 2.151 | Grad_l2 --> 0.390 | Weights_l2 --> 8906.885 | Lr --> 0.007 | Seconds_per_step --> 3.389 |
471
+ [2024-08-10 09:01:31,004][Main][INFO] - [train] Step 21500 out of 80000 | Loss --> 2.152 | Grad_l2 --> 0.390 | Weights_l2 --> 8907.704 | Lr --> 0.007 | Seconds_per_step --> 3.394 |
472
+ [2024-08-10 09:04:20,626][Main][INFO] - [train] Step 21550 out of 80000 | Loss --> 2.140 | Grad_l2 --> 0.392 | Weights_l2 --> 8908.519 | Lr --> 0.007 | Seconds_per_step --> 3.392 |
473
+ [2024-08-10 09:07:09,329][Main][INFO] - [train] Step 21600 out of 80000 | Loss --> 2.142 | Grad_l2 --> 0.392 | Weights_l2 --> 8909.337 | Lr --> 0.007 | Seconds_per_step --> 3.374 |
474
+ [2024-08-10 09:09:58,896][Main][INFO] - [train] Step 21650 out of 80000 | Loss --> 2.142 | Grad_l2 --> 0.396 | Weights_l2 --> 8910.161 | Lr --> 0.007 | Seconds_per_step --> 3.391 |
475
+ [2024-08-10 09:12:47,641][Main][INFO] - [train] Step 21700 out of 80000 | Loss --> 2.138 | Grad_l2 --> 0.393 | Weights_l2 --> 8910.985 | Lr --> 0.007 | Seconds_per_step --> 3.375 |
476
+ [2024-08-10 09:15:36,715][Main][INFO] - [train] Step 21750 out of 80000 | Loss --> 2.137 | Grad_l2 --> 0.389 | Weights_l2 --> 8911.803 | Lr --> 0.007 | Seconds_per_step --> 3.381 |
477
+ [2024-08-10 09:18:25,804][Main][INFO] - [train] Step 21800 out of 80000 | Loss --> 2.124 | Grad_l2 --> 0.388 | Weights_l2 --> 8912.593 | Lr --> 0.007 | Seconds_per_step --> 3.382 |
478
+ [2024-08-10 09:21:14,976][Main][INFO] - [train] Step 21850 out of 80000 | Loss --> 2.127 | Grad_l2 --> 0.388 | Weights_l2 --> 8913.424 | Lr --> 0.007 | Seconds_per_step --> 3.383 |
479
+ [2024-08-10 09:24:04,906][Main][INFO] - [train] Step 21900 out of 80000 | Loss --> 2.127 | Grad_l2 --> 0.391 | Weights_l2 --> 8914.230 | Lr --> 0.007 | Seconds_per_step --> 3.399 |
480
+ [2024-08-10 09:26:55,122][Main][INFO] - [train] Step 21950 out of 80000 | Loss --> 2.129 | Grad_l2 --> 0.389 | Weights_l2 --> 8915.052 | Lr --> 0.007 | Seconds_per_step --> 3.404 |
481
+ [2024-08-10 09:29:44,540][Main][INFO] - [train] Step 22000 out of 80000 | Loss --> 2.125 | Grad_l2 --> 0.389 | Weights_l2 --> 8915.853 | Lr --> 0.007 | Seconds_per_step --> 3.388 |
482
+ [2024-08-10 09:32:34,046][Main][INFO] - [train] Step 22050 out of 80000 | Loss --> 2.122 | Grad_l2 --> 0.395 | Weights_l2 --> 8916.661 | Lr --> 0.007 | Seconds_per_step --> 3.390 |
483
+ [2024-08-10 09:35:21,952][Main][INFO] - [train] Step 22100 out of 80000 | Loss --> 2.119 | Grad_l2 --> 0.385 | Weights_l2 --> 8917.495 | Lr --> 0.007 | Seconds_per_step --> 3.358 |
484
+ [2024-08-10 09:38:11,003][Main][INFO] - [train] Step 22150 out of 80000 | Loss --> 2.122 | Grad_l2 --> 0.391 | Weights_l2 --> 8918.291 | Lr --> 0.007 | Seconds_per_step --> 3.381 |
485
+ [2024-08-10 09:40:58,413][Main][INFO] - [train] Step 22200 out of 80000 | Loss --> 2.117 | Grad_l2 --> 0.387 | Weights_l2 --> 8919.096 | Lr --> 0.007 | Seconds_per_step --> 3.348 |
486
+ [2024-08-10 09:43:47,159][Main][INFO] - [train] Step 22250 out of 80000 | Loss --> 2.121 | Grad_l2 --> 0.384 | Weights_l2 --> 8919.898 | Lr --> 0.007 | Seconds_per_step --> 3.375 |
487
+ [2024-08-10 09:46:36,610][Main][INFO] - [train] Step 22300 out of 80000 | Loss --> 2.113 | Grad_l2 --> 0.388 | Weights_l2 --> 8920.716 | Lr --> 0.007 | Seconds_per_step --> 3.389 |
488
+ [2024-08-10 09:49:27,851][Main][INFO] - [train] Step 22350 out of 80000 | Loss --> 2.115 | Grad_l2 --> 0.384 | Weights_l2 --> 8921.523 | Lr --> 0.007 | Seconds_per_step --> 3.425 |
489
+ [2024-08-10 09:52:17,105][Main][INFO] - [train] Step 22400 out of 80000 | Loss --> 2.114 | Grad_l2 --> 0.388 | Weights_l2 --> 8922.301 | Lr --> 0.007 | Seconds_per_step --> 3.385 |
490
+ [2024-08-10 09:55:06,547][Main][INFO] - [train] Step 22450 out of 80000 | Loss --> 2.123 | Grad_l2 --> 0.384 | Weights_l2 --> 8923.100 | Lr --> 0.007 | Seconds_per_step --> 3.389 |
491
+ [2024-08-10 09:57:55,448][Main][INFO] - [train] Step 22500 out of 80000 | Loss --> 2.118 | Grad_l2 --> 0.386 | Weights_l2 --> 8923.925 | Lr --> 0.007 | Seconds_per_step --> 3.378 |
492
+ [2024-08-10 10:00:44,643][Main][INFO] - [train] Step 22550 out of 80000 | Loss --> 2.113 | Grad_l2 --> 0.383 | Weights_l2 --> 8924.712 | Lr --> 0.007 | Seconds_per_step --> 3.384 |
493
+ [2024-08-10 10:03:33,602][Main][INFO] - [train] Step 22600 out of 80000 | Loss --> 2.121 | Grad_l2 --> 0.386 | Weights_l2 --> 8925.491 | Lr --> 0.007 | Seconds_per_step --> 3.379 |
494
+ [2024-08-10 10:06:22,538][Main][INFO] - [train] Step 22650 out of 80000 | Loss --> 2.113 | Grad_l2 --> 0.384 | Weights_l2 --> 8926.277 | Lr --> 0.007 | Seconds_per_step --> 3.379 |
495
+ [2024-08-10 10:09:11,569][Main][INFO] - [train] Step 22700 out of 80000 | Loss --> 2.107 | Grad_l2 --> 0.385 | Weights_l2 --> 8927.058 | Lr --> 0.007 | Seconds_per_step --> 3.381 |
496
+ [2024-08-10 10:12:00,266][Main][INFO] - [train] Step 22750 out of 80000 | Loss --> 2.106 | Grad_l2 --> 0.386 | Weights_l2 --> 8927.846 | Lr --> 0.007 | Seconds_per_step --> 3.374 |
497
+ [2024-08-10 10:14:49,150][Main][INFO] - [train] Step 22800 out of 80000 | Loss --> 2.119 | Grad_l2 --> 0.382 | Weights_l2 --> 8928.630 | Lr --> 0.007 | Seconds_per_step --> 3.378 |
498
+ [2024-08-10 10:17:37,676][Main][INFO] - [train] Step 22850 out of 80000 | Loss --> 2.111 | Grad_l2 --> 0.383 | Weights_l2 --> 8929.421 | Lr --> 0.007 | Seconds_per_step --> 3.371 |
499
+ [2024-08-10 10:20:27,046][Main][INFO] - [train] Step 22900 out of 80000 | Loss --> 2.111 | Grad_l2 --> 0.380 | Weights_l2 --> 8930.220 | Lr --> 0.007 | Seconds_per_step --> 3.387 |
500
+ [2024-08-10 10:23:16,675][Main][INFO] - [train] Step 22950 out of 80000 | Loss --> 2.115 | Grad_l2 --> 0.383 | Weights_l2 --> 8931.007 | Lr --> 0.007 | Seconds_per_step --> 3.393 |
501
+ [2024-08-10 10:26:05,843][Main][INFO] - [train] Step 23000 out of 80000 | Loss --> 2.120 | Grad_l2 --> 0.381 | Weights_l2 --> 8931.786 | Lr --> 0.007 | Seconds_per_step --> 3.383 |
502
+ [2024-08-10 10:28:54,942][Main][INFO] - [train] Step 23050 out of 80000 | Loss --> 2.116 | Grad_l2 --> 0.386 | Weights_l2 --> 8932.580 | Lr --> 0.007 | Seconds_per_step --> 3.382 |
503
+ [2024-08-10 10:31:44,834][Main][INFO] - [train] Step 23100 out of 80000 | Loss --> 2.115 | Grad_l2 --> 0.380 | Weights_l2 --> 8933.380 | Lr --> 0.007 | Seconds_per_step --> 3.398 |
504
+ [2024-08-10 10:34:34,786][Main][INFO] - [train] Step 23150 out of 80000 | Loss --> 2.113 | Grad_l2 --> 0.377 | Weights_l2 --> 8934.167 | Lr --> 0.007 | Seconds_per_step --> 3.399 |
505
+ [2024-08-10 10:37:24,223][Main][INFO] - [train] Step 23200 out of 80000 | Loss --> 2.106 | Grad_l2 --> 0.377 | Weights_l2 --> 8934.945 | Lr --> 0.007 | Seconds_per_step --> 3.389 |
506
+ [2024-08-10 10:40:13,606][Main][INFO] - [train] Step 23250 out of 80000 | Loss --> 2.109 | Grad_l2 --> 0.381 | Weights_l2 --> 8935.739 | Lr --> 0.007 | Seconds_per_step --> 3.388 |
507
+ [2024-08-10 10:43:03,034][Main][INFO] - [train] Step 23300 out of 80000 | Loss --> 2.105 | Grad_l2 --> 0.380 | Weights_l2 --> 8936.510 | Lr --> 0.007 | Seconds_per_step --> 3.389 |
508
+ [2024-08-10 10:45:52,352][Main][INFO] - [train] Step 23350 out of 80000 | Loss --> 2.115 | Grad_l2 --> 0.381 | Weights_l2 --> 8937.305 | Lr --> 0.007 | Seconds_per_step --> 3.386 |
509
+ [2024-08-10 10:48:42,504][Main][INFO] - [train] Step 23400 out of 80000 | Loss --> 2.108 | Grad_l2 --> 0.381 | Weights_l2 --> 8938.113 | Lr --> 0.007 | Seconds_per_step --> 3.403 |
510
+ [2024-08-10 10:51:32,165][Main][INFO] - [train] Step 23450 out of 80000 | Loss --> 2.107 | Grad_l2 --> 0.375 | Weights_l2 --> 8938.876 | Lr --> 0.007 | Seconds_per_step --> 3.393 |
511
+ [2024-08-10 10:54:21,677][Main][INFO] - [train] Step 23500 out of 80000 | Loss --> 2.100 | Grad_l2 --> 0.380 | Weights_l2 --> 8939.651 | Lr --> 0.007 | Seconds_per_step --> 3.390 |
512
+ [2024-08-10 10:57:10,744][Main][INFO] - [train] Step 23550 out of 80000 | Loss --> 2.101 | Grad_l2 --> 0.381 | Weights_l2 --> 8940.430 | Lr --> 0.007 | Seconds_per_step --> 3.381 |
513
+ [2024-08-10 11:00:00,750][Main][INFO] - [train] Step 23600 out of 80000 | Loss --> 2.105 | Grad_l2 --> 0.376 | Weights_l2 --> 8941.201 | Lr --> 0.007 | Seconds_per_step --> 3.400 |
514
+ [2024-08-10 11:02:49,980][Main][INFO] - [train] Step 23650 out of 80000 | Loss --> 2.106 | Grad_l2 --> 0.377 | Weights_l2 --> 8941.992 | Lr --> 0.007 | Seconds_per_step --> 3.385 |
515
+ [2024-08-10 11:05:38,767][Main][INFO] - [train] Step 23700 out of 80000 | Loss --> 2.096 | Grad_l2 --> 0.380 | Weights_l2 --> 8942.754 | Lr --> 0.007 | Seconds_per_step --> 3.376 |
516
+ [2024-08-10 11:08:28,650][Main][INFO] - [train] Step 23750 out of 80000 | Loss --> 2.096 | Grad_l2 --> 0.380 | Weights_l2 --> 8943.525 | Lr --> 0.007 | Seconds_per_step --> 3.398 |
517
+ [2024-08-10 11:11:17,799][Main][INFO] - [train] Step 23800 out of 80000 | Loss --> 2.106 | Grad_l2 --> 0.378 | Weights_l2 --> 8944.294 | Lr --> 0.007 | Seconds_per_step --> 3.383 |
518
+ [2024-08-10 11:14:06,944][Main][INFO] - [train] Step 23850 out of 80000 | Loss --> 2.095 | Grad_l2 --> 0.373 | Weights_l2 --> 8945.061 | Lr --> 0.007 | Seconds_per_step --> 3.383 |
519
+ [2024-08-10 11:16:56,683][Main][INFO] - [train] Step 23900 out of 80000 | Loss --> 2.101 | Grad_l2 --> 0.376 | Weights_l2 --> 8945.835 | Lr --> 0.007 | Seconds_per_step --> 3.395 |
520
+ [2024-08-10 11:19:45,844][Main][INFO] - [train] Step 23950 out of 80000 | Loss --> 2.092 | Grad_l2 --> 0.375 | Weights_l2 --> 8946.629 | Lr --> 0.007 | Seconds_per_step --> 3.383 |
521
+ [2024-08-10 11:22:35,661][Main][INFO] - [train] Step 24000 out of 80000 | Loss --> 2.096 | Grad_l2 --> 0.377 | Weights_l2 --> 8947.382 | Lr --> 0.007 | Seconds_per_step --> 3.396 |
522
+ [2024-08-10 11:25:23,611][Main][INFO] - [train] Step 24050 out of 80000 | Loss --> 2.094 | Grad_l2 --> 0.374 | Weights_l2 --> 8948.130 | Lr --> 0.007 | Seconds_per_step --> 3.359 |
523
+ [2024-08-10 11:28:12,984][Main][INFO] - [train] Step 24100 out of 80000 | Loss --> 2.095 | Grad_l2 --> 0.373 | Weights_l2 --> 8948.867 | Lr --> 0.007 | Seconds_per_step --> 3.387 |
524
+ [2024-08-10 11:31:01,571][Main][INFO] - [train] Step 24150 out of 80000 | Loss --> 2.095 | Grad_l2 --> 0.374 | Weights_l2 --> 8949.631 | Lr --> 0.007 | Seconds_per_step --> 3.372 |
525
+ [2024-08-10 11:33:50,863][Main][INFO] - [train] Step 24200 out of 80000 | Loss --> 2.097 | Grad_l2 --> 0.376 | Weights_l2 --> 8950.388 | Lr --> 0.007 | Seconds_per_step --> 3.386 |
526
+ [2024-08-10 11:36:40,686][Main][INFO] - [train] Step 24250 out of 80000 | Loss --> 2.096 | Grad_l2 --> 0.374 | Weights_l2 --> 8951.146 | Lr --> 0.007 | Seconds_per_step --> 3.396 |
527
+ [2024-08-10 11:39:29,849][Main][INFO] - [train] Step 24300 out of 80000 | Loss --> 2.090 | Grad_l2 --> 0.373 | Weights_l2 --> 8951.859 | Lr --> 0.007 | Seconds_per_step --> 3.383 |
528
+ [2024-08-10 11:42:19,157][Main][INFO] - [train] Step 24350 out of 80000 | Loss --> 2.097 | Grad_l2 --> 0.371 | Weights_l2 --> 8952.607 | Lr --> 0.007 | Seconds_per_step --> 3.386 |
529
+ [2024-08-10 11:45:08,412][Main][INFO] - [train] Step 24400 out of 80000 | Loss --> 2.094 | Grad_l2 --> 0.372 | Weights_l2 --> 8953.362 | Lr --> 0.007 | Seconds_per_step --> 3.385 |
530
+ [2024-08-10 11:47:57,713][Main][INFO] - [train] Step 24450 out of 80000 | Loss --> 2.091 | Grad_l2 --> 0.375 | Weights_l2 --> 8954.094 | Lr --> 0.007 | Seconds_per_step --> 3.386 |
531
+ [2024-08-10 11:50:46,406][Main][INFO] - [train] Step 24500 out of 80000 | Loss --> 2.100 | Grad_l2 --> 0.369 | Weights_l2 --> 8954.854 | Lr --> 0.007 | Seconds_per_step --> 3.374 |
532
+ [2024-08-10 11:53:35,339][Main][INFO] - [train] Step 24550 out of 80000 | Loss --> 2.110 | Grad_l2 --> 0.374 | Weights_l2 --> 8955.580 | Lr --> 0.007 | Seconds_per_step --> 3.379 |
533
+ [2024-08-10 11:56:24,268][Main][INFO] - [train] Step 24600 out of 80000 | Loss --> 2.104 | Grad_l2 --> 0.375 | Weights_l2 --> 8956.344 | Lr --> 0.007 | Seconds_per_step --> 3.379 |
534
+ [2024-08-10 11:59:13,863][Main][INFO] - [train] Step 24650 out of 80000 | Loss --> 2.103 | Grad_l2 --> 0.376 | Weights_l2 --> 8957.068 | Lr --> 0.007 | Seconds_per_step --> 3.392 |
535
+ [2024-08-10 12:02:03,598][Main][INFO] - [train] Step 24700 out of 80000 | Loss --> 2.106 | Grad_l2 --> 0.370 | Weights_l2 --> 8957.814 | Lr --> 0.007 | Seconds_per_step --> 3.395 |
536
+ [2024-08-10 12:04:52,269][Main][INFO] - [train] Step 24750 out of 80000 | Loss --> 2.107 | Grad_l2 --> 0.365 | Weights_l2 --> 8958.570 | Lr --> 0.007 | Seconds_per_step --> 3.373 |
537
+ [2024-08-10 12:07:41,278][Main][INFO] - [train] Step 24800 out of 80000 | Loss --> 2.114 | Grad_l2 --> 0.373 | Weights_l2 --> 8959.279 | Lr --> 0.007 | Seconds_per_step --> 3.380 |
538
+ [2024-08-10 12:10:31,555][Main][INFO] - [train] Step 24850 out of 80000 | Loss --> 2.110 | Grad_l2 --> 0.369 | Weights_l2 --> 8960.027 | Lr --> 0.007 | Seconds_per_step --> 3.406 |
539
+ [2024-08-10 12:13:21,204][Main][INFO] - [train] Step 24900 out of 80000 | Loss --> 2.102 | Grad_l2 --> 0.372 | Weights_l2 --> 8960.746 | Lr --> 0.007 | Seconds_per_step --> 3.393 |
540
+ [2024-08-10 12:16:10,885][Main][INFO] - [train] Step 24950 out of 80000 | Loss --> 2.114 | Grad_l2 --> 0.370 | Weights_l2 --> 8961.486 | Lr --> 0.007 | Seconds_per_step --> 3.394 |
541
+ [2024-08-10 12:19:00,451][Main][INFO] - [train] Step 25000 out of 80000 | Loss --> 2.113 | Grad_l2 --> 0.372 | Weights_l2 --> 8962.205 | Lr --> 0.007 | Seconds_per_step --> 3.391 |
542
+ [2024-08-10 12:19:00,451][accelerate.accelerator][INFO] - Saving current state to checkpoint-pt-25000
543
+ [2024-08-10 12:19:00,454][accelerate.utils.other][WARNING] - Removed shared tensor {'encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'} while saving. This should be OK, but check by verifying that you don't receive any warning while reloading
544
+ [2024-08-10 12:19:02,584][accelerate.checkpointing][INFO] - Model weights saved in checkpoint-pt-25000/model.safetensors
545
+ [2024-08-10 12:19:05,471][accelerate.checkpointing][INFO] - Optimizer state saved in checkpoint-pt-25000/optimizer.bin
546
+ [2024-08-10 12:19:05,472][accelerate.checkpointing][INFO] - Scheduler state saved in checkpoint-pt-25000/scheduler.bin
547
+ [2024-08-10 12:19:05,472][accelerate.checkpointing][INFO] - Sampler state for dataloader 0 saved in checkpoint-pt-25000/sampler.bin
548
+ [2024-08-10 12:19:05,472][accelerate.checkpointing][INFO] - Sampler state for dataloader 1 saved in checkpoint-pt-25000/sampler_1.bin
549
+ [2024-08-10 12:19:05,473][accelerate.checkpointing][INFO] - Random states saved in checkpoint-pt-25000/random_states_0.pkl
550
+ [2024-08-10 12:21:55,414][Main][INFO] - [train] Step 25050 out of 80000 | Loss --> 2.117 | Grad_l2 --> 0.368 | Weights_l2 --> 8962.926 | Lr --> 0.007 | Seconds_per_step --> 3.499 |
551
+ [2024-08-10 12:24:44,641][Main][INFO] - [train] Step 25100 out of 80000 | Loss --> 2.108 | Grad_l2 --> 0.368 | Weights_l2 --> 8963.658 | Lr --> 0.007 | Seconds_per_step --> 3.385 |
552
+ [2024-08-10 12:27:33,678][Main][INFO] - [train] Step 25150 out of 80000 | Loss --> 2.104 | Grad_l2 --> 0.370 | Weights_l2 --> 8964.369 | Lr --> 0.007 | Seconds_per_step --> 3.381 |
553
+ [2024-08-10 12:30:22,703][Main][INFO] - [train] Step 25200 out of 80000 | Loss --> 2.102 | Grad_l2 --> 0.367 | Weights_l2 --> 8965.077 | Lr --> 0.007 | Seconds_per_step --> 3.380 |
554
+ [2024-08-10 12:33:12,286][Main][INFO] - [train] Step 25250 out of 80000 | Loss --> 2.108 | Grad_l2 --> 0.367 | Weights_l2 --> 8965.794 | Lr --> 0.007 | Seconds_per_step --> 3.392 |
555
+ [2024-08-10 12:36:00,779][Main][INFO] - [train] Step 25300 out of 80000 | Loss --> 2.107 | Grad_l2 --> 0.367 | Weights_l2 --> 8966.528 | Lr --> 0.007 | Seconds_per_step --> 3.370 |
556
+ [2024-08-10 12:38:48,971][Main][INFO] - [train] Step 25350 out of 80000 | Loss --> 2.107 | Grad_l2 --> 0.364 | Weights_l2 --> 8967.235 | Lr --> 0.007 | Seconds_per_step --> 3.364 |
557
+ [2024-08-10 12:41:37,429][Main][INFO] - [train] Step 25400 out of 80000 | Loss --> 2.117 | Grad_l2 --> 0.363 | Weights_l2 --> 8967.925 | Lr --> 0.007 | Seconds_per_step --> 3.369 |
558
+ [2024-08-10 12:44:26,521][Main][INFO] - [train] Step 25450 out of 80000 | Loss --> 2.110 | Grad_l2 --> 0.371 | Weights_l2 --> 8968.626 | Lr --> 0.007 | Seconds_per_step --> 3.382 |
559
+ [2024-08-10 12:47:15,850][Main][INFO] - [train] Step 25500 out of 80000 | Loss --> 2.113 | Grad_l2 --> 0.368 | Weights_l2 --> 8969.323 | Lr --> 0.007 | Seconds_per_step --> 3.387 |
560
+ [2024-08-10 12:50:05,229][Main][INFO] - [train] Step 25550 out of 80000 | Loss --> 2.106 | Grad_l2 --> 0.362 | Weights_l2 --> 8970.029 | Lr --> 0.007 | Seconds_per_step --> 3.388 |
561
+ [2024-08-10 12:52:54,821][Main][INFO] - [train] Step 25600 out of 80000 | Loss --> 2.112 | Grad_l2 --> 0.365 | Weights_l2 --> 8970.711 | Lr --> 0.007 | Seconds_per_step --> 3.392 |
562
+ [2024-08-10 12:55:44,920][Main][INFO] - [train] Step 25650 out of 80000 | Loss --> 2.116 | Grad_l2 --> 0.366 | Weights_l2 --> 8971.399 | Lr --> 0.007 | Seconds_per_step --> 3.402 |
563
+ [2024-08-10 12:58:32,938][Main][INFO] - [train] Step 25700 out of 80000 | Loss --> 2.114 | Grad_l2 --> 0.364 | Weights_l2 --> 8972.067 | Lr --> 0.007 | Seconds_per_step --> 3.360 |
564
+ [2024-08-10 13:01:22,907][Main][INFO] - [train] Step 25750 out of 80000 | Loss --> 2.124 | Grad_l2 --> 0.365 | Weights_l2 --> 8972.769 | Lr --> 0.007 | Seconds_per_step --> 3.399 |
565
+ [2024-08-10 13:04:12,153][Main][INFO] - [train] Step 25800 out of 80000 | Loss --> 2.116 | Grad_l2 --> 0.365 | Weights_l2 --> 8973.450 | Lr --> 0.007 | Seconds_per_step --> 3.385 |
566
+ [2024-08-10 13:07:02,172][Main][INFO] - [train] Step 25850 out of 80000 | Loss --> 2.118 | Grad_l2 --> 0.367 | Weights_l2 --> 8974.127 | Lr --> 0.007 | Seconds_per_step --> 3.400 |
567
+ [2024-08-10 13:09:51,422][Main][INFO] - [train] Step 25900 out of 80000 | Loss --> 2.117 | Grad_l2 --> 0.365 | Weights_l2 --> 8974.808 | Lr --> 0.007 | Seconds_per_step --> 3.385 |
568
+ [2024-08-10 13:12:40,893][Main][INFO] - [train] Step 25950 out of 80000 | Loss --> 2.119 | Grad_l2 --> 0.367 | Weights_l2 --> 8975.499 | Lr --> 0.007 | Seconds_per_step --> 3.389 |
569
+ [2024-08-10 13:15:30,193][Main][INFO] - [train] Step 26000 out of 80000 | Loss --> 2.117 | Grad_l2 --> 0.365 | Weights_l2 --> 8976.191 | Lr --> 0.007 | Seconds_per_step --> 3.386 |
570
+ [2024-08-10 13:18:19,215][Main][INFO] - [train] Step 26050 out of 80000 | Loss --> 2.105 | Grad_l2 --> 0.366 | Weights_l2 --> 8976.887 | Lr --> 0.007 | Seconds_per_step --> 3.380 |
571
+ [2024-08-10 13:21:09,000][Main][INFO] - [train] Step 26100 out of 80000 | Loss --> 2.127 | Grad_l2 --> 0.367 | Weights_l2 --> 8977.570 | Lr --> 0.007 | Seconds_per_step --> 3.396 |
572
+ [2024-08-10 13:23:57,054][Main][INFO] - [train] Step 26150 out of 80000 | Loss --> 2.112 | Grad_l2 --> 0.365 | Weights_l2 --> 8978.248 | Lr --> 0.007 | Seconds_per_step --> 3.361 |
573
+ [2024-08-10 13:26:46,324][Main][INFO] - [train] Step 26200 out of 80000 | Loss --> 2.119 | Grad_l2 --> 0.362 | Weights_l2 --> 8978.920 | Lr --> 0.007 | Seconds_per_step --> 3.385 |
574
+ [2024-08-10 13:29:35,823][Main][INFO] - [train] Step 26250 out of 80000 | Loss --> 2.121 | Grad_l2 --> 0.360 | Weights_l2 --> 8979.595 | Lr --> 0.007 | Seconds_per_step --> 3.390 |
575
+ [2024-08-10 13:32:25,900][Main][INFO] - [train] Step 26300 out of 80000 | Loss --> 2.117 | Grad_l2 --> 0.360 | Weights_l2 --> 8980.264 | Lr --> 0.007 | Seconds_per_step --> 3.402 |
576
+ [2024-08-10 13:35:14,993][Main][INFO] - [train] Step 26350 out of 80000 | Loss --> 2.111 | Grad_l2 --> 0.364 | Weights_l2 --> 8980.936 | Lr --> 0.007 | Seconds_per_step --> 3.382 |