g4rg commited on
Commit
fc9d9d7
·
verified ·
1 Parent(s): e473290

Training in progress, step 264, checkpoint

Browse files
Files changed (28) hide show
  1. last-checkpoint/adapter_model.safetensors +1 -1
  2. last-checkpoint/global_step264/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +3 -0
  3. last-checkpoint/global_step264/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +3 -0
  4. last-checkpoint/global_step264/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +3 -0
  5. last-checkpoint/global_step264/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +3 -0
  6. last-checkpoint/global_step264/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +3 -0
  7. last-checkpoint/global_step264/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +3 -0
  8. last-checkpoint/global_step264/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +3 -0
  9. last-checkpoint/global_step264/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +3 -0
  10. last-checkpoint/global_step264/zero_pp_rank_0_mp_rank_00_model_states.pt +3 -0
  11. last-checkpoint/global_step264/zero_pp_rank_1_mp_rank_00_model_states.pt +3 -0
  12. last-checkpoint/global_step264/zero_pp_rank_2_mp_rank_00_model_states.pt +3 -0
  13. last-checkpoint/global_step264/zero_pp_rank_3_mp_rank_00_model_states.pt +3 -0
  14. last-checkpoint/global_step264/zero_pp_rank_4_mp_rank_00_model_states.pt +3 -0
  15. last-checkpoint/global_step264/zero_pp_rank_5_mp_rank_00_model_states.pt +3 -0
  16. last-checkpoint/global_step264/zero_pp_rank_6_mp_rank_00_model_states.pt +3 -0
  17. last-checkpoint/global_step264/zero_pp_rank_7_mp_rank_00_model_states.pt +3 -0
  18. last-checkpoint/latest +1 -1
  19. last-checkpoint/rng_state_0.pth +1 -1
  20. last-checkpoint/rng_state_1.pth +1 -1
  21. last-checkpoint/rng_state_2.pth +1 -1
  22. last-checkpoint/rng_state_3.pth +1 -1
  23. last-checkpoint/rng_state_4.pth +1 -1
  24. last-checkpoint/rng_state_5.pth +1 -1
  25. last-checkpoint/rng_state_6.pth +1 -1
  26. last-checkpoint/rng_state_7.pth +1 -1
  27. last-checkpoint/scheduler.pt +1 -1
  28. last-checkpoint/trainer_state.json +473 -3
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:92aa718963ff1a158ab63158709261c7329af2ae34ebe9805357cdb7a33e38de
3
  size 763470136
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b6b7fbaf2d6a6e1654728bf2b64ff7a097f615d5247c146dd31d3eccfa8fc30f
3
  size 763470136
last-checkpoint/global_step264/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8ca144ba18752225282c0f4544d978254c2ae4f35d68cae745a609c918846e1f
3
+ size 289065424
last-checkpoint/global_step264/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:614e9ee88e9bcb3a45429c59b2bf2769ba433567f51363c7a4253b475b21e6bc
3
+ size 289065424
last-checkpoint/global_step264/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d655ebb2146d9487a8d09cd971ceb207ee79d6d7c42ab932971bb75ba93e940c
3
+ size 289065424
last-checkpoint/global_step264/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:89eae13e8c0ba60e40a9c3d3f11b76646b83ba197f6abf0aba73b689661edd6c
3
+ size 289065424
last-checkpoint/global_step264/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d0557f6a14c69ef21c2ed3a5c48be9f7a3a47f102c4b16e6db9df1e8036a73a5
3
+ size 289065424
last-checkpoint/global_step264/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8d0052e580fe2d1f70992b2fcc233627549397786188980f2ec6a856d84d45f9
3
+ size 289065424
last-checkpoint/global_step264/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:67db40a4c668372a5d51fa8aad0bf99b67dda6f1acdcfc128369378fd063a309
3
+ size 289065424
last-checkpoint/global_step264/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ca273d1927a1a34aa37630afe3fae7e5b729674eeb60cf491b164d2415105d78
3
+ size 289065424
last-checkpoint/global_step264/zero_pp_rank_0_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ef56386f8cf47768c633eb8b8ab10076925c3b5e457f1548072d50346f8b468d
3
+ size 348711830
last-checkpoint/global_step264/zero_pp_rank_1_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d5bfd7e5ba0884a15f626b0c26345601fddc4900a760abdb231c701069e165d2
3
+ size 348711830
last-checkpoint/global_step264/zero_pp_rank_2_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:67d112660240266bc61d3417f0745f1145af0c63963c37f516b89c36b4985ece
3
+ size 348711830
last-checkpoint/global_step264/zero_pp_rank_3_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:36d966bcd023af6eda0fb4a38472fa53c83aa9a27937caf21dd846c3fd6f9274
3
+ size 348711830
last-checkpoint/global_step264/zero_pp_rank_4_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c28b986c53d2c4dc45b78474b2e13f40f55e7da315b0b108f035a72e86cd9308
3
+ size 348711830
last-checkpoint/global_step264/zero_pp_rank_5_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e59b35e6b3d5165fdaae19137b371d5a6458b844c4a65494b02eba621e681844
3
+ size 348711830
last-checkpoint/global_step264/zero_pp_rank_6_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:881ffb5534b69c57a2e00bbfd8c40881c2f08386b7a27b1f3c4179356f366e18
3
+ size 348711830
last-checkpoint/global_step264/zero_pp_rank_7_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:63a9082d03b443436f3f7c312f3eae46a679986eef5d0a24630c726e3b8afa34
3
+ size 348711830
last-checkpoint/latest CHANGED
@@ -1 +1 @@
1
- global_step198
 
1
+ global_step264
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b659790db5a549bc94a52bd0661c6c5e6c19beea5b259996f6ed9fe2149516f2
3
  size 15920
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:756188867614fe144ce7bb4100b8fdc4a53793718efdbfd597ab9a7af1127cb3
3
  size 15920
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4fedd424305bc76c60abafd8b0806d3107fa0fd9dcab69abdd8a175961c5d292
3
  size 15920
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9326dda8ccb88256fea16bdb08bf3d8ee2d7890d74941621ea0ae79baad53127
3
  size 15920
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:018b23a19c9fcba72d4cdfa2c9fc3962ed3bc3bd0e06e1ebeb979a60bbcca587
3
  size 15920
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dc7ea8107c02800ceda5d3219d8139cc0c46423c770369f8d482750d2ee66b59
3
  size 15920
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:655e1d8eba47928d19c120d020c1358f82da6b7b643dec3c9fd55e5052edd4fe
3
  size 15920
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5100775819feb4598b355aaf5ae7a2d05f1e6c33d82585848692501430716b79
3
  size 15920
last-checkpoint/rng_state_4.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0051f5950d3497fd49cd25af996fe01c32a4128ba6dc3623a168e00768ef4bd5
3
  size 15920
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:375d7beb01cab64b2715fb3d805593967127e2433072776577d1a22535bc71f6
3
  size 15920
last-checkpoint/rng_state_5.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:81af3423c5f1a9239eebc7b36cb6e6db3f9862f7b90cd7560fef2590ee1d68d0
3
  size 15920
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be4bc162636adeba1331e40da73f3fb1fde2fb44472545ff46bc3e2a6588d115
3
  size 15920
last-checkpoint/rng_state_6.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:493f5e097b662c4de6f929779988d574e0855983f464da2bbac2cf6d59691a7a
3
  size 15920
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e4ec9613f9c318e718457c34ba482fb1b487745cd80d6e26c4479f47030f964
3
  size 15920
last-checkpoint/rng_state_7.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:47313163e11ebbda29b8bf91fd61cb4b29fc84b8ec482325f230809ff25c6426
3
  size 15920
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e6c5785e3656da35a0034b82ee38c2b260ac87d57dc93498957445739f27c017
3
  size 15920
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ced86f3a1c08cecda79ca695145fe007ebcfd4f2f8962847f6a9d9d58b4b557b
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:26eca587873b25805521ebb406b132a4ba3e54d5f099d35d9e497769da91dcd6
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.6073619631901841,
5
  "eval_steps": 66,
6
- "global_step": 198,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -1425,6 +1425,476 @@
1425
  "eval_samples_per_second": 1.793,
1426
  "eval_steps_per_second": 0.126,
1427
  "step": 198
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1428
  }
1429
  ],
1430
  "logging_steps": 1,
@@ -1444,7 +1914,7 @@
1444
  "attributes": {}
1445
  }
1446
  },
1447
- "total_flos": 216215096131584.0,
1448
  "train_batch_size": 2,
1449
  "trial_name": null,
1450
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.8098159509202454,
5
  "eval_steps": 66,
6
+ "global_step": 264,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
1425
  "eval_samples_per_second": 1.793,
1426
  "eval_steps_per_second": 0.126,
1427
  "step": 198
1428
+ },
1429
+ {
1430
+ "epoch": 0.6104294478527608,
1431
+ "grad_norm": 0.31755022515076264,
1432
+ "learning_rate": 4.313016905898286e-05,
1433
+ "loss": 1.8861,
1434
+ "step": 199
1435
+ },
1436
+ {
1437
+ "epoch": 0.6134969325153374,
1438
+ "grad_norm": 0.37514333178831394,
1439
+ "learning_rate": 4.268516544675628e-05,
1440
+ "loss": 1.9366,
1441
+ "step": 200
1442
+ },
1443
+ {
1444
+ "epoch": 0.6165644171779141,
1445
+ "grad_norm": 0.2768732078613857,
1446
+ "learning_rate": 4.224145985535202e-05,
1447
+ "loss": 1.8781,
1448
+ "step": 201
1449
+ },
1450
+ {
1451
+ "epoch": 0.6196319018404908,
1452
+ "grad_norm": 0.385983235346578,
1453
+ "learning_rate": 4.1799099052681934e-05,
1454
+ "loss": 1.9089,
1455
+ "step": 202
1456
+ },
1457
+ {
1458
+ "epoch": 0.6226993865030674,
1459
+ "grad_norm": 0.34929147929166254,
1460
+ "learning_rate": 4.135812966491305e-05,
1461
+ "loss": 1.9409,
1462
+ "step": 203
1463
+ },
1464
+ {
1465
+ "epoch": 0.6257668711656442,
1466
+ "grad_norm": 0.3448745967701562,
1467
+ "learning_rate": 4.091859817155307e-05,
1468
+ "loss": 1.8935,
1469
+ "step": 204
1470
+ },
1471
+ {
1472
+ "epoch": 0.6288343558282209,
1473
+ "grad_norm": 0.24777573443198542,
1474
+ "learning_rate": 4.048055090055125e-05,
1475
+ "loss": 1.9007,
1476
+ "step": 205
1477
+ },
1478
+ {
1479
+ "epoch": 0.6319018404907976,
1480
+ "grad_norm": 0.33163324355956286,
1481
+ "learning_rate": 4.004403402341532e-05,
1482
+ "loss": 1.8816,
1483
+ "step": 206
1484
+ },
1485
+ {
1486
+ "epoch": 0.6349693251533742,
1487
+ "grad_norm": 0.6161345209342699,
1488
+ "learning_rate": 3.960909355034491e-05,
1489
+ "loss": 1.8952,
1490
+ "step": 207
1491
+ },
1492
+ {
1493
+ "epoch": 0.6380368098159509,
1494
+ "grad_norm": 0.29863513222265725,
1495
+ "learning_rate": 3.917577532538185e-05,
1496
+ "loss": 1.8622,
1497
+ "step": 208
1498
+ },
1499
+ {
1500
+ "epoch": 0.6411042944785276,
1501
+ "grad_norm": 0.23544641297651625,
1502
+ "learning_rate": 3.8744125021578126e-05,
1503
+ "loss": 1.9098,
1504
+ "step": 209
1505
+ },
1506
+ {
1507
+ "epoch": 0.6441717791411042,
1508
+ "grad_norm": 0.29701664972205183,
1509
+ "learning_rate": 3.831418813618177e-05,
1510
+ "loss": 1.8963,
1511
+ "step": 210
1512
+ },
1513
+ {
1514
+ "epoch": 0.647239263803681,
1515
+ "grad_norm": 0.2608462550147094,
1516
+ "learning_rate": 3.788600998584135e-05,
1517
+ "loss": 1.9425,
1518
+ "step": 211
1519
+ },
1520
+ {
1521
+ "epoch": 0.6503067484662577,
1522
+ "grad_norm": 0.2753794235571961,
1523
+ "learning_rate": 3.7459635701829435e-05,
1524
+ "loss": 1.9312,
1525
+ "step": 212
1526
+ },
1527
+ {
1528
+ "epoch": 0.6533742331288344,
1529
+ "grad_norm": 0.40974803557689143,
1530
+ "learning_rate": 3.703511022528562e-05,
1531
+ "loss": 1.8992,
1532
+ "step": 213
1533
+ },
1534
+ {
1535
+ "epoch": 0.656441717791411,
1536
+ "grad_norm": 0.24030236007607908,
1537
+ "learning_rate": 3.6612478302479594e-05,
1538
+ "loss": 1.9326,
1539
+ "step": 214
1540
+ },
1541
+ {
1542
+ "epoch": 0.6595092024539877,
1543
+ "grad_norm": 0.4383608820045659,
1544
+ "learning_rate": 3.619178448009477e-05,
1545
+ "loss": 1.932,
1546
+ "step": 215
1547
+ },
1548
+ {
1549
+ "epoch": 0.6625766871165644,
1550
+ "grad_norm": 0.5102560092350799,
1551
+ "learning_rate": 3.5773073100532874e-05,
1552
+ "loss": 1.8956,
1553
+ "step": 216
1554
+ },
1555
+ {
1556
+ "epoch": 0.6656441717791411,
1557
+ "grad_norm": 0.36274812580727284,
1558
+ "learning_rate": 3.535638829724019e-05,
1559
+ "loss": 1.8919,
1560
+ "step": 217
1561
+ },
1562
+ {
1563
+ "epoch": 0.6687116564417178,
1564
+ "grad_norm": 0.23488730500365318,
1565
+ "learning_rate": 3.494177399005578e-05,
1566
+ "loss": 1.9158,
1567
+ "step": 218
1568
+ },
1569
+ {
1570
+ "epoch": 0.6717791411042945,
1571
+ "grad_norm": 0.3741689726256645,
1572
+ "learning_rate": 3.452927388058206e-05,
1573
+ "loss": 1.9423,
1574
+ "step": 219
1575
+ },
1576
+ {
1577
+ "epoch": 0.6748466257668712,
1578
+ "grad_norm": 0.25651456348082824,
1579
+ "learning_rate": 3.411893144757866e-05,
1580
+ "loss": 1.8415,
1581
+ "step": 220
1582
+ },
1583
+ {
1584
+ "epoch": 0.6779141104294478,
1585
+ "grad_norm": 0.2612828905023135,
1586
+ "learning_rate": 3.3710789942379556e-05,
1587
+ "loss": 1.9472,
1588
+ "step": 221
1589
+ },
1590
+ {
1591
+ "epoch": 0.6809815950920245,
1592
+ "grad_norm": 0.25469935789428655,
1593
+ "learning_rate": 3.33048923843343e-05,
1594
+ "loss": 1.949,
1595
+ "step": 222
1596
+ },
1597
+ {
1598
+ "epoch": 0.6840490797546013,
1599
+ "grad_norm": 0.23410106434735667,
1600
+ "learning_rate": 3.2901281556273646e-05,
1601
+ "loss": 1.8963,
1602
+ "step": 223
1603
+ },
1604
+ {
1605
+ "epoch": 0.6871165644171779,
1606
+ "grad_norm": 0.25811790889112224,
1607
+ "learning_rate": 3.250000000000001e-05,
1608
+ "loss": 1.8488,
1609
+ "step": 224
1610
+ },
1611
+ {
1612
+ "epoch": 0.6901840490797546,
1613
+ "grad_norm": 0.2701258126507899,
1614
+ "learning_rate": 3.210109001180358e-05,
1615
+ "loss": 1.9429,
1616
+ "step": 225
1617
+ },
1618
+ {
1619
+ "epoch": 0.6932515337423313,
1620
+ "grad_norm": 0.27336206551312103,
1621
+ "learning_rate": 3.170459363800409e-05,
1622
+ "loss": 1.9063,
1623
+ "step": 226
1624
+ },
1625
+ {
1626
+ "epoch": 0.696319018404908,
1627
+ "grad_norm": 0.30139160569284024,
1628
+ "learning_rate": 3.1310552670518986e-05,
1629
+ "loss": 1.9182,
1630
+ "step": 227
1631
+ },
1632
+ {
1633
+ "epoch": 0.6993865030674846,
1634
+ "grad_norm": 0.23370917590561624,
1635
+ "learning_rate": 3.0919008642458494e-05,
1636
+ "loss": 1.9541,
1637
+ "step": 228
1638
+ },
1639
+ {
1640
+ "epoch": 0.7024539877300614,
1641
+ "grad_norm": 0.22222235132591592,
1642
+ "learning_rate": 3.053000282374781e-05,
1643
+ "loss": 1.8864,
1644
+ "step": 229
1645
+ },
1646
+ {
1647
+ "epoch": 0.7055214723926381,
1648
+ "grad_norm": 0.27873390973935386,
1649
+ "learning_rate": 3.014357621677724e-05,
1650
+ "loss": 1.8852,
1651
+ "step": 230
1652
+ },
1653
+ {
1654
+ "epoch": 0.7085889570552147,
1655
+ "grad_norm": 0.3108583744507131,
1656
+ "learning_rate": 2.9759769552080376e-05,
1657
+ "loss": 1.8663,
1658
+ "step": 231
1659
+ },
1660
+ {
1661
+ "epoch": 0.7116564417177914,
1662
+ "grad_norm": 0.30913975922284836,
1663
+ "learning_rate": 2.93786232840409e-05,
1664
+ "loss": 1.9404,
1665
+ "step": 232
1666
+ },
1667
+ {
1668
+ "epoch": 0.7147239263803681,
1669
+ "grad_norm": 0.28385532690084997,
1670
+ "learning_rate": 2.90001775866287e-05,
1671
+ "loss": 1.9023,
1672
+ "step": 233
1673
+ },
1674
+ {
1675
+ "epoch": 0.7177914110429447,
1676
+ "grad_norm": 0.2808200803737186,
1677
+ "learning_rate": 2.8624472349165355e-05,
1678
+ "loss": 1.9192,
1679
+ "step": 234
1680
+ },
1681
+ {
1682
+ "epoch": 0.7208588957055214,
1683
+ "grad_norm": 0.23648694756886077,
1684
+ "learning_rate": 2.8251547172119603e-05,
1685
+ "loss": 2.0132,
1686
+ "step": 235
1687
+ },
1688
+ {
1689
+ "epoch": 0.7239263803680982,
1690
+ "grad_norm": 0.6069490067148141,
1691
+ "learning_rate": 2.7881441362933468e-05,
1692
+ "loss": 1.8395,
1693
+ "step": 236
1694
+ },
1695
+ {
1696
+ "epoch": 0.7269938650306749,
1697
+ "grad_norm": 0.3350257794257116,
1698
+ "learning_rate": 2.751419393187905e-05,
1699
+ "loss": 1.8667,
1700
+ "step": 237
1701
+ },
1702
+ {
1703
+ "epoch": 0.7300613496932515,
1704
+ "grad_norm": 0.232164276820369,
1705
+ "learning_rate": 2.7149843587946744e-05,
1706
+ "loss": 1.8656,
1707
+ "step": 238
1708
+ },
1709
+ {
1710
+ "epoch": 0.7331288343558282,
1711
+ "grad_norm": 0.38356734047420593,
1712
+ "learning_rate": 2.6788428734765224e-05,
1713
+ "loss": 1.9048,
1714
+ "step": 239
1715
+ },
1716
+ {
1717
+ "epoch": 0.7361963190184049,
1718
+ "grad_norm": 0.2618731826165273,
1719
+ "learning_rate": 2.642998746655348e-05,
1720
+ "loss": 1.9783,
1721
+ "step": 240
1722
+ },
1723
+ {
1724
+ "epoch": 0.7392638036809815,
1725
+ "grad_norm": 0.6648822511934657,
1726
+ "learning_rate": 2.6074557564105727e-05,
1727
+ "loss": 1.9043,
1728
+ "step": 241
1729
+ },
1730
+ {
1731
+ "epoch": 0.7423312883435583,
1732
+ "grad_norm": 0.27175163581016115,
1733
+ "learning_rate": 2.5722176490809118e-05,
1734
+ "loss": 1.9585,
1735
+ "step": 242
1736
+ },
1737
+ {
1738
+ "epoch": 0.745398773006135,
1739
+ "grad_norm": 0.3925966681047075,
1740
+ "learning_rate": 2.5372881388694912e-05,
1741
+ "loss": 1.8515,
1742
+ "step": 243
1743
+ },
1744
+ {
1745
+ "epoch": 0.7484662576687117,
1746
+ "grad_norm": 0.37190935188206453,
1747
+ "learning_rate": 2.5026709074523748e-05,
1748
+ "loss": 1.9688,
1749
+ "step": 244
1750
+ },
1751
+ {
1752
+ "epoch": 0.7515337423312883,
1753
+ "grad_norm": 0.2257138379202953,
1754
+ "learning_rate": 2.4683696035904928e-05,
1755
+ "loss": 1.9486,
1756
+ "step": 245
1757
+ },
1758
+ {
1759
+ "epoch": 0.754601226993865,
1760
+ "grad_norm": 0.2274145468605237,
1761
+ "learning_rate": 2.434387842745056e-05,
1762
+ "loss": 1.9302,
1763
+ "step": 246
1764
+ },
1765
+ {
1766
+ "epoch": 0.7576687116564417,
1767
+ "grad_norm": 0.5126959359452324,
1768
+ "learning_rate": 2.400729206696477e-05,
1769
+ "loss": 1.9443,
1770
+ "step": 247
1771
+ },
1772
+ {
1773
+ "epoch": 0.7607361963190185,
1774
+ "grad_norm": 0.2551304692334095,
1775
+ "learning_rate": 2.3673972431668306e-05,
1776
+ "loss": 2.009,
1777
+ "step": 248
1778
+ },
1779
+ {
1780
+ "epoch": 0.7638036809815951,
1781
+ "grad_norm": 0.4447523876477682,
1782
+ "learning_rate": 2.334395465445926e-05,
1783
+ "loss": 1.8468,
1784
+ "step": 249
1785
+ },
1786
+ {
1787
+ "epoch": 0.7668711656441718,
1788
+ "grad_norm": 0.2657558360669318,
1789
+ "learning_rate": 2.3017273520209882e-05,
1790
+ "loss": 1.8886,
1791
+ "step": 250
1792
+ },
1793
+ {
1794
+ "epoch": 0.7699386503067485,
1795
+ "grad_norm": 0.37573420755761094,
1796
+ "learning_rate": 2.2693963462100117e-05,
1797
+ "loss": 1.8663,
1798
+ "step": 251
1799
+ },
1800
+ {
1801
+ "epoch": 0.7730061349693251,
1802
+ "grad_norm": 0.26075506564879214,
1803
+ "learning_rate": 2.2374058557988336e-05,
1804
+ "loss": 1.909,
1805
+ "step": 252
1806
+ },
1807
+ {
1808
+ "epoch": 0.7760736196319018,
1809
+ "grad_norm": 0.2951446457265513,
1810
+ "learning_rate": 2.2057592526819353e-05,
1811
+ "loss": 1.9362,
1812
+ "step": 253
1813
+ },
1814
+ {
1815
+ "epoch": 0.7791411042944786,
1816
+ "grad_norm": 0.24420003456766767,
1817
+ "learning_rate": 2.1744598725070347e-05,
1818
+ "loss": 1.9134,
1819
+ "step": 254
1820
+ },
1821
+ {
1822
+ "epoch": 0.7822085889570553,
1823
+ "grad_norm": 0.2563261666147908,
1824
+ "learning_rate": 2.143511014323506e-05,
1825
+ "loss": 1.9569,
1826
+ "step": 255
1827
+ },
1828
+ {
1829
+ "epoch": 0.7852760736196319,
1830
+ "grad_norm": 0.27427716272900493,
1831
+ "learning_rate": 2.11291594023464e-05,
1832
+ "loss": 1.8982,
1833
+ "step": 256
1834
+ },
1835
+ {
1836
+ "epoch": 0.7883435582822086,
1837
+ "grad_norm": 0.4685271777395839,
1838
+ "learning_rate": 2.082677875053818e-05,
1839
+ "loss": 1.9256,
1840
+ "step": 257
1841
+ },
1842
+ {
1843
+ "epoch": 0.7914110429447853,
1844
+ "grad_norm": 0.3080424306042412,
1845
+ "learning_rate": 2.0528000059645997e-05,
1846
+ "loss": 1.9154,
1847
+ "step": 258
1848
+ },
1849
+ {
1850
+ "epoch": 0.7944785276073619,
1851
+ "grad_norm": 0.2672783439075976,
1852
+ "learning_rate": 2.023285482184785e-05,
1853
+ "loss": 1.9574,
1854
+ "step": 259
1855
+ },
1856
+ {
1857
+ "epoch": 0.7975460122699386,
1858
+ "grad_norm": 0.3311914465278651,
1859
+ "learning_rate": 1.994137414634483e-05,
1860
+ "loss": 1.9133,
1861
+ "step": 260
1862
+ },
1863
+ {
1864
+ "epoch": 0.8006134969325154,
1865
+ "grad_norm": 0.7675438620825049,
1866
+ "learning_rate": 1.9653588756082064e-05,
1867
+ "loss": 1.892,
1868
+ "step": 261
1869
+ },
1870
+ {
1871
+ "epoch": 0.803680981595092,
1872
+ "grad_norm": 0.2757310062776552,
1873
+ "learning_rate": 1.9369528984510394e-05,
1874
+ "loss": 1.9087,
1875
+ "step": 262
1876
+ },
1877
+ {
1878
+ "epoch": 0.8067484662576687,
1879
+ "grad_norm": 0.24797296946202665,
1880
+ "learning_rate": 1.9089224772389225e-05,
1881
+ "loss": 1.8836,
1882
+ "step": 263
1883
+ },
1884
+ {
1885
+ "epoch": 0.8098159509202454,
1886
+ "grad_norm": 0.41244928985184576,
1887
+ "learning_rate": 1.881270566463062e-05,
1888
+ "loss": 1.9094,
1889
+ "step": 264
1890
+ },
1891
+ {
1892
+ "epoch": 0.8098159509202454,
1893
+ "eval_loss": 2.593792200088501,
1894
+ "eval_runtime": 55.7303,
1895
+ "eval_samples_per_second": 1.794,
1896
+ "eval_steps_per_second": 0.126,
1897
+ "step": 264
1898
  }
1899
  ],
1900
  "logging_steps": 1,
 
1914
  "attributes": {}
1915
  }
1916
  },
1917
+ "total_flos": 288286794842112.0,
1918
  "train_batch_size": 2,
1919
  "trial_name": null,
1920
  "trial_params": null