prxy5606 commited on
Commit
5f8287b
·
verified ·
1 Parent(s): 95d25ac

Training in progress, step 250, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3b2de8b9c49da0702145eca4649d5e27fb8b37259f1fbf501749e7780ae33249
3
  size 671149168
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3e5e70313e832a97b4a5f909837ad9943129a92cc5051d6a727def5672f1a649
3
  size 671149168
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b85797c129ff1060ce0dc1d96ef77a9b4c61c1d637d5f0c81561a97d564ac164
3
  size 1342555602
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a2d53e875526cc7f45b9ee008df430f17f426411218e934f51ade9ef8b575fcb
3
  size 1342555602
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:579394abb2c244ad26cc7a4fe6ac0f2c5b0a50affcc1f3a70e165a1601351152
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3055d414afa45ec35d956573da41804fc6e7dcd64cd4972d9d8a03f439aed732
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:721fbab3fb8bec5f97cdbed7b104a2d545aa7387136ca9cabeed23ace817187d
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0cc9840a370c6adb27f4a6f97aa77bff7e496e9a0cca1d81b87512770179ee98
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "best_metric": 1.62479567527771,
3
- "best_model_checkpoint": "miner_id_24/checkpoint-200",
4
- "epoch": 0.04571689810846334,
5
  "eval_steps": 50,
6
- "global_step": 200,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -1447,6 +1447,364 @@
1447
  "eval_samples_per_second": 5.533,
1448
  "eval_steps_per_second": 2.767,
1449
  "step": 200
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1450
  }
1451
  ],
1452
  "logging_steps": 1,
@@ -1475,7 +1833,7 @@
1475
  "attributes": {}
1476
  }
1477
  },
1478
- "total_flos": 4.4575587979965235e+17,
1479
  "train_batch_size": 8,
1480
  "trial_name": null,
1481
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.5557529926300049,
3
+ "best_model_checkpoint": "miner_id_24/checkpoint-250",
4
+ "epoch": 0.05714612263557917,
5
  "eval_steps": 50,
6
+ "global_step": 250,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
1447
  "eval_samples_per_second": 5.533,
1448
  "eval_steps_per_second": 2.767,
1449
  "step": 200
1450
+ },
1451
+ {
1452
+ "epoch": 0.04594548259900566,
1453
+ "grad_norm": 10.6249418258667,
1454
+ "learning_rate": 2.9663167846209998e-05,
1455
+ "loss": 3.5265,
1456
+ "step": 201
1457
+ },
1458
+ {
1459
+ "epoch": 0.04617406708954797,
1460
+ "grad_norm": 8.47287368774414,
1461
+ "learning_rate": 2.9133077275909108e-05,
1462
+ "loss": 3.4879,
1463
+ "step": 202
1464
+ },
1465
+ {
1466
+ "epoch": 0.04640265158009029,
1467
+ "grad_norm": 7.781945705413818,
1468
+ "learning_rate": 2.86058117529173e-05,
1469
+ "loss": 3.0601,
1470
+ "step": 203
1471
+ },
1472
+ {
1473
+ "epoch": 0.04663123607063261,
1474
+ "grad_norm": 7.176991939544678,
1475
+ "learning_rate": 2.8081442660546125e-05,
1476
+ "loss": 2.6794,
1477
+ "step": 204
1478
+ },
1479
+ {
1480
+ "epoch": 0.04685982056117492,
1481
+ "grad_norm": 6.956686496734619,
1482
+ "learning_rate": 2.7560040989976892e-05,
1483
+ "loss": 2.3741,
1484
+ "step": 205
1485
+ },
1486
+ {
1487
+ "epoch": 0.04708840505171724,
1488
+ "grad_norm": 7.541036605834961,
1489
+ "learning_rate": 2.7041677330649407e-05,
1490
+ "loss": 2.4602,
1491
+ "step": 206
1492
+ },
1493
+ {
1494
+ "epoch": 0.04731698954225956,
1495
+ "grad_norm": 26.17896270751953,
1496
+ "learning_rate": 2.6526421860705473e-05,
1497
+ "loss": 1.7172,
1498
+ "step": 207
1499
+ },
1500
+ {
1501
+ "epoch": 0.04754557403280187,
1502
+ "grad_norm": 44.718421936035156,
1503
+ "learning_rate": 2.6014344337487707e-05,
1504
+ "loss": 1.8271,
1505
+ "step": 208
1506
+ },
1507
+ {
1508
+ "epoch": 0.04777415852334419,
1509
+ "grad_norm": 23.585582733154297,
1510
+ "learning_rate": 2.5505514088095655e-05,
1511
+ "loss": 1.1668,
1512
+ "step": 209
1513
+ },
1514
+ {
1515
+ "epoch": 0.04800274301388651,
1516
+ "grad_norm": 18.206127166748047,
1517
+ "learning_rate": 2.500000000000001e-05,
1518
+ "loss": 0.7469,
1519
+ "step": 210
1520
+ },
1521
+ {
1522
+ "epoch": 0.04823132750442882,
1523
+ "grad_norm": 4.393265724182129,
1524
+ "learning_rate": 2.4497870511716235e-05,
1525
+ "loss": 0.1077,
1526
+ "step": 211
1527
+ },
1528
+ {
1529
+ "epoch": 0.04845991199497114,
1530
+ "grad_norm": 4.106040954589844,
1531
+ "learning_rate": 2.399919360353923e-05,
1532
+ "loss": 0.2907,
1533
+ "step": 212
1534
+ },
1535
+ {
1536
+ "epoch": 0.04868849648551346,
1537
+ "grad_norm": 13.828383445739746,
1538
+ "learning_rate": 2.350403678833976e-05,
1539
+ "loss": 0.8737,
1540
+ "step": 213
1541
+ },
1542
+ {
1543
+ "epoch": 0.04891708097605577,
1544
+ "grad_norm": 59.581298828125,
1545
+ "learning_rate": 2.3012467102424373e-05,
1546
+ "loss": 4.8994,
1547
+ "step": 214
1548
+ },
1549
+ {
1550
+ "epoch": 0.04914566546659809,
1551
+ "grad_norm": 52.219642639160156,
1552
+ "learning_rate": 2.25245510964597e-05,
1553
+ "loss": 4.3086,
1554
+ "step": 215
1555
+ },
1556
+ {
1557
+ "epoch": 0.04937424995714041,
1558
+ "grad_norm": 57.01164627075195,
1559
+ "learning_rate": 2.2040354826462668e-05,
1560
+ "loss": 3.0226,
1561
+ "step": 216
1562
+ },
1563
+ {
1564
+ "epoch": 0.04960283444768272,
1565
+ "grad_norm": 35.1441764831543,
1566
+ "learning_rate": 2.1559943844857422e-05,
1567
+ "loss": 1.8326,
1568
+ "step": 217
1569
+ },
1570
+ {
1571
+ "epoch": 0.04983141893822504,
1572
+ "grad_norm": 23.194089889526367,
1573
+ "learning_rate": 2.1083383191600674e-05,
1574
+ "loss": 0.9515,
1575
+ "step": 218
1576
+ },
1577
+ {
1578
+ "epoch": 0.05006000342876736,
1579
+ "grad_norm": 20.370725631713867,
1580
+ "learning_rate": 2.061073738537635e-05,
1581
+ "loss": 2.2338,
1582
+ "step": 219
1583
+ },
1584
+ {
1585
+ "epoch": 0.05028858791930967,
1586
+ "grad_norm": 31.7033634185791,
1587
+ "learning_rate": 2.0142070414860704e-05,
1588
+ "loss": 2.838,
1589
+ "step": 220
1590
+ },
1591
+ {
1592
+ "epoch": 0.05051717240985199,
1593
+ "grad_norm": 30.212299346923828,
1594
+ "learning_rate": 1.9677445730059346e-05,
1595
+ "loss": 3.5437,
1596
+ "step": 221
1597
+ },
1598
+ {
1599
+ "epoch": 0.05074575690039431,
1600
+ "grad_norm": 24.48371124267578,
1601
+ "learning_rate": 1.9216926233717085e-05,
1602
+ "loss": 2.1516,
1603
+ "step": 222
1604
+ },
1605
+ {
1606
+ "epoch": 0.05097434139093662,
1607
+ "grad_norm": 26.28635597229004,
1608
+ "learning_rate": 1.8760574272802e-05,
1609
+ "loss": 1.8107,
1610
+ "step": 223
1611
+ },
1612
+ {
1613
+ "epoch": 0.05120292588147894,
1614
+ "grad_norm": 64.8018569946289,
1615
+ "learning_rate": 1.8308451630064484e-05,
1616
+ "loss": 3.8766,
1617
+ "step": 224
1618
+ },
1619
+ {
1620
+ "epoch": 0.05143151037202126,
1621
+ "grad_norm": 79.22872161865234,
1622
+ "learning_rate": 1.7860619515673033e-05,
1623
+ "loss": 5.5396,
1624
+ "step": 225
1625
+ },
1626
+ {
1627
+ "epoch": 0.05166009486256357,
1628
+ "grad_norm": 81.44451904296875,
1629
+ "learning_rate": 1.7417138558927244e-05,
1630
+ "loss": 5.3564,
1631
+ "step": 226
1632
+ },
1633
+ {
1634
+ "epoch": 0.05188867935310589,
1635
+ "grad_norm": 74.99747467041016,
1636
+ "learning_rate": 1.6978068800049624e-05,
1637
+ "loss": 4.2999,
1638
+ "step": 227
1639
+ },
1640
+ {
1641
+ "epoch": 0.05211726384364821,
1642
+ "grad_norm": 30.039758682250977,
1643
+ "learning_rate": 1.6543469682057106e-05,
1644
+ "loss": 4.5715,
1645
+ "step": 228
1646
+ },
1647
+ {
1648
+ "epoch": 0.05234584833419052,
1649
+ "grad_norm": 25.890417098999023,
1650
+ "learning_rate": 1.611340004271339e-05,
1651
+ "loss": 4.2147,
1652
+ "step": 229
1653
+ },
1654
+ {
1655
+ "epoch": 0.05257443282473284,
1656
+ "grad_norm": 24.455820083618164,
1657
+ "learning_rate": 1.5687918106563326e-05,
1658
+ "loss": 3.6963,
1659
+ "step": 230
1660
+ },
1661
+ {
1662
+ "epoch": 0.05280301731527516,
1663
+ "grad_norm": 13.776949882507324,
1664
+ "learning_rate": 1.526708147705013e-05,
1665
+ "loss": 2.5523,
1666
+ "step": 231
1667
+ },
1668
+ {
1669
+ "epoch": 0.05303160180581747,
1670
+ "grad_norm": 14.489748001098633,
1671
+ "learning_rate": 1.4850947128716913e-05,
1672
+ "loss": 2.8481,
1673
+ "step": 232
1674
+ },
1675
+ {
1676
+ "epoch": 0.05326018629635979,
1677
+ "grad_norm": 14.851402282714844,
1678
+ "learning_rate": 1.4439571399493146e-05,
1679
+ "loss": 2.9429,
1680
+ "step": 233
1681
+ },
1682
+ {
1683
+ "epoch": 0.05348877078690211,
1684
+ "grad_norm": 14.959819793701172,
1685
+ "learning_rate": 1.4033009983067452e-05,
1686
+ "loss": 2.7784,
1687
+ "step": 234
1688
+ },
1689
+ {
1690
+ "epoch": 0.05371735527744442,
1691
+ "grad_norm": 10.266538619995117,
1692
+ "learning_rate": 1.3631317921347563e-05,
1693
+ "loss": 2.1112,
1694
+ "step": 235
1695
+ },
1696
+ {
1697
+ "epoch": 0.05394593976798674,
1698
+ "grad_norm": 13.498571395874023,
1699
+ "learning_rate": 1.3234549597008571e-05,
1700
+ "loss": 2.61,
1701
+ "step": 236
1702
+ },
1703
+ {
1704
+ "epoch": 0.05417452425852906,
1705
+ "grad_norm": 13.794647216796875,
1706
+ "learning_rate": 1.2842758726130283e-05,
1707
+ "loss": 2.655,
1708
+ "step": 237
1709
+ },
1710
+ {
1711
+ "epoch": 0.05440310874907137,
1712
+ "grad_norm": 12.700047492980957,
1713
+ "learning_rate": 1.245599835092504e-05,
1714
+ "loss": 2.1982,
1715
+ "step": 238
1716
+ },
1717
+ {
1718
+ "epoch": 0.05463169323961369,
1719
+ "grad_norm": 11.33856201171875,
1720
+ "learning_rate": 1.2074320832556556e-05,
1721
+ "loss": 1.8852,
1722
+ "step": 239
1723
+ },
1724
+ {
1725
+ "epoch": 0.05486027773015601,
1726
+ "grad_norm": 12.628767013549805,
1727
+ "learning_rate": 1.1697777844051105e-05,
1728
+ "loss": 2.6147,
1729
+ "step": 240
1730
+ },
1731
+ {
1732
+ "epoch": 0.05508886222069832,
1733
+ "grad_norm": 12.457136154174805,
1734
+ "learning_rate": 1.132642036330181e-05,
1735
+ "loss": 2.2797,
1736
+ "step": 241
1737
+ },
1738
+ {
1739
+ "epoch": 0.05531744671124064,
1740
+ "grad_norm": 16.78705406188965,
1741
+ "learning_rate": 1.096029866616704e-05,
1742
+ "loss": 2.1772,
1743
+ "step": 242
1744
+ },
1745
+ {
1746
+ "epoch": 0.05554603120178296,
1747
+ "grad_norm": 20.40379524230957,
1748
+ "learning_rate": 1.0599462319663905e-05,
1749
+ "loss": 4.4865,
1750
+ "step": 243
1751
+ },
1752
+ {
1753
+ "epoch": 0.05577461569232527,
1754
+ "grad_norm": 47.940460205078125,
1755
+ "learning_rate": 1.0243960175257606e-05,
1756
+ "loss": 4.1248,
1757
+ "step": 244
1758
+ },
1759
+ {
1760
+ "epoch": 0.05600320018286759,
1761
+ "grad_norm": 20.962236404418945,
1762
+ "learning_rate": 9.893840362247809e-06,
1763
+ "loss": 5.0609,
1764
+ "step": 245
1765
+ },
1766
+ {
1767
+ "epoch": 0.05623178467340991,
1768
+ "grad_norm": 17.944730758666992,
1769
+ "learning_rate": 9.549150281252633e-06,
1770
+ "loss": 5.8087,
1771
+ "step": 246
1772
+ },
1773
+ {
1774
+ "epoch": 0.05646036916395222,
1775
+ "grad_norm": 14.370197296142578,
1776
+ "learning_rate": 9.209936597791407e-06,
1777
+ "loss": 4.9449,
1778
+ "step": 247
1779
+ },
1780
+ {
1781
+ "epoch": 0.05668895365449454,
1782
+ "grad_norm": 13.268957138061523,
1783
+ "learning_rate": 8.876245235966885e-06,
1784
+ "loss": 4.4975,
1785
+ "step": 248
1786
+ },
1787
+ {
1788
+ "epoch": 0.05691753814503686,
1789
+ "grad_norm": 13.039083480834961,
1790
+ "learning_rate": 8.548121372247918e-06,
1791
+ "loss": 3.907,
1792
+ "step": 249
1793
+ },
1794
+ {
1795
+ "epoch": 0.05714612263557917,
1796
+ "grad_norm": 14.2072172164917,
1797
+ "learning_rate": 8.225609429353187e-06,
1798
+ "loss": 3.7406,
1799
+ "step": 250
1800
+ },
1801
+ {
1802
+ "epoch": 0.05714612263557917,
1803
+ "eval_loss": 0.5557529926300049,
1804
+ "eval_runtime": 1331.7964,
1805
+ "eval_samples_per_second": 5.532,
1806
+ "eval_steps_per_second": 2.766,
1807
+ "step": 250
1808
  }
1809
  ],
1810
  "logging_steps": 1,
 
1833
  "attributes": {}
1834
  }
1835
  },
1836
+ "total_flos": 5.566582254961951e+17,
1837
  "train_batch_size": 8,
1838
  "trial_name": null,
1839
  "trial_params": null