musabg commited on
Commit
b25c2a4
1 Parent(s): 6414281

Upload folder using huggingface_hub

Browse files
pytorch_model-00001-of-00003.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0109702a17586a6669f6211be7906e986e5292f56b1e53682a9decc684dff422
3
- size 9877989650
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7aaa6098082d792fee0be032e0dd660213189af96f0213a828d4cc55d626d222
3
+ size 9877988050
pytorch_model-00002-of-00003.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cec7a4785a52bef4efdb94137e6c54a479ed55762251e2ecc1658ee1bf38ff38
3
- size 9894801206
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:372de6c40b11b32b0b29f6304b8a22f2c81e56f9516f7ea134115145bb5525be
3
+ size 9894799542
pytorch_model-00003-of-00003.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:519cc27e43a806ed30c155517838a9b34b0e96de21ac23267a2e2fdfd4c4c2e8
3
- size 7180990841
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c5049d5c4b55dada711ff08977660e67d02d7a9741e09b2539913368037efaca
3
+ size 7180989689
trainer_state.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 2.5174825174825175,
5
- "global_step": 720,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
@@ -4342,11 +4342,856 @@
4342
  "learning_rate": 1.3271869055905495e-06,
4343
  "loss": 0.1769,
4344
  "step": 720
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4345
  }
4346
  ],
4347
  "max_steps": 858,
4348
  "num_train_epochs": 3,
4349
- "total_flos": 1.8324177213299098e+18,
4350
  "trial_name": null,
4351
  "trial_params": null
4352
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 3.0,
5
+ "global_step": 858,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
 
4342
  "learning_rate": 1.3271869055905495e-06,
4343
  "loss": 0.1769,
4344
  "step": 720
4345
+ },
4346
+ {
4347
+ "epoch": 2.52,
4348
+ "learning_rate": 1.3084513916284913e-06,
4349
+ "loss": 0.1691,
4350
+ "step": 721
4351
+ },
4352
+ {
4353
+ "epoch": 2.52,
4354
+ "learning_rate": 1.2898398000448441e-06,
4355
+ "loss": 0.1835,
4356
+ "step": 722
4357
+ },
4358
+ {
4359
+ "epoch": 2.53,
4360
+ "learning_rate": 1.2713523961999996e-06,
4361
+ "loss": 0.1878,
4362
+ "step": 723
4363
+ },
4364
+ {
4365
+ "epoch": 2.53,
4366
+ "learning_rate": 1.2529894436836965e-06,
4367
+ "loss": 0.178,
4368
+ "step": 724
4369
+ },
4370
+ {
4371
+ "epoch": 2.53,
4372
+ "learning_rate": 1.2347512043112753e-06,
4373
+ "loss": 0.1772,
4374
+ "step": 725
4375
+ },
4376
+ {
4377
+ "epoch": 2.54,
4378
+ "learning_rate": 1.2166379381199423e-06,
4379
+ "loss": 0.1823,
4380
+ "step": 726
4381
+ },
4382
+ {
4383
+ "epoch": 2.54,
4384
+ "learning_rate": 1.1986499033650557e-06,
4385
+ "loss": 0.176,
4386
+ "step": 727
4387
+ },
4388
+ {
4389
+ "epoch": 2.55,
4390
+ "learning_rate": 1.1807873565164507e-06,
4391
+ "loss": 0.1685,
4392
+ "step": 728
4393
+ },
4394
+ {
4395
+ "epoch": 2.55,
4396
+ "learning_rate": 1.1630505522547853e-06,
4397
+ "loss": 0.1721,
4398
+ "step": 729
4399
+ },
4400
+ {
4401
+ "epoch": 2.55,
4402
+ "learning_rate": 1.1454397434679022e-06,
4403
+ "loss": 0.1601,
4404
+ "step": 730
4405
+ },
4406
+ {
4407
+ "epoch": 2.56,
4408
+ "learning_rate": 1.12795518124722e-06,
4409
+ "loss": 0.1755,
4410
+ "step": 731
4411
+ },
4412
+ {
4413
+ "epoch": 2.56,
4414
+ "learning_rate": 1.11059711488417e-06,
4415
+ "loss": 0.1735,
4416
+ "step": 732
4417
+ },
4418
+ {
4419
+ "epoch": 2.56,
4420
+ "learning_rate": 1.0933657918666175e-06,
4421
+ "loss": 0.1627,
4422
+ "step": 733
4423
+ },
4424
+ {
4425
+ "epoch": 2.57,
4426
+ "learning_rate": 1.0762614578753571e-06,
4427
+ "loss": 0.1694,
4428
+ "step": 734
4429
+ },
4430
+ {
4431
+ "epoch": 2.57,
4432
+ "learning_rate": 1.0592843567805944e-06,
4433
+ "loss": 0.1933,
4434
+ "step": 735
4435
+ },
4436
+ {
4437
+ "epoch": 2.57,
4438
+ "learning_rate": 1.042434730638473e-06,
4439
+ "loss": 0.1702,
4440
+ "step": 736
4441
+ },
4442
+ {
4443
+ "epoch": 2.58,
4444
+ "learning_rate": 1.0257128196876233e-06,
4445
+ "loss": 0.1776,
4446
+ "step": 737
4447
+ },
4448
+ {
4449
+ "epoch": 2.58,
4450
+ "learning_rate": 1.0091188623457415e-06,
4451
+ "loss": 0.1745,
4452
+ "step": 738
4453
+ },
4454
+ {
4455
+ "epoch": 2.58,
4456
+ "learning_rate": 9.926530952061831e-07,
4457
+ "loss": 0.1746,
4458
+ "step": 739
4459
+ },
4460
+ {
4461
+ "epoch": 2.59,
4462
+ "learning_rate": 9.763157530345957e-07,
4463
+ "loss": 0.1832,
4464
+ "step": 740
4465
+ },
4466
+ {
4467
+ "epoch": 2.59,
4468
+ "learning_rate": 9.601070687655667e-07,
4469
+ "loss": 0.1721,
4470
+ "step": 741
4471
+ },
4472
+ {
4473
+ "epoch": 2.59,
4474
+ "learning_rate": 9.440272734993072e-07,
4475
+ "loss": 0.1767,
4476
+ "step": 742
4477
+ },
4478
+ {
4479
+ "epoch": 2.6,
4480
+ "learning_rate": 9.280765964983529e-07,
4481
+ "loss": 0.1743,
4482
+ "step": 743
4483
+ },
4484
+ {
4485
+ "epoch": 2.6,
4486
+ "learning_rate": 9.122552651842931e-07,
4487
+ "loss": 0.1906,
4488
+ "step": 744
4489
+ },
4490
+ {
4491
+ "epoch": 2.6,
4492
+ "learning_rate": 8.965635051345411e-07,
4493
+ "loss": 0.1703,
4494
+ "step": 745
4495
+ },
4496
+ {
4497
+ "epoch": 2.61,
4498
+ "learning_rate": 8.810015400790994e-07,
4499
+ "loss": 0.1847,
4500
+ "step": 746
4501
+ },
4502
+ {
4503
+ "epoch": 2.61,
4504
+ "learning_rate": 8.655695918973861e-07,
4505
+ "loss": 0.178,
4506
+ "step": 747
4507
+ },
4508
+ {
4509
+ "epoch": 2.62,
4510
+ "learning_rate": 8.502678806150588e-07,
4511
+ "loss": 0.1808,
4512
+ "step": 748
4513
+ },
4514
+ {
4515
+ "epoch": 2.62,
4516
+ "learning_rate": 8.350966244008896e-07,
4517
+ "loss": 0.1768,
4518
+ "step": 749
4519
+ },
4520
+ {
4521
+ "epoch": 2.62,
4522
+ "learning_rate": 8.200560395636414e-07,
4523
+ "loss": 0.1871,
4524
+ "step": 750
4525
+ },
4526
+ {
4527
+ "epoch": 2.63,
4528
+ "learning_rate": 8.051463405489956e-07,
4529
+ "loss": 0.176,
4530
+ "step": 751
4531
+ },
4532
+ {
4533
+ "epoch": 2.63,
4534
+ "learning_rate": 7.903677399364839e-07,
4535
+ "loss": 0.1715,
4536
+ "step": 752
4537
+ },
4538
+ {
4539
+ "epoch": 2.63,
4540
+ "learning_rate": 7.757204484364699e-07,
4541
+ "loss": 0.1839,
4542
+ "step": 753
4543
+ },
4544
+ {
4545
+ "epoch": 2.64,
4546
+ "learning_rate": 7.612046748871327e-07,
4547
+ "loss": 0.1799,
4548
+ "step": 754
4549
+ },
4550
+ {
4551
+ "epoch": 2.64,
4552
+ "learning_rate": 7.468206262514965e-07,
4553
+ "loss": 0.1664,
4554
+ "step": 755
4555
+ },
4556
+ {
4557
+ "epoch": 2.64,
4558
+ "learning_rate": 7.325685076144795e-07,
4559
+ "loss": 0.1742,
4560
+ "step": 756
4561
+ },
4562
+ {
4563
+ "epoch": 2.65,
4564
+ "learning_rate": 7.184485221799631e-07,
4565
+ "loss": 0.1811,
4566
+ "step": 757
4567
+ },
4568
+ {
4569
+ "epoch": 2.65,
4570
+ "learning_rate": 7.044608712679058e-07,
4571
+ "loss": 0.1805,
4572
+ "step": 758
4573
+ },
4574
+ {
4575
+ "epoch": 2.65,
4576
+ "learning_rate": 6.90605754311462e-07,
4577
+ "loss": 0.1733,
4578
+ "step": 759
4579
+ },
4580
+ {
4581
+ "epoch": 2.66,
4582
+ "learning_rate": 6.768833688541443e-07,
4583
+ "loss": 0.1746,
4584
+ "step": 760
4585
+ },
4586
+ {
4587
+ "epoch": 2.66,
4588
+ "learning_rate": 6.632939105470049e-07,
4589
+ "loss": 0.1745,
4590
+ "step": 761
4591
+ },
4592
+ {
4593
+ "epoch": 2.66,
4594
+ "learning_rate": 6.498375731458529e-07,
4595
+ "loss": 0.176,
4596
+ "step": 762
4597
+ },
4598
+ {
4599
+ "epoch": 2.67,
4600
+ "learning_rate": 6.365145485084767e-07,
4601
+ "loss": 0.1773,
4602
+ "step": 763
4603
+ },
4604
+ {
4605
+ "epoch": 2.67,
4606
+ "learning_rate": 6.233250265919266e-07,
4607
+ "loss": 0.1801,
4608
+ "step": 764
4609
+ },
4610
+ {
4611
+ "epoch": 2.67,
4612
+ "learning_rate": 6.102691954497908e-07,
4613
+ "loss": 0.177,
4614
+ "step": 765
4615
+ },
4616
+ {
4617
+ "epoch": 2.68,
4618
+ "learning_rate": 5.973472412295256e-07,
4619
+ "loss": 0.1753,
4620
+ "step": 766
4621
+ },
4622
+ {
4623
+ "epoch": 2.68,
4624
+ "learning_rate": 5.845593481697931e-07,
4625
+ "loss": 0.1721,
4626
+ "step": 767
4627
+ },
4628
+ {
4629
+ "epoch": 2.69,
4630
+ "learning_rate": 5.719056985978388e-07,
4631
+ "loss": 0.1906,
4632
+ "step": 768
4633
+ },
4634
+ {
4635
+ "epoch": 2.69,
4636
+ "learning_rate": 5.59386472926895e-07,
4637
+ "loss": 0.192,
4638
+ "step": 769
4639
+ },
4640
+ {
4641
+ "epoch": 2.69,
4642
+ "learning_rate": 5.470018496535967e-07,
4643
+ "loss": 0.1909,
4644
+ "step": 770
4645
+ },
4646
+ {
4647
+ "epoch": 2.7,
4648
+ "learning_rate": 5.347520053554544e-07,
4649
+ "loss": 0.1836,
4650
+ "step": 771
4651
+ },
4652
+ {
4653
+ "epoch": 2.7,
4654
+ "learning_rate": 5.22637114688318e-07,
4655
+ "loss": 0.1815,
4656
+ "step": 772
4657
+ },
4658
+ {
4659
+ "epoch": 2.7,
4660
+ "learning_rate": 5.106573503839018e-07,
4661
+ "loss": 0.1717,
4662
+ "step": 773
4663
+ },
4664
+ {
4665
+ "epoch": 2.71,
4666
+ "learning_rate": 4.988128832473105e-07,
4667
+ "loss": 0.1843,
4668
+ "step": 774
4669
+ },
4670
+ {
4671
+ "epoch": 2.71,
4672
+ "learning_rate": 4.871038821546104e-07,
4673
+ "loss": 0.1721,
4674
+ "step": 775
4675
+ },
4676
+ {
4677
+ "epoch": 2.71,
4678
+ "learning_rate": 4.755305140504185e-07,
4679
+ "loss": 0.1919,
4680
+ "step": 776
4681
+ },
4682
+ {
4683
+ "epoch": 2.72,
4684
+ "learning_rate": 4.6409294394552774e-07,
4685
+ "loss": 0.1768,
4686
+ "step": 777
4687
+ },
4688
+ {
4689
+ "epoch": 2.72,
4690
+ "learning_rate": 4.5279133491454406e-07,
4691
+ "loss": 0.1926,
4692
+ "step": 778
4693
+ },
4694
+ {
4695
+ "epoch": 2.72,
4696
+ "learning_rate": 4.416258480935731e-07,
4697
+ "loss": 0.1749,
4698
+ "step": 779
4699
+ },
4700
+ {
4701
+ "epoch": 2.73,
4702
+ "learning_rate": 4.305966426779118e-07,
4703
+ "loss": 0.176,
4704
+ "step": 780
4705
+ },
4706
+ {
4707
+ "epoch": 2.73,
4708
+ "learning_rate": 4.197038759197869e-07,
4709
+ "loss": 0.1823,
4710
+ "step": 781
4711
+ },
4712
+ {
4713
+ "epoch": 2.73,
4714
+ "learning_rate": 4.089477031261113e-07,
4715
+ "loss": 0.1808,
4716
+ "step": 782
4717
+ },
4718
+ {
4719
+ "epoch": 2.74,
4720
+ "learning_rate": 3.983282776562647e-07,
4721
+ "loss": 0.1722,
4722
+ "step": 783
4723
+ },
4724
+ {
4725
+ "epoch": 2.74,
4726
+ "learning_rate": 3.878457509199107e-07,
4727
+ "loss": 0.174,
4728
+ "step": 784
4729
+ },
4730
+ {
4731
+ "epoch": 2.74,
4732
+ "learning_rate": 3.7750027237484e-07,
4733
+ "loss": 0.1742,
4734
+ "step": 785
4735
+ },
4736
+ {
4737
+ "epoch": 2.75,
4738
+ "learning_rate": 3.6729198952483725e-07,
4739
+ "loss": 0.178,
4740
+ "step": 786
4741
+ },
4742
+ {
4743
+ "epoch": 2.75,
4744
+ "learning_rate": 3.572210479175753e-07,
4745
+ "loss": 0.1695,
4746
+ "step": 787
4747
+ },
4748
+ {
4749
+ "epoch": 2.76,
4750
+ "learning_rate": 3.4728759114254774e-07,
4751
+ "loss": 0.1676,
4752
+ "step": 788
4753
+ },
4754
+ {
4755
+ "epoch": 2.76,
4756
+ "learning_rate": 3.374917608290107e-07,
4757
+ "loss": 0.1848,
4758
+ "step": 789
4759
+ },
4760
+ {
4761
+ "epoch": 2.76,
4762
+ "learning_rate": 3.278336966439744e-07,
4763
+ "loss": 0.187,
4764
+ "step": 790
4765
+ },
4766
+ {
4767
+ "epoch": 2.77,
4768
+ "learning_rate": 3.1831353629020345e-07,
4769
+ "loss": 0.1757,
4770
+ "step": 791
4771
+ },
4772
+ {
4773
+ "epoch": 2.77,
4774
+ "learning_rate": 3.089314155042589e-07,
4775
+ "loss": 0.1715,
4776
+ "step": 792
4777
+ },
4778
+ {
4779
+ "epoch": 2.77,
4780
+ "learning_rate": 2.996874680545603e-07,
4781
+ "loss": 0.1661,
4782
+ "step": 793
4783
+ },
4784
+ {
4785
+ "epoch": 2.78,
4786
+ "learning_rate": 2.905818257394799e-07,
4787
+ "loss": 0.184,
4788
+ "step": 794
4789
+ },
4790
+ {
4791
+ "epoch": 2.78,
4792
+ "learning_rate": 2.816146183854618e-07,
4793
+ "loss": 0.1794,
4794
+ "step": 795
4795
+ },
4796
+ {
4797
+ "epoch": 2.78,
4798
+ "learning_rate": 2.727859738451721e-07,
4799
+ "loss": 0.1744,
4800
+ "step": 796
4801
+ },
4802
+ {
4803
+ "epoch": 2.79,
4804
+ "learning_rate": 2.640960179956764e-07,
4805
+ "loss": 0.1644,
4806
+ "step": 797
4807
+ },
4808
+ {
4809
+ "epoch": 2.79,
4810
+ "learning_rate": 2.5554487473664404e-07,
4811
+ "loss": 0.1781,
4812
+ "step": 798
4813
+ },
4814
+ {
4815
+ "epoch": 2.79,
4816
+ "learning_rate": 2.471326659885809e-07,
4817
+ "loss": 0.1831,
4818
+ "step": 799
4819
+ },
4820
+ {
4821
+ "epoch": 2.8,
4822
+ "learning_rate": 2.388595116910919e-07,
4823
+ "loss": 0.1818,
4824
+ "step": 800
4825
+ },
4826
+ {
4827
+ "epoch": 2.8,
4828
+ "learning_rate": 2.3072552980117568e-07,
4829
+ "loss": 0.191,
4830
+ "step": 801
4831
+ },
4832
+ {
4833
+ "epoch": 2.8,
4834
+ "learning_rate": 2.2273083629153148e-07,
4835
+ "loss": 0.1834,
4836
+ "step": 802
4837
+ },
4838
+ {
4839
+ "epoch": 2.81,
4840
+ "learning_rate": 2.1487554514891706e-07,
4841
+ "loss": 0.1709,
4842
+ "step": 803
4843
+ },
4844
+ {
4845
+ "epoch": 2.81,
4846
+ "learning_rate": 2.0715976837251793e-07,
4847
+ "loss": 0.1698,
4848
+ "step": 804
4849
+ },
4850
+ {
4851
+ "epoch": 2.81,
4852
+ "learning_rate": 1.9958361597235076e-07,
4853
+ "loss": 0.1833,
4854
+ "step": 805
4855
+ },
4856
+ {
4857
+ "epoch": 2.82,
4858
+ "learning_rate": 1.921471959676957e-07,
4859
+ "loss": 0.1738,
4860
+ "step": 806
4861
+ },
4862
+ {
4863
+ "epoch": 2.82,
4864
+ "learning_rate": 1.8485061438555552e-07,
4865
+ "loss": 0.168,
4866
+ "step": 807
4867
+ },
4868
+ {
4869
+ "epoch": 2.83,
4870
+ "learning_rate": 1.7769397525914668e-07,
4871
+ "loss": 0.1849,
4872
+ "step": 808
4873
+ },
4874
+ {
4875
+ "epoch": 2.83,
4876
+ "learning_rate": 1.706773806264106e-07,
4877
+ "loss": 0.1741,
4878
+ "step": 809
4879
+ },
4880
+ {
4881
+ "epoch": 2.83,
4882
+ "learning_rate": 1.6380093052856482e-07,
4883
+ "loss": 0.1777,
4884
+ "step": 810
4885
+ },
4886
+ {
4887
+ "epoch": 2.84,
4888
+ "learning_rate": 1.5706472300867082e-07,
4889
+ "loss": 0.2177,
4890
+ "step": 811
4891
+ },
4892
+ {
4893
+ "epoch": 2.84,
4894
+ "learning_rate": 1.5046885411024393e-07,
4895
+ "loss": 0.175,
4896
+ "step": 812
4897
+ },
4898
+ {
4899
+ "epoch": 2.84,
4900
+ "learning_rate": 1.4401341787587454e-07,
4901
+ "loss": 0.1731,
4902
+ "step": 813
4903
+ },
4904
+ {
4905
+ "epoch": 2.85,
4906
+ "learning_rate": 1.3769850634589356e-07,
4907
+ "loss": 0.1778,
4908
+ "step": 814
4909
+ },
4910
+ {
4911
+ "epoch": 2.85,
4912
+ "learning_rate": 1.3152420955706014e-07,
4913
+ "loss": 0.1704,
4914
+ "step": 815
4915
+ },
4916
+ {
4917
+ "epoch": 2.85,
4918
+ "learning_rate": 1.2549061554127494e-07,
4919
+ "loss": 0.1729,
4920
+ "step": 816
4921
+ },
4922
+ {
4923
+ "epoch": 2.86,
4924
+ "learning_rate": 1.195978103243234e-07,
4925
+ "loss": 0.1831,
4926
+ "step": 817
4927
+ },
4928
+ {
4929
+ "epoch": 2.86,
4930
+ "learning_rate": 1.1384587792465873e-07,
4931
+ "loss": 0.166,
4932
+ "step": 818
4933
+ },
4934
+ {
4935
+ "epoch": 2.86,
4936
+ "learning_rate": 1.0823490035218986e-07,
4937
+ "loss": 0.1784,
4938
+ "step": 819
4939
+ },
4940
+ {
4941
+ "epoch": 2.87,
4942
+ "learning_rate": 1.0276495760712768e-07,
4943
+ "loss": 0.1825,
4944
+ "step": 820
4945
+ },
4946
+ {
4947
+ "epoch": 2.87,
4948
+ "learning_rate": 9.743612767882937e-08,
4949
+ "loss": 0.1838,
4950
+ "step": 821
4951
+ },
4952
+ {
4953
+ "epoch": 2.87,
4954
+ "learning_rate": 9.224848654469932e-08,
4955
+ "loss": 0.1793,
4956
+ "step": 822
4957
+ },
4958
+ {
4959
+ "epoch": 2.88,
4960
+ "learning_rate": 8.720210816909436e-08,
4961
+ "loss": 0.1735,
4962
+ "step": 823
4963
+ },
4964
+ {
4965
+ "epoch": 2.88,
4966
+ "learning_rate": 8.229706450227804e-08,
4967
+ "loss": 0.1773,
4968
+ "step": 824
4969
+ },
4970
+ {
4971
+ "epoch": 2.88,
4972
+ "learning_rate": 7.753342547939357e-08,
4973
+ "loss": 0.1783,
4974
+ "step": 825
4975
+ },
4976
+ {
4977
+ "epoch": 2.89,
4978
+ "learning_rate": 7.291125901946027e-08,
4979
+ "loss": 0.1779,
4980
+ "step": 826
4981
+ },
4982
+ {
4983
+ "epoch": 2.89,
4984
+ "learning_rate": 6.843063102441317e-08,
4985
+ "loss": 0.1824,
4986
+ "step": 827
4987
+ },
4988
+ {
4989
+ "epoch": 2.9,
4990
+ "learning_rate": 6.409160537815818e-08,
4991
+ "loss": 0.1727,
4992
+ "step": 828
4993
+ },
4994
+ {
4995
+ "epoch": 2.9,
4996
+ "learning_rate": 5.9894243945664e-08,
4997
+ "loss": 0.1756,
4998
+ "step": 829
4999
+ },
5000
+ {
5001
+ "epoch": 2.9,
5002
+ "learning_rate": 5.5838606572078404e-08,
5003
+ "loss": 0.1828,
5004
+ "step": 830
5005
+ },
5006
+ {
5007
+ "epoch": 2.91,
5008
+ "learning_rate": 5.192475108187545e-08,
5009
+ "loss": 0.1693,
5010
+ "step": 831
5011
+ },
5012
+ {
5013
+ "epoch": 2.91,
5014
+ "learning_rate": 4.815273327803183e-08,
5015
+ "loss": 0.1911,
5016
+ "step": 832
5017
+ },
5018
+ {
5019
+ "epoch": 2.91,
5020
+ "learning_rate": 4.4522606941228564e-08,
5021
+ "loss": 0.1885,
5022
+ "step": 833
5023
+ },
5024
+ {
5025
+ "epoch": 2.92,
5026
+ "learning_rate": 4.103442382909051e-08,
5027
+ "loss": 0.1746,
5028
+ "step": 834
5029
+ },
5030
+ {
5031
+ "epoch": 2.92,
5032
+ "learning_rate": 3.7688233675439164e-08,
5033
+ "loss": 0.1931,
5034
+ "step": 835
5035
+ },
5036
+ {
5037
+ "epoch": 2.92,
5038
+ "learning_rate": 3.448408418959326e-08,
5039
+ "loss": 0.1707,
5040
+ "step": 836
5041
+ },
5042
+ {
5043
+ "epoch": 2.93,
5044
+ "learning_rate": 3.1422021055679266e-08,
5045
+ "loss": 0.1713,
5046
+ "step": 837
5047
+ },
5048
+ {
5049
+ "epoch": 2.93,
5050
+ "learning_rate": 2.850208793198861e-08,
5051
+ "loss": 0.1746,
5052
+ "step": 838
5053
+ },
5054
+ {
5055
+ "epoch": 2.93,
5056
+ "learning_rate": 2.572432645034817e-08,
5057
+ "loss": 0.1714,
5058
+ "step": 839
5059
+ },
5060
+ {
5061
+ "epoch": 2.94,
5062
+ "learning_rate": 2.308877621553185e-08,
5063
+ "loss": 0.1758,
5064
+ "step": 840
5065
+ },
5066
+ {
5067
+ "epoch": 2.94,
5068
+ "learning_rate": 2.059547480469104e-08,
5069
+ "loss": 0.1741,
5070
+ "step": 841
5071
+ },
5072
+ {
5073
+ "epoch": 2.94,
5074
+ "learning_rate": 1.824445776682504e-08,
5075
+ "loss": 0.179,
5076
+ "step": 842
5077
+ },
5078
+ {
5079
+ "epoch": 2.95,
5080
+ "learning_rate": 1.603575862226925e-08,
5081
+ "loss": 0.1797,
5082
+ "step": 843
5083
+ },
5084
+ {
5085
+ "epoch": 2.95,
5086
+ "learning_rate": 1.396940886221776e-08,
5087
+ "loss": 0.1738,
5088
+ "step": 844
5089
+ },
5090
+ {
5091
+ "epoch": 2.95,
5092
+ "learning_rate": 1.2045437948275952e-08,
5093
+ "loss": 0.1819,
5094
+ "step": 845
5095
+ },
5096
+ {
5097
+ "epoch": 2.96,
5098
+ "learning_rate": 1.0263873312040818e-08,
5099
+ "loss": 0.1698,
5100
+ "step": 846
5101
+ },
5102
+ {
5103
+ "epoch": 2.96,
5104
+ "learning_rate": 8.62474035470795e-09,
5105
+ "loss": 0.182,
5106
+ "step": 847
5107
+ },
5108
+ {
5109
+ "epoch": 2.97,
5110
+ "learning_rate": 7.128062446709605e-09,
5111
+ "loss": 0.1692,
5112
+ "step": 848
5113
+ },
5114
+ {
5115
+ "epoch": 2.97,
5116
+ "learning_rate": 5.773860927383856e-09,
5117
+ "loss": 0.1778,
5118
+ "step": 849
5119
+ },
5120
+ {
5121
+ "epoch": 2.97,
5122
+ "learning_rate": 4.562155104665955e-09,
5123
+ "loss": 0.1786,
5124
+ "step": 850
5125
+ },
5126
+ {
5127
+ "epoch": 2.98,
5128
+ "learning_rate": 3.492962254819654e-09,
5129
+ "loss": 0.1657,
5130
+ "step": 851
5131
+ },
5132
+ {
5133
+ "epoch": 2.98,
5134
+ "learning_rate": 2.5662976221840772e-09,
5135
+ "loss": 0.1741,
5136
+ "step": 852
5137
+ },
5138
+ {
5139
+ "epoch": 2.98,
5140
+ "learning_rate": 1.7821744189605583e-09,
5141
+ "loss": 0.1843,
5142
+ "step": 853
5143
+ },
5144
+ {
5145
+ "epoch": 2.99,
5146
+ "learning_rate": 1.1406038250205699e-09,
5147
+ "loss": 0.1725,
5148
+ "step": 854
5149
+ },
5150
+ {
5151
+ "epoch": 2.99,
5152
+ "learning_rate": 6.41594987752514e-10,
5153
+ "loss": 0.1748,
5154
+ "step": 855
5155
+ },
5156
+ {
5157
+ "epoch": 2.99,
5158
+ "learning_rate": 2.851550219240551e-10,
5159
+ "loss": 0.1902,
5160
+ "step": 856
5161
+ },
5162
+ {
5163
+ "epoch": 3.0,
5164
+ "learning_rate": 7.128900958774942e-11,
5165
+ "loss": 0.1918,
5166
+ "step": 857
5167
+ },
5168
+ {
5169
+ "epoch": 3.0,
5170
+ "learning_rate": 0.0,
5171
+ "loss": 0.1749,
5172
+ "step": 858
5173
+ },
5174
+ {
5175
+ "epoch": 3.0,
5176
+ "eval_loss": 0.322337806224823,
5177
+ "eval_runtime": 42.6722,
5178
+ "eval_samples_per_second": 17.506,
5179
+ "eval_steps_per_second": 0.562,
5180
+ "step": 858
5181
+ },
5182
+ {
5183
+ "epoch": 3.0,
5184
+ "step": 858,
5185
+ "total_flos": 2.1832875959648256e+18,
5186
+ "train_loss": 0.43623367181191075,
5187
+ "train_runtime": 26982.3823,
5188
+ "train_samples_per_second": 4.067,
5189
+ "train_steps_per_second": 0.032
5190
  }
5191
  ],
5192
  "max_steps": 858,
5193
  "num_train_epochs": 3,
5194
+ "total_flos": 2.1832875959648256e+18,
5195
  "trial_name": null,
5196
  "trial_params": null
5197
  }