dejanseo commited on
Commit
2578f5b
·
verified ·
1 Parent(s): 2f9f6d1

Upload 10 files

Browse files
Files changed (6) hide show
  1. model.safetensors +1 -1
  2. optimizer.pt +1 -1
  3. rng_state.pth +1 -1
  4. scheduler.pt +1 -1
  5. trainer_state.json +703 -3
  6. training_args.bin +1 -1
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c47551a7eac9f3af5a099fb43cf3bd746b619e755908d045751053dd700cfe7f
3
  size 46336400
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:791f2cffcb51b68cbe5c687b26ed2fb3125a22df4a3816d6142f70f95daa8487
3
  size 46336400
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a7510c1cb86d985e9e92a09b36832075e632129bedcb1fb0e53b6050a109acde
3
  size 92717818
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4e11e19418c088b769e31dcba9668f5f4a8b189d0e7cf1ae8ec5ecd4ee5c38f7
3
  size 92717818
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:991302b98f71fe6753a827cc719ac45bb4d09fa954ba7556a8dab29f93de4b22
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dcc8ea042fd13c8710d3d4d32a52ce258428fe889ea6ff7a6130da33817b281a
3
  size 14244
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f296705e88ed1818756382e8a193360d069ba057ae92c40d6ecf3cb059cf7fd3
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:34801bece81353859ae8fa21fd3347f3337ea795cb2f8c7b0326371ad8b075d8
3
  size 1064
trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 9.901641036226648,
6
  "eval_steps": 500,
7
- "global_step": 12000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -8408,6 +8408,706 @@
8408
  "learning_rate": 0.0008909074226993302,
8409
  "loss": 2.8483,
8410
  "step": 12000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8411
  }
8412
  ],
8413
  "logging_steps": 10,
@@ -8427,7 +9127,7 @@
8427
  "attributes": {}
8428
  }
8429
  },
8430
- "total_flos": 6.143573864610202e+16,
8431
  "train_batch_size": 64,
8432
  "trial_name": null,
8433
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 10.73485395809681,
6
  "eval_steps": 500,
7
+ "global_step": 13000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
8408
  "learning_rate": 0.0008909074226993302,
8409
  "loss": 2.8483,
8410
  "step": 12000
8411
+ },
8412
+ {
8413
+ "epoch": 9.91732892971411,
8414
+ "grad_norm": 0.5881712436676025,
8415
+ "learning_rate": 0.0008907239196256538,
8416
+ "loss": 2.8515,
8417
+ "step": 12010
8418
+ },
8419
+ {
8420
+ "epoch": 9.92558571576014,
8421
+ "grad_norm": 0.597673773765564,
8422
+ "learning_rate": 0.0008905404165519773,
8423
+ "loss": 2.843,
8424
+ "step": 12020
8425
+ },
8426
+ {
8427
+ "epoch": 9.933842501806172,
8428
+ "grad_norm": 0.5990006923675537,
8429
+ "learning_rate": 0.0008903569134783008,
8430
+ "loss": 2.848,
8431
+ "step": 12030
8432
+ },
8433
+ {
8434
+ "epoch": 9.942099287852203,
8435
+ "grad_norm": 0.6145173907279968,
8436
+ "learning_rate": 0.0008901734104046243,
8437
+ "loss": 2.8411,
8438
+ "step": 12040
8439
+ },
8440
+ {
8441
+ "epoch": 9.950356073898234,
8442
+ "grad_norm": 0.5862278938293457,
8443
+ "learning_rate": 0.0008899899073309478,
8444
+ "loss": 2.8561,
8445
+ "step": 12050
8446
+ },
8447
+ {
8448
+ "epoch": 9.958612859944267,
8449
+ "grad_norm": 0.5999264717102051,
8450
+ "learning_rate": 0.0008898064042572714,
8451
+ "loss": 2.8487,
8452
+ "step": 12060
8453
+ },
8454
+ {
8455
+ "epoch": 9.966869645990299,
8456
+ "grad_norm": 0.5286862850189209,
8457
+ "learning_rate": 0.0008896229011835948,
8458
+ "loss": 2.851,
8459
+ "step": 12070
8460
+ },
8461
+ {
8462
+ "epoch": 9.97512643203633,
8463
+ "grad_norm": 0.5677134394645691,
8464
+ "learning_rate": 0.0008894393981099184,
8465
+ "loss": 2.8516,
8466
+ "step": 12080
8467
+ },
8468
+ {
8469
+ "epoch": 9.983383218082361,
8470
+ "grad_norm": 0.5856079459190369,
8471
+ "learning_rate": 0.0008892558950362418,
8472
+ "loss": 2.8545,
8473
+ "step": 12090
8474
+ },
8475
+ {
8476
+ "epoch": 9.991640004128392,
8477
+ "grad_norm": 0.6451898813247681,
8478
+ "learning_rate": 0.0008890723919625654,
8479
+ "loss": 2.8358,
8480
+ "step": 12100
8481
+ },
8482
+ {
8483
+ "epoch": 9.999896790174425,
8484
+ "grad_norm": 0.6016899347305298,
8485
+ "learning_rate": 0.0008888888888888888,
8486
+ "loss": 2.8434,
8487
+ "step": 12110
8488
+ },
8489
+ {
8490
+ "epoch": 10.008256786046031,
8491
+ "grad_norm": 0.5568205714225769,
8492
+ "learning_rate": 0.0008887053858152124,
8493
+ "loss": 2.8738,
8494
+ "step": 12120
8495
+ },
8496
+ {
8497
+ "epoch": 10.016513572092062,
8498
+ "grad_norm": 0.544348955154419,
8499
+ "learning_rate": 0.0008885218827415359,
8500
+ "loss": 2.8332,
8501
+ "step": 12130
8502
+ },
8503
+ {
8504
+ "epoch": 10.024770358138095,
8505
+ "grad_norm": 0.6535346508026123,
8506
+ "learning_rate": 0.0008883383796678595,
8507
+ "loss": 2.8394,
8508
+ "step": 12140
8509
+ },
8510
+ {
8511
+ "epoch": 10.033027144184127,
8512
+ "grad_norm": 0.5878455638885498,
8513
+ "learning_rate": 0.0008881548765941829,
8514
+ "loss": 2.8475,
8515
+ "step": 12150
8516
+ },
8517
+ {
8518
+ "epoch": 10.041283930230158,
8519
+ "grad_norm": 0.5842605829238892,
8520
+ "learning_rate": 0.0008879713735205065,
8521
+ "loss": 2.8403,
8522
+ "step": 12160
8523
+ },
8524
+ {
8525
+ "epoch": 10.049540716276189,
8526
+ "grad_norm": 0.6385082006454468,
8527
+ "learning_rate": 0.00088778787044683,
8528
+ "loss": 2.8376,
8529
+ "step": 12170
8530
+ },
8531
+ {
8532
+ "epoch": 10.057797502322222,
8533
+ "grad_norm": 0.6178941130638123,
8534
+ "learning_rate": 0.0008876043673731536,
8535
+ "loss": 2.8441,
8536
+ "step": 12180
8537
+ },
8538
+ {
8539
+ "epoch": 10.066054288368253,
8540
+ "grad_norm": 0.5717580318450928,
8541
+ "learning_rate": 0.000887420864299477,
8542
+ "loss": 2.8356,
8543
+ "step": 12190
8544
+ },
8545
+ {
8546
+ "epoch": 10.074311074414284,
8547
+ "grad_norm": 0.5871554613113403,
8548
+ "learning_rate": 0.0008872373612258006,
8549
+ "loss": 2.8412,
8550
+ "step": 12200
8551
+ },
8552
+ {
8553
+ "epoch": 10.082567860460316,
8554
+ "grad_norm": 0.6004984974861145,
8555
+ "learning_rate": 0.0008870538581521241,
8556
+ "loss": 2.8338,
8557
+ "step": 12210
8558
+ },
8559
+ {
8560
+ "epoch": 10.090824646506347,
8561
+ "grad_norm": 0.6046565175056458,
8562
+ "learning_rate": 0.0008868703550784475,
8563
+ "loss": 2.8372,
8564
+ "step": 12220
8565
+ },
8566
+ {
8567
+ "epoch": 10.09908143255238,
8568
+ "grad_norm": 0.5893774032592773,
8569
+ "learning_rate": 0.000886686852004771,
8570
+ "loss": 2.8295,
8571
+ "step": 12230
8572
+ },
8573
+ {
8574
+ "epoch": 10.10733821859841,
8575
+ "grad_norm": 0.5833553671836853,
8576
+ "learning_rate": 0.0008865033489310946,
8577
+ "loss": 2.8371,
8578
+ "step": 12240
8579
+ },
8580
+ {
8581
+ "epoch": 10.115595004644442,
8582
+ "grad_norm": 0.6019455194473267,
8583
+ "learning_rate": 0.0008863198458574181,
8584
+ "loss": 2.8246,
8585
+ "step": 12250
8586
+ },
8587
+ {
8588
+ "epoch": 10.123851790690473,
8589
+ "grad_norm": 0.6151683926582336,
8590
+ "learning_rate": 0.0008861363427837416,
8591
+ "loss": 2.841,
8592
+ "step": 12260
8593
+ },
8594
+ {
8595
+ "epoch": 10.132108576736504,
8596
+ "grad_norm": 0.6026824116706848,
8597
+ "learning_rate": 0.0008859528397100651,
8598
+ "loss": 2.8392,
8599
+ "step": 12270
8600
+ },
8601
+ {
8602
+ "epoch": 10.140365362782537,
8603
+ "grad_norm": 0.5783131718635559,
8604
+ "learning_rate": 0.0008857693366363887,
8605
+ "loss": 2.8479,
8606
+ "step": 12280
8607
+ },
8608
+ {
8609
+ "epoch": 10.148622148828569,
8610
+ "grad_norm": 0.6481205821037292,
8611
+ "learning_rate": 0.0008855858335627122,
8612
+ "loss": 2.8423,
8613
+ "step": 12290
8614
+ },
8615
+ {
8616
+ "epoch": 10.1568789348746,
8617
+ "grad_norm": 0.5748919248580933,
8618
+ "learning_rate": 0.0008854023304890357,
8619
+ "loss": 2.8349,
8620
+ "step": 12300
8621
+ },
8622
+ {
8623
+ "epoch": 10.165135720920631,
8624
+ "grad_norm": 0.5705230832099915,
8625
+ "learning_rate": 0.0008852188274153592,
8626
+ "loss": 2.8463,
8627
+ "step": 12310
8628
+ },
8629
+ {
8630
+ "epoch": 10.173392506966664,
8631
+ "grad_norm": 0.5699977278709412,
8632
+ "learning_rate": 0.0008850353243416828,
8633
+ "loss": 2.8438,
8634
+ "step": 12320
8635
+ },
8636
+ {
8637
+ "epoch": 10.181649293012695,
8638
+ "grad_norm": 0.544175386428833,
8639
+ "learning_rate": 0.0008848518212680063,
8640
+ "loss": 2.8363,
8641
+ "step": 12330
8642
+ },
8643
+ {
8644
+ "epoch": 10.189906079058726,
8645
+ "grad_norm": 0.568715512752533,
8646
+ "learning_rate": 0.0008846683181943298,
8647
+ "loss": 2.8362,
8648
+ "step": 12340
8649
+ },
8650
+ {
8651
+ "epoch": 10.198162865104758,
8652
+ "grad_norm": 0.5720770955085754,
8653
+ "learning_rate": 0.0008844848151206532,
8654
+ "loss": 2.8284,
8655
+ "step": 12350
8656
+ },
8657
+ {
8658
+ "epoch": 10.206419651150789,
8659
+ "grad_norm": 0.626235842704773,
8660
+ "learning_rate": 0.0008843013120469768,
8661
+ "loss": 2.8393,
8662
+ "step": 12360
8663
+ },
8664
+ {
8665
+ "epoch": 10.214676437196822,
8666
+ "grad_norm": 0.5661699175834656,
8667
+ "learning_rate": 0.0008841178089733003,
8668
+ "loss": 2.8333,
8669
+ "step": 12370
8670
+ },
8671
+ {
8672
+ "epoch": 10.222933223242853,
8673
+ "grad_norm": 0.6092801094055176,
8674
+ "learning_rate": 0.0008839343058996238,
8675
+ "loss": 2.8513,
8676
+ "step": 12380
8677
+ },
8678
+ {
8679
+ "epoch": 10.231190009288884,
8680
+ "grad_norm": 0.6037712097167969,
8681
+ "learning_rate": 0.0008837508028259473,
8682
+ "loss": 2.8328,
8683
+ "step": 12390
8684
+ },
8685
+ {
8686
+ "epoch": 10.239446795334915,
8687
+ "grad_norm": 0.5994784832000732,
8688
+ "learning_rate": 0.0008835672997522709,
8689
+ "loss": 2.8268,
8690
+ "step": 12400
8691
+ },
8692
+ {
8693
+ "epoch": 10.247703581380948,
8694
+ "grad_norm": 0.5821447968482971,
8695
+ "learning_rate": 0.0008833837966785944,
8696
+ "loss": 2.8376,
8697
+ "step": 12410
8698
+ },
8699
+ {
8700
+ "epoch": 10.25596036742698,
8701
+ "grad_norm": 0.6151066422462463,
8702
+ "learning_rate": 0.0008832002936049179,
8703
+ "loss": 2.8338,
8704
+ "step": 12420
8705
+ },
8706
+ {
8707
+ "epoch": 10.26421715347301,
8708
+ "grad_norm": 0.6016796231269836,
8709
+ "learning_rate": 0.0008830167905312414,
8710
+ "loss": 2.8295,
8711
+ "step": 12430
8712
+ },
8713
+ {
8714
+ "epoch": 10.272473939519042,
8715
+ "grad_norm": 0.5741587281227112,
8716
+ "learning_rate": 0.000882833287457565,
8717
+ "loss": 2.8283,
8718
+ "step": 12440
8719
+ },
8720
+ {
8721
+ "epoch": 10.280730725565073,
8722
+ "grad_norm": 0.5840280055999756,
8723
+ "learning_rate": 0.0008826497843838885,
8724
+ "loss": 2.8268,
8725
+ "step": 12450
8726
+ },
8727
+ {
8728
+ "epoch": 10.288987511611106,
8729
+ "grad_norm": 0.5622872710227966,
8730
+ "learning_rate": 0.000882466281310212,
8731
+ "loss": 2.8424,
8732
+ "step": 12460
8733
+ },
8734
+ {
8735
+ "epoch": 10.297244297657137,
8736
+ "grad_norm": 0.6184718608856201,
8737
+ "learning_rate": 0.0008822827782365355,
8738
+ "loss": 2.8269,
8739
+ "step": 12470
8740
+ },
8741
+ {
8742
+ "epoch": 10.305501083703168,
8743
+ "grad_norm": 0.5796384215354919,
8744
+ "learning_rate": 0.0008820992751628591,
8745
+ "loss": 2.8383,
8746
+ "step": 12480
8747
+ },
8748
+ {
8749
+ "epoch": 10.3137578697492,
8750
+ "grad_norm": 0.617235541343689,
8751
+ "learning_rate": 0.0008819157720891825,
8752
+ "loss": 2.8268,
8753
+ "step": 12490
8754
+ },
8755
+ {
8756
+ "epoch": 10.322014655795233,
8757
+ "grad_norm": 0.5677554607391357,
8758
+ "learning_rate": 0.000881732269015506,
8759
+ "loss": 2.8349,
8760
+ "step": 12500
8761
+ },
8762
+ {
8763
+ "epoch": 10.330271441841264,
8764
+ "grad_norm": 0.5938097238540649,
8765
+ "learning_rate": 0.0008815487659418295,
8766
+ "loss": 2.8362,
8767
+ "step": 12510
8768
+ },
8769
+ {
8770
+ "epoch": 10.338528227887295,
8771
+ "grad_norm": 0.6369422078132629,
8772
+ "learning_rate": 0.0008813652628681531,
8773
+ "loss": 2.8364,
8774
+ "step": 12520
8775
+ },
8776
+ {
8777
+ "epoch": 10.346785013933326,
8778
+ "grad_norm": 0.6142675280570984,
8779
+ "learning_rate": 0.0008811817597944766,
8780
+ "loss": 2.8259,
8781
+ "step": 12530
8782
+ },
8783
+ {
8784
+ "epoch": 10.355041799979357,
8785
+ "grad_norm": 0.5718218684196472,
8786
+ "learning_rate": 0.0008809982567208001,
8787
+ "loss": 2.8473,
8788
+ "step": 12540
8789
+ },
8790
+ {
8791
+ "epoch": 10.36329858602539,
8792
+ "grad_norm": 0.5698361992835999,
8793
+ "learning_rate": 0.0008808147536471236,
8794
+ "loss": 2.8398,
8795
+ "step": 12550
8796
+ },
8797
+ {
8798
+ "epoch": 10.371555372071422,
8799
+ "grad_norm": 0.5833884477615356,
8800
+ "learning_rate": 0.0008806312505734472,
8801
+ "loss": 2.8171,
8802
+ "step": 12560
8803
+ },
8804
+ {
8805
+ "epoch": 10.379812158117453,
8806
+ "grad_norm": 0.6157854795455933,
8807
+ "learning_rate": 0.0008804477474997707,
8808
+ "loss": 2.8409,
8809
+ "step": 12570
8810
+ },
8811
+ {
8812
+ "epoch": 10.388068944163484,
8813
+ "grad_norm": 0.5915418863296509,
8814
+ "learning_rate": 0.0008802642444260942,
8815
+ "loss": 2.8369,
8816
+ "step": 12580
8817
+ },
8818
+ {
8819
+ "epoch": 10.396325730209515,
8820
+ "grad_norm": 0.5849014520645142,
8821
+ "learning_rate": 0.0008800807413524177,
8822
+ "loss": 2.8329,
8823
+ "step": 12590
8824
+ },
8825
+ {
8826
+ "epoch": 10.404582516255548,
8827
+ "grad_norm": 0.6383744478225708,
8828
+ "learning_rate": 0.0008798972382787413,
8829
+ "loss": 2.8299,
8830
+ "step": 12600
8831
+ },
8832
+ {
8833
+ "epoch": 10.41283930230158,
8834
+ "grad_norm": 0.5256903767585754,
8835
+ "learning_rate": 0.0008797137352050648,
8836
+ "loss": 2.8244,
8837
+ "step": 12610
8838
+ },
8839
+ {
8840
+ "epoch": 10.42109608834761,
8841
+ "grad_norm": 0.6176425218582153,
8842
+ "learning_rate": 0.0008795302321313881,
8843
+ "loss": 2.8318,
8844
+ "step": 12620
8845
+ },
8846
+ {
8847
+ "epoch": 10.429352874393642,
8848
+ "grad_norm": 0.625028133392334,
8849
+ "learning_rate": 0.0008793467290577117,
8850
+ "loss": 2.8367,
8851
+ "step": 12630
8852
+ },
8853
+ {
8854
+ "epoch": 10.437609660439675,
8855
+ "grad_norm": 0.626335620880127,
8856
+ "learning_rate": 0.0008791632259840352,
8857
+ "loss": 2.8346,
8858
+ "step": 12640
8859
+ },
8860
+ {
8861
+ "epoch": 10.445866446485706,
8862
+ "grad_norm": 0.5328546166419983,
8863
+ "learning_rate": 0.0008789797229103587,
8864
+ "loss": 2.8279,
8865
+ "step": 12650
8866
+ },
8867
+ {
8868
+ "epoch": 10.454123232531737,
8869
+ "grad_norm": 0.5871540904045105,
8870
+ "learning_rate": 0.0008787962198366822,
8871
+ "loss": 2.8268,
8872
+ "step": 12660
8873
+ },
8874
+ {
8875
+ "epoch": 10.462380018577768,
8876
+ "grad_norm": 0.5590776205062866,
8877
+ "learning_rate": 0.0008786127167630058,
8878
+ "loss": 2.8227,
8879
+ "step": 12670
8880
+ },
8881
+ {
8882
+ "epoch": 10.4706368046238,
8883
+ "grad_norm": 0.5899330973625183,
8884
+ "learning_rate": 0.0008784292136893293,
8885
+ "loss": 2.8186,
8886
+ "step": 12680
8887
+ },
8888
+ {
8889
+ "epoch": 10.478893590669832,
8890
+ "grad_norm": 0.653564989566803,
8891
+ "learning_rate": 0.0008782457106156528,
8892
+ "loss": 2.8333,
8893
+ "step": 12690
8894
+ },
8895
+ {
8896
+ "epoch": 10.487150376715864,
8897
+ "grad_norm": 0.627564013004303,
8898
+ "learning_rate": 0.0008780622075419763,
8899
+ "loss": 2.823,
8900
+ "step": 12700
8901
+ },
8902
+ {
8903
+ "epoch": 10.495407162761895,
8904
+ "grad_norm": 0.6121799945831299,
8905
+ "learning_rate": 0.0008778787044682999,
8906
+ "loss": 2.8394,
8907
+ "step": 12710
8908
+ },
8909
+ {
8910
+ "epoch": 10.503663948807926,
8911
+ "grad_norm": 0.6052922010421753,
8912
+ "learning_rate": 0.0008776952013946234,
8913
+ "loss": 2.815,
8914
+ "step": 12720
8915
+ },
8916
+ {
8917
+ "epoch": 10.511920734853959,
8918
+ "grad_norm": 0.592348039150238,
8919
+ "learning_rate": 0.000877511698320947,
8920
+ "loss": 2.8293,
8921
+ "step": 12730
8922
+ },
8923
+ {
8924
+ "epoch": 10.52017752089999,
8925
+ "grad_norm": 0.5429986119270325,
8926
+ "learning_rate": 0.0008773281952472704,
8927
+ "loss": 2.8258,
8928
+ "step": 12740
8929
+ },
8930
+ {
8931
+ "epoch": 10.528434306946021,
8932
+ "grad_norm": 0.6261007785797119,
8933
+ "learning_rate": 0.0008771446921735939,
8934
+ "loss": 2.8287,
8935
+ "step": 12750
8936
+ },
8937
+ {
8938
+ "epoch": 10.536691092992053,
8939
+ "grad_norm": 0.5362280011177063,
8940
+ "learning_rate": 0.0008769611890999174,
8941
+ "loss": 2.8271,
8942
+ "step": 12760
8943
+ },
8944
+ {
8945
+ "epoch": 10.544947879038084,
8946
+ "grad_norm": 0.5826970338821411,
8947
+ "learning_rate": 0.0008767776860262409,
8948
+ "loss": 2.8285,
8949
+ "step": 12770
8950
+ },
8951
+ {
8952
+ "epoch": 10.553204665084117,
8953
+ "grad_norm": 0.597993791103363,
8954
+ "learning_rate": 0.0008765941829525644,
8955
+ "loss": 2.8213,
8956
+ "step": 12780
8957
+ },
8958
+ {
8959
+ "epoch": 10.561461451130148,
8960
+ "grad_norm": 0.5747185945510864,
8961
+ "learning_rate": 0.000876410679878888,
8962
+ "loss": 2.8282,
8963
+ "step": 12790
8964
+ },
8965
+ {
8966
+ "epoch": 10.569718237176179,
8967
+ "grad_norm": 0.5573180317878723,
8968
+ "learning_rate": 0.0008762271768052115,
8969
+ "loss": 2.824,
8970
+ "step": 12800
8971
+ },
8972
+ {
8973
+ "epoch": 10.57797502322221,
8974
+ "grad_norm": 0.5840964317321777,
8975
+ "learning_rate": 0.000876043673731535,
8976
+ "loss": 2.8178,
8977
+ "step": 12810
8978
+ },
8979
+ {
8980
+ "epoch": 10.586231809268241,
8981
+ "grad_norm": 0.5690692663192749,
8982
+ "learning_rate": 0.0008758601706578585,
8983
+ "loss": 2.8155,
8984
+ "step": 12820
8985
+ },
8986
+ {
8987
+ "epoch": 10.594488595314274,
8988
+ "grad_norm": 0.5685713887214661,
8989
+ "learning_rate": 0.0008756766675841821,
8990
+ "loss": 2.8147,
8991
+ "step": 12830
8992
+ },
8993
+ {
8994
+ "epoch": 10.602745381360306,
8995
+ "grad_norm": 0.6194620132446289,
8996
+ "learning_rate": 0.0008754931645105056,
8997
+ "loss": 2.8374,
8998
+ "step": 12840
8999
+ },
9000
+ {
9001
+ "epoch": 10.611002167406337,
9002
+ "grad_norm": 0.5465943217277527,
9003
+ "learning_rate": 0.0008753096614368291,
9004
+ "loss": 2.8312,
9005
+ "step": 12850
9006
+ },
9007
+ {
9008
+ "epoch": 10.619258953452368,
9009
+ "grad_norm": 0.5942501425743103,
9010
+ "learning_rate": 0.0008751261583631526,
9011
+ "loss": 2.8235,
9012
+ "step": 12860
9013
+ },
9014
+ {
9015
+ "epoch": 10.627515739498401,
9016
+ "grad_norm": 0.5760926008224487,
9017
+ "learning_rate": 0.0008749426552894762,
9018
+ "loss": 2.8196,
9019
+ "step": 12870
9020
+ },
9021
+ {
9022
+ "epoch": 10.635772525544432,
9023
+ "grad_norm": 0.5682793259620667,
9024
+ "learning_rate": 0.0008747591522157996,
9025
+ "loss": 2.8349,
9026
+ "step": 12880
9027
+ },
9028
+ {
9029
+ "epoch": 10.644029311590463,
9030
+ "grad_norm": 0.5754048228263855,
9031
+ "learning_rate": 0.0008745756491421231,
9032
+ "loss": 2.8215,
9033
+ "step": 12890
9034
+ },
9035
+ {
9036
+ "epoch": 10.652286097636495,
9037
+ "grad_norm": 0.5868312120437622,
9038
+ "learning_rate": 0.0008743921460684466,
9039
+ "loss": 2.8359,
9040
+ "step": 12900
9041
+ },
9042
+ {
9043
+ "epoch": 10.660542883682526,
9044
+ "grad_norm": 0.5740572214126587,
9045
+ "learning_rate": 0.0008742086429947702,
9046
+ "loss": 2.8321,
9047
+ "step": 12910
9048
+ },
9049
+ {
9050
+ "epoch": 10.668799669728559,
9051
+ "grad_norm": 0.570972740650177,
9052
+ "learning_rate": 0.0008740251399210937,
9053
+ "loss": 2.8291,
9054
+ "step": 12920
9055
+ },
9056
+ {
9057
+ "epoch": 10.67705645577459,
9058
+ "grad_norm": 0.5573681592941284,
9059
+ "learning_rate": 0.0008738416368474172,
9060
+ "loss": 2.8211,
9061
+ "step": 12930
9062
+ },
9063
+ {
9064
+ "epoch": 10.685313241820621,
9065
+ "grad_norm": 0.6186919212341309,
9066
+ "learning_rate": 0.0008736581337737407,
9067
+ "loss": 2.8122,
9068
+ "step": 12940
9069
+ },
9070
+ {
9071
+ "epoch": 10.693570027866652,
9072
+ "grad_norm": 0.6006292700767517,
9073
+ "learning_rate": 0.0008734746307000643,
9074
+ "loss": 2.8187,
9075
+ "step": 12950
9076
+ },
9077
+ {
9078
+ "epoch": 10.701826813912685,
9079
+ "grad_norm": 0.571305513381958,
9080
+ "learning_rate": 0.0008732911276263878,
9081
+ "loss": 2.8213,
9082
+ "step": 12960
9083
+ },
9084
+ {
9085
+ "epoch": 10.710083599958717,
9086
+ "grad_norm": 0.5861838459968567,
9087
+ "learning_rate": 0.0008731076245527113,
9088
+ "loss": 2.8176,
9089
+ "step": 12970
9090
+ },
9091
+ {
9092
+ "epoch": 10.718340386004748,
9093
+ "grad_norm": 0.618885338306427,
9094
+ "learning_rate": 0.0008729241214790348,
9095
+ "loss": 2.8325,
9096
+ "step": 12980
9097
+ },
9098
+ {
9099
+ "epoch": 10.726597172050779,
9100
+ "grad_norm": 0.6155752539634705,
9101
+ "learning_rate": 0.0008727406184053584,
9102
+ "loss": 2.8359,
9103
+ "step": 12990
9104
+ },
9105
+ {
9106
+ "epoch": 10.73485395809681,
9107
+ "grad_norm": 0.5645089149475098,
9108
+ "learning_rate": 0.0008725571153316819,
9109
+ "loss": 2.8272,
9110
+ "step": 13000
9111
  }
9112
  ],
9113
  "logging_steps": 10,
 
9127
  "attributes": {}
9128
  }
9129
  },
9130
+ "total_flos": 6.655925629005005e+16,
9131
  "train_batch_size": 64,
9132
  "trial_name": null,
9133
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bcf51f03fd4d73b628c5742a3052af29cf8509444f1a5f10abab3e54e9efbe0d
3
  size 5304
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b4255e8645ef79d5d89578e0550408329539962f036c0ac03649b791aa1cf604
3
  size 5304