error577 commited on
Commit
e564344
·
verified ·
1 Parent(s): fd25ee3

Training in progress, step 2400, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7f4451577bf82ae4c14fb8b5f6d15593c695f63d1bc7c8c377049e28c0b6f430
3
  size 500770656
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0bc773204153173a13c1ac40b0d299d63826a9009d800e65a16ac4dff721fee9
3
  size 500770656
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:435f4a73c69232486ea2c5684eb01e7449a2602d9445e4a4dbe0c21719127715
3
  size 254918356
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2354a9b4460be38c2facc081861e1d39817d9e3f7d6d7818671513775a0f21bd
3
  size 254918356
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c6015ab40414177a8cb3a25519cffb5a624e999127e3ac742f7bf693b450cb8e
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a8e8df32598dfacb12011daa77172ba188bcb85dc5dfb5c57bf90f20875c1ee3
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e66e55baeee62db229bddf3da45b85b2a91fe7343a6a75e11aba725017a7a321
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dfa45a2010848f8ba6bd00a9aefaa39f18e6a555b04b4e25c9be094c299a3176
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": 0.635880708694458,
3
  "best_model_checkpoint": "miner_id_24/checkpoint-1800",
4
- "epoch": 0.303469204772743,
5
  "eval_steps": 200,
6
- "global_step": 2200,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -15503,6 +15503,1414 @@
15503
  "eval_samples_per_second": 2.51,
15504
  "eval_steps_per_second": 2.51,
15505
  "step": 2200
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15506
  }
15507
  ],
15508
  "logging_steps": 1,
@@ -15517,7 +16925,7 @@
15517
  "early_stopping_threshold": 0.0
15518
  },
15519
  "attributes": {
15520
- "early_stopping_patience_counter": 2
15521
  }
15522
  },
15523
  "TrainerControl": {
@@ -15526,12 +16934,12 @@
15526
  "should_evaluate": false,
15527
  "should_log": false,
15528
  "should_save": true,
15529
- "should_training_stop": false
15530
  },
15531
  "attributes": {}
15532
  }
15533
  },
15534
- "total_flos": 3.483236466111283e+17,
15535
  "train_batch_size": 1,
15536
  "trial_name": null,
15537
  "trial_params": null
 
1
  {
2
  "best_metric": 0.635880708694458,
3
  "best_model_checkpoint": "miner_id_24/checkpoint-1800",
4
+ "epoch": 0.33105731429753776,
5
  "eval_steps": 200,
6
+ "global_step": 2400,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
15503
  "eval_samples_per_second": 2.51,
15504
  "eval_steps_per_second": 2.51,
15505
  "step": 2200
15506
+ },
15507
+ {
15508
+ "epoch": 0.3036071453203669,
15509
+ "grad_norm": 0.7370956540107727,
15510
+ "learning_rate": 0.000195108780321177,
15511
+ "loss": 0.4832,
15512
+ "step": 2201
15513
+ },
15514
+ {
15515
+ "epoch": 0.30374508586799087,
15516
+ "grad_norm": 0.6172298192977905,
15517
+ "learning_rate": 0.00019510431046312185,
15518
+ "loss": 0.5685,
15519
+ "step": 2202
15520
+ },
15521
+ {
15522
+ "epoch": 0.3038830264156149,
15523
+ "grad_norm": 0.4689820408821106,
15524
+ "learning_rate": 0.0001950998386148504,
15525
+ "loss": 0.3635,
15526
+ "step": 2203
15527
+ },
15528
+ {
15529
+ "epoch": 0.3040209669632388,
15530
+ "grad_norm": 0.8951042294502258,
15531
+ "learning_rate": 0.00019509536477645617,
15532
+ "loss": 0.8364,
15533
+ "step": 2204
15534
+ },
15535
+ {
15536
+ "epoch": 0.30415890751086283,
15537
+ "grad_norm": 0.6719712018966675,
15538
+ "learning_rate": 0.00019509088894803286,
15539
+ "loss": 0.2531,
15540
+ "step": 2205
15541
+ },
15542
+ {
15543
+ "epoch": 0.3042968480584868,
15544
+ "grad_norm": 0.730803370475769,
15545
+ "learning_rate": 0.00019508641112967408,
15546
+ "loss": 0.5159,
15547
+ "step": 2206
15548
+ },
15549
+ {
15550
+ "epoch": 0.3044347886061108,
15551
+ "grad_norm": 0.7379736304283142,
15552
+ "learning_rate": 0.0001950819313214736,
15553
+ "loss": 0.4162,
15554
+ "step": 2207
15555
+ },
15556
+ {
15557
+ "epoch": 0.30457272915373473,
15558
+ "grad_norm": 0.8285558223724365,
15559
+ "learning_rate": 0.00019507744952352508,
15560
+ "loss": 0.7966,
15561
+ "step": 2208
15562
+ },
15563
+ {
15564
+ "epoch": 0.30471066970135874,
15565
+ "grad_norm": 0.8864738941192627,
15566
+ "learning_rate": 0.00019507296573592235,
15567
+ "loss": 0.7326,
15568
+ "step": 2209
15569
+ },
15570
+ {
15571
+ "epoch": 0.3048486102489827,
15572
+ "grad_norm": 0.7778903841972351,
15573
+ "learning_rate": 0.00019506847995875924,
15574
+ "loss": 0.4939,
15575
+ "step": 2210
15576
+ },
15577
+ {
15578
+ "epoch": 0.30498655079660664,
15579
+ "grad_norm": 0.7180725932121277,
15580
+ "learning_rate": 0.00019506399219212966,
15581
+ "loss": 0.5479,
15582
+ "step": 2211
15583
+ },
15584
+ {
15585
+ "epoch": 0.30512449134423064,
15586
+ "grad_norm": 0.9998635053634644,
15587
+ "learning_rate": 0.00019505950243612746,
15588
+ "loss": 1.02,
15589
+ "step": 2212
15590
+ },
15591
+ {
15592
+ "epoch": 0.3052624318918546,
15593
+ "grad_norm": 1.4325881004333496,
15594
+ "learning_rate": 0.00019505501069084659,
15595
+ "loss": 0.6919,
15596
+ "step": 2213
15597
+ },
15598
+ {
15599
+ "epoch": 0.3054003724394786,
15600
+ "grad_norm": 0.7900728583335876,
15601
+ "learning_rate": 0.00019505051695638113,
15602
+ "loss": 0.3652,
15603
+ "step": 2214
15604
+ },
15605
+ {
15606
+ "epoch": 0.30553831298710254,
15607
+ "grad_norm": 0.8904551863670349,
15608
+ "learning_rate": 0.00019504602123282508,
15609
+ "loss": 0.8051,
15610
+ "step": 2215
15611
+ },
15612
+ {
15613
+ "epoch": 0.30567625353472655,
15614
+ "grad_norm": 0.5742565989494324,
15615
+ "learning_rate": 0.00019504152352027245,
15616
+ "loss": 0.3562,
15617
+ "step": 2216
15618
+ },
15619
+ {
15620
+ "epoch": 0.3058141940823505,
15621
+ "grad_norm": 0.8754223585128784,
15622
+ "learning_rate": 0.00019503702381881745,
15623
+ "loss": 0.7154,
15624
+ "step": 2217
15625
+ },
15626
+ {
15627
+ "epoch": 0.3059521346299745,
15628
+ "grad_norm": 0.834255576133728,
15629
+ "learning_rate": 0.00019503252212855422,
15630
+ "loss": 0.8241,
15631
+ "step": 2218
15632
+ },
15633
+ {
15634
+ "epoch": 0.30609007517759845,
15635
+ "grad_norm": 0.8959856033325195,
15636
+ "learning_rate": 0.00019502801844957697,
15637
+ "loss": 1.1416,
15638
+ "step": 2219
15639
+ },
15640
+ {
15641
+ "epoch": 0.3062280157252224,
15642
+ "grad_norm": 0.76212078332901,
15643
+ "learning_rate": 0.00019502351278197994,
15644
+ "loss": 0.5501,
15645
+ "step": 2220
15646
+ },
15647
+ {
15648
+ "epoch": 0.3063659562728464,
15649
+ "grad_norm": 1.0702933073043823,
15650
+ "learning_rate": 0.0001950190051258574,
15651
+ "loss": 0.5158,
15652
+ "step": 2221
15653
+ },
15654
+ {
15655
+ "epoch": 0.30650389682047036,
15656
+ "grad_norm": 0.9771005511283875,
15657
+ "learning_rate": 0.00019501449548130372,
15658
+ "loss": 0.6492,
15659
+ "step": 2222
15660
+ },
15661
+ {
15662
+ "epoch": 0.30664183736809436,
15663
+ "grad_norm": 0.6449692845344543,
15664
+ "learning_rate": 0.00019500998384841322,
15665
+ "loss": 0.581,
15666
+ "step": 2223
15667
+ },
15668
+ {
15669
+ "epoch": 0.3067797779157183,
15670
+ "grad_norm": 0.6486768126487732,
15671
+ "learning_rate": 0.00019500547022728034,
15672
+ "loss": 0.6896,
15673
+ "step": 2224
15674
+ },
15675
+ {
15676
+ "epoch": 0.3069177184633423,
15677
+ "grad_norm": 0.570933997631073,
15678
+ "learning_rate": 0.00019500095461799955,
15679
+ "loss": 0.4472,
15680
+ "step": 2225
15681
+ },
15682
+ {
15683
+ "epoch": 0.30705565901096626,
15684
+ "grad_norm": 0.6124463081359863,
15685
+ "learning_rate": 0.00019499643702066536,
15686
+ "loss": 0.49,
15687
+ "step": 2226
15688
+ },
15689
+ {
15690
+ "epoch": 0.30719359955859027,
15691
+ "grad_norm": 1.030892014503479,
15692
+ "learning_rate": 0.00019499191743537224,
15693
+ "loss": 0.6116,
15694
+ "step": 2227
15695
+ },
15696
+ {
15697
+ "epoch": 0.3073315401062142,
15698
+ "grad_norm": 0.7422316670417786,
15699
+ "learning_rate": 0.00019498739586221482,
15700
+ "loss": 0.4349,
15701
+ "step": 2228
15702
+ },
15703
+ {
15704
+ "epoch": 0.3074694806538382,
15705
+ "grad_norm": 1.2078644037246704,
15706
+ "learning_rate": 0.00019498287230128775,
15707
+ "loss": 0.8739,
15708
+ "step": 2229
15709
+ },
15710
+ {
15711
+ "epoch": 0.3076074212014622,
15712
+ "grad_norm": 0.6796876788139343,
15713
+ "learning_rate": 0.0001949783467526856,
15714
+ "loss": 0.4402,
15715
+ "step": 2230
15716
+ },
15717
+ {
15718
+ "epoch": 0.3077453617490861,
15719
+ "grad_norm": 0.9108544588088989,
15720
+ "learning_rate": 0.00019497381921650318,
15721
+ "loss": 0.8838,
15722
+ "step": 2231
15723
+ },
15724
+ {
15725
+ "epoch": 0.3078833022967101,
15726
+ "grad_norm": 0.9964629411697388,
15727
+ "learning_rate": 0.00019496928969283517,
15728
+ "loss": 0.7255,
15729
+ "step": 2232
15730
+ },
15731
+ {
15732
+ "epoch": 0.3080212428443341,
15733
+ "grad_norm": 1.5495188236236572,
15734
+ "learning_rate": 0.00019496475818177634,
15735
+ "loss": 1.264,
15736
+ "step": 2233
15737
+ },
15738
+ {
15739
+ "epoch": 0.3081591833919581,
15740
+ "grad_norm": 0.8140445351600647,
15741
+ "learning_rate": 0.0001949602246834216,
15742
+ "loss": 0.9636,
15743
+ "step": 2234
15744
+ },
15745
+ {
15746
+ "epoch": 0.30829712393958203,
15747
+ "grad_norm": 0.6906377077102661,
15748
+ "learning_rate": 0.0001949556891978658,
15749
+ "loss": 0.553,
15750
+ "step": 2235
15751
+ },
15752
+ {
15753
+ "epoch": 0.30843506448720603,
15754
+ "grad_norm": 0.8340548872947693,
15755
+ "learning_rate": 0.00019495115172520378,
15756
+ "loss": 0.5792,
15757
+ "step": 2236
15758
+ },
15759
+ {
15760
+ "epoch": 0.30857300503483,
15761
+ "grad_norm": 1.0296357870101929,
15762
+ "learning_rate": 0.00019494661226553055,
15763
+ "loss": 0.971,
15764
+ "step": 2237
15765
+ },
15766
+ {
15767
+ "epoch": 0.308710945582454,
15768
+ "grad_norm": 0.7610672116279602,
15769
+ "learning_rate": 0.0001949420708189411,
15770
+ "loss": 0.5375,
15771
+ "step": 2238
15772
+ },
15773
+ {
15774
+ "epoch": 0.30884888613007794,
15775
+ "grad_norm": 0.722172200679779,
15776
+ "learning_rate": 0.00019493752738553046,
15777
+ "loss": 0.479,
15778
+ "step": 2239
15779
+ },
15780
+ {
15781
+ "epoch": 0.3089868266777019,
15782
+ "grad_norm": 0.8141410946846008,
15783
+ "learning_rate": 0.00019493298196539375,
15784
+ "loss": 0.8384,
15785
+ "step": 2240
15786
+ },
15787
+ {
15788
+ "epoch": 0.3091247672253259,
15789
+ "grad_norm": 0.7743800282478333,
15790
+ "learning_rate": 0.000194928434558626,
15791
+ "loss": 0.9943,
15792
+ "step": 2241
15793
+ },
15794
+ {
15795
+ "epoch": 0.30926270777294984,
15796
+ "grad_norm": 0.6680206656455994,
15797
+ "learning_rate": 0.00019492388516532247,
15798
+ "loss": 0.4103,
15799
+ "step": 2242
15800
+ },
15801
+ {
15802
+ "epoch": 0.30940064832057385,
15803
+ "grad_norm": 0.9488325715065002,
15804
+ "learning_rate": 0.0001949193337855783,
15805
+ "loss": 0.8465,
15806
+ "step": 2243
15807
+ },
15808
+ {
15809
+ "epoch": 0.3095385888681978,
15810
+ "grad_norm": 0.5857890248298645,
15811
+ "learning_rate": 0.00019491478041948877,
15812
+ "loss": 0.395,
15813
+ "step": 2244
15814
+ },
15815
+ {
15816
+ "epoch": 0.3096765294158218,
15817
+ "grad_norm": 0.5725042223930359,
15818
+ "learning_rate": 0.00019491022506714912,
15819
+ "loss": 0.3626,
15820
+ "step": 2245
15821
+ },
15822
+ {
15823
+ "epoch": 0.30981446996344575,
15824
+ "grad_norm": 0.7076693773269653,
15825
+ "learning_rate": 0.00019490566772865475,
15826
+ "loss": 0.5949,
15827
+ "step": 2246
15828
+ },
15829
+ {
15830
+ "epoch": 0.30995241051106975,
15831
+ "grad_norm": 0.8544387817382812,
15832
+ "learning_rate": 0.00019490110840410097,
15833
+ "loss": 1.0608,
15834
+ "step": 2247
15835
+ },
15836
+ {
15837
+ "epoch": 0.3100903510586937,
15838
+ "grad_norm": 0.832599937915802,
15839
+ "learning_rate": 0.00019489654709358323,
15840
+ "loss": 0.807,
15841
+ "step": 2248
15842
+ },
15843
+ {
15844
+ "epoch": 0.31022829160631765,
15845
+ "grad_norm": 1.0049424171447754,
15846
+ "learning_rate": 0.00019489198379719696,
15847
+ "loss": 0.794,
15848
+ "step": 2249
15849
+ },
15850
+ {
15851
+ "epoch": 0.31036623215394166,
15852
+ "grad_norm": 0.6564392447471619,
15853
+ "learning_rate": 0.00019488741851503765,
15854
+ "loss": 0.5557,
15855
+ "step": 2250
15856
+ },
15857
+ {
15858
+ "epoch": 0.3105041727015656,
15859
+ "grad_norm": 0.5619440078735352,
15860
+ "learning_rate": 0.00019488285124720086,
15861
+ "loss": 0.4077,
15862
+ "step": 2251
15863
+ },
15864
+ {
15865
+ "epoch": 0.3106421132491896,
15866
+ "grad_norm": 0.5860351920127869,
15867
+ "learning_rate": 0.00019487828199378214,
15868
+ "loss": 0.4018,
15869
+ "step": 2252
15870
+ },
15871
+ {
15872
+ "epoch": 0.31078005379681356,
15873
+ "grad_norm": 0.7864125370979309,
15874
+ "learning_rate": 0.00019487371075487713,
15875
+ "loss": 0.6525,
15876
+ "step": 2253
15877
+ },
15878
+ {
15879
+ "epoch": 0.31091799434443756,
15880
+ "grad_norm": 0.6421269178390503,
15881
+ "learning_rate": 0.00019486913753058148,
15882
+ "loss": 0.4446,
15883
+ "step": 2254
15884
+ },
15885
+ {
15886
+ "epoch": 0.3110559348920615,
15887
+ "grad_norm": 1.2416633367538452,
15888
+ "learning_rate": 0.0001948645623209909,
15889
+ "loss": 0.5695,
15890
+ "step": 2255
15891
+ },
15892
+ {
15893
+ "epoch": 0.3111938754396855,
15894
+ "grad_norm": 1.3990689516067505,
15895
+ "learning_rate": 0.00019485998512620113,
15896
+ "loss": 0.8486,
15897
+ "step": 2256
15898
+ },
15899
+ {
15900
+ "epoch": 0.31133181598730947,
15901
+ "grad_norm": 0.8644762635231018,
15902
+ "learning_rate": 0.00019485540594630794,
15903
+ "loss": 0.5197,
15904
+ "step": 2257
15905
+ },
15906
+ {
15907
+ "epoch": 0.3114697565349334,
15908
+ "grad_norm": 0.7197523713111877,
15909
+ "learning_rate": 0.0001948508247814072,
15910
+ "loss": 0.4854,
15911
+ "step": 2258
15912
+ },
15913
+ {
15914
+ "epoch": 0.3116076970825574,
15915
+ "grad_norm": 0.7777307033538818,
15916
+ "learning_rate": 0.00019484624163159474,
15917
+ "loss": 0.8011,
15918
+ "step": 2259
15919
+ },
15920
+ {
15921
+ "epoch": 0.31174563763018137,
15922
+ "grad_norm": 3.498762369155884,
15923
+ "learning_rate": 0.00019484165649696648,
15924
+ "loss": 1.2415,
15925
+ "step": 2260
15926
+ },
15927
+ {
15928
+ "epoch": 0.3118835781778054,
15929
+ "grad_norm": 0.8177916407585144,
15930
+ "learning_rate": 0.00019483706937761837,
15931
+ "loss": 0.6254,
15932
+ "step": 2261
15933
+ },
15934
+ {
15935
+ "epoch": 0.3120215187254293,
15936
+ "grad_norm": 0.8077528476715088,
15937
+ "learning_rate": 0.0001948324802736464,
15938
+ "loss": 1.1841,
15939
+ "step": 2262
15940
+ },
15941
+ {
15942
+ "epoch": 0.31215945927305333,
15943
+ "grad_norm": 0.7529622316360474,
15944
+ "learning_rate": 0.00019482788918514664,
15945
+ "loss": 0.5046,
15946
+ "step": 2263
15947
+ },
15948
+ {
15949
+ "epoch": 0.3122973998206773,
15950
+ "grad_norm": 0.6038236618041992,
15951
+ "learning_rate": 0.0001948232961122151,
15952
+ "loss": 0.5598,
15953
+ "step": 2264
15954
+ },
15955
+ {
15956
+ "epoch": 0.3124353403683013,
15957
+ "grad_norm": 0.6496687531471252,
15958
+ "learning_rate": 0.00019481870105494796,
15959
+ "loss": 0.3127,
15960
+ "step": 2265
15961
+ },
15962
+ {
15963
+ "epoch": 0.31257328091592523,
15964
+ "grad_norm": 0.8372655510902405,
15965
+ "learning_rate": 0.00019481410401344133,
15966
+ "loss": 0.7623,
15967
+ "step": 2266
15968
+ },
15969
+ {
15970
+ "epoch": 0.3127112214635492,
15971
+ "grad_norm": 0.9408671855926514,
15972
+ "learning_rate": 0.00019480950498779144,
15973
+ "loss": 0.913,
15974
+ "step": 2267
15975
+ },
15976
+ {
15977
+ "epoch": 0.3128491620111732,
15978
+ "grad_norm": 1.2297847270965576,
15979
+ "learning_rate": 0.00019480490397809456,
15980
+ "loss": 0.7727,
15981
+ "step": 2268
15982
+ },
15983
+ {
15984
+ "epoch": 0.31298710255879714,
15985
+ "grad_norm": 0.8657265305519104,
15986
+ "learning_rate": 0.0001948003009844469,
15987
+ "loss": 0.7712,
15988
+ "step": 2269
15989
+ },
15990
+ {
15991
+ "epoch": 0.31312504310642114,
15992
+ "grad_norm": 0.6789664030075073,
15993
+ "learning_rate": 0.00019479569600694486,
15994
+ "loss": 0.5377,
15995
+ "step": 2270
15996
+ },
15997
+ {
15998
+ "epoch": 0.3132629836540451,
15999
+ "grad_norm": 0.8153241872787476,
16000
+ "learning_rate": 0.00019479108904568474,
16001
+ "loss": 0.438,
16002
+ "step": 2271
16003
+ },
16004
+ {
16005
+ "epoch": 0.3134009242016691,
16006
+ "grad_norm": 0.820363461971283,
16007
+ "learning_rate": 0.00019478648010076298,
16008
+ "loss": 0.5774,
16009
+ "step": 2272
16010
+ },
16011
+ {
16012
+ "epoch": 0.31353886474929304,
16013
+ "grad_norm": 0.9345502853393555,
16014
+ "learning_rate": 0.00019478186917227605,
16015
+ "loss": 0.7403,
16016
+ "step": 2273
16017
+ },
16018
+ {
16019
+ "epoch": 0.31367680529691705,
16020
+ "grad_norm": 0.6386396884918213,
16021
+ "learning_rate": 0.00019477725626032043,
16022
+ "loss": 0.5016,
16023
+ "step": 2274
16024
+ },
16025
+ {
16026
+ "epoch": 0.313814745844541,
16027
+ "grad_norm": 1.081990122795105,
16028
+ "learning_rate": 0.00019477264136499262,
16029
+ "loss": 0.7868,
16030
+ "step": 2275
16031
+ },
16032
+ {
16033
+ "epoch": 0.313952686392165,
16034
+ "grad_norm": 0.7201882600784302,
16035
+ "learning_rate": 0.00019476802448638924,
16036
+ "loss": 0.488,
16037
+ "step": 2276
16038
+ },
16039
+ {
16040
+ "epoch": 0.31409062693978895,
16041
+ "grad_norm": 0.7955479621887207,
16042
+ "learning_rate": 0.00019476340562460688,
16043
+ "loss": 0.7676,
16044
+ "step": 2277
16045
+ },
16046
+ {
16047
+ "epoch": 0.3142285674874129,
16048
+ "grad_norm": 0.731919527053833,
16049
+ "learning_rate": 0.0001947587847797422,
16050
+ "loss": 0.579,
16051
+ "step": 2278
16052
+ },
16053
+ {
16054
+ "epoch": 0.3143665080350369,
16055
+ "grad_norm": 1.8228474855422974,
16056
+ "learning_rate": 0.00019475416195189192,
16057
+ "loss": 0.8461,
16058
+ "step": 2279
16059
+ },
16060
+ {
16061
+ "epoch": 0.31450444858266086,
16062
+ "grad_norm": 0.5661347508430481,
16063
+ "learning_rate": 0.00019474953714115274,
16064
+ "loss": 0.3593,
16065
+ "step": 2280
16066
+ },
16067
+ {
16068
+ "epoch": 0.31464238913028486,
16069
+ "grad_norm": 0.747999370098114,
16070
+ "learning_rate": 0.00019474491034762145,
16071
+ "loss": 0.6878,
16072
+ "step": 2281
16073
+ },
16074
+ {
16075
+ "epoch": 0.3147803296779088,
16076
+ "grad_norm": 0.9928996562957764,
16077
+ "learning_rate": 0.0001947402815713949,
16078
+ "loss": 0.8761,
16079
+ "step": 2282
16080
+ },
16081
+ {
16082
+ "epoch": 0.3149182702255328,
16083
+ "grad_norm": 0.7003133893013,
16084
+ "learning_rate": 0.00019473565081256996,
16085
+ "loss": 0.4855,
16086
+ "step": 2283
16087
+ },
16088
+ {
16089
+ "epoch": 0.31505621077315676,
16090
+ "grad_norm": 0.6472734808921814,
16091
+ "learning_rate": 0.00019473101807124352,
16092
+ "loss": 0.511,
16093
+ "step": 2284
16094
+ },
16095
+ {
16096
+ "epoch": 0.31519415132078077,
16097
+ "grad_norm": 0.723513662815094,
16098
+ "learning_rate": 0.0001947263833475125,
16099
+ "loss": 0.4892,
16100
+ "step": 2285
16101
+ },
16102
+ {
16103
+ "epoch": 0.3153320918684047,
16104
+ "grad_norm": 1.6176047325134277,
16105
+ "learning_rate": 0.00019472174664147393,
16106
+ "loss": 0.5581,
16107
+ "step": 2286
16108
+ },
16109
+ {
16110
+ "epoch": 0.31547003241602867,
16111
+ "grad_norm": 0.9376205801963806,
16112
+ "learning_rate": 0.00019471710795322485,
16113
+ "loss": 1.091,
16114
+ "step": 2287
16115
+ },
16116
+ {
16117
+ "epoch": 0.31560797296365267,
16118
+ "grad_norm": 1.0848584175109863,
16119
+ "learning_rate": 0.00019471246728286227,
16120
+ "loss": 0.6718,
16121
+ "step": 2288
16122
+ },
16123
+ {
16124
+ "epoch": 0.3157459135112766,
16125
+ "grad_norm": 1.0394634008407593,
16126
+ "learning_rate": 0.00019470782463048336,
16127
+ "loss": 0.4477,
16128
+ "step": 2289
16129
+ },
16130
+ {
16131
+ "epoch": 0.3158838540589006,
16132
+ "grad_norm": 0.8964745998382568,
16133
+ "learning_rate": 0.00019470317999618523,
16134
+ "loss": 0.4769,
16135
+ "step": 2290
16136
+ },
16137
+ {
16138
+ "epoch": 0.3160217946065246,
16139
+ "grad_norm": 0.6246095299720764,
16140
+ "learning_rate": 0.00019469853338006514,
16141
+ "loss": 0.2479,
16142
+ "step": 2291
16143
+ },
16144
+ {
16145
+ "epoch": 0.3161597351541486,
16146
+ "grad_norm": 0.878368079662323,
16147
+ "learning_rate": 0.0001946938847822203,
16148
+ "loss": 0.3964,
16149
+ "step": 2292
16150
+ },
16151
+ {
16152
+ "epoch": 0.31629767570177253,
16153
+ "grad_norm": 0.6446416974067688,
16154
+ "learning_rate": 0.00019468923420274797,
16155
+ "loss": 0.6782,
16156
+ "step": 2293
16157
+ },
16158
+ {
16159
+ "epoch": 0.31643561624939653,
16160
+ "grad_norm": 0.8462199568748474,
16161
+ "learning_rate": 0.0001946845816417455,
16162
+ "loss": 0.6378,
16163
+ "step": 2294
16164
+ },
16165
+ {
16166
+ "epoch": 0.3165735567970205,
16167
+ "grad_norm": 0.7193346619606018,
16168
+ "learning_rate": 0.00019467992709931017,
16169
+ "loss": 0.5933,
16170
+ "step": 2295
16171
+ },
16172
+ {
16173
+ "epoch": 0.31671149734464443,
16174
+ "grad_norm": 1.4028959274291992,
16175
+ "learning_rate": 0.00019467527057553952,
16176
+ "loss": 1.1746,
16177
+ "step": 2296
16178
+ },
16179
+ {
16180
+ "epoch": 0.31684943789226844,
16181
+ "grad_norm": 0.8412365913391113,
16182
+ "learning_rate": 0.00019467061207053087,
16183
+ "loss": 0.5632,
16184
+ "step": 2297
16185
+ },
16186
+ {
16187
+ "epoch": 0.3169873784398924,
16188
+ "grad_norm": 0.6352449655532837,
16189
+ "learning_rate": 0.0001946659515843818,
16190
+ "loss": 0.4559,
16191
+ "step": 2298
16192
+ },
16193
+ {
16194
+ "epoch": 0.3171253189875164,
16195
+ "grad_norm": 0.48701727390289307,
16196
+ "learning_rate": 0.00019466128911718982,
16197
+ "loss": 0.2398,
16198
+ "step": 2299
16199
+ },
16200
+ {
16201
+ "epoch": 0.31726325953514034,
16202
+ "grad_norm": 0.5449528098106384,
16203
+ "learning_rate": 0.00019465662466905243,
16204
+ "loss": 0.6206,
16205
+ "step": 2300
16206
+ },
16207
+ {
16208
+ "epoch": 0.31740120008276435,
16209
+ "grad_norm": 1.2383208274841309,
16210
+ "learning_rate": 0.00019465195824006732,
16211
+ "loss": 0.9354,
16212
+ "step": 2301
16213
+ },
16214
+ {
16215
+ "epoch": 0.3175391406303883,
16216
+ "grad_norm": 0.9451349377632141,
16217
+ "learning_rate": 0.00019464728983033212,
16218
+ "loss": 0.9349,
16219
+ "step": 2302
16220
+ },
16221
+ {
16222
+ "epoch": 0.3176770811780123,
16223
+ "grad_norm": 0.7076907753944397,
16224
+ "learning_rate": 0.0001946426194399445,
16225
+ "loss": 0.667,
16226
+ "step": 2303
16227
+ },
16228
+ {
16229
+ "epoch": 0.31781502172563625,
16230
+ "grad_norm": 0.6356270909309387,
16231
+ "learning_rate": 0.00019463794706900224,
16232
+ "loss": 0.2469,
16233
+ "step": 2304
16234
+ },
16235
+ {
16236
+ "epoch": 0.3179529622732602,
16237
+ "grad_norm": 0.8059444427490234,
16238
+ "learning_rate": 0.00019463327271760308,
16239
+ "loss": 0.6322,
16240
+ "step": 2305
16241
+ },
16242
+ {
16243
+ "epoch": 0.3180909028208842,
16244
+ "grad_norm": 0.7126657366752625,
16245
+ "learning_rate": 0.00019462859638584484,
16246
+ "loss": 0.4607,
16247
+ "step": 2306
16248
+ },
16249
+ {
16250
+ "epoch": 0.31822884336850815,
16251
+ "grad_norm": 1.20512855052948,
16252
+ "learning_rate": 0.0001946239180738254,
16253
+ "loss": 0.7065,
16254
+ "step": 2307
16255
+ },
16256
+ {
16257
+ "epoch": 0.31836678391613216,
16258
+ "grad_norm": 1.0039737224578857,
16259
+ "learning_rate": 0.00019461923778164267,
16260
+ "loss": 0.7817,
16261
+ "step": 2308
16262
+ },
16263
+ {
16264
+ "epoch": 0.3185047244637561,
16265
+ "grad_norm": 0.8472278118133545,
16266
+ "learning_rate": 0.00019461455550939455,
16267
+ "loss": 0.7392,
16268
+ "step": 2309
16269
+ },
16270
+ {
16271
+ "epoch": 0.3186426650113801,
16272
+ "grad_norm": 0.8026204109191895,
16273
+ "learning_rate": 0.00019460987125717905,
16274
+ "loss": 0.6547,
16275
+ "step": 2310
16276
+ },
16277
+ {
16278
+ "epoch": 0.31878060555900406,
16279
+ "grad_norm": 0.985788881778717,
16280
+ "learning_rate": 0.00019460518502509422,
16281
+ "loss": 0.3619,
16282
+ "step": 2311
16283
+ },
16284
+ {
16285
+ "epoch": 0.31891854610662806,
16286
+ "grad_norm": 0.913837194442749,
16287
+ "learning_rate": 0.00019460049681323808,
16288
+ "loss": 0.8376,
16289
+ "step": 2312
16290
+ },
16291
+ {
16292
+ "epoch": 0.319056486654252,
16293
+ "grad_norm": 0.6265845894813538,
16294
+ "learning_rate": 0.0001945958066217088,
16295
+ "loss": 0.579,
16296
+ "step": 2313
16297
+ },
16298
+ {
16299
+ "epoch": 0.319194427201876,
16300
+ "grad_norm": 0.9424504637718201,
16301
+ "learning_rate": 0.00019459111445060444,
16302
+ "loss": 0.5184,
16303
+ "step": 2314
16304
+ },
16305
+ {
16306
+ "epoch": 0.31933236774949997,
16307
+ "grad_norm": 0.5835946202278137,
16308
+ "learning_rate": 0.00019458642030002326,
16309
+ "loss": 0.4495,
16310
+ "step": 2315
16311
+ },
16312
+ {
16313
+ "epoch": 0.3194703082971239,
16314
+ "grad_norm": 0.7594127058982849,
16315
+ "learning_rate": 0.00019458172417006347,
16316
+ "loss": 0.7142,
16317
+ "step": 2316
16318
+ },
16319
+ {
16320
+ "epoch": 0.3196082488447479,
16321
+ "grad_norm": 0.6176849007606506,
16322
+ "learning_rate": 0.00019457702606082337,
16323
+ "loss": 0.3594,
16324
+ "step": 2317
16325
+ },
16326
+ {
16327
+ "epoch": 0.31974618939237187,
16328
+ "grad_norm": 1.6596888303756714,
16329
+ "learning_rate": 0.00019457232597240126,
16330
+ "loss": 0.9118,
16331
+ "step": 2318
16332
+ },
16333
+ {
16334
+ "epoch": 0.3198841299399959,
16335
+ "grad_norm": 0.8690287470817566,
16336
+ "learning_rate": 0.00019456762390489548,
16337
+ "loss": 0.566,
16338
+ "step": 2319
16339
+ },
16340
+ {
16341
+ "epoch": 0.3200220704876198,
16342
+ "grad_norm": 1.1131110191345215,
16343
+ "learning_rate": 0.0001945629198584044,
16344
+ "loss": 1.1,
16345
+ "step": 2320
16346
+ },
16347
+ {
16348
+ "epoch": 0.32016001103524383,
16349
+ "grad_norm": 0.7218566536903381,
16350
+ "learning_rate": 0.00019455821383302657,
16351
+ "loss": 0.5501,
16352
+ "step": 2321
16353
+ },
16354
+ {
16355
+ "epoch": 0.3202979515828678,
16356
+ "grad_norm": 0.5688751339912415,
16357
+ "learning_rate": 0.00019455350582886038,
16358
+ "loss": 0.5373,
16359
+ "step": 2322
16360
+ },
16361
+ {
16362
+ "epoch": 0.3204358921304918,
16363
+ "grad_norm": 1.2792819738388062,
16364
+ "learning_rate": 0.00019454879584600437,
16365
+ "loss": 0.733,
16366
+ "step": 2323
16367
+ },
16368
+ {
16369
+ "epoch": 0.32057383267811573,
16370
+ "grad_norm": 0.9383312463760376,
16371
+ "learning_rate": 0.0001945440838845571,
16372
+ "loss": 0.4367,
16373
+ "step": 2324
16374
+ },
16375
+ {
16376
+ "epoch": 0.3207117732257397,
16377
+ "grad_norm": 0.9324066042900085,
16378
+ "learning_rate": 0.00019453936994461718,
16379
+ "loss": 0.9925,
16380
+ "step": 2325
16381
+ },
16382
+ {
16383
+ "epoch": 0.3208497137733637,
16384
+ "grad_norm": 1.0629867315292358,
16385
+ "learning_rate": 0.0001945346540262833,
16386
+ "loss": 0.7296,
16387
+ "step": 2326
16388
+ },
16389
+ {
16390
+ "epoch": 0.32098765432098764,
16391
+ "grad_norm": 0.7863196730613708,
16392
+ "learning_rate": 0.0001945299361296541,
16393
+ "loss": 0.936,
16394
+ "step": 2327
16395
+ },
16396
+ {
16397
+ "epoch": 0.32112559486861164,
16398
+ "grad_norm": 0.6948659420013428,
16399
+ "learning_rate": 0.0001945252162548283,
16400
+ "loss": 0.4759,
16401
+ "step": 2328
16402
+ },
16403
+ {
16404
+ "epoch": 0.3212635354162356,
16405
+ "grad_norm": 0.908307671546936,
16406
+ "learning_rate": 0.00019452049440190473,
16407
+ "loss": 0.8042,
16408
+ "step": 2329
16409
+ },
16410
+ {
16411
+ "epoch": 0.3214014759638596,
16412
+ "grad_norm": 0.814140796661377,
16413
+ "learning_rate": 0.00019451577057098213,
16414
+ "loss": 0.7884,
16415
+ "step": 2330
16416
+ },
16417
+ {
16418
+ "epoch": 0.32153941651148354,
16419
+ "grad_norm": 0.752573549747467,
16420
+ "learning_rate": 0.0001945110447621594,
16421
+ "loss": 0.8405,
16422
+ "step": 2331
16423
+ },
16424
+ {
16425
+ "epoch": 0.32167735705910755,
16426
+ "grad_norm": 0.8677518963813782,
16427
+ "learning_rate": 0.00019450631697553542,
16428
+ "loss": 0.8891,
16429
+ "step": 2332
16430
+ },
16431
+ {
16432
+ "epoch": 0.3218152976067315,
16433
+ "grad_norm": 0.7212129831314087,
16434
+ "learning_rate": 0.00019450158721120916,
16435
+ "loss": 0.5369,
16436
+ "step": 2333
16437
+ },
16438
+ {
16439
+ "epoch": 0.32195323815435545,
16440
+ "grad_norm": 0.6805658936500549,
16441
+ "learning_rate": 0.00019449685546927954,
16442
+ "loss": 0.4181,
16443
+ "step": 2334
16444
+ },
16445
+ {
16446
+ "epoch": 0.32209117870197945,
16447
+ "grad_norm": 0.8572118878364563,
16448
+ "learning_rate": 0.0001944921217498456,
16449
+ "loss": 0.4678,
16450
+ "step": 2335
16451
+ },
16452
+ {
16453
+ "epoch": 0.3222291192496034,
16454
+ "grad_norm": 0.7739250063896179,
16455
+ "learning_rate": 0.00019448738605300645,
16456
+ "loss": 0.8138,
16457
+ "step": 2336
16458
+ },
16459
+ {
16460
+ "epoch": 0.3223670597972274,
16461
+ "grad_norm": 0.9221212863922119,
16462
+ "learning_rate": 0.00019448264837886113,
16463
+ "loss": 0.6867,
16464
+ "step": 2337
16465
+ },
16466
+ {
16467
+ "epoch": 0.32250500034485136,
16468
+ "grad_norm": 0.5943915247917175,
16469
+ "learning_rate": 0.0001944779087275088,
16470
+ "loss": 0.4269,
16471
+ "step": 2338
16472
+ },
16473
+ {
16474
+ "epoch": 0.32264294089247536,
16475
+ "grad_norm": 0.7601683735847473,
16476
+ "learning_rate": 0.00019447316709904865,
16477
+ "loss": 0.4699,
16478
+ "step": 2339
16479
+ },
16480
+ {
16481
+ "epoch": 0.3227808814400993,
16482
+ "grad_norm": 0.8653863072395325,
16483
+ "learning_rate": 0.0001944684234935799,
16484
+ "loss": 0.6408,
16485
+ "step": 2340
16486
+ },
16487
+ {
16488
+ "epoch": 0.3229188219877233,
16489
+ "grad_norm": 0.8126456141471863,
16490
+ "learning_rate": 0.00019446367791120186,
16491
+ "loss": 0.7773,
16492
+ "step": 2341
16493
+ },
16494
+ {
16495
+ "epoch": 0.32305676253534726,
16496
+ "grad_norm": 0.6638123393058777,
16497
+ "learning_rate": 0.00019445893035201383,
16498
+ "loss": 0.4854,
16499
+ "step": 2342
16500
+ },
16501
+ {
16502
+ "epoch": 0.3231947030829712,
16503
+ "grad_norm": 1.3545905351638794,
16504
+ "learning_rate": 0.00019445418081611506,
16505
+ "loss": 0.9794,
16506
+ "step": 2343
16507
+ },
16508
+ {
16509
+ "epoch": 0.3233326436305952,
16510
+ "grad_norm": 0.8681669235229492,
16511
+ "learning_rate": 0.00019444942930360503,
16512
+ "loss": 0.8998,
16513
+ "step": 2344
16514
+ },
16515
+ {
16516
+ "epoch": 0.32347058417821917,
16517
+ "grad_norm": 1.0023455619812012,
16518
+ "learning_rate": 0.00019444467581458322,
16519
+ "loss": 0.7062,
16520
+ "step": 2345
16521
+ },
16522
+ {
16523
+ "epoch": 0.32360852472584317,
16524
+ "grad_norm": 0.8101288676261902,
16525
+ "learning_rate": 0.00019443992034914897,
16526
+ "loss": 0.6581,
16527
+ "step": 2346
16528
+ },
16529
+ {
16530
+ "epoch": 0.3237464652734671,
16531
+ "grad_norm": 1.2586729526519775,
16532
+ "learning_rate": 0.00019443516290740194,
16533
+ "loss": 0.7804,
16534
+ "step": 2347
16535
+ },
16536
+ {
16537
+ "epoch": 0.3238844058210911,
16538
+ "grad_norm": 0.9507285356521606,
16539
+ "learning_rate": 0.00019443040348944156,
16540
+ "loss": 0.6049,
16541
+ "step": 2348
16542
+ },
16543
+ {
16544
+ "epoch": 0.3240223463687151,
16545
+ "grad_norm": 0.6528936624526978,
16546
+ "learning_rate": 0.00019442564209536754,
16547
+ "loss": 0.4616,
16548
+ "step": 2349
16549
+ },
16550
+ {
16551
+ "epoch": 0.3241602869163391,
16552
+ "grad_norm": 0.7113572359085083,
16553
+ "learning_rate": 0.00019442087872527944,
16554
+ "loss": 0.6116,
16555
+ "step": 2350
16556
+ },
16557
+ {
16558
+ "epoch": 0.32429822746396303,
16559
+ "grad_norm": 0.5419871807098389,
16560
+ "learning_rate": 0.00019441611337927696,
16561
+ "loss": 0.2321,
16562
+ "step": 2351
16563
+ },
16564
+ {
16565
+ "epoch": 0.324436168011587,
16566
+ "grad_norm": 0.679607629776001,
16567
+ "learning_rate": 0.00019441134605745986,
16568
+ "loss": 0.459,
16569
+ "step": 2352
16570
+ },
16571
+ {
16572
+ "epoch": 0.324574108559211,
16573
+ "grad_norm": 0.9691960215568542,
16574
+ "learning_rate": 0.00019440657675992787,
16575
+ "loss": 0.4727,
16576
+ "step": 2353
16577
+ },
16578
+ {
16579
+ "epoch": 0.32471204910683493,
16580
+ "grad_norm": 0.8125988841056824,
16581
+ "learning_rate": 0.0001944018054867808,
16582
+ "loss": 0.8017,
16583
+ "step": 2354
16584
+ },
16585
+ {
16586
+ "epoch": 0.32484998965445894,
16587
+ "grad_norm": 1.649573802947998,
16588
+ "learning_rate": 0.00019439703223811847,
16589
+ "loss": 0.829,
16590
+ "step": 2355
16591
+ },
16592
+ {
16593
+ "epoch": 0.3249879302020829,
16594
+ "grad_norm": 0.745305061340332,
16595
+ "learning_rate": 0.00019439225701404085,
16596
+ "loss": 0.4651,
16597
+ "step": 2356
16598
+ },
16599
+ {
16600
+ "epoch": 0.3251258707497069,
16601
+ "grad_norm": 0.6748473048210144,
16602
+ "learning_rate": 0.00019438747981464775,
16603
+ "loss": 0.5996,
16604
+ "step": 2357
16605
+ },
16606
+ {
16607
+ "epoch": 0.32526381129733084,
16608
+ "grad_norm": 1.0531598329544067,
16609
+ "learning_rate": 0.00019438270064003926,
16610
+ "loss": 0.9084,
16611
+ "step": 2358
16612
+ },
16613
+ {
16614
+ "epoch": 0.32540175184495485,
16615
+ "grad_norm": 0.9223348498344421,
16616
+ "learning_rate": 0.00019437791949031535,
16617
+ "loss": 0.7866,
16618
+ "step": 2359
16619
+ },
16620
+ {
16621
+ "epoch": 0.3255396923925788,
16622
+ "grad_norm": 0.7526196837425232,
16623
+ "learning_rate": 0.00019437313636557602,
16624
+ "loss": 0.4975,
16625
+ "step": 2360
16626
+ },
16627
+ {
16628
+ "epoch": 0.3256776329402028,
16629
+ "grad_norm": 1.6201496124267578,
16630
+ "learning_rate": 0.00019436835126592143,
16631
+ "loss": 0.7395,
16632
+ "step": 2361
16633
+ },
16634
+ {
16635
+ "epoch": 0.32581557348782675,
16636
+ "grad_norm": 0.7340310215950012,
16637
+ "learning_rate": 0.00019436356419145166,
16638
+ "loss": 0.6325,
16639
+ "step": 2362
16640
+ },
16641
+ {
16642
+ "epoch": 0.3259535140354507,
16643
+ "grad_norm": 1.1777743101119995,
16644
+ "learning_rate": 0.00019435877514226697,
16645
+ "loss": 0.4779,
16646
+ "step": 2363
16647
+ },
16648
+ {
16649
+ "epoch": 0.3260914545830747,
16650
+ "grad_norm": 0.9242397546768188,
16651
+ "learning_rate": 0.00019435398411846752,
16652
+ "loss": 0.4674,
16653
+ "step": 2364
16654
+ },
16655
+ {
16656
+ "epoch": 0.32622939513069865,
16657
+ "grad_norm": 0.6935853958129883,
16658
+ "learning_rate": 0.00019434919112015355,
16659
+ "loss": 0.3949,
16660
+ "step": 2365
16661
+ },
16662
+ {
16663
+ "epoch": 0.32636733567832266,
16664
+ "grad_norm": 0.7134401202201843,
16665
+ "learning_rate": 0.00019434439614742543,
16666
+ "loss": 0.5659,
16667
+ "step": 2366
16668
+ },
16669
+ {
16670
+ "epoch": 0.3265052762259466,
16671
+ "grad_norm": 0.9489606618881226,
16672
+ "learning_rate": 0.00019433959920038345,
16673
+ "loss": 0.7124,
16674
+ "step": 2367
16675
+ },
16676
+ {
16677
+ "epoch": 0.3266432167735706,
16678
+ "grad_norm": 0.6194107532501221,
16679
+ "learning_rate": 0.000194334800279128,
16680
+ "loss": 0.8171,
16681
+ "step": 2368
16682
+ },
16683
+ {
16684
+ "epoch": 0.32678115732119456,
16685
+ "grad_norm": 0.8815126419067383,
16686
+ "learning_rate": 0.00019432999938375953,
16687
+ "loss": 0.5195,
16688
+ "step": 2369
16689
+ },
16690
+ {
16691
+ "epoch": 0.32691909786881856,
16692
+ "grad_norm": 0.5797806978225708,
16693
+ "learning_rate": 0.0001943251965143785,
16694
+ "loss": 0.4952,
16695
+ "step": 2370
16696
+ },
16697
+ {
16698
+ "epoch": 0.3270570384164425,
16699
+ "grad_norm": 0.9306840300559998,
16700
+ "learning_rate": 0.00019432039167108537,
16701
+ "loss": 0.476,
16702
+ "step": 2371
16703
+ },
16704
+ {
16705
+ "epoch": 0.32719497896406646,
16706
+ "grad_norm": 0.6784822344779968,
16707
+ "learning_rate": 0.00019431558485398076,
16708
+ "loss": 0.641,
16709
+ "step": 2372
16710
+ },
16711
+ {
16712
+ "epoch": 0.32733291951169047,
16713
+ "grad_norm": 0.7142674922943115,
16714
+ "learning_rate": 0.00019431077606316523,
16715
+ "loss": 0.6712,
16716
+ "step": 2373
16717
+ },
16718
+ {
16719
+ "epoch": 0.3274708600593144,
16720
+ "grad_norm": 1.0263147354125977,
16721
+ "learning_rate": 0.00019430596529873938,
16722
+ "loss": 0.8278,
16723
+ "step": 2374
16724
+ },
16725
+ {
16726
+ "epoch": 0.3276088006069384,
16727
+ "grad_norm": 0.672478199005127,
16728
+ "learning_rate": 0.00019430115256080394,
16729
+ "loss": 0.5935,
16730
+ "step": 2375
16731
+ },
16732
+ {
16733
+ "epoch": 0.32774674115456237,
16734
+ "grad_norm": 0.9333507418632507,
16735
+ "learning_rate": 0.0001942963378494596,
16736
+ "loss": 0.664,
16737
+ "step": 2376
16738
+ },
16739
+ {
16740
+ "epoch": 0.3278846817021864,
16741
+ "grad_norm": 0.8227028250694275,
16742
+ "learning_rate": 0.0001942915211648071,
16743
+ "loss": 0.3793,
16744
+ "step": 2377
16745
+ },
16746
+ {
16747
+ "epoch": 0.3280226222498103,
16748
+ "grad_norm": 0.8363267183303833,
16749
+ "learning_rate": 0.00019428670250694728,
16750
+ "loss": 0.534,
16751
+ "step": 2378
16752
+ },
16753
+ {
16754
+ "epoch": 0.32816056279743433,
16755
+ "grad_norm": 0.6801791787147522,
16756
+ "learning_rate": 0.00019428188187598094,
16757
+ "loss": 0.5693,
16758
+ "step": 2379
16759
+ },
16760
+ {
16761
+ "epoch": 0.3282985033450583,
16762
+ "grad_norm": 0.9937869310379028,
16763
+ "learning_rate": 0.00019427705927200896,
16764
+ "loss": 0.4011,
16765
+ "step": 2380
16766
+ },
16767
+ {
16768
+ "epoch": 0.32843644389268223,
16769
+ "grad_norm": 0.7679700255393982,
16770
+ "learning_rate": 0.00019427223469513228,
16771
+ "loss": 0.4928,
16772
+ "step": 2381
16773
+ },
16774
+ {
16775
+ "epoch": 0.32857438444030623,
16776
+ "grad_norm": 1.2696233987808228,
16777
+ "learning_rate": 0.00019426740814545185,
16778
+ "loss": 0.3716,
16779
+ "step": 2382
16780
+ },
16781
+ {
16782
+ "epoch": 0.3287123249879302,
16783
+ "grad_norm": 0.816831648349762,
16784
+ "learning_rate": 0.00019426257962306868,
16785
+ "loss": 0.469,
16786
+ "step": 2383
16787
+ },
16788
+ {
16789
+ "epoch": 0.3288502655355542,
16790
+ "grad_norm": 1.172206163406372,
16791
+ "learning_rate": 0.0001942577491280838,
16792
+ "loss": 0.7011,
16793
+ "step": 2384
16794
+ },
16795
+ {
16796
+ "epoch": 0.32898820608317814,
16797
+ "grad_norm": 0.8468907475471497,
16798
+ "learning_rate": 0.00019425291666059832,
16799
+ "loss": 0.2813,
16800
+ "step": 2385
16801
+ },
16802
+ {
16803
+ "epoch": 0.32912614663080214,
16804
+ "grad_norm": 0.9245859980583191,
16805
+ "learning_rate": 0.00019424808222071337,
16806
+ "loss": 0.5006,
16807
+ "step": 2386
16808
+ },
16809
+ {
16810
+ "epoch": 0.3292640871784261,
16811
+ "grad_norm": 1.3314694166183472,
16812
+ "learning_rate": 0.00019424324580853006,
16813
+ "loss": 0.3318,
16814
+ "step": 2387
16815
+ },
16816
+ {
16817
+ "epoch": 0.3294020277260501,
16818
+ "grad_norm": 0.6868737936019897,
16819
+ "learning_rate": 0.00019423840742414968,
16820
+ "loss": 0.4828,
16821
+ "step": 2388
16822
+ },
16823
+ {
16824
+ "epoch": 0.32953996827367404,
16825
+ "grad_norm": 0.5695831775665283,
16826
+ "learning_rate": 0.00019423356706767343,
16827
+ "loss": 0.4117,
16828
+ "step": 2389
16829
+ },
16830
+ {
16831
+ "epoch": 0.329677908821298,
16832
+ "grad_norm": 0.8199607729911804,
16833
+ "learning_rate": 0.00019422872473920264,
16834
+ "loss": 0.8271,
16835
+ "step": 2390
16836
+ },
16837
+ {
16838
+ "epoch": 0.329815849368922,
16839
+ "grad_norm": 0.9360648989677429,
16840
+ "learning_rate": 0.0001942238804388386,
16841
+ "loss": 0.8707,
16842
+ "step": 2391
16843
+ },
16844
+ {
16845
+ "epoch": 0.32995378991654595,
16846
+ "grad_norm": 0.7775169610977173,
16847
+ "learning_rate": 0.00019421903416668273,
16848
+ "loss": 0.5637,
16849
+ "step": 2392
16850
+ },
16851
+ {
16852
+ "epoch": 0.33009173046416995,
16853
+ "grad_norm": 0.8939715027809143,
16854
+ "learning_rate": 0.0001942141859228364,
16855
+ "loss": 0.681,
16856
+ "step": 2393
16857
+ },
16858
+ {
16859
+ "epoch": 0.3302296710117939,
16860
+ "grad_norm": 0.7903376221656799,
16861
+ "learning_rate": 0.00019420933570740112,
16862
+ "loss": 0.6019,
16863
+ "step": 2394
16864
+ },
16865
+ {
16866
+ "epoch": 0.3303676115594179,
16867
+ "grad_norm": 0.5653364062309265,
16868
+ "learning_rate": 0.00019420448352047833,
16869
+ "loss": 0.4377,
16870
+ "step": 2395
16871
+ },
16872
+ {
16873
+ "epoch": 0.33050555210704186,
16874
+ "grad_norm": 0.6574212312698364,
16875
+ "learning_rate": 0.0001941996293621696,
16876
+ "loss": 0.3995,
16877
+ "step": 2396
16878
+ },
16879
+ {
16880
+ "epoch": 0.33064349265466586,
16881
+ "grad_norm": 0.9487119913101196,
16882
+ "learning_rate": 0.00019419477323257654,
16883
+ "loss": 0.652,
16884
+ "step": 2397
16885
+ },
16886
+ {
16887
+ "epoch": 0.3307814332022898,
16888
+ "grad_norm": 0.8530499339103699,
16889
+ "learning_rate": 0.0001941899151318007,
16890
+ "loss": 0.7224,
16891
+ "step": 2398
16892
+ },
16893
+ {
16894
+ "epoch": 0.3309193737499138,
16895
+ "grad_norm": 0.8137893676757812,
16896
+ "learning_rate": 0.0001941850550599438,
16897
+ "loss": 0.5038,
16898
+ "step": 2399
16899
+ },
16900
+ {
16901
+ "epoch": 0.33105731429753776,
16902
+ "grad_norm": 0.8479599356651306,
16903
+ "learning_rate": 0.00019418019301710757,
16904
+ "loss": 0.4543,
16905
+ "step": 2400
16906
+ },
16907
+ {
16908
+ "epoch": 0.33105731429753776,
16909
+ "eval_loss": 0.6588593125343323,
16910
+ "eval_runtime": 23.4746,
16911
+ "eval_samples_per_second": 2.513,
16912
+ "eval_steps_per_second": 2.513,
16913
+ "step": 2400
16914
  }
16915
  ],
16916
  "logging_steps": 1,
 
16925
  "early_stopping_threshold": 0.0
16926
  },
16927
  "attributes": {
16928
+ "early_stopping_patience_counter": 3
16929
  }
16930
  },
16931
  "TrainerControl": {
 
16934
  "should_evaluate": false,
16935
  "should_log": false,
16936
  "should_save": true,
16937
+ "should_training_stop": true
16938
  },
16939
  "attributes": {}
16940
  }
16941
  },
16942
+ "total_flos": 3.8002757108760576e+17,
16943
  "train_batch_size": 1,
16944
  "trial_name": null,
16945
  "trial_params": null