ErrorAI commited on
Commit
1656ff0
·
verified ·
1 Parent(s): 541d22b

Training in progress, step 738, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0bd01a843d72a649467ca0ef8aaef50f27c47ac052da47acba539e63005add66
3
  size 83945296
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3a3937bb644e8be3db53724ff37a4cf4df56813bf6b4a93d38ec325890a084a4
3
  size 83945296
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:edd8c13f8967274a96ca7b3ebe0650a28c9586ed56d88d318df8624de39af605
3
  size 43123028
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bd82f4b1baa9236bf9259ffc67bbc17c841880be954604873e130ffce8157a86
3
  size 43123028
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:08416b69f3ea1ef1c206c452662a86dc8094e9ca4c60db81379bdb5238fe1a17
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7bb2c147343e6526aaa232aefc3fc703d6b9ddcf5d44d0d12953eb06e064afe1
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7a02f5253505f86b36ab797e2d0183c254e06141b3cd157163c8863bd1f151e5
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3482c8bfdc1affb8191f8e392c3195155db922305404a13a0a905ef4dc9ec8d7
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.5008908119114278,
5
  "eval_steps": 500,
6
- "global_step": 492,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -3451,6 +3451,1728 @@
3451
  "learning_rate": 5.032122403295977e-05,
3452
  "loss": 2.3439,
3453
  "step": 492
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3454
  }
3455
  ],
3456
  "logging_steps": 1,
@@ -3470,7 +5192,7 @@
3470
  "attributes": {}
3471
  }
3472
  },
3473
- "total_flos": 4.032897809991598e+17,
3474
  "train_batch_size": 4,
3475
  "trial_name": null,
3476
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.7513362178671418,
5
  "eval_steps": 500,
6
+ "global_step": 738,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
3451
  "learning_rate": 5.032122403295977e-05,
3452
  "loss": 2.3439,
3453
  "step": 492
3454
+ },
3455
+ {
3456
+ "epoch": 0.5019088826673453,
3457
+ "grad_norm": 8.969354629516602,
3458
+ "learning_rate": 5.0160612845131414e-05,
3459
+ "loss": 2.5224,
3460
+ "step": 493
3461
+ },
3462
+ {
3463
+ "epoch": 0.5029269534232629,
3464
+ "grad_norm": 7.22929573059082,
3465
+ "learning_rate": 5e-05,
3466
+ "loss": 1.6864,
3467
+ "step": 494
3468
+ },
3469
+ {
3470
+ "epoch": 0.5039450241791804,
3471
+ "grad_norm": 7.851796627044678,
3472
+ "learning_rate": 4.9839387154868584e-05,
3473
+ "loss": 2.0388,
3474
+ "step": 495
3475
+ },
3476
+ {
3477
+ "epoch": 0.504963094935098,
3478
+ "grad_norm": 9.29887580871582,
3479
+ "learning_rate": 4.967877596704025e-05,
3480
+ "loss": 2.6501,
3481
+ "step": 496
3482
+ },
3483
+ {
3484
+ "epoch": 0.5059811656910155,
3485
+ "grad_norm": 8.79398250579834,
3486
+ "learning_rate": 4.951816809380097e-05,
3487
+ "loss": 2.5759,
3488
+ "step": 497
3489
+ },
3490
+ {
3491
+ "epoch": 0.506999236446933,
3492
+ "grad_norm": 10.286650657653809,
3493
+ "learning_rate": 4.9357565192402525e-05,
3494
+ "loss": 3.1672,
3495
+ "step": 498
3496
+ },
3497
+ {
3498
+ "epoch": 0.5080173072028505,
3499
+ "grad_norm": 8.799942016601562,
3500
+ "learning_rate": 4.919696892004539e-05,
3501
+ "loss": 1.9062,
3502
+ "step": 499
3503
+ },
3504
+ {
3505
+ "epoch": 0.5090353779587682,
3506
+ "grad_norm": 8.252535820007324,
3507
+ "learning_rate": 4.903638093386167e-05,
3508
+ "loss": 1.6945,
3509
+ "step": 500
3510
+ },
3511
+ {
3512
+ "epoch": 0.5100534487146857,
3513
+ "grad_norm": 5.307441234588623,
3514
+ "learning_rate": 4.887580289089787e-05,
3515
+ "loss": 1.7627,
3516
+ "step": 501
3517
+ },
3518
+ {
3519
+ "epoch": 0.5110715194706033,
3520
+ "grad_norm": 7.124861717224121,
3521
+ "learning_rate": 4.8715236448098016e-05,
3522
+ "loss": 2.5286,
3523
+ "step": 502
3524
+ },
3525
+ {
3526
+ "epoch": 0.5120895902265208,
3527
+ "grad_norm": 10.368796348571777,
3528
+ "learning_rate": 4.855468326228638e-05,
3529
+ "loss": 3.6972,
3530
+ "step": 503
3531
+ },
3532
+ {
3533
+ "epoch": 0.5131076609824383,
3534
+ "grad_norm": 10.741311073303223,
3535
+ "learning_rate": 4.8394144990150404e-05,
3536
+ "loss": 4.1789,
3537
+ "step": 504
3538
+ },
3539
+ {
3540
+ "epoch": 0.5141257317383559,
3541
+ "grad_norm": 10.585883140563965,
3542
+ "learning_rate": 4.8233623288223704e-05,
3543
+ "loss": 3.5806,
3544
+ "step": 505
3545
+ },
3546
+ {
3547
+ "epoch": 0.5151438024942734,
3548
+ "grad_norm": 13.6077880859375,
3549
+ "learning_rate": 4.807311981286888e-05,
3550
+ "loss": 3.5391,
3551
+ "step": 506
3552
+ },
3553
+ {
3554
+ "epoch": 0.5161618732501909,
3555
+ "grad_norm": 16.34162712097168,
3556
+ "learning_rate": 4.7912636220260473e-05,
3557
+ "loss": 3.6142,
3558
+ "step": 507
3559
+ },
3560
+ {
3561
+ "epoch": 0.5171799440061084,
3562
+ "grad_norm": 11.734464645385742,
3563
+ "learning_rate": 4.775217416636786e-05,
3564
+ "loss": 2.6898,
3565
+ "step": 508
3566
+ },
3567
+ {
3568
+ "epoch": 0.518198014762026,
3569
+ "grad_norm": 7.3565287590026855,
3570
+ "learning_rate": 4.759173530693814e-05,
3571
+ "loss": 2.0978,
3572
+ "step": 509
3573
+ },
3574
+ {
3575
+ "epoch": 0.5192160855179435,
3576
+ "grad_norm": 5.852792739868164,
3577
+ "learning_rate": 4.7431321297479135e-05,
3578
+ "loss": 1.4297,
3579
+ "step": 510
3580
+ },
3581
+ {
3582
+ "epoch": 0.520234156273861,
3583
+ "grad_norm": 6.781408786773682,
3584
+ "learning_rate": 4.727093379324222e-05,
3585
+ "loss": 1.5329,
3586
+ "step": 511
3587
+ },
3588
+ {
3589
+ "epoch": 0.5212522270297786,
3590
+ "grad_norm": 11.24429702758789,
3591
+ "learning_rate": 4.711057444920522e-05,
3592
+ "loss": 1.6744,
3593
+ "step": 512
3594
+ },
3595
+ {
3596
+ "epoch": 0.5222702977856961,
3597
+ "grad_norm": 7.679388046264648,
3598
+ "learning_rate": 4.695024492005548e-05,
3599
+ "loss": 2.2356,
3600
+ "step": 513
3601
+ },
3602
+ {
3603
+ "epoch": 0.5232883685416136,
3604
+ "grad_norm": 5.93134069442749,
3605
+ "learning_rate": 4.6789946860172634e-05,
3606
+ "loss": 1.2665,
3607
+ "step": 514
3608
+ },
3609
+ {
3610
+ "epoch": 0.5243064392975312,
3611
+ "grad_norm": 6.789477348327637,
3612
+ "learning_rate": 4.6629681923611603e-05,
3613
+ "loss": 1.6466,
3614
+ "step": 515
3615
+ },
3616
+ {
3617
+ "epoch": 0.5253245100534487,
3618
+ "grad_norm": 8.315037727355957,
3619
+ "learning_rate": 4.646945176408555e-05,
3620
+ "loss": 2.065,
3621
+ "step": 516
3622
+ },
3623
+ {
3624
+ "epoch": 0.5263425808093662,
3625
+ "grad_norm": 5.443754196166992,
3626
+ "learning_rate": 4.630925803494877e-05,
3627
+ "loss": 1.0138,
3628
+ "step": 517
3629
+ },
3630
+ {
3631
+ "epoch": 0.5273606515652838,
3632
+ "grad_norm": 6.596680641174316,
3633
+ "learning_rate": 4.6149102389179635e-05,
3634
+ "loss": 1.8229,
3635
+ "step": 518
3636
+ },
3637
+ {
3638
+ "epoch": 0.5283787223212013,
3639
+ "grad_norm": 5.735509872436523,
3640
+ "learning_rate": 4.598898647936354e-05,
3641
+ "loss": 1.5016,
3642
+ "step": 519
3643
+ },
3644
+ {
3645
+ "epoch": 0.5293967930771188,
3646
+ "grad_norm": 7.154899597167969,
3647
+ "learning_rate": 4.58289119576759e-05,
3648
+ "loss": 1.7249,
3649
+ "step": 520
3650
+ },
3651
+ {
3652
+ "epoch": 0.5304148638330364,
3653
+ "grad_norm": 5.887238502502441,
3654
+ "learning_rate": 4.566888047586507e-05,
3655
+ "loss": 1.3531,
3656
+ "step": 521
3657
+ },
3658
+ {
3659
+ "epoch": 0.5314329345889539,
3660
+ "grad_norm": 7.944952964782715,
3661
+ "learning_rate": 4.55088936852352e-05,
3662
+ "loss": 2.0604,
3663
+ "step": 522
3664
+ },
3665
+ {
3666
+ "epoch": 0.5324510053448714,
3667
+ "grad_norm": 8.235894203186035,
3668
+ "learning_rate": 4.5348953236629395e-05,
3669
+ "loss": 1.782,
3670
+ "step": 523
3671
+ },
3672
+ {
3673
+ "epoch": 0.533469076100789,
3674
+ "grad_norm": 9.824324607849121,
3675
+ "learning_rate": 4.518906078041252e-05,
3676
+ "loss": 3.1078,
3677
+ "step": 524
3678
+ },
3679
+ {
3680
+ "epoch": 0.5344871468567065,
3681
+ "grad_norm": 8.053499221801758,
3682
+ "learning_rate": 4.502921796645424e-05,
3683
+ "loss": 2.5225,
3684
+ "step": 525
3685
+ },
3686
+ {
3687
+ "epoch": 0.535505217612624,
3688
+ "grad_norm": 9.53549861907959,
3689
+ "learning_rate": 4.486942644411197e-05,
3690
+ "loss": 3.0847,
3691
+ "step": 526
3692
+ },
3693
+ {
3694
+ "epoch": 0.5365232883685416,
3695
+ "grad_norm": 8.427640914916992,
3696
+ "learning_rate": 4.4709687862213866e-05,
3697
+ "loss": 2.1704,
3698
+ "step": 527
3699
+ },
3700
+ {
3701
+ "epoch": 0.5375413591244591,
3702
+ "grad_norm": 7.989354610443115,
3703
+ "learning_rate": 4.4550003869041845e-05,
3704
+ "loss": 2.3719,
3705
+ "step": 528
3706
+ },
3707
+ {
3708
+ "epoch": 0.5385594298803766,
3709
+ "grad_norm": 7.53865909576416,
3710
+ "learning_rate": 4.439037611231448e-05,
3711
+ "loss": 2.4358,
3712
+ "step": 529
3713
+ },
3714
+ {
3715
+ "epoch": 0.5395775006362942,
3716
+ "grad_norm": 9.102818489074707,
3717
+ "learning_rate": 4.423080623917012e-05,
3718
+ "loss": 3.0774,
3719
+ "step": 530
3720
+ },
3721
+ {
3722
+ "epoch": 0.5405955713922117,
3723
+ "grad_norm": 10.17009162902832,
3724
+ "learning_rate": 4.407129589614979e-05,
3725
+ "loss": 2.719,
3726
+ "step": 531
3727
+ },
3728
+ {
3729
+ "epoch": 0.5416136421481293,
3730
+ "grad_norm": 8.132767677307129,
3731
+ "learning_rate": 4.3911846729180335e-05,
3732
+ "loss": 2.6276,
3733
+ "step": 532
3734
+ },
3735
+ {
3736
+ "epoch": 0.5426317129040469,
3737
+ "grad_norm": 8.669943809509277,
3738
+ "learning_rate": 4.3752460383557195e-05,
3739
+ "loss": 2.2211,
3740
+ "step": 533
3741
+ },
3742
+ {
3743
+ "epoch": 0.5436497836599644,
3744
+ "grad_norm": 8.190427780151367,
3745
+ "learning_rate": 4.359313850392772e-05,
3746
+ "loss": 2.2451,
3747
+ "step": 534
3748
+ },
3749
+ {
3750
+ "epoch": 0.544667854415882,
3751
+ "grad_norm": 7.185608386993408,
3752
+ "learning_rate": 4.3433882734274e-05,
3753
+ "loss": 1.938,
3754
+ "step": 535
3755
+ },
3756
+ {
3757
+ "epoch": 0.5456859251717995,
3758
+ "grad_norm": 9.735365867614746,
3759
+ "learning_rate": 4.327469471789597e-05,
3760
+ "loss": 3.3738,
3761
+ "step": 536
3762
+ },
3763
+ {
3764
+ "epoch": 0.546703995927717,
3765
+ "grad_norm": 9.06591796875,
3766
+ "learning_rate": 4.311557609739442e-05,
3767
+ "loss": 3.4894,
3768
+ "step": 537
3769
+ },
3770
+ {
3771
+ "epoch": 0.5477220666836345,
3772
+ "grad_norm": 8.038829803466797,
3773
+ "learning_rate": 4.295652851465412e-05,
3774
+ "loss": 2.6487,
3775
+ "step": 538
3776
+ },
3777
+ {
3778
+ "epoch": 0.5487401374395521,
3779
+ "grad_norm": 7.375051498413086,
3780
+ "learning_rate": 4.27975536108268e-05,
3781
+ "loss": 2.4853,
3782
+ "step": 539
3783
+ },
3784
+ {
3785
+ "epoch": 0.5497582081954696,
3786
+ "grad_norm": 9.910839080810547,
3787
+ "learning_rate": 4.2638653026314224e-05,
3788
+ "loss": 3.1606,
3789
+ "step": 540
3790
+ },
3791
+ {
3792
+ "epoch": 0.5507762789513871,
3793
+ "grad_norm": 7.77678918838501,
3794
+ "learning_rate": 4.24798284007513e-05,
3795
+ "loss": 2.33,
3796
+ "step": 541
3797
+ },
3798
+ {
3799
+ "epoch": 0.5517943497073047,
3800
+ "grad_norm": 7.377612113952637,
3801
+ "learning_rate": 4.232108137298919e-05,
3802
+ "loss": 2.299,
3803
+ "step": 542
3804
+ },
3805
+ {
3806
+ "epoch": 0.5528124204632222,
3807
+ "grad_norm": 9.510624885559082,
3808
+ "learning_rate": 4.216241358107831e-05,
3809
+ "loss": 2.8467,
3810
+ "step": 543
3811
+ },
3812
+ {
3813
+ "epoch": 0.5538304912191397,
3814
+ "grad_norm": 6.834048748016357,
3815
+ "learning_rate": 4.200382666225141e-05,
3816
+ "loss": 2.0166,
3817
+ "step": 544
3818
+ },
3819
+ {
3820
+ "epoch": 0.5548485619750573,
3821
+ "grad_norm": 8.245951652526855,
3822
+ "learning_rate": 4.1845322252906864e-05,
3823
+ "loss": 2.672,
3824
+ "step": 545
3825
+ },
3826
+ {
3827
+ "epoch": 0.5558666327309748,
3828
+ "grad_norm": 7.539649963378906,
3829
+ "learning_rate": 4.16869019885916e-05,
3830
+ "loss": 2.3618,
3831
+ "step": 546
3832
+ },
3833
+ {
3834
+ "epoch": 0.5568847034868923,
3835
+ "grad_norm": 7.983175754547119,
3836
+ "learning_rate": 4.152856750398426e-05,
3837
+ "loss": 2.2049,
3838
+ "step": 547
3839
+ },
3840
+ {
3841
+ "epoch": 0.5579027742428099,
3842
+ "grad_norm": 8.641951560974121,
3843
+ "learning_rate": 4.1370320432878404e-05,
3844
+ "loss": 2.2235,
3845
+ "step": 548
3846
+ },
3847
+ {
3848
+ "epoch": 0.5589208449987274,
3849
+ "grad_norm": 7.9181437492370605,
3850
+ "learning_rate": 4.1212162408165595e-05,
3851
+ "loss": 1.9295,
3852
+ "step": 549
3853
+ },
3854
+ {
3855
+ "epoch": 0.5599389157546449,
3856
+ "grad_norm": 10.45153522491455,
3857
+ "learning_rate": 4.105409506181854e-05,
3858
+ "loss": 2.1553,
3859
+ "step": 550
3860
+ },
3861
+ {
3862
+ "epoch": 0.5609569865105625,
3863
+ "grad_norm": 6.951171398162842,
3864
+ "learning_rate": 4.0896120024874286e-05,
3865
+ "loss": 2.5913,
3866
+ "step": 551
3867
+ },
3868
+ {
3869
+ "epoch": 0.56197505726648,
3870
+ "grad_norm": 9.614657402038574,
3871
+ "learning_rate": 4.073823892741735e-05,
3872
+ "loss": 4.2435,
3873
+ "step": 552
3874
+ },
3875
+ {
3876
+ "epoch": 0.5629931280223975,
3877
+ "grad_norm": 9.35623550415039,
3878
+ "learning_rate": 4.0580453398563e-05,
3879
+ "loss": 3.7123,
3880
+ "step": 553
3881
+ },
3882
+ {
3883
+ "epoch": 0.5640111987783151,
3884
+ "grad_norm": 10.756424903869629,
3885
+ "learning_rate": 4.042276506644024e-05,
3886
+ "loss": 3.6713,
3887
+ "step": 554
3888
+ },
3889
+ {
3890
+ "epoch": 0.5650292695342326,
3891
+ "grad_norm": 9.823023796081543,
3892
+ "learning_rate": 4.0265175558175265e-05,
3893
+ "loss": 3.7602,
3894
+ "step": 555
3895
+ },
3896
+ {
3897
+ "epoch": 0.5660473402901501,
3898
+ "grad_norm": 13.360715866088867,
3899
+ "learning_rate": 4.0107686499874465e-05,
3900
+ "loss": 3.269,
3901
+ "step": 556
3902
+ },
3903
+ {
3904
+ "epoch": 0.5670654110460677,
3905
+ "grad_norm": 14.194052696228027,
3906
+ "learning_rate": 3.9950299516607766e-05,
3907
+ "loss": 4.3906,
3908
+ "step": 557
3909
+ },
3910
+ {
3911
+ "epoch": 0.5680834818019852,
3912
+ "grad_norm": 16.591251373291016,
3913
+ "learning_rate": 3.979301623239177e-05,
3914
+ "loss": 4.4802,
3915
+ "step": 558
3916
+ },
3917
+ {
3918
+ "epoch": 0.5691015525579027,
3919
+ "grad_norm": 6.6096720695495605,
3920
+ "learning_rate": 3.9635838270173107e-05,
3921
+ "loss": 1.6842,
3922
+ "step": 559
3923
+ },
3924
+ {
3925
+ "epoch": 0.5701196233138203,
3926
+ "grad_norm": 6.252510070800781,
3927
+ "learning_rate": 3.94787672518116e-05,
3928
+ "loss": 1.6248,
3929
+ "step": 560
3930
+ },
3931
+ {
3932
+ "epoch": 0.5711376940697378,
3933
+ "grad_norm": 7.445550441741943,
3934
+ "learning_rate": 3.9321804798063565e-05,
3935
+ "loss": 1.7234,
3936
+ "step": 561
3937
+ },
3938
+ {
3939
+ "epoch": 0.5721557648256553,
3940
+ "grad_norm": 5.321173191070557,
3941
+ "learning_rate": 3.9164952528565057e-05,
3942
+ "loss": 1.2454,
3943
+ "step": 562
3944
+ },
3945
+ {
3946
+ "epoch": 0.5731738355815729,
3947
+ "grad_norm": 4.566540241241455,
3948
+ "learning_rate": 3.900821206181521e-05,
3949
+ "loss": 1.0588,
3950
+ "step": 563
3951
+ },
3952
+ {
3953
+ "epoch": 0.5741919063374905,
3954
+ "grad_norm": 8.349088668823242,
3955
+ "learning_rate": 3.8851585015159536e-05,
3956
+ "loss": 1.5751,
3957
+ "step": 564
3958
+ },
3959
+ {
3960
+ "epoch": 0.575209977093408,
3961
+ "grad_norm": 6.328129291534424,
3962
+ "learning_rate": 3.8695073004773106e-05,
3963
+ "loss": 2.025,
3964
+ "step": 565
3965
+ },
3966
+ {
3967
+ "epoch": 0.5762280478493256,
3968
+ "grad_norm": 8.211170196533203,
3969
+ "learning_rate": 3.8538677645644096e-05,
3970
+ "loss": 1.1548,
3971
+ "step": 566
3972
+ },
3973
+ {
3974
+ "epoch": 0.5772461186052431,
3975
+ "grad_norm": 5.518578052520752,
3976
+ "learning_rate": 3.838240055155692e-05,
3977
+ "loss": 1.2809,
3978
+ "step": 567
3979
+ },
3980
+ {
3981
+ "epoch": 0.5782641893611606,
3982
+ "grad_norm": 6.383520603179932,
3983
+ "learning_rate": 3.822624333507571e-05,
3984
+ "loss": 1.8485,
3985
+ "step": 568
3986
+ },
3987
+ {
3988
+ "epoch": 0.5792822601170782,
3989
+ "grad_norm": 5.425829887390137,
3990
+ "learning_rate": 3.8070207607527584e-05,
3991
+ "loss": 1.4567,
3992
+ "step": 569
3993
+ },
3994
+ {
3995
+ "epoch": 0.5803003308729957,
3996
+ "grad_norm": 8.478185653686523,
3997
+ "learning_rate": 3.791429497898608e-05,
3998
+ "loss": 2.0052,
3999
+ "step": 570
4000
+ },
4001
+ {
4002
+ "epoch": 0.5813184016289132,
4003
+ "grad_norm": 8.863068580627441,
4004
+ "learning_rate": 3.775850705825454e-05,
4005
+ "loss": 2.2554,
4006
+ "step": 571
4007
+ },
4008
+ {
4009
+ "epoch": 0.5823364723848308,
4010
+ "grad_norm": 5.8295183181762695,
4011
+ "learning_rate": 3.7602845452849463e-05,
4012
+ "loss": 1.2544,
4013
+ "step": 572
4014
+ },
4015
+ {
4016
+ "epoch": 0.5833545431407483,
4017
+ "grad_norm": 8.446788787841797,
4018
+ "learning_rate": 3.7447311768983964e-05,
4019
+ "loss": 2.4702,
4020
+ "step": 573
4021
+ },
4022
+ {
4023
+ "epoch": 0.5843726138966658,
4024
+ "grad_norm": 7.7443766593933105,
4025
+ "learning_rate": 3.7291907611551195e-05,
4026
+ "loss": 2.0707,
4027
+ "step": 574
4028
+ },
4029
+ {
4030
+ "epoch": 0.5853906846525834,
4031
+ "grad_norm": 8.347147941589355,
4032
+ "learning_rate": 3.713663458410779e-05,
4033
+ "loss": 1.659,
4034
+ "step": 575
4035
+ },
4036
+ {
4037
+ "epoch": 0.5864087554085009,
4038
+ "grad_norm": 7.487883567810059,
4039
+ "learning_rate": 3.69814942888572e-05,
4040
+ "loss": 2.0328,
4041
+ "step": 576
4042
+ },
4043
+ {
4044
+ "epoch": 0.5874268261644184,
4045
+ "grad_norm": 7.8575286865234375,
4046
+ "learning_rate": 3.682648832663339e-05,
4047
+ "loss": 1.8928,
4048
+ "step": 577
4049
+ },
4050
+ {
4051
+ "epoch": 0.588444896920336,
4052
+ "grad_norm": 8.947505950927734,
4053
+ "learning_rate": 3.6671618296884146e-05,
4054
+ "loss": 1.6774,
4055
+ "step": 578
4056
+ },
4057
+ {
4058
+ "epoch": 0.5894629676762535,
4059
+ "grad_norm": 5.097304821014404,
4060
+ "learning_rate": 3.6516885797654594e-05,
4061
+ "loss": 1.3306,
4062
+ "step": 579
4063
+ },
4064
+ {
4065
+ "epoch": 0.590481038432171,
4066
+ "grad_norm": 6.418907642364502,
4067
+ "learning_rate": 3.636229242557075e-05,
4068
+ "loss": 1.9186,
4069
+ "step": 580
4070
+ },
4071
+ {
4072
+ "epoch": 0.5914991091880886,
4073
+ "grad_norm": 7.3138346672058105,
4074
+ "learning_rate": 3.620783977582305e-05,
4075
+ "loss": 2.4993,
4076
+ "step": 581
4077
+ },
4078
+ {
4079
+ "epoch": 0.5925171799440061,
4080
+ "grad_norm": 7.914095878601074,
4081
+ "learning_rate": 3.605352944214986e-05,
4082
+ "loss": 2.078,
4083
+ "step": 582
4084
+ },
4085
+ {
4086
+ "epoch": 0.5935352506999236,
4087
+ "grad_norm": 10.451981544494629,
4088
+ "learning_rate": 3.5899363016821e-05,
4089
+ "loss": 2.5348,
4090
+ "step": 583
4091
+ },
4092
+ {
4093
+ "epoch": 0.5945533214558412,
4094
+ "grad_norm": 6.191624164581299,
4095
+ "learning_rate": 3.5745342090621405e-05,
4096
+ "loss": 1.6607,
4097
+ "step": 584
4098
+ },
4099
+ {
4100
+ "epoch": 0.5955713922117587,
4101
+ "grad_norm": 7.947683811187744,
4102
+ "learning_rate": 3.559146825283465e-05,
4103
+ "loss": 2.4664,
4104
+ "step": 585
4105
+ },
4106
+ {
4107
+ "epoch": 0.5965894629676762,
4108
+ "grad_norm": 7.410199165344238,
4109
+ "learning_rate": 3.5437743091226565e-05,
4110
+ "loss": 2.0212,
4111
+ "step": 586
4112
+ },
4113
+ {
4114
+ "epoch": 0.5976075337235938,
4115
+ "grad_norm": 8.705409049987793,
4116
+ "learning_rate": 3.528416819202881e-05,
4117
+ "loss": 2.2274,
4118
+ "step": 587
4119
+ },
4120
+ {
4121
+ "epoch": 0.5986256044795113,
4122
+ "grad_norm": 7.487548351287842,
4123
+ "learning_rate": 3.5130745139922574e-05,
4124
+ "loss": 2.104,
4125
+ "step": 588
4126
+ },
4127
+ {
4128
+ "epoch": 0.5996436752354288,
4129
+ "grad_norm": 8.788456916809082,
4130
+ "learning_rate": 3.497747551802221e-05,
4131
+ "loss": 2.5106,
4132
+ "step": 589
4133
+ },
4134
+ {
4135
+ "epoch": 0.6006617459913464,
4136
+ "grad_norm": 7.41387939453125,
4137
+ "learning_rate": 3.482436090785882e-05,
4138
+ "loss": 2.1219,
4139
+ "step": 590
4140
+ },
4141
+ {
4142
+ "epoch": 0.6016798167472639,
4143
+ "grad_norm": 6.481340408325195,
4144
+ "learning_rate": 3.467140288936407e-05,
4145
+ "loss": 1.9451,
4146
+ "step": 591
4147
+ },
4148
+ {
4149
+ "epoch": 0.6026978875031814,
4150
+ "grad_norm": 7.278069496154785,
4151
+ "learning_rate": 3.451860304085378e-05,
4152
+ "loss": 1.8661,
4153
+ "step": 592
4154
+ },
4155
+ {
4156
+ "epoch": 0.603715958259099,
4157
+ "grad_norm": 8.016121864318848,
4158
+ "learning_rate": 3.43659629390117e-05,
4159
+ "loss": 1.9884,
4160
+ "step": 593
4161
+ },
4162
+ {
4163
+ "epoch": 0.6047340290150165,
4164
+ "grad_norm": 8.917866706848145,
4165
+ "learning_rate": 3.421348415887315e-05,
4166
+ "loss": 2.6266,
4167
+ "step": 594
4168
+ },
4169
+ {
4170
+ "epoch": 0.605752099770934,
4171
+ "grad_norm": 9.271273612976074,
4172
+ "learning_rate": 3.406116827380889e-05,
4173
+ "loss": 2.6668,
4174
+ "step": 595
4175
+ },
4176
+ {
4177
+ "epoch": 0.6067701705268517,
4178
+ "grad_norm": 7.660860061645508,
4179
+ "learning_rate": 3.390901685550887e-05,
4180
+ "loss": 2.373,
4181
+ "step": 596
4182
+ },
4183
+ {
4184
+ "epoch": 0.6077882412827692,
4185
+ "grad_norm": 7.496829032897949,
4186
+ "learning_rate": 3.375703147396583e-05,
4187
+ "loss": 2.137,
4188
+ "step": 597
4189
+ },
4190
+ {
4191
+ "epoch": 0.6088063120386867,
4192
+ "grad_norm": 10.63588809967041,
4193
+ "learning_rate": 3.360521369745937e-05,
4194
+ "loss": 2.0113,
4195
+ "step": 598
4196
+ },
4197
+ {
4198
+ "epoch": 0.6098243827946043,
4199
+ "grad_norm": 8.661003112792969,
4200
+ "learning_rate": 3.345356509253959e-05,
4201
+ "loss": 2.202,
4202
+ "step": 599
4203
+ },
4204
+ {
4205
+ "epoch": 0.6108424535505218,
4206
+ "grad_norm": 6.928518295288086,
4207
+ "learning_rate": 3.330208722401097e-05,
4208
+ "loss": 1.6603,
4209
+ "step": 600
4210
+ },
4211
+ {
4212
+ "epoch": 0.6118605243064393,
4213
+ "grad_norm": 5.956086158752441,
4214
+ "learning_rate": 3.315078165491622e-05,
4215
+ "loss": 2.2319,
4216
+ "step": 601
4217
+ },
4218
+ {
4219
+ "epoch": 0.6128785950623569,
4220
+ "grad_norm": 9.131757736206055,
4221
+ "learning_rate": 3.2999649946520174e-05,
4222
+ "loss": 3.3601,
4223
+ "step": 602
4224
+ },
4225
+ {
4226
+ "epoch": 0.6138966658182744,
4227
+ "grad_norm": 8.110289573669434,
4228
+ "learning_rate": 3.2848693658293675e-05,
4229
+ "loss": 2.8758,
4230
+ "step": 603
4231
+ },
4232
+ {
4233
+ "epoch": 0.6149147365741919,
4234
+ "grad_norm": 11.287444114685059,
4235
+ "learning_rate": 3.2697914347897406e-05,
4236
+ "loss": 4.129,
4237
+ "step": 604
4238
+ },
4239
+ {
4240
+ "epoch": 0.6159328073301095,
4241
+ "grad_norm": 10.69924259185791,
4242
+ "learning_rate": 3.254731357116597e-05,
4243
+ "loss": 4.2776,
4244
+ "step": 605
4245
+ },
4246
+ {
4247
+ "epoch": 0.616950878086027,
4248
+ "grad_norm": 9.89280891418457,
4249
+ "learning_rate": 3.239689288209168e-05,
4250
+ "loss": 3.1346,
4251
+ "step": 606
4252
+ },
4253
+ {
4254
+ "epoch": 0.6179689488419445,
4255
+ "grad_norm": 11.832335472106934,
4256
+ "learning_rate": 3.224665383280867e-05,
4257
+ "loss": 3.4148,
4258
+ "step": 607
4259
+ },
4260
+ {
4261
+ "epoch": 0.6189870195978621,
4262
+ "grad_norm": 13.277129173278809,
4263
+ "learning_rate": 3.2096597973576694e-05,
4264
+ "loss": 3.4906,
4265
+ "step": 608
4266
+ },
4267
+ {
4268
+ "epoch": 0.6200050903537796,
4269
+ "grad_norm": 6.8787994384765625,
4270
+ "learning_rate": 3.194672685276532e-05,
4271
+ "loss": 1.4383,
4272
+ "step": 609
4273
+ },
4274
+ {
4275
+ "epoch": 0.6210231611096971,
4276
+ "grad_norm": 5.783747673034668,
4277
+ "learning_rate": 3.179704201683786e-05,
4278
+ "loss": 1.3518,
4279
+ "step": 610
4280
+ },
4281
+ {
4282
+ "epoch": 0.6220412318656147,
4283
+ "grad_norm": 5.462782859802246,
4284
+ "learning_rate": 3.16475450103354e-05,
4285
+ "loss": 1.249,
4286
+ "step": 611
4287
+ },
4288
+ {
4289
+ "epoch": 0.6230593026215322,
4290
+ "grad_norm": 5.050539016723633,
4291
+ "learning_rate": 3.1498237375860886e-05,
4292
+ "loss": 1.1348,
4293
+ "step": 612
4294
+ },
4295
+ {
4296
+ "epoch": 0.6240773733774497,
4297
+ "grad_norm": 8.341720581054688,
4298
+ "learning_rate": 3.1349120654063225e-05,
4299
+ "loss": 1.7345,
4300
+ "step": 613
4301
+ },
4302
+ {
4303
+ "epoch": 0.6250954441333673,
4304
+ "grad_norm": 4.832444190979004,
4305
+ "learning_rate": 3.120019638362136e-05,
4306
+ "loss": 1.0501,
4307
+ "step": 614
4308
+ },
4309
+ {
4310
+ "epoch": 0.6261135148892848,
4311
+ "grad_norm": 7.373495578765869,
4312
+ "learning_rate": 3.1051466101228385e-05,
4313
+ "loss": 1.7428,
4314
+ "step": 615
4315
+ },
4316
+ {
4317
+ "epoch": 0.6271315856452023,
4318
+ "grad_norm": 5.6345319747924805,
4319
+ "learning_rate": 3.090293134157572e-05,
4320
+ "loss": 1.2435,
4321
+ "step": 616
4322
+ },
4323
+ {
4324
+ "epoch": 0.6281496564011199,
4325
+ "grad_norm": 6.5224609375,
4326
+ "learning_rate": 3.0754593637337276e-05,
4327
+ "loss": 1.4176,
4328
+ "step": 617
4329
+ },
4330
+ {
4331
+ "epoch": 0.6291677271570374,
4332
+ "grad_norm": 8.80791187286377,
4333
+ "learning_rate": 3.06064545191536e-05,
4334
+ "loss": 2.4285,
4335
+ "step": 618
4336
+ },
4337
+ {
4338
+ "epoch": 0.6301857979129549,
4339
+ "grad_norm": 9.331201553344727,
4340
+ "learning_rate": 3.0458515515616115e-05,
4341
+ "loss": 2.7192,
4342
+ "step": 619
4343
+ },
4344
+ {
4345
+ "epoch": 0.6312038686688725,
4346
+ "grad_norm": 9.033586502075195,
4347
+ "learning_rate": 3.0310778153251324e-05,
4348
+ "loss": 1.8652,
4349
+ "step": 620
4350
+ },
4351
+ {
4352
+ "epoch": 0.63222193942479,
4353
+ "grad_norm": 6.689144134521484,
4354
+ "learning_rate": 3.0163243956505095e-05,
4355
+ "loss": 1.5773,
4356
+ "step": 621
4357
+ },
4358
+ {
4359
+ "epoch": 0.6332400101807075,
4360
+ "grad_norm": 8.037043571472168,
4361
+ "learning_rate": 3.0015914447726867e-05,
4362
+ "loss": 2.3296,
4363
+ "step": 622
4364
+ },
4365
+ {
4366
+ "epoch": 0.6342580809366251,
4367
+ "grad_norm": 7.927774906158447,
4368
+ "learning_rate": 2.986879114715403e-05,
4369
+ "loss": 2.2707,
4370
+ "step": 623
4371
+ },
4372
+ {
4373
+ "epoch": 0.6352761516925426,
4374
+ "grad_norm": 5.514461994171143,
4375
+ "learning_rate": 2.9721875572896157e-05,
4376
+ "loss": 1.7974,
4377
+ "step": 624
4378
+ },
4379
+ {
4380
+ "epoch": 0.6362942224484601,
4381
+ "grad_norm": 7.439801216125488,
4382
+ "learning_rate": 2.95751692409194e-05,
4383
+ "loss": 2.1823,
4384
+ "step": 625
4385
+ },
4386
+ {
4387
+ "epoch": 0.6373122932043777,
4388
+ "grad_norm": 7.419183731079102,
4389
+ "learning_rate": 2.942867366503077e-05,
4390
+ "loss": 2.1965,
4391
+ "step": 626
4392
+ },
4393
+ {
4394
+ "epoch": 0.6383303639602952,
4395
+ "grad_norm": 5.545042037963867,
4396
+ "learning_rate": 2.9282390356862606e-05,
4397
+ "loss": 1.4957,
4398
+ "step": 627
4399
+ },
4400
+ {
4401
+ "epoch": 0.6393484347162128,
4402
+ "grad_norm": 11.62447738647461,
4403
+ "learning_rate": 2.9136320825856967e-05,
4404
+ "loss": 3.3109,
4405
+ "step": 628
4406
+ },
4407
+ {
4408
+ "epoch": 0.6403665054721304,
4409
+ "grad_norm": 8.367134094238281,
4410
+ "learning_rate": 2.899046657924992e-05,
4411
+ "loss": 2.2194,
4412
+ "step": 629
4413
+ },
4414
+ {
4415
+ "epoch": 0.6413845762280479,
4416
+ "grad_norm": 10.391725540161133,
4417
+ "learning_rate": 2.884482912205621e-05,
4418
+ "loss": 2.0195,
4419
+ "step": 630
4420
+ },
4421
+ {
4422
+ "epoch": 0.6424026469839654,
4423
+ "grad_norm": 8.217406272888184,
4424
+ "learning_rate": 2.8699409957053535e-05,
4425
+ "loss": 2.4132,
4426
+ "step": 631
4427
+ },
4428
+ {
4429
+ "epoch": 0.643420717739883,
4430
+ "grad_norm": 8.29297161102295,
4431
+ "learning_rate": 2.855421058476719e-05,
4432
+ "loss": 2.4454,
4433
+ "step": 632
4434
+ },
4435
+ {
4436
+ "epoch": 0.6444387884958005,
4437
+ "grad_norm": 8.815670013427734,
4438
+ "learning_rate": 2.840923250345442e-05,
4439
+ "loss": 2.5413,
4440
+ "step": 633
4441
+ },
4442
+ {
4443
+ "epoch": 0.645456859251718,
4444
+ "grad_norm": 8.5559720993042,
4445
+ "learning_rate": 2.8264477209089145e-05,
4446
+ "loss": 2.7664,
4447
+ "step": 634
4448
+ },
4449
+ {
4450
+ "epoch": 0.6464749300076356,
4451
+ "grad_norm": 8.682782173156738,
4452
+ "learning_rate": 2.8119946195346375e-05,
4453
+ "loss": 2.5312,
4454
+ "step": 635
4455
+ },
4456
+ {
4457
+ "epoch": 0.6474930007635531,
4458
+ "grad_norm": 11.519887924194336,
4459
+ "learning_rate": 2.7975640953586846e-05,
4460
+ "loss": 2.9688,
4461
+ "step": 636
4462
+ },
4463
+ {
4464
+ "epoch": 0.6485110715194706,
4465
+ "grad_norm": 8.966607093811035,
4466
+ "learning_rate": 2.7831562972841696e-05,
4467
+ "loss": 2.7022,
4468
+ "step": 637
4469
+ },
4470
+ {
4471
+ "epoch": 0.6495291422753882,
4472
+ "grad_norm": 8.183965682983398,
4473
+ "learning_rate": 2.768771373979697e-05,
4474
+ "loss": 2.3317,
4475
+ "step": 638
4476
+ },
4477
+ {
4478
+ "epoch": 0.6505472130313057,
4479
+ "grad_norm": 8.993667602539062,
4480
+ "learning_rate": 2.7544094738778436e-05,
4481
+ "loss": 2.7296,
4482
+ "step": 639
4483
+ },
4484
+ {
4485
+ "epoch": 0.6515652837872232,
4486
+ "grad_norm": 7.731354713439941,
4487
+ "learning_rate": 2.74007074517361e-05,
4488
+ "loss": 1.9501,
4489
+ "step": 640
4490
+ },
4491
+ {
4492
+ "epoch": 0.6525833545431408,
4493
+ "grad_norm": 6.967146396636963,
4494
+ "learning_rate": 2.7257553358229034e-05,
4495
+ "loss": 1.8838,
4496
+ "step": 641
4497
+ },
4498
+ {
4499
+ "epoch": 0.6536014252990583,
4500
+ "grad_norm": 6.557554244995117,
4501
+ "learning_rate": 2.7114633935410085e-05,
4502
+ "loss": 1.7431,
4503
+ "step": 642
4504
+ },
4505
+ {
4506
+ "epoch": 0.6546194960549758,
4507
+ "grad_norm": 10.207218170166016,
4508
+ "learning_rate": 2.6971950658010666e-05,
4509
+ "loss": 2.4966,
4510
+ "step": 643
4511
+ },
4512
+ {
4513
+ "epoch": 0.6556375668108934,
4514
+ "grad_norm": 7.477417469024658,
4515
+ "learning_rate": 2.682950499832535e-05,
4516
+ "loss": 2.1944,
4517
+ "step": 644
4518
+ },
4519
+ {
4520
+ "epoch": 0.6566556375668109,
4521
+ "grad_norm": 10.127610206604004,
4522
+ "learning_rate": 2.6687298426196973e-05,
4523
+ "loss": 2.6473,
4524
+ "step": 645
4525
+ },
4526
+ {
4527
+ "epoch": 0.6576737083227284,
4528
+ "grad_norm": 6.374731540679932,
4529
+ "learning_rate": 2.6545332409001265e-05,
4530
+ "loss": 1.8528,
4531
+ "step": 646
4532
+ },
4533
+ {
4534
+ "epoch": 0.658691779078646,
4535
+ "grad_norm": 6.7048444747924805,
4536
+ "learning_rate": 2.6403608411631742e-05,
4537
+ "loss": 1.7493,
4538
+ "step": 647
4539
+ },
4540
+ {
4541
+ "epoch": 0.6597098498345635,
4542
+ "grad_norm": 7.112037181854248,
4543
+ "learning_rate": 2.6262127896484602e-05,
4544
+ "loss": 2.0421,
4545
+ "step": 648
4546
+ },
4547
+ {
4548
+ "epoch": 0.660727920590481,
4549
+ "grad_norm": 8.483193397521973,
4550
+ "learning_rate": 2.612089232344371e-05,
4551
+ "loss": 1.91,
4552
+ "step": 649
4553
+ },
4554
+ {
4555
+ "epoch": 0.6617459913463986,
4556
+ "grad_norm": 10.052485466003418,
4557
+ "learning_rate": 2.5979903149865387e-05,
4558
+ "loss": 2.0998,
4559
+ "step": 650
4560
+ },
4561
+ {
4562
+ "epoch": 0.6627640621023161,
4563
+ "grad_norm": 8.01032543182373,
4564
+ "learning_rate": 2.5839161830563474e-05,
4565
+ "loss": 2.5145,
4566
+ "step": 651
4567
+ },
4568
+ {
4569
+ "epoch": 0.6637821328582336,
4570
+ "grad_norm": 9.746928215026855,
4571
+ "learning_rate": 2.569866981779433e-05,
4572
+ "loss": 3.3683,
4573
+ "step": 652
4574
+ },
4575
+ {
4576
+ "epoch": 0.6648002036141512,
4577
+ "grad_norm": 8.607123374938965,
4578
+ "learning_rate": 2.555842856124182e-05,
4579
+ "loss": 2.9144,
4580
+ "step": 653
4581
+ },
4582
+ {
4583
+ "epoch": 0.6658182743700687,
4584
+ "grad_norm": 10.463346481323242,
4585
+ "learning_rate": 2.5418439508002258e-05,
4586
+ "loss": 3.9062,
4587
+ "step": 654
4588
+ },
4589
+ {
4590
+ "epoch": 0.6668363451259862,
4591
+ "grad_norm": 9.336942672729492,
4592
+ "learning_rate": 2.5278704102569662e-05,
4593
+ "loss": 3.3966,
4594
+ "step": 655
4595
+ },
4596
+ {
4597
+ "epoch": 0.6678544158819038,
4598
+ "grad_norm": 10.415209770202637,
4599
+ "learning_rate": 2.5139223786820747e-05,
4600
+ "loss": 3.7271,
4601
+ "step": 656
4602
+ },
4603
+ {
4604
+ "epoch": 0.6688724866378213,
4605
+ "grad_norm": 14.631210327148438,
4606
+ "learning_rate": 2.500000000000001e-05,
4607
+ "loss": 3.7071,
4608
+ "step": 657
4609
+ },
4610
+ {
4611
+ "epoch": 0.6698905573937388,
4612
+ "grad_norm": 13.001562118530273,
4613
+ "learning_rate": 2.486103417870493e-05,
4614
+ "loss": 3.214,
4615
+ "step": 658
4616
+ },
4617
+ {
4618
+ "epoch": 0.6709086281496563,
4619
+ "grad_norm": 11.307893753051758,
4620
+ "learning_rate": 2.472232775687119e-05,
4621
+ "loss": 2.8893,
4622
+ "step": 659
4623
+ },
4624
+ {
4625
+ "epoch": 0.6719266989055739,
4626
+ "grad_norm": 7.8647379875183105,
4627
+ "learning_rate": 2.4583882165757766e-05,
4628
+ "loss": 2.0442,
4629
+ "step": 660
4630
+ },
4631
+ {
4632
+ "epoch": 0.6729447696614915,
4633
+ "grad_norm": 5.790807247161865,
4634
+ "learning_rate": 2.4445698833932234e-05,
4635
+ "loss": 1.3228,
4636
+ "step": 661
4637
+ },
4638
+ {
4639
+ "epoch": 0.6739628404174091,
4640
+ "grad_norm": 5.694929599761963,
4641
+ "learning_rate": 2.4307779187256064e-05,
4642
+ "loss": 1.3618,
4643
+ "step": 662
4644
+ },
4645
+ {
4646
+ "epoch": 0.6749809111733266,
4647
+ "grad_norm": 5.114007949829102,
4648
+ "learning_rate": 2.417012464886978e-05,
4649
+ "loss": 1.2137,
4650
+ "step": 663
4651
+ },
4652
+ {
4653
+ "epoch": 0.6759989819292441,
4654
+ "grad_norm": 7.429940223693848,
4655
+ "learning_rate": 2.4032736639178444e-05,
4656
+ "loss": 1.8593,
4657
+ "step": 664
4658
+ },
4659
+ {
4660
+ "epoch": 0.6770170526851617,
4661
+ "grad_norm": 5.101173400878906,
4662
+ "learning_rate": 2.389561657583681e-05,
4663
+ "loss": 0.9669,
4664
+ "step": 665
4665
+ },
4666
+ {
4667
+ "epoch": 0.6780351234410792,
4668
+ "grad_norm": 7.89351224899292,
4669
+ "learning_rate": 2.3758765873734896e-05,
4670
+ "loss": 1.8615,
4671
+ "step": 666
4672
+ },
4673
+ {
4674
+ "epoch": 0.6790531941969967,
4675
+ "grad_norm": 7.043496608734131,
4676
+ "learning_rate": 2.3622185944983188e-05,
4677
+ "loss": 1.7828,
4678
+ "step": 667
4679
+ },
4680
+ {
4681
+ "epoch": 0.6800712649529143,
4682
+ "grad_norm": 7.9154510498046875,
4683
+ "learning_rate": 2.3485878198898252e-05,
4684
+ "loss": 2.2469,
4685
+ "step": 668
4686
+ },
4687
+ {
4688
+ "epoch": 0.6810893357088318,
4689
+ "grad_norm": 6.627047061920166,
4690
+ "learning_rate": 2.3349844041988045e-05,
4691
+ "loss": 1.5789,
4692
+ "step": 669
4693
+ },
4694
+ {
4695
+ "epoch": 0.6821074064647493,
4696
+ "grad_norm": 5.884915828704834,
4697
+ "learning_rate": 2.3214084877937464e-05,
4698
+ "loss": 1.5281,
4699
+ "step": 670
4700
+ },
4701
+ {
4702
+ "epoch": 0.6831254772206669,
4703
+ "grad_norm": 6.640014171600342,
4704
+ "learning_rate": 2.30786021075939e-05,
4705
+ "loss": 1.4942,
4706
+ "step": 671
4707
+ },
4708
+ {
4709
+ "epoch": 0.6841435479765844,
4710
+ "grad_norm": 6.866456985473633,
4711
+ "learning_rate": 2.294339712895271e-05,
4712
+ "loss": 1.674,
4713
+ "step": 672
4714
+ },
4715
+ {
4716
+ "epoch": 0.6851616187325019,
4717
+ "grad_norm": 6.7534990310668945,
4718
+ "learning_rate": 2.28084713371428e-05,
4719
+ "loss": 1.3313,
4720
+ "step": 673
4721
+ },
4722
+ {
4723
+ "epoch": 0.6861796894884195,
4724
+ "grad_norm": 6.38292121887207,
4725
+ "learning_rate": 2.2673826124412312e-05,
4726
+ "loss": 1.6016,
4727
+ "step": 674
4728
+ },
4729
+ {
4730
+ "epoch": 0.687197760244337,
4731
+ "grad_norm": 7.129096031188965,
4732
+ "learning_rate": 2.2539462880114194e-05,
4733
+ "loss": 1.8662,
4734
+ "step": 675
4735
+ },
4736
+ {
4737
+ "epoch": 0.6882158310002545,
4738
+ "grad_norm": 6.555764675140381,
4739
+ "learning_rate": 2.240538299069178e-05,
4740
+ "loss": 1.9315,
4741
+ "step": 676
4742
+ },
4743
+ {
4744
+ "epoch": 0.689233901756172,
4745
+ "grad_norm": 5.772182941436768,
4746
+ "learning_rate": 2.2271587839664672e-05,
4747
+ "loss": 1.3156,
4748
+ "step": 677
4749
+ },
4750
+ {
4751
+ "epoch": 0.6902519725120896,
4752
+ "grad_norm": 7.608791351318359,
4753
+ "learning_rate": 2.213807880761434e-05,
4754
+ "loss": 1.9463,
4755
+ "step": 678
4756
+ },
4757
+ {
4758
+ "epoch": 0.6912700432680071,
4759
+ "grad_norm": 7.279063701629639,
4760
+ "learning_rate": 2.2004857272169876e-05,
4761
+ "loss": 1.9304,
4762
+ "step": 679
4763
+ },
4764
+ {
4765
+ "epoch": 0.6922881140239247,
4766
+ "grad_norm": 9.676162719726562,
4767
+ "learning_rate": 2.1871924607993797e-05,
4768
+ "loss": 2.3767,
4769
+ "step": 680
4770
+ },
4771
+ {
4772
+ "epoch": 0.6933061847798422,
4773
+ "grad_norm": 7.1779093742370605,
4774
+ "learning_rate": 2.1739282186767923e-05,
4775
+ "loss": 1.6381,
4776
+ "step": 681
4777
+ },
4778
+ {
4779
+ "epoch": 0.6943242555357597,
4780
+ "grad_norm": 6.892930030822754,
4781
+ "learning_rate": 2.160693137717912e-05,
4782
+ "loss": 2.134,
4783
+ "step": 682
4784
+ },
4785
+ {
4786
+ "epoch": 0.6953423262916772,
4787
+ "grad_norm": 9.403331756591797,
4788
+ "learning_rate": 2.1474873544905205e-05,
4789
+ "loss": 2.2294,
4790
+ "step": 683
4791
+ },
4792
+ {
4793
+ "epoch": 0.6963603970475948,
4794
+ "grad_norm": 7.7654595375061035,
4795
+ "learning_rate": 2.134311005260093e-05,
4796
+ "loss": 2.0953,
4797
+ "step": 684
4798
+ },
4799
+ {
4800
+ "epoch": 0.6973784678035123,
4801
+ "grad_norm": 10.087757110595703,
4802
+ "learning_rate": 2.1211642259883867e-05,
4803
+ "loss": 2.9221,
4804
+ "step": 685
4805
+ },
4806
+ {
4807
+ "epoch": 0.6983965385594298,
4808
+ "grad_norm": 8.816588401794434,
4809
+ "learning_rate": 2.108047152332028e-05,
4810
+ "loss": 2.6949,
4811
+ "step": 686
4812
+ },
4813
+ {
4814
+ "epoch": 0.6994146093153474,
4815
+ "grad_norm": 8.12427043914795,
4816
+ "learning_rate": 2.0949599196411325e-05,
4817
+ "loss": 1.7944,
4818
+ "step": 687
4819
+ },
4820
+ {
4821
+ "epoch": 0.7004326800712649,
4822
+ "grad_norm": 7.3718461990356445,
4823
+ "learning_rate": 2.0819026629578952e-05,
4824
+ "loss": 2.1142,
4825
+ "step": 688
4826
+ },
4827
+ {
4828
+ "epoch": 0.7014507508271824,
4829
+ "grad_norm": 7.3536577224731445,
4830
+ "learning_rate": 2.0688755170151996e-05,
4831
+ "loss": 2.0029,
4832
+ "step": 689
4833
+ },
4834
+ {
4835
+ "epoch": 0.7024688215831,
4836
+ "grad_norm": 8.220134735107422,
4837
+ "learning_rate": 2.0558786162352244e-05,
4838
+ "loss": 2.2986,
4839
+ "step": 690
4840
+ },
4841
+ {
4842
+ "epoch": 0.7034868923390175,
4843
+ "grad_norm": 9.169322967529297,
4844
+ "learning_rate": 2.0429120947280678e-05,
4845
+ "loss": 2.3455,
4846
+ "step": 691
4847
+ },
4848
+ {
4849
+ "epoch": 0.704504963094935,
4850
+ "grad_norm": 8.935730934143066,
4851
+ "learning_rate": 2.029976086290347e-05,
4852
+ "loss": 2.1588,
4853
+ "step": 692
4854
+ },
4855
+ {
4856
+ "epoch": 0.7055230338508527,
4857
+ "grad_norm": 7.555604934692383,
4858
+ "learning_rate": 2.017070724403835e-05,
4859
+ "loss": 2.2783,
4860
+ "step": 693
4861
+ },
4862
+ {
4863
+ "epoch": 0.7065411046067702,
4864
+ "grad_norm": 7.896771430969238,
4865
+ "learning_rate": 2.0041961422340676e-05,
4866
+ "loss": 1.8964,
4867
+ "step": 694
4868
+ },
4869
+ {
4870
+ "epoch": 0.7075591753626878,
4871
+ "grad_norm": 8.242528915405273,
4872
+ "learning_rate": 1.9913524726289784e-05,
4873
+ "loss": 1.9936,
4874
+ "step": 695
4875
+ },
4876
+ {
4877
+ "epoch": 0.7085772461186053,
4878
+ "grad_norm": 7.946272373199463,
4879
+ "learning_rate": 1.9785398481175294e-05,
4880
+ "loss": 2.1526,
4881
+ "step": 696
4882
+ },
4883
+ {
4884
+ "epoch": 0.7095953168745228,
4885
+ "grad_norm": 8.382307052612305,
4886
+ "learning_rate": 1.965758400908334e-05,
4887
+ "loss": 2.4691,
4888
+ "step": 697
4889
+ },
4890
+ {
4891
+ "epoch": 0.7106133876304404,
4892
+ "grad_norm": 6.839285373687744,
4893
+ "learning_rate": 1.9530082628883056e-05,
4894
+ "loss": 1.7924,
4895
+ "step": 698
4896
+ },
4897
+ {
4898
+ "epoch": 0.7116314583863579,
4899
+ "grad_norm": 12.65297794342041,
4900
+ "learning_rate": 1.9402895656212833e-05,
4901
+ "loss": 2.0093,
4902
+ "step": 699
4903
+ },
4904
+ {
4905
+ "epoch": 0.7126495291422754,
4906
+ "grad_norm": 11.35102653503418,
4907
+ "learning_rate": 1.927602440346687e-05,
4908
+ "loss": 1.7963,
4909
+ "step": 700
4910
+ },
4911
+ {
4912
+ "epoch": 0.713667599898193,
4913
+ "grad_norm": 7.479799747467041,
4914
+ "learning_rate": 1.914947017978153e-05,
4915
+ "loss": 3.4169,
4916
+ "step": 701
4917
+ },
4918
+ {
4919
+ "epoch": 0.7146856706541105,
4920
+ "grad_norm": 9.703947067260742,
4921
+ "learning_rate": 1.9023234291021873e-05,
4922
+ "loss": 2.8178,
4923
+ "step": 702
4924
+ },
4925
+ {
4926
+ "epoch": 0.715703741410028,
4927
+ "grad_norm": 10.218291282653809,
4928
+ "learning_rate": 1.889731803976822e-05,
4929
+ "loss": 2.841,
4930
+ "step": 703
4931
+ },
4932
+ {
4933
+ "epoch": 0.7167218121659455,
4934
+ "grad_norm": 12.210125923156738,
4935
+ "learning_rate": 1.8771722725302643e-05,
4936
+ "loss": 3.9947,
4937
+ "step": 704
4938
+ },
4939
+ {
4940
+ "epoch": 0.7177398829218631,
4941
+ "grad_norm": 9.851053237915039,
4942
+ "learning_rate": 1.8646449643595565e-05,
4943
+ "loss": 2.8836,
4944
+ "step": 705
4945
+ },
4946
+ {
4947
+ "epoch": 0.7187579536777806,
4948
+ "grad_norm": 11.182621955871582,
4949
+ "learning_rate": 1.8521500087292467e-05,
4950
+ "loss": 3.2881,
4951
+ "step": 706
4952
+ },
4953
+ {
4954
+ "epoch": 0.7197760244336981,
4955
+ "grad_norm": 16.472837448120117,
4956
+ "learning_rate": 1.8396875345700497e-05,
4957
+ "loss": 3.6782,
4958
+ "step": 707
4959
+ },
4960
+ {
4961
+ "epoch": 0.7207940951896157,
4962
+ "grad_norm": 13.632477760314941,
4963
+ "learning_rate": 1.8272576704775074e-05,
4964
+ "loss": 3.5599,
4965
+ "step": 708
4966
+ },
4967
+ {
4968
+ "epoch": 0.7218121659455332,
4969
+ "grad_norm": 8.531991958618164,
4970
+ "learning_rate": 1.8148605447106797e-05,
4971
+ "loss": 1.815,
4972
+ "step": 709
4973
+ },
4974
+ {
4975
+ "epoch": 0.7228302367014507,
4976
+ "grad_norm": 6.116468906402588,
4977
+ "learning_rate": 1.8024962851908107e-05,
4978
+ "loss": 1.3279,
4979
+ "step": 710
4980
+ },
4981
+ {
4982
+ "epoch": 0.7238483074573683,
4983
+ "grad_norm": 6.058359622955322,
4984
+ "learning_rate": 1.7901650195000068e-05,
4985
+ "loss": 1.1209,
4986
+ "step": 711
4987
+ },
4988
+ {
4989
+ "epoch": 0.7248663782132858,
4990
+ "grad_norm": 7.301308631896973,
4991
+ "learning_rate": 1.7778668748799242e-05,
4992
+ "loss": 1.6941,
4993
+ "step": 712
4994
+ },
4995
+ {
4996
+ "epoch": 0.7258844489692033,
4997
+ "grad_norm": 6.059625148773193,
4998
+ "learning_rate": 1.76560197823046e-05,
4999
+ "loss": 1.4134,
5000
+ "step": 713
5001
+ },
5002
+ {
5003
+ "epoch": 0.7269025197251209,
5004
+ "grad_norm": 5.40415620803833,
5005
+ "learning_rate": 1.753370456108433e-05,
5006
+ "loss": 1.5117,
5007
+ "step": 714
5008
+ },
5009
+ {
5010
+ "epoch": 0.7279205904810384,
5011
+ "grad_norm": 6.5403008460998535,
5012
+ "learning_rate": 1.7411724347262824e-05,
5013
+ "loss": 1.397,
5014
+ "step": 715
5015
+ },
5016
+ {
5017
+ "epoch": 0.7289386612369559,
5018
+ "grad_norm": 8.339217185974121,
5019
+ "learning_rate": 1.729008039950772e-05,
5020
+ "loss": 1.5315,
5021
+ "step": 716
5022
+ },
5023
+ {
5024
+ "epoch": 0.7299567319928735,
5025
+ "grad_norm": 5.882655620574951,
5026
+ "learning_rate": 1.7168773973016776e-05,
5027
+ "loss": 1.1574,
5028
+ "step": 717
5029
+ },
5030
+ {
5031
+ "epoch": 0.730974802748791,
5032
+ "grad_norm": 6.183307647705078,
5033
+ "learning_rate": 1.7047806319505076e-05,
5034
+ "loss": 1.3367,
5035
+ "step": 718
5036
+ },
5037
+ {
5038
+ "epoch": 0.7319928735047085,
5039
+ "grad_norm": 6.28183126449585,
5040
+ "learning_rate": 1.692717868719195e-05,
5041
+ "loss": 1.5637,
5042
+ "step": 719
5043
+ },
5044
+ {
5045
+ "epoch": 0.7330109442606261,
5046
+ "grad_norm": 4.728903293609619,
5047
+ "learning_rate": 1.680689232078827e-05,
5048
+ "loss": 1.4179,
5049
+ "step": 720
5050
+ },
5051
+ {
5052
+ "epoch": 0.7340290150165436,
5053
+ "grad_norm": 6.95587158203125,
5054
+ "learning_rate": 1.668694846148343e-05,
5055
+ "loss": 1.7837,
5056
+ "step": 721
5057
+ },
5058
+ {
5059
+ "epoch": 0.7350470857724611,
5060
+ "grad_norm": 5.531774997711182,
5061
+ "learning_rate": 1.6567348346932658e-05,
5062
+ "loss": 1.2069,
5063
+ "step": 722
5064
+ },
5065
+ {
5066
+ "epoch": 0.7360651565283787,
5067
+ "grad_norm": 5.498968601226807,
5068
+ "learning_rate": 1.644809321124423e-05,
5069
+ "loss": 1.1316,
5070
+ "step": 723
5071
+ },
5072
+ {
5073
+ "epoch": 0.7370832272842962,
5074
+ "grad_norm": 7.1133809089660645,
5075
+ "learning_rate": 1.6329184284966677e-05,
5076
+ "loss": 2.0335,
5077
+ "step": 724
5078
+ },
5079
+ {
5080
+ "epoch": 0.7381012980402138,
5081
+ "grad_norm": 6.765145301818848,
5082
+ "learning_rate": 1.621062279507617e-05,
5083
+ "loss": 2.0067,
5084
+ "step": 725
5085
+ },
5086
+ {
5087
+ "epoch": 0.7391193687961314,
5088
+ "grad_norm": 7.21923828125,
5089
+ "learning_rate": 1.609240996496378e-05,
5090
+ "loss": 2.2922,
5091
+ "step": 726
5092
+ },
5093
+ {
5094
+ "epoch": 0.7401374395520489,
5095
+ "grad_norm": 5.8889360427856445,
5096
+ "learning_rate": 1.597454701442288e-05,
5097
+ "loss": 1.6363,
5098
+ "step": 727
5099
+ },
5100
+ {
5101
+ "epoch": 0.7411555103079664,
5102
+ "grad_norm": 8.041604042053223,
5103
+ "learning_rate": 1.5857035159636623e-05,
5104
+ "loss": 1.6933,
5105
+ "step": 728
5106
+ },
5107
+ {
5108
+ "epoch": 0.742173581063884,
5109
+ "grad_norm": 7.711045742034912,
5110
+ "learning_rate": 1.5739875613165283e-05,
5111
+ "loss": 1.9258,
5112
+ "step": 729
5113
+ },
5114
+ {
5115
+ "epoch": 0.7431916518198015,
5116
+ "grad_norm": 7.747977256774902,
5117
+ "learning_rate": 1.5623069583933836e-05,
5118
+ "loss": 2.5273,
5119
+ "step": 730
5120
+ },
5121
+ {
5122
+ "epoch": 0.744209722575719,
5123
+ "grad_norm": 8.055684089660645,
5124
+ "learning_rate": 1.550661827721941e-05,
5125
+ "loss": 2.0398,
5126
+ "step": 731
5127
+ },
5128
+ {
5129
+ "epoch": 0.7452277933316366,
5130
+ "grad_norm": 8.75759220123291,
5131
+ "learning_rate": 1.5390522894638938e-05,
5132
+ "loss": 2.5372,
5133
+ "step": 732
5134
+ },
5135
+ {
5136
+ "epoch": 0.7462458640875541,
5137
+ "grad_norm": 6.629666805267334,
5138
+ "learning_rate": 1.527478463413666e-05,
5139
+ "loss": 1.8586,
5140
+ "step": 733
5141
+ },
5142
+ {
5143
+ "epoch": 0.7472639348434716,
5144
+ "grad_norm": 7.634647369384766,
5145
+ "learning_rate": 1.5159404689971795e-05,
5146
+ "loss": 1.7609,
5147
+ "step": 734
5148
+ },
5149
+ {
5150
+ "epoch": 0.7482820055993892,
5151
+ "grad_norm": 8.821757316589355,
5152
+ "learning_rate": 1.5044384252706312e-05,
5153
+ "loss": 2.5073,
5154
+ "step": 735
5155
+ },
5156
+ {
5157
+ "epoch": 0.7493000763553067,
5158
+ "grad_norm": 7.940456867218018,
5159
+ "learning_rate": 1.4929724509192488e-05,
5160
+ "loss": 2.6403,
5161
+ "step": 736
5162
+ },
5163
+ {
5164
+ "epoch": 0.7503181471112242,
5165
+ "grad_norm": 7.819153308868408,
5166
+ "learning_rate": 1.4815426642560754e-05,
5167
+ "loss": 2.3173,
5168
+ "step": 737
5169
+ },
5170
+ {
5171
+ "epoch": 0.7513362178671418,
5172
+ "grad_norm": 7.586490154266357,
5173
+ "learning_rate": 1.470149183220748e-05,
5174
+ "loss": 2.0191,
5175
+ "step": 738
5176
  }
5177
  ],
5178
  "logging_steps": 1,
 
5192
  "attributes": {}
5193
  }
5194
  },
5195
+ "total_flos": 6.046717707288576e+17,
5196
  "train_batch_size": 4,
5197
  "trial_name": null,
5198
  "trial_params": null