antonpolishko commited on
Commit
84f7726
·
verified ·
1 Parent(s): 4e35a22

Training in progress, epoch 2, checkpoint

Browse files
last-checkpoint/model-00001-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:11cca19a8bc1b40f8d1f067fb7f18195ad528c0f2dbd848842217ee82d84c32f
3
  size 4976698672
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b41c2bc8462b0e47f49b76757e3dfb6d9da0ffef646532492049087d75df0804
3
  size 4976698672
last-checkpoint/model-00002-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:eb82134e45554a9af53f18575ba70ce59cd8f0da61b9efc67d46b1f9f9e420fc
3
  size 4999802720
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:62173b4011c396c0ad68343f75580a3747fb6885e2d93d0199c4a9f116f5dfbc
3
  size 4999802720
last-checkpoint/model-00003-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f32423dd2220104f8666b0a19da4851a2c8edcd9addad96b84ac3600e553006c
3
  size 4915916176
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4a39aca3f3f901b09c08f052df0856befa492d5a02e4c71de99d616fd45ad76c
3
  size 4915916176
last-checkpoint/model-00004-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:537e35f4de984baa9421f75bdb506dd795a0842603b6942f26edf94f8785ba18
3
  size 1168138808
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:37c6b53f424ac34e0041d6ce2645b9bc702f28c76a993636e35e43b49f969acc
3
  size 1168138808
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4a6409b691acfd9602ca4eca918d1c449edef044225fb699dc875f3e4361f191
3
  size 32121299754
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5d9ea41c947ef3420f6dd11ecace6682e7d5ffd8bab60ddb1dd5b41723b1ae75
3
  size 32121299754
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:955b895101e13c81ba510512313a06782795770a0bf998c90f718166d25f1664
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:74386f26f36ed67f56395205881e5db2d0c28ffcbeed50dd95b28771d2dac588
3
  size 15984
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:096e351ace65ff91008be171a45173ed6369cc639fce73a288f671041e24b0ec
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:41c88f9de084200454883a13c3717941ea3fd433e2f8735507fc30611f9c5501
3
  size 15984
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f21c61b1a7e793bbdec183de3b52da90042305234bc7e5887986655cd3fc2192
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:965b00d4cb4710ebab57c8787b9925bb3f77b8eeba94a186ec4bc1c2f326ef3f
3
  size 15984
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:348742498d81780506d9760c655a7a7555185b5fbd70a7ae296d88fd9aeecd84
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d5dc374b8b9a4c45c950f9d136feab85a767081fa59f0c7d68ed3a62060c4949
3
  size 15984
last-checkpoint/rng_state_4.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:399c4700dab9ae7b754110ce307fb7e26e22cb49b5bb233c435f0f12b77c202f
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5c7c212fb779217f1edac0baf44f67b608eefc1e0e4e3f5a9dd7eb557032c1bc
3
  size 15984
last-checkpoint/rng_state_5.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:864ea2379cc907eb4189c52706cb978150d9c26e18abf74679590729a8f0c8e8
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:86e1effd626ce1e95dd68a0c8089fe19218f2b24dfe9e45ed2cab1c0ebc10ba1
3
  size 15984
last-checkpoint/rng_state_6.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:25755ba07299ee6ff45936dd04df329596319c9f8095af71e6f3a219e7543e26
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:799cc83f60dfc1c4243cfd6403592112414a2eec494e6832f10221c96ff62c20
3
  size 15984
last-checkpoint/rng_state_7.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:908f018cd701ed629c41299726da4a25f202f20a1d4bc2075a2266ed4013db3a
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:586777c398770c3255d3a1f48c7fef44ea9d89117c627c9ea490e16bfd9a49ba
3
  size 15984
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ac1e9f2863e336135723663af33b8e751efacc94ddbc0f8e24b38d798ef74e64
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e0b8bb24713451d2d82dd1f621f66d6e162f73bb6a391a8f262290751eb1a69d
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 1.0,
5
  "eval_steps": 300,
6
- "global_step": 299,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -427,6 +427,434 @@
427
  "learning_rate": 1.5141027441932214e-06,
428
  "loss": 4.597,
429
  "step": 295
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
430
  }
431
  ],
432
  "logging_steps": 5,
@@ -446,7 +874,7 @@
446
  "attributes": {}
447
  }
448
  },
449
- "total_flos": 1.7647318252524667e+18,
450
  "train_batch_size": 8,
451
  "trial_name": null,
452
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 2.0,
5
  "eval_steps": 300,
6
+ "global_step": 598,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
427
  "learning_rate": 1.5141027441932214e-06,
428
  "loss": 4.597,
429
  "step": 295
430
+ },
431
+ {
432
+ "epoch": 1.0033444816053512,
433
+ "grad_norm": 2.75,
434
+ "learning_rate": 1.498987493914135e-06,
435
+ "loss": 4.5658,
436
+ "step": 300
437
+ },
438
+ {
439
+ "epoch": 1.0033444816053512,
440
+ "eval_loss": 4.585446834564209,
441
+ "eval_runtime": 7.2206,
442
+ "eval_samples_per_second": 80.188,
443
+ "eval_steps_per_second": 2.631,
444
+ "step": 300
445
+ },
446
+ {
447
+ "epoch": 1.020066889632107,
448
+ "grad_norm": 2.90625,
449
+ "learning_rate": 1.4837188871052397e-06,
450
+ "loss": 4.5897,
451
+ "step": 305
452
+ },
453
+ {
454
+ "epoch": 1.0367892976588629,
455
+ "grad_norm": 2.734375,
456
+ "learning_rate": 1.4683016163501854e-06,
457
+ "loss": 4.5885,
458
+ "step": 310
459
+ },
460
+ {
461
+ "epoch": 1.0535117056856187,
462
+ "grad_norm": 2.75,
463
+ "learning_rate": 1.452740419922317e-06,
464
+ "loss": 4.6009,
465
+ "step": 315
466
+ },
467
+ {
468
+ "epoch": 1.0702341137123745,
469
+ "grad_norm": 2.96875,
470
+ "learning_rate": 1.4370400803284373e-06,
471
+ "loss": 4.5789,
472
+ "step": 320
473
+ },
474
+ {
475
+ "epoch": 1.0869565217391304,
476
+ "grad_norm": 3.4375,
477
+ "learning_rate": 1.421205422838971e-06,
478
+ "loss": 4.5435,
479
+ "step": 325
480
+ },
481
+ {
482
+ "epoch": 1.1036789297658862,
483
+ "grad_norm": 2.734375,
484
+ "learning_rate": 1.4052413140049897e-06,
485
+ "loss": 4.566,
486
+ "step": 330
487
+ },
488
+ {
489
+ "epoch": 1.120401337792642,
490
+ "grad_norm": 2.59375,
491
+ "learning_rate": 1.389152660162549e-06,
492
+ "loss": 4.5621,
493
+ "step": 335
494
+ },
495
+ {
496
+ "epoch": 1.137123745819398,
497
+ "grad_norm": 2.65625,
498
+ "learning_rate": 1.3729444059247953e-06,
499
+ "loss": 4.5505,
500
+ "step": 340
501
+ },
502
+ {
503
+ "epoch": 1.1538461538461537,
504
+ "grad_norm": 2.40625,
505
+ "learning_rate": 1.3566215326623129e-06,
506
+ "loss": 4.5709,
507
+ "step": 345
508
+ },
509
+ {
510
+ "epoch": 1.1705685618729098,
511
+ "grad_norm": 2.390625,
512
+ "learning_rate": 1.3401890569721723e-06,
513
+ "loss": 4.5569,
514
+ "step": 350
515
+ },
516
+ {
517
+ "epoch": 1.1872909698996654,
518
+ "grad_norm": 2.46875,
519
+ "learning_rate": 1.3236520291361515e-06,
520
+ "loss": 4.5456,
521
+ "step": 355
522
+ },
523
+ {
524
+ "epoch": 1.2040133779264215,
525
+ "grad_norm": 2.5625,
526
+ "learning_rate": 1.3070155315686059e-06,
527
+ "loss": 4.5543,
528
+ "step": 360
529
+ },
530
+ {
531
+ "epoch": 1.2207357859531773,
532
+ "grad_norm": 2.65625,
533
+ "learning_rate": 1.2902846772544622e-06,
534
+ "loss": 4.5408,
535
+ "step": 365
536
+ },
537
+ {
538
+ "epoch": 1.2374581939799332,
539
+ "grad_norm": 2.90625,
540
+ "learning_rate": 1.273464608177818e-06,
541
+ "loss": 4.5435,
542
+ "step": 370
543
+ },
544
+ {
545
+ "epoch": 1.254180602006689,
546
+ "grad_norm": 2.578125,
547
+ "learning_rate": 1.2565604937416266e-06,
548
+ "loss": 4.5436,
549
+ "step": 375
550
+ },
551
+ {
552
+ "epoch": 1.2709030100334449,
553
+ "grad_norm": 2.484375,
554
+ "learning_rate": 1.2395775291789567e-06,
555
+ "loss": 4.5448,
556
+ "step": 380
557
+ },
558
+ {
559
+ "epoch": 1.2876254180602007,
560
+ "grad_norm": 2.375,
561
+ "learning_rate": 1.2225209339563143e-06,
562
+ "loss": 4.5398,
563
+ "step": 385
564
+ },
565
+ {
566
+ "epoch": 1.3043478260869565,
567
+ "grad_norm": 2.484375,
568
+ "learning_rate": 1.2053959501695144e-06,
569
+ "loss": 4.5383,
570
+ "step": 390
571
+ },
572
+ {
573
+ "epoch": 1.3210702341137124,
574
+ "grad_norm": 2.421875,
575
+ "learning_rate": 1.1882078409326002e-06,
576
+ "loss": 4.5556,
577
+ "step": 395
578
+ },
579
+ {
580
+ "epoch": 1.3377926421404682,
581
+ "grad_norm": 2.703125,
582
+ "learning_rate": 1.1709618887603011e-06,
583
+ "loss": 4.5273,
584
+ "step": 400
585
+ },
586
+ {
587
+ "epoch": 1.354515050167224,
588
+ "grad_norm": 2.5,
589
+ "learning_rate": 1.15366339394453e-06,
590
+ "loss": 4.5262,
591
+ "step": 405
592
+ },
593
+ {
594
+ "epoch": 1.37123745819398,
595
+ "grad_norm": 2.46875,
596
+ "learning_rate": 1.1363176729254144e-06,
597
+ "loss": 4.5391,
598
+ "step": 410
599
+ },
600
+ {
601
+ "epoch": 1.3879598662207357,
602
+ "grad_norm": 2.953125,
603
+ "learning_rate": 1.118930056657367e-06,
604
+ "loss": 4.5218,
605
+ "step": 415
606
+ },
607
+ {
608
+ "epoch": 1.4046822742474916,
609
+ "grad_norm": 2.421875,
610
+ "learning_rate": 1.1015058889706942e-06,
611
+ "loss": 4.5255,
612
+ "step": 420
613
+ },
614
+ {
615
+ "epoch": 1.4214046822742474,
616
+ "grad_norm": 2.53125,
617
+ "learning_rate": 1.0840505249292475e-06,
618
+ "loss": 4.5304,
619
+ "step": 425
620
+ },
621
+ {
622
+ "epoch": 1.4381270903010033,
623
+ "grad_norm": 2.5,
624
+ "learning_rate": 1.0665693291846243e-06,
625
+ "loss": 4.5418,
626
+ "step": 430
627
+ },
628
+ {
629
+ "epoch": 1.4548494983277591,
630
+ "grad_norm": 2.40625,
631
+ "learning_rate": 1.0490676743274181e-06,
632
+ "loss": 4.522,
633
+ "step": 435
634
+ },
635
+ {
636
+ "epoch": 1.471571906354515,
637
+ "grad_norm": 2.5,
638
+ "learning_rate": 1.031550939236033e-06,
639
+ "loss": 4.5346,
640
+ "step": 440
641
+ },
642
+ {
643
+ "epoch": 1.488294314381271,
644
+ "grad_norm": 2.296875,
645
+ "learning_rate": 1.0140245074235622e-06,
646
+ "loss": 4.496,
647
+ "step": 445
648
+ },
649
+ {
650
+ "epoch": 1.5050167224080266,
651
+ "grad_norm": 2.390625,
652
+ "learning_rate": 9.964937653832469e-07,
653
+ "loss": 4.5212,
654
+ "step": 450
655
+ },
656
+ {
657
+ "epoch": 1.5217391304347827,
658
+ "grad_norm": 2.3125,
659
+ "learning_rate": 9.78964100933011e-07,
660
+ "loss": 4.5069,
661
+ "step": 455
662
+ },
663
+ {
664
+ "epoch": 1.5384615384615383,
665
+ "grad_norm": 2.546875,
666
+ "learning_rate": 9.614409015595994e-07,
667
+ "loss": 4.5124,
668
+ "step": 460
669
+ },
670
+ {
671
+ "epoch": 1.5551839464882944,
672
+ "grad_norm": 2.515625,
673
+ "learning_rate": 9.43929552762808e-07,
674
+ "loss": 4.5155,
675
+ "step": 465
676
+ },
677
+ {
678
+ "epoch": 1.57190635451505,
679
+ "grad_norm": 2.46875,
680
+ "learning_rate": 9.264354364003326e-07,
681
+ "loss": 4.5143,
682
+ "step": 470
683
+ },
684
+ {
685
+ "epoch": 1.588628762541806,
686
+ "grad_norm": 2.25,
687
+ "learning_rate": 9.089639290337298e-07,
688
+ "loss": 4.4947,
689
+ "step": 475
690
+ },
691
+ {
692
+ "epoch": 1.605351170568562,
693
+ "grad_norm": 2.53125,
694
+ "learning_rate": 8.915204002760122e-07,
695
+ "loss": 4.5113,
696
+ "step": 480
697
+ },
698
+ {
699
+ "epoch": 1.6220735785953178,
700
+ "grad_norm": 2.359375,
701
+ "learning_rate": 8.741102111413748e-07,
702
+ "loss": 4.5215,
703
+ "step": 485
704
+ },
705
+ {
706
+ "epoch": 1.6387959866220736,
707
+ "grad_norm": 2.390625,
708
+ "learning_rate": 8.567387123975647e-07,
709
+ "loss": 4.4991,
710
+ "step": 490
711
+ },
712
+ {
713
+ "epoch": 1.6555183946488294,
714
+ "grad_norm": 3.296875,
715
+ "learning_rate": 8.394112429214029e-07,
716
+ "loss": 4.5263,
717
+ "step": 495
718
+ },
719
+ {
720
+ "epoch": 1.6722408026755853,
721
+ "grad_norm": 2.28125,
722
+ "learning_rate": 8.221331280579564e-07,
723
+ "loss": 4.5039,
724
+ "step": 500
725
+ },
726
+ {
727
+ "epoch": 1.6889632107023411,
728
+ "grad_norm": 2.46875,
729
+ "learning_rate": 8.049096779838717e-07,
730
+ "loss": 4.5294,
731
+ "step": 505
732
+ },
733
+ {
734
+ "epoch": 1.705685618729097,
735
+ "grad_norm": 2.734375,
736
+ "learning_rate": 7.877461860753696e-07,
737
+ "loss": 4.4868,
738
+ "step": 510
739
+ },
740
+ {
741
+ "epoch": 1.7224080267558528,
742
+ "grad_norm": 2.484375,
743
+ "learning_rate": 7.706479272814022e-07,
744
+ "loss": 4.4988,
745
+ "step": 515
746
+ },
747
+ {
748
+ "epoch": 1.7391304347826086,
749
+ "grad_norm": 2.3125,
750
+ "learning_rate": 7.536201565024767e-07,
751
+ "loss": 4.4843,
752
+ "step": 520
753
+ },
754
+ {
755
+ "epoch": 1.7558528428093645,
756
+ "grad_norm": 2.359375,
757
+ "learning_rate": 7.366681069756351e-07,
758
+ "loss": 4.4878,
759
+ "step": 525
760
+ },
761
+ {
762
+ "epoch": 1.7725752508361206,
763
+ "grad_norm": 2.578125,
764
+ "learning_rate": 7.197969886660984e-07,
765
+ "loss": 4.4925,
766
+ "step": 530
767
+ },
768
+ {
769
+ "epoch": 1.7892976588628762,
770
+ "grad_norm": 2.65625,
771
+ "learning_rate": 7.030119866660565e-07,
772
+ "loss": 4.5185,
773
+ "step": 535
774
+ },
775
+ {
776
+ "epoch": 1.8060200668896322,
777
+ "grad_norm": 2.375,
778
+ "learning_rate": 6.863182596011085e-07,
779
+ "loss": 4.4988,
780
+ "step": 540
781
+ },
782
+ {
783
+ "epoch": 1.8227424749163879,
784
+ "grad_norm": 2.640625,
785
+ "learning_rate": 6.697209380448332e-07,
786
+ "loss": 4.4764,
787
+ "step": 545
788
+ },
789
+ {
790
+ "epoch": 1.839464882943144,
791
+ "grad_norm": 2.296875,
792
+ "learning_rate": 6.532251229419809e-07,
793
+ "loss": 4.4987,
794
+ "step": 550
795
+ },
796
+ {
797
+ "epoch": 1.8561872909698995,
798
+ "grad_norm": 2.46875,
799
+ "learning_rate": 6.368358840407752e-07,
800
+ "loss": 4.5123,
801
+ "step": 555
802
+ },
803
+ {
804
+ "epoch": 1.8729096989966556,
805
+ "grad_norm": 2.375,
806
+ "learning_rate": 6.205582583347973e-07,
807
+ "loss": 4.5019,
808
+ "step": 560
809
+ },
810
+ {
811
+ "epoch": 1.8896321070234112,
812
+ "grad_norm": 2.28125,
813
+ "learning_rate": 6.043972485149414e-07,
814
+ "loss": 4.5041,
815
+ "step": 565
816
+ },
817
+ {
818
+ "epoch": 1.9063545150501673,
819
+ "grad_norm": 2.453125,
820
+ "learning_rate": 5.88357821431908e-07,
821
+ "loss": 4.485,
822
+ "step": 570
823
+ },
824
+ {
825
+ "epoch": 1.9230769230769231,
826
+ "grad_norm": 2.4375,
827
+ "learning_rate": 5.724449065697181e-07,
828
+ "loss": 4.4854,
829
+ "step": 575
830
+ },
831
+ {
832
+ "epoch": 1.939799331103679,
833
+ "grad_norm": 2.5625,
834
+ "learning_rate": 5.566633945307052e-07,
835
+ "loss": 4.5039,
836
+ "step": 580
837
+ },
838
+ {
839
+ "epoch": 1.9565217391304348,
840
+ "grad_norm": 2.359375,
841
+ "learning_rate": 5.410181355324621e-07,
842
+ "loss": 4.507,
843
+ "step": 585
844
+ },
845
+ {
846
+ "epoch": 1.9732441471571907,
847
+ "grad_norm": 2.3125,
848
+ "learning_rate": 5.255139379171966e-07,
849
+ "loss": 4.5087,
850
+ "step": 590
851
+ },
852
+ {
853
+ "epoch": 1.9899665551839465,
854
+ "grad_norm": 2.359375,
855
+ "learning_rate": 5.101555666739563e-07,
856
+ "loss": 4.5007,
857
+ "step": 595
858
  }
859
  ],
860
  "logging_steps": 5,
 
874
  "attributes": {}
875
  }
876
  },
877
+ "total_flos": 3.529463651578675e+18,
878
  "train_batch_size": 8,
879
  "trial_name": null,
880
  "trial_params": null