haoyuw commited on
Commit
119632d
·
verified ·
1 Parent(s): 950e704

Model save

Browse files
Files changed (4) hide show
  1. README.md +5 -4
  2. all_results.json +5 -5
  3. train_results.json +5 -5
  4. trainer_state.json +509 -12
README.md CHANGED
@@ -1,16 +1,17 @@
1
  ---
2
  base_model: Qwen/Qwen2.5-Math-1.5B-Instruct
3
- datasets: GAIR/LIMO
4
  library_name: transformers
 
5
  tags:
6
  - generated_from_trainer
7
- - open-r1
 
8
  licence: license
9
  ---
10
 
11
- # Model Card for None
12
 
13
- This model is a fine-tuned version of [Qwen/Qwen2.5-Math-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Math-1.5B-Instruct) on the [GAIR/LIMO](https://huggingface.co/datasets/GAIR/LIMO) dataset.
14
  It has been trained using [TRL](https://github.com/huggingface/trl).
15
 
16
  ## Quick start
 
1
  ---
2
  base_model: Qwen/Qwen2.5-Math-1.5B-Instruct
 
3
  library_name: transformers
4
+ model_name: Qwen2.5-1.5B-Math-Instruct-LIMO
5
  tags:
6
  - generated_from_trainer
7
+ - trl
8
+ - sft
9
  licence: license
10
  ---
11
 
12
+ # Model Card for Qwen2.5-1.5B-Math-Instruct-LIMO
13
 
14
+ This model is a fine-tuned version of [Qwen/Qwen2.5-Math-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Math-1.5B-Instruct).
15
  It has been trained using [TRL](https://github.com/huggingface/trl).
16
 
17
  ## Quick start
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "total_flos": 20283251490816.0,
3
- "train_loss": 0.7583397939968645,
4
- "train_runtime": 1935.5651,
5
  "train_samples": 817,
6
- "train_samples_per_second": 11.753,
7
- "train_steps_per_second": 0.184
8
  }
 
1
  {
2
+ "total_flos": 40566502981632.0,
3
+ "train_loss": 0.25444099340545995,
4
+ "train_runtime": 2064.0536,
5
  "train_samples": 817,
6
+ "train_samples_per_second": 22.042,
7
+ "train_steps_per_second": 0.345
8
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "total_flos": 20283251490816.0,
3
- "train_loss": 0.7583397939968645,
4
- "train_runtime": 1935.5651,
5
  "train_samples": 817,
6
- "train_samples_per_second": 11.753,
7
- "train_steps_per_second": 0.184
8
  }
 
1
  {
2
+ "total_flos": 40566502981632.0,
3
+ "train_loss": 0.25444099340545995,
4
+ "train_runtime": 2064.0536,
5
  "train_samples": 817,
6
+ "train_samples_per_second": 22.042,
7
+ "train_steps_per_second": 0.345
8
  }
trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 4.0,
5
  "eval_steps": 500,
6
- "global_step": 356,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -506,19 +506,516 @@
506
  "step": 355
507
  },
508
  {
509
- "epoch": 4.0,
510
- "step": 356,
511
- "total_flos": 20283251490816.0,
512
- "train_loss": 0.7583397939968645,
513
- "train_runtime": 1935.5651,
514
- "train_samples_per_second": 11.753,
515
- "train_steps_per_second": 0.184
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
516
  }
517
  ],
518
  "logging_steps": 5,
519
- "max_steps": 356,
520
  "num_input_tokens_seen": 0,
521
- "num_train_epochs": 4,
522
  "save_steps": 200,
523
  "stateful_callbacks": {
524
  "TrainerControl": {
@@ -532,7 +1029,7 @@
532
  "attributes": {}
533
  }
534
  },
535
- "total_flos": 20283251490816.0,
536
  "train_batch_size": 8,
537
  "trial_name": null,
538
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 8.0,
5
  "eval_steps": 500,
6
+ "global_step": 712,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
506
  "step": 355
507
  },
508
  {
509
+ "epoch": 4.044943820224719,
510
+ "grad_norm": 0.4074217412543327,
511
+ "learning_rate": 1.1585150086919896e-05,
512
+ "loss": 0.6009,
513
+ "step": 360
514
+ },
515
+ {
516
+ "epoch": 4.101123595505618,
517
+ "grad_norm": 0.4271665264127198,
518
+ "learning_rate": 1.137632369284973e-05,
519
+ "loss": 0.6207,
520
+ "step": 365
521
+ },
522
+ {
523
+ "epoch": 4.157303370786517,
524
+ "grad_norm": 0.43109911858630096,
525
+ "learning_rate": 1.1167294115304992e-05,
526
+ "loss": 0.5911,
527
+ "step": 370
528
+ },
529
+ {
530
+ "epoch": 4.213483146067416,
531
+ "grad_norm": 0.43839912783355806,
532
+ "learning_rate": 1.0958174212861062e-05,
533
+ "loss": 0.6011,
534
+ "step": 375
535
+ },
536
+ {
537
+ "epoch": 4.269662921348314,
538
+ "grad_norm": 0.4190687745455898,
539
+ "learning_rate": 1.0749076892861227e-05,
540
+ "loss": 0.5983,
541
+ "step": 380
542
+ },
543
+ {
544
+ "epoch": 4.325842696629214,
545
+ "grad_norm": 0.4272292754884261,
546
+ "learning_rate": 1.0540115050456152e-05,
547
+ "loss": 0.6298,
548
+ "step": 385
549
+ },
550
+ {
551
+ "epoch": 4.382022471910112,
552
+ "grad_norm": 0.4188166195248199,
553
+ "learning_rate": 1.0331401507649868e-05,
554
+ "loss": 0.614,
555
+ "step": 390
556
+ },
557
+ {
558
+ "epoch": 4.438202247191011,
559
+ "grad_norm": 0.46031386712411543,
560
+ "learning_rate": 1.012304895238529e-05,
561
+ "loss": 0.5934,
562
+ "step": 395
563
+ },
564
+ {
565
+ "epoch": 4.49438202247191,
566
+ "grad_norm": 0.4198160623739865,
567
+ "learning_rate": 9.915169877702096e-06,
568
+ "loss": 0.5816,
569
+ "step": 400
570
+ },
571
+ {
572
+ "epoch": 4.550561797752809,
573
+ "grad_norm": 0.41877479862746064,
574
+ "learning_rate": 9.707876520999864e-06,
575
+ "loss": 0.6071,
576
+ "step": 405
577
+ },
578
+ {
579
+ "epoch": 4.606741573033708,
580
+ "grad_norm": 0.445785010864897,
581
+ "learning_rate": 9.501280803439204e-06,
582
+ "loss": 0.5816,
583
+ "step": 410
584
+ },
585
+ {
586
+ "epoch": 4.662921348314606,
587
+ "grad_norm": 0.4481588101026953,
588
+ "learning_rate": 9.295494269513659e-06,
589
+ "loss": 0.6006,
590
+ "step": 415
591
+ },
592
+ {
593
+ "epoch": 4.719101123595506,
594
+ "grad_norm": 0.4260960071459342,
595
+ "learning_rate": 9.090628026824941e-06,
596
+ "loss": 0.6357,
597
+ "step": 420
598
+ },
599
+ {
600
+ "epoch": 4.775280898876405,
601
+ "grad_norm": 0.4192037207998735,
602
+ "learning_rate": 8.886792686094096e-06,
603
+ "loss": 0.6006,
604
+ "step": 425
605
+ },
606
+ {
607
+ "epoch": 4.831460674157303,
608
+ "grad_norm": 0.43566168980415704,
609
+ "learning_rate": 8.684098301440903e-06,
610
+ "loss": 0.6267,
611
+ "step": 430
612
+ },
613
+ {
614
+ "epoch": 4.887640449438202,
615
+ "grad_norm": 0.42117325698617175,
616
+ "learning_rate": 8.482654310963817e-06,
617
+ "loss": 0.595,
618
+ "step": 435
619
+ },
620
+ {
621
+ "epoch": 4.943820224719101,
622
+ "grad_norm": 0.4414097385180421,
623
+ "learning_rate": 8.2825694776525e-06,
624
+ "loss": 0.5871,
625
+ "step": 440
626
+ },
627
+ {
628
+ "epoch": 5.0,
629
+ "grad_norm": 0.41448225752045526,
630
+ "learning_rate": 8.083951830664867e-06,
631
+ "loss": 0.5838,
632
+ "step": 445
633
+ },
634
+ {
635
+ "epoch": 5.056179775280899,
636
+ "grad_norm": 0.44467080989005614,
637
+ "learning_rate": 7.886908607000321e-06,
638
+ "loss": 0.5612,
639
+ "step": 450
640
+ },
641
+ {
642
+ "epoch": 5.112359550561798,
643
+ "grad_norm": 0.48267692647228083,
644
+ "learning_rate": 7.691546193600702e-06,
645
+ "loss": 0.5438,
646
+ "step": 455
647
+ },
648
+ {
649
+ "epoch": 5.168539325842697,
650
+ "grad_norm": 0.46882770217949304,
651
+ "learning_rate": 7.497970069910192e-06,
652
+ "loss": 0.5145,
653
+ "step": 460
654
+ },
655
+ {
656
+ "epoch": 5.224719101123595,
657
+ "grad_norm": 0.45016082689033277,
658
+ "learning_rate": 7.306284750925192e-06,
659
+ "loss": 0.5392,
660
+ "step": 465
661
+ },
662
+ {
663
+ "epoch": 5.280898876404494,
664
+ "grad_norm": 0.4904333208280953,
665
+ "learning_rate": 7.116593730764929e-06,
666
+ "loss": 0.5172,
667
+ "step": 470
668
+ },
669
+ {
670
+ "epoch": 5.337078651685394,
671
+ "grad_norm": 0.5047681974857045,
672
+ "learning_rate": 6.928999426793234e-06,
673
+ "loss": 0.5408,
674
+ "step": 475
675
+ },
676
+ {
677
+ "epoch": 5.393258426966292,
678
+ "grad_norm": 0.46939981998911995,
679
+ "learning_rate": 6.743603124321712e-06,
680
+ "loss": 0.5408,
681
+ "step": 480
682
+ },
683
+ {
684
+ "epoch": 5.449438202247191,
685
+ "grad_norm": 0.48583542280728176,
686
+ "learning_rate": 6.5605049219240635e-06,
687
+ "loss": 0.5564,
688
+ "step": 485
689
+ },
690
+ {
691
+ "epoch": 5.50561797752809,
692
+ "grad_norm": 0.445507769617856,
693
+ "learning_rate": 6.379803677391223e-06,
694
+ "loss": 0.5272,
695
+ "step": 490
696
+ },
697
+ {
698
+ "epoch": 5.561797752808989,
699
+ "grad_norm": 0.48127792996455504,
700
+ "learning_rate": 6.201596954356362e-06,
701
+ "loss": 0.5221,
702
+ "step": 495
703
+ },
704
+ {
705
+ "epoch": 5.617977528089888,
706
+ "grad_norm": 0.48763800709480937,
707
+ "learning_rate": 6.02598096961865e-06,
708
+ "loss": 0.5017,
709
+ "step": 500
710
+ },
711
+ {
712
+ "epoch": 5.674157303370786,
713
+ "grad_norm": 0.45141538643328166,
714
+ "learning_rate": 5.853050541194187e-06,
715
+ "loss": 0.5301,
716
+ "step": 505
717
+ },
718
+ {
719
+ "epoch": 5.730337078651686,
720
+ "grad_norm": 0.46888316703181054,
721
+ "learning_rate": 5.682899037122178e-06,
722
+ "loss": 0.5088,
723
+ "step": 510
724
+ },
725
+ {
726
+ "epoch": 5.786516853932584,
727
+ "grad_norm": 0.49578196304683,
728
+ "learning_rate": 5.515618325053952e-06,
729
+ "loss": 0.5051,
730
+ "step": 515
731
+ },
732
+ {
733
+ "epoch": 5.842696629213483,
734
+ "grad_norm": 0.4766026956952897,
735
+ "learning_rate": 5.351298722652064e-06,
736
+ "loss": 0.509,
737
+ "step": 520
738
+ },
739
+ {
740
+ "epoch": 5.898876404494382,
741
+ "grad_norm": 0.5255442360578876,
742
+ "learning_rate": 5.190028948826304e-06,
743
+ "loss": 0.5465,
744
+ "step": 525
745
+ },
746
+ {
747
+ "epoch": 5.955056179775281,
748
+ "grad_norm": 0.5102538127102871,
749
+ "learning_rate": 5.031896075832846e-06,
750
+ "loss": 0.5186,
751
+ "step": 530
752
+ },
753
+ {
754
+ "epoch": 6.01123595505618,
755
+ "grad_norm": 0.5022519456849229,
756
+ "learning_rate": 4.876985482262482e-06,
757
+ "loss": 0.5204,
758
+ "step": 535
759
+ },
760
+ {
761
+ "epoch": 6.067415730337078,
762
+ "grad_norm": 0.46834498920984025,
763
+ "learning_rate": 4.725380806943299e-06,
764
+ "loss": 0.4707,
765
+ "step": 540
766
+ },
767
+ {
768
+ "epoch": 6.123595505617978,
769
+ "grad_norm": 0.5403111606747254,
770
+ "learning_rate": 4.577163903782655e-06,
771
+ "loss": 0.4491,
772
+ "step": 545
773
+ },
774
+ {
775
+ "epoch": 6.179775280898877,
776
+ "grad_norm": 0.5964108863972761,
777
+ "learning_rate": 4.432414797572894e-06,
778
+ "loss": 0.4709,
779
+ "step": 550
780
+ },
781
+ {
782
+ "epoch": 6.235955056179775,
783
+ "grad_norm": 0.4994302712693112,
784
+ "learning_rate": 4.291211640784608e-06,
785
+ "loss": 0.4708,
786
+ "step": 555
787
+ },
788
+ {
789
+ "epoch": 6.292134831460674,
790
+ "grad_norm": 0.49243733113864285,
791
+ "learning_rate": 4.153630671370821e-06,
792
+ "loss": 0.4926,
793
+ "step": 560
794
+ },
795
+ {
796
+ "epoch": 6.348314606741573,
797
+ "grad_norm": 0.49658882787661995,
798
+ "learning_rate": 4.019746171604824e-06,
799
+ "loss": 0.502,
800
+ "step": 565
801
+ },
802
+ {
803
+ "epoch": 6.404494382022472,
804
+ "grad_norm": 0.5418840142061598,
805
+ "learning_rate": 3.889630427973951e-06,
806
+ "loss": 0.4602,
807
+ "step": 570
808
+ },
809
+ {
810
+ "epoch": 6.460674157303371,
811
+ "grad_norm": 0.5008443330249364,
812
+ "learning_rate": 3.763353692150864e-06,
813
+ "loss": 0.4665,
814
+ "step": 575
815
+ },
816
+ {
817
+ "epoch": 6.51685393258427,
818
+ "grad_norm": 0.48589030681678613,
819
+ "learning_rate": 3.6409841430635166e-06,
820
+ "loss": 0.4497,
821
+ "step": 580
822
+ },
823
+ {
824
+ "epoch": 6.573033707865169,
825
+ "grad_norm": 0.5820871330261026,
826
+ "learning_rate": 3.522587850084197e-06,
827
+ "loss": 0.4629,
828
+ "step": 585
829
+ },
830
+ {
831
+ "epoch": 6.629213483146067,
832
+ "grad_norm": 0.49834548702881326,
833
+ "learning_rate": 3.408228737357575e-06,
834
+ "loss": 0.483,
835
+ "step": 590
836
+ },
837
+ {
838
+ "epoch": 6.685393258426966,
839
+ "grad_norm": 0.5052799618260936,
840
+ "learning_rate": 3.297968549286974e-06,
841
+ "loss": 0.4808,
842
+ "step": 595
843
+ },
844
+ {
845
+ "epoch": 6.741573033707866,
846
+ "grad_norm": 0.492591898344833,
847
+ "learning_rate": 3.191866817197539e-06,
848
+ "loss": 0.472,
849
+ "step": 600
850
+ },
851
+ {
852
+ "epoch": 6.797752808988764,
853
+ "grad_norm": 0.49032082408314204,
854
+ "learning_rate": 3.089980827194276e-06,
855
+ "loss": 0.4611,
856
+ "step": 605
857
+ },
858
+ {
859
+ "epoch": 6.853932584269663,
860
+ "grad_norm": 0.4959126643842184,
861
+ "learning_rate": 2.9923655892323144e-06,
862
+ "loss": 0.4726,
863
+ "step": 610
864
+ },
865
+ {
866
+ "epoch": 6.910112359550562,
867
+ "grad_norm": 0.5335774103596451,
868
+ "learning_rate": 2.8990738074161196e-06,
869
+ "loss": 0.4792,
870
+ "step": 615
871
+ },
872
+ {
873
+ "epoch": 6.966292134831461,
874
+ "grad_norm": 0.5201706543464556,
875
+ "learning_rate": 2.8101558515436506e-06,
876
+ "loss": 0.4711,
877
+ "step": 620
878
+ },
879
+ {
880
+ "epoch": 7.022471910112359,
881
+ "grad_norm": 0.4623475019520009,
882
+ "learning_rate": 2.725659729910878e-06,
883
+ "loss": 0.4545,
884
+ "step": 625
885
+ },
886
+ {
887
+ "epoch": 7.078651685393258,
888
+ "grad_norm": 0.4834608390252829,
889
+ "learning_rate": 2.645631063391285e-06,
890
+ "loss": 0.443,
891
+ "step": 630
892
+ },
893
+ {
894
+ "epoch": 7.134831460674158,
895
+ "grad_norm": 0.5359592571307704,
896
+ "learning_rate": 2.570113060804401e-06,
897
+ "loss": 0.441,
898
+ "step": 635
899
+ },
900
+ {
901
+ "epoch": 7.191011235955056,
902
+ "grad_norm": 0.5433362481606256,
903
+ "learning_rate": 2.4991464955866314e-06,
904
+ "loss": 0.4264,
905
+ "step": 640
906
+ },
907
+ {
908
+ "epoch": 7.247191011235955,
909
+ "grad_norm": 0.5225953610583668,
910
+ "learning_rate": 2.432769683776995e-06,
911
+ "loss": 0.4145,
912
+ "step": 645
913
+ },
914
+ {
915
+ "epoch": 7.303370786516854,
916
+ "grad_norm": 0.5205000317985969,
917
+ "learning_rate": 2.371018463329651e-06,
918
+ "loss": 0.435,
919
+ "step": 650
920
+ },
921
+ {
922
+ "epoch": 7.359550561797753,
923
+ "grad_norm": 0.5007819526288158,
924
+ "learning_rate": 2.3139261747643925e-06,
925
+ "loss": 0.41,
926
+ "step": 655
927
+ },
928
+ {
929
+ "epoch": 7.415730337078652,
930
+ "grad_norm": 0.518942310039327,
931
+ "learning_rate": 2.261523643165532e-06,
932
+ "loss": 0.4502,
933
+ "step": 660
934
+ },
935
+ {
936
+ "epoch": 7.47191011235955,
937
+ "grad_norm": 0.49873700284836797,
938
+ "learning_rate": 2.2138391615389306e-06,
939
+ "loss": 0.4324,
940
+ "step": 665
941
+ },
942
+ {
943
+ "epoch": 7.52808988764045,
944
+ "grad_norm": 0.5208414783528387,
945
+ "learning_rate": 2.1708984755361205e-06,
946
+ "loss": 0.4449,
947
+ "step": 670
948
+ },
949
+ {
950
+ "epoch": 7.584269662921348,
951
+ "grad_norm": 0.5791900845785815,
952
+ "learning_rate": 2.1327247695538015e-06,
953
+ "loss": 0.4484,
954
+ "step": 675
955
+ },
956
+ {
957
+ "epoch": 7.640449438202247,
958
+ "grad_norm": 0.5438940813210439,
959
+ "learning_rate": 2.0993386542161944e-06,
960
+ "loss": 0.4685,
961
+ "step": 680
962
+ },
963
+ {
964
+ "epoch": 7.696629213483146,
965
+ "grad_norm": 0.5127534158556458,
966
+ "learning_rate": 2.070758155247017e-06,
967
+ "loss": 0.4339,
968
+ "step": 685
969
+ },
970
+ {
971
+ "epoch": 7.752808988764045,
972
+ "grad_norm": 0.5415209164546988,
973
+ "learning_rate": 2.0469987037371005e-06,
974
+ "loss": 0.4103,
975
+ "step": 690
976
+ },
977
+ {
978
+ "epoch": 7.808988764044944,
979
+ "grad_norm": 0.5136419675345572,
980
+ "learning_rate": 2.028073127812876e-06,
981
+ "loss": 0.4169,
982
+ "step": 695
983
+ },
984
+ {
985
+ "epoch": 7.865168539325842,
986
+ "grad_norm": 0.5319921953080505,
987
+ "learning_rate": 2.013991645710262e-06,
988
+ "loss": 0.4286,
989
+ "step": 700
990
+ },
991
+ {
992
+ "epoch": 7.921348314606742,
993
+ "grad_norm": 0.5733506841186511,
994
+ "learning_rate": 2.0047618602576594e-06,
995
+ "loss": 0.4505,
996
+ "step": 705
997
+ },
998
+ {
999
+ "epoch": 7.97752808988764,
1000
+ "grad_norm": 0.5142139821667071,
1001
+ "learning_rate": 2.0003887547710647e-06,
1002
+ "loss": 0.4202,
1003
+ "step": 710
1004
+ },
1005
+ {
1006
+ "epoch": 8.0,
1007
+ "step": 712,
1008
+ "total_flos": 40566502981632.0,
1009
+ "train_loss": 0.25444099340545995,
1010
+ "train_runtime": 2064.0536,
1011
+ "train_samples_per_second": 22.042,
1012
+ "train_steps_per_second": 0.345
1013
  }
1014
  ],
1015
  "logging_steps": 5,
1016
+ "max_steps": 712,
1017
  "num_input_tokens_seen": 0,
1018
+ "num_train_epochs": 8,
1019
  "save_steps": 200,
1020
  "stateful_callbacks": {
1021
  "TrainerControl": {
 
1029
  "attributes": {}
1030
  }
1031
  },
1032
+ "total_flos": 40566502981632.0,
1033
  "train_batch_size": 8,
1034
  "trial_name": null,
1035
  "trial_params": null