Training in progress, step 200, checkpoint

Browse files

Files changed (9) hide show

last-checkpoint/model.safetensors +1 -1
last-checkpoint/optimizer.pt +2 -2
last-checkpoint/rng_state_0.pth +2 -2
last-checkpoint/rng_state_1.pth +2 -2
last-checkpoint/rng_state_2.pth +2 -2
last-checkpoint/rng_state_3.pth +2 -2
last-checkpoint/scheduler.pt +1 -1
last-checkpoint/trainer_state.json +98 -542
last-checkpoint/training_args.bin +1 -1

last-checkpoint/model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:dfc7459b8070f26a9a864b85343d1738e3d2dcd6020a04f5fb8b9aca1951ffd2
 size 723674912

 version https://git-lfs.github.com/spec/v1
+oid sha256:6ca635966f2128b90695cdcf1b450ff9388c9812f95f690192973e5b7eefd3c9
 size 723674912

last-checkpoint/optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b945b56cf643054d93848122565520049285ab0e46c861ff6ab04f75d42ed166
-size 735625626

 version https://git-lfs.github.com/spec/v1
+oid sha256:ef2ab014f8101a1dbbd4564e14dff1cbf3c43dda56a1b19089771de3e5eb2e2f
+size 735625370

last-checkpoint/rng_state_0.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0df10be429c2412198f2c4c684866a8c921cbd0d9ee4c865077476da07bda410
-size 15984

 version https://git-lfs.github.com/spec/v1
+oid sha256:d497aa3968cd2f05db0d0e8c5e1be496a8a5348df0a825e18ed3fdbaa24257ad
+size 15024

last-checkpoint/rng_state_1.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b14d8363b71c1e824421bf14a513f0f951f1c6c9b9494dcedc75b3fa1fecea91
-size 15984

 version https://git-lfs.github.com/spec/v1
+oid sha256:5dda0d87dad890add5a8f3995389ff6a597895845903171a363aa580fa07ac30
+size 15024

last-checkpoint/rng_state_2.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f0d27a8e9aed69fdd3b729f8cbf6300af5e0e26e9226d0e2307b0cf80aff9030
-size 15984

 version https://git-lfs.github.com/spec/v1
+oid sha256:de656d8e54adb9fa6e0b2ddbe69d4325a775f7e1580ed51c58a759ad9c7520d4
+size 15024

last-checkpoint/rng_state_3.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:23de9e35ad029c7a59ff383920fe892474d8f295c2e2f82ec3e6f109f3f96960
-size 15984

 version https://git-lfs.github.com/spec/v1
+oid sha256:9eaa87e4309aa1a00b739cd637a2ec444ea6c757388c653064a1906e4d8dfb2e
+size 15024

last-checkpoint/scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:40b6b717644e21f80a22ec98694b3a2fd9d62a6467e549d64314725dba905d52
 size 1064

 version https://git-lfs.github.com/spec/v1
+oid sha256:ca9a25c72339c898b564e0c464a3f6fc75bbeec408008928b7ed05533156b98c
 size 1064

last-checkpoint/trainer_state.json CHANGED Viewed

@@ -1,619 +1,175 @@
 {
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 0.23675643681562591,
   "eval_steps": 200,
-  "global_step": 800,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
-      "epoch": 0.0002959455460195324,
-      "eval_loss": 2.584977626800537,
-      "eval_runtime": 23.9896,
-      "eval_samples_per_second": 62.61,
-      "eval_steps_per_second": 15.673,
       "step": 1
     },
     {
-      "epoch": 0.002959455460195324,
-      "grad_norm": 2.140625,
-      "learning_rate": 1.6000000000000003e-05,
-      "loss": 2.3759,
       "step": 10
     },
     {
-      "epoch": 0.005918910920390648,
-      "grad_norm": 3.46875,
-      "learning_rate": 3.2000000000000005e-05,
-      "loss": 2.4589,
       "step": 20
     },
     {
-      "epoch": 0.008878366380585973,
-      "grad_norm": 4.15625,
-      "learning_rate": 4.8e-05,
-      "loss": 2.2745,
       "step": 30
     },
     {
-      "epoch": 0.011837821840781295,
-      "grad_norm": 5.5,
-      "learning_rate": 6.400000000000001e-05,
-      "loss": 2.5814,
       "step": 40
     },
     {
-      "epoch": 0.01479727730097662,
-      "grad_norm": 19.25,
-      "learning_rate": 8e-05,
-      "loss": 2.6502,
       "step": 50
     },
     {
-      "epoch": 0.017756732761171946,
-      "grad_norm": 3.078125,
-      "learning_rate": 9.6e-05,
-      "loss": 2.1861,
       "step": 60
     },
     {
-      "epoch": 0.020716188221367268,
-      "grad_norm": 2.859375,
-      "learning_rate": 0.00011200000000000001,
-      "loss": 2.1366,
       "step": 70
     },
     {
-      "epoch": 0.02367564368156259,
-      "grad_norm": 5.03125,
-      "learning_rate": 0.00012800000000000002,
-      "loss": 2.2049,
       "step": 80
     },
     {
-      "epoch": 0.026635099141757917,
-      "grad_norm": 5.09375,
-      "learning_rate": 0.000144,
-      "loss": 2.0852,
       "step": 90
     },
     {
-      "epoch": 0.02959455460195324,
-      "grad_norm": 15.5,
-      "learning_rate": 0.00016,
-      "loss": 2.5937,
       "step": 100
     },
     {
-      "epoch": 0.032554010062148565,
-      "grad_norm": 2.703125,
-      "learning_rate": 0.00017600000000000002,
-      "loss": 2.3173,
       "step": 110
     },
     {
-      "epoch": 0.03551346552234389,
-      "grad_norm": 3.0,
-      "learning_rate": 0.000192,
-      "loss": 2.156,
       "step": 120
     },
     {
-      "epoch": 0.03847292098253921,
-      "grad_norm": 2.984375,
-      "learning_rate": 0.0001999978128380225,
-      "loss": 2.2033,
       "step": 130
     },
     {
-      "epoch": 0.041432376442734536,
-      "grad_norm": 5.71875,
-      "learning_rate": 0.0001999803161162393,
-      "loss": 2.1184,
       "step": 140
     },
     {
-      "epoch": 0.04439183190292986,
-      "grad_norm": 17.875,
-      "learning_rate": 0.00019994532573409262,
-      "loss": 1.956,
       "step": 150
     },
     {
-      "epoch": 0.04735128736312518,
-      "grad_norm": 2.90625,
-      "learning_rate": 0.00019989284781388617,
-      "loss": 2.2089,
       "step": 160
     },
     {
-      "epoch": 0.05031074282332051,
-      "grad_norm": 2.71875,
-      "learning_rate": 0.00019982289153773646,
-      "loss": 2.1831,
       "step": 170
     },
     {
-      "epoch": 0.053270198283515834,
-      "grad_norm": 4.28125,
-      "learning_rate": 0.00019973546914596623,
-      "loss": 2.2995,
       "step": 180
     },
     {
-      "epoch": 0.05622965374371116,
-      "grad_norm": 7.34375,
-      "learning_rate": 0.00019963059593496268,
-      "loss": 2.0765,
       "step": 190
     },
     {
-      "epoch": 0.05918910920390648,
-      "grad_norm": 15.0625,
-      "learning_rate": 0.00019950829025450114,
-      "loss": 2.3713,
       "step": 200
     },
     {
-      "epoch": 0.05918910920390648,
-      "eval_loss": 2.143343448638916,
-      "eval_runtime": 23.9248,
-      "eval_samples_per_second": 62.78,
-      "eval_steps_per_second": 15.716,
       "step": 200
-    },
-    {
-      "epoch": 0.062148564664101805,
-      "grad_norm": 2.1875,
-      "learning_rate": 0.0001993685735045343,
-      "loss": 2.3077,
-      "step": 210
-    },
-    {
-      "epoch": 0.06510802012429713,
-      "grad_norm": 3.0,
-      "learning_rate": 0.0001992114701314478,
-      "loss": 2.257,
-      "step": 220
-    },
-    {
-      "epoch": 0.06806747558449246,
-      "grad_norm": 3.40625,
-      "learning_rate": 0.000199037007623783,
-      "loss": 2.151,
-      "step": 230
-    },
-    {
-      "epoch": 0.07102693104468778,
-      "grad_norm": 5.75,
-      "learning_rate": 0.00019884521650742715,
-      "loss": 2.1019,
-      "step": 240
-    },
-    {
-      "epoch": 0.0739863865048831,
-      "grad_norm": 21.25,
-      "learning_rate": 0.00019863613034027224,
-      "loss": 2.2643,
-      "step": 250
-    },
-    {
-      "epoch": 0.07694584196507842,
-      "grad_norm": 3.09375,
-      "learning_rate": 0.0001984097857063434,
-      "loss": 2.1065,
-      "step": 260
-    },
-    {
-      "epoch": 0.07990529742527375,
-      "grad_norm": 2.796875,
-      "learning_rate": 0.0001981662222093976,
-      "loss": 2.302,
-      "step": 270
-    },
-    {
-      "epoch": 0.08286475288546907,
-      "grad_norm": 3.296875,
-      "learning_rate": 0.00019790548246599447,
-      "loss": 2.1499,
-      "step": 280
-    },
-    {
-      "epoch": 0.0858242083456644,
-      "grad_norm": 5.3125,
-      "learning_rate": 0.00019762761209803927,
-      "loss": 1.9488,
-      "step": 290
-    },
-    {
-      "epoch": 0.08878366380585972,
-      "grad_norm": 16.375,
-      "learning_rate": 0.0001973326597248006,
-      "loss": 2.0258,
-      "step": 300
-    },
-    {
-      "epoch": 0.09174311926605505,
-      "grad_norm": 2.84375,
-      "learning_rate": 0.00019702067695440332,
-      "loss": 2.3253,
-      "step": 310
-    },
-    {
-      "epoch": 0.09470257472625036,
-      "grad_norm": 3.015625,
-      "learning_rate": 0.00019669171837479873,
-      "loss": 1.9515,
-      "step": 320
-    },
-    {
-      "epoch": 0.09766203018644569,
-      "grad_norm": 3.375,
-      "learning_rate": 0.00019634584154421317,
-      "loss": 2.1889,
-      "step": 330
-    },
-    {
-      "epoch": 0.10062148564664102,
-      "grad_norm": 4.5625,
-      "learning_rate": 0.00019598310698107702,
-      "loss": 2.0052,
-      "step": 340
-    },
-    {
-      "epoch": 0.10358094110683634,
-      "grad_norm": 13.25,
-      "learning_rate": 0.00019560357815343577,
-      "loss": 1.6711,
-      "step": 350
-    },
-    {
-      "epoch": 0.10654039656703167,
-      "grad_norm": 2.40625,
-      "learning_rate": 0.00019520732146784491,
-      "loss": 2.195,
-      "step": 360
-    },
-    {
-      "epoch": 0.109499852027227,
-      "grad_norm": 3.125,
-      "learning_rate": 0.0001947944062577507,
-      "loss": 1.9741,
-      "step": 370
-    },
-    {
-      "epoch": 0.11245930748742232,
-      "grad_norm": 4.4375,
-      "learning_rate": 0.00019436490477135878,
-      "loss": 2.1273,
-      "step": 380
-    },
-    {
-      "epoch": 0.11541876294761765,
-      "grad_norm": 5.5625,
-      "learning_rate": 0.00019391889215899299,
-      "loss": 1.8576,
-      "step": 390
-    },
-    {
-      "epoch": 0.11837821840781296,
-      "grad_norm": 11.25,
-      "learning_rate": 0.0001934564464599461,
-      "loss": 1.6087,
-      "step": 400
-    },
-    {
-      "epoch": 0.11837821840781296,
-      "eval_loss": 2.0912728309631348,
-      "eval_runtime": 23.8357,
-      "eval_samples_per_second": 63.015,
-      "eval_steps_per_second": 15.775,
-      "step": 400
-    },
-    {
-      "epoch": 0.12133767386800828,
-      "grad_norm": 2.15625,
-      "learning_rate": 0.00019297764858882514,
-      "loss": 2.3663,
-      "step": 410
-    },
-    {
-      "epoch": 0.12429712932820361,
-      "grad_norm": 3.15625,
-      "learning_rate": 0.00019248258232139388,
-      "loss": 2.1701,
-      "step": 420
-    },
-    {
-      "epoch": 0.12725658478839894,
-      "grad_norm": 4.34375,
-      "learning_rate": 0.00019197133427991436,
-      "loss": 2.0932,
-      "step": 430
-    },
-    {
-      "epoch": 0.13021604024859426,
-      "grad_norm": 4.21875,
-      "learning_rate": 0.00019144399391799043,
-      "loss": 1.914,
-      "step": 440
-    },
-    {
-      "epoch": 0.1331754957087896,
-      "grad_norm": 18.0,
-      "learning_rate": 0.00019090065350491626,
-      "loss": 2.2622,
-      "step": 450
-    },
-    {
-      "epoch": 0.1361349511689849,
-      "grad_norm": 2.21875,
-      "learning_rate": 0.0001903414081095315,
-      "loss": 2.3188,
-      "step": 460
-    },
-    {
-      "epoch": 0.13909440662918024,
-      "grad_norm": 2.484375,
-      "learning_rate": 0.00018976635558358722,
-      "loss": 2.0455,
-      "step": 470
-    },
-    {
-      "epoch": 0.14205386208937557,
-      "grad_norm": 4.4375,
-      "learning_rate": 0.00018917559654462474,
-      "loss": 2.1161,
-      "step": 480
-    },
-    {
-      "epoch": 0.1450133175495709,
-      "grad_norm": 5.21875,
-      "learning_rate": 0.00018856923435837022,
-      "loss": 2.073,
-      "step": 490
-    },
-    {
-      "epoch": 0.1479727730097662,
-      "grad_norm": 12.25,
-      "learning_rate": 0.0001879473751206489,
-      "loss": 1.5186,
-      "step": 500
-    },
-    {
-      "epoch": 0.15093222846996152,
-      "grad_norm": 2.1875,
-      "learning_rate": 0.00018731012763882133,
-      "loss": 2.3347,
-      "step": 510
-    },
-    {
-      "epoch": 0.15389168393015684,
-      "grad_norm": 3.125,
-      "learning_rate": 0.00018665760341274505,
-      "loss": 2.1125,
-      "step": 520
-    },
-    {
-      "epoch": 0.15685113939035217,
-      "grad_norm": 3.75,
-      "learning_rate": 0.00018598991661526572,
-      "loss": 2.203,
-      "step": 530
-    },
-    {
-      "epoch": 0.1598105948505475,
-      "grad_norm": 3.859375,
-      "learning_rate": 0.00018530718407223974,
-      "loss": 2.1003,
-      "step": 540
-    },
-    {
-      "epoch": 0.16277005031074282,
-      "grad_norm": 15.9375,
-      "learning_rate": 0.00018460952524209355,
-      "loss": 1.8457,
-      "step": 550
-    },
-    {
-      "epoch": 0.16572950577093815,
-      "grad_norm": 2.109375,
-      "learning_rate": 0.00018389706219492147,
-      "loss": 2.1732,
-      "step": 560
-    },
-    {
-      "epoch": 0.16868896123113347,
-      "grad_norm": 2.515625,
-      "learning_rate": 0.00018316991959112716,
-      "loss": 2.309,
-      "step": 570
-    },
-    {
-      "epoch": 0.1716484166913288,
-      "grad_norm": 3.25,
-      "learning_rate": 0.00018242822465961176,
-      "loss": 1.7926,
-      "step": 580
-    },
-    {
-      "epoch": 0.17460787215152412,
-      "grad_norm": 6.96875,
-      "learning_rate": 0.00018167210717551224,
-      "loss": 1.9797,
-      "step": 590
-    },
-    {
-      "epoch": 0.17756732761171945,
-      "grad_norm": 15.0625,
-      "learning_rate": 0.00018090169943749476,
-      "loss": 1.748,
-      "step": 600
-    },
-    {
-      "epoch": 0.17756732761171945,
-      "eval_loss": 2.0752952098846436,
-      "eval_runtime": 23.9564,
-      "eval_samples_per_second": 62.697,
-      "eval_steps_per_second": 15.695,
-      "step": 600
-    },
-    {
-      "epoch": 0.18052678307191478,
-      "grad_norm": 2.8125,
-      "learning_rate": 0.00018011713624460608,
-      "loss": 1.9757,
-      "step": 610
-    },
-    {
-      "epoch": 0.1834862385321101,
-      "grad_norm": 2.46875,
-      "learning_rate": 0.00017931855487268782,
-      "loss": 1.9936,
-      "step": 620
-    },
-    {
-      "epoch": 0.18644569399230543,
-      "grad_norm": 3.171875,
-      "learning_rate": 0.0001785060950503568,
-      "loss": 2.3718,
-      "step": 630
-    },
-    {
-      "epoch": 0.18940514945250073,
-      "grad_norm": 10.125,
-      "learning_rate": 0.00017767989893455698,
-      "loss": 2.0974,
-      "step": 640
-    },
-    {
-      "epoch": 0.19236460491269605,
-      "grad_norm": 16.25,
-      "learning_rate": 0.00017684011108568592,
-      "loss": 2.0872,
-      "step": 650
-    },
-    {
-      "epoch": 0.19532406037289138,
-      "grad_norm": 2.0625,
-      "learning_rate": 0.00017598687844230088,
-      "loss": 2.3763,
-      "step": 660
-    },
-    {
-      "epoch": 0.1982835158330867,
-      "grad_norm": 2.640625,
-      "learning_rate": 0.00017512035029540885,
-      "loss": 2.102,
-      "step": 670
-    },
-    {
-      "epoch": 0.20124297129328203,
-      "grad_norm": 4.03125,
-      "learning_rate": 0.000174240678262345,
-      "loss": 2.1481,
-      "step": 680
-    },
-    {
-      "epoch": 0.20420242675347736,
-      "grad_norm": 6.0625,
-      "learning_rate": 0.000173348016260244,
-      "loss": 1.8523,
-      "step": 690
-    },
-    {
-      "epoch": 0.20716188221367268,
-      "grad_norm": 18.75,
-      "learning_rate": 0.00017244252047910892,
-      "loss": 1.7534,
-      "step": 700
-    },
-    {
-      "epoch": 0.210121337673868,
-      "grad_norm": 2.265625,
-      "learning_rate": 0.00017152434935448256,
-      "loss": 2.1479,
-      "step": 710
-    },
-    {
-      "epoch": 0.21308079313406333,
-      "grad_norm": 2.8125,
-      "learning_rate": 0.0001705936635397259,
-      "loss": 2.2207,
-      "step": 720
-    },
-    {
-      "epoch": 0.21604024859425866,
-      "grad_norm": 3.59375,
-      "learning_rate": 0.00016965062587790823,
-      "loss": 2.0364,
-      "step": 730
-    },
-    {
-      "epoch": 0.218999704054454,
-      "grad_norm": 3.265625,
-      "learning_rate": 0.00016869540137331445,
-      "loss": 1.8523,
-      "step": 740
-    },
-    {
-      "epoch": 0.2219591595146493,
-      "grad_norm": 10.4375,
-      "learning_rate": 0.00016772815716257412,
-      "loss": 1.8113,
-      "step": 750
-    },
-    {
-      "epoch": 0.22491861497484464,
-      "grad_norm": 2.21875,
-      "learning_rate": 0.00016674906248541726,
-      "loss": 2.2365,
-      "step": 760
-    },
-    {
-      "epoch": 0.22787807043503996,
-      "grad_norm": 3.9375,
-      "learning_rate": 0.00016575828865506245,
-      "loss": 2.0369,
-      "step": 770
-    },
-    {
-      "epoch": 0.2308375258952353,
-      "grad_norm": 4.0625,
-      "learning_rate": 0.0001647560090282419,
-      "loss": 1.9434,
-      "step": 780
-    },
-    {
-      "epoch": 0.2337969813554306,
-      "grad_norm": 6.8125,
-      "learning_rate": 0.000163742398974869,
-      "loss": 1.822,
-      "step": 790
-    },
-    {
-      "epoch": 0.23675643681562591,
-      "grad_norm": 29.0,
-      "learning_rate": 0.0001627176358473537,
-      "loss": 1.9161,
-      "step": 800
-    },
-    {
-      "epoch": 0.23675643681562591,
-      "eval_loss": 2.0661604404449463,
-      "eval_runtime": 24.0938,
-      "eval_samples_per_second": 62.34,
-      "eval_steps_per_second": 15.606,
-      "step": 800
     }
   ],
   "logging_steps": 10,
-  "max_steps": 2500,
   "num_input_tokens_seen": 0,
-  "num_train_epochs": 1,
-  "save_steps": 400,
   "stateful_callbacks": {
     "TrainerControl": {
       "args": {
@@ -621,13 +177,13 @@
         "should_evaluate": false,
         "should_log": false,
         "should_save": true,
-        "should_training_stop": false
       },
       "attributes": {}
     }
   },
-  "total_flos": 1.24028893790208e+16,
-  "train_batch_size": 4,
   "trial_name": null,
   "trial_params": null
 }

 {
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 25.0,
   "eval_steps": 200,
+  "global_step": 200,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
+      "epoch": 0.125,
+      "eval_loss": 2.5584230422973633,
+      "eval_runtime": 4.9063,
+      "eval_samples_per_second": 305.933,
+      "eval_steps_per_second": 3.465,
       "step": 1
     },
     {
+      "epoch": 1.25,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0002,
+      "loss": 2.3584,
       "step": 10
     },
     {
+      "epoch": 2.5,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.00019863613034027224,
+      "loss": 2.2406,
       "step": 20
     },
     {
+      "epoch": 3.75,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.00019458172417006347,
+      "loss": 2.1785,
       "step": 30
     },
     {
+      "epoch": 5.0,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0001879473751206489,
+      "loss": 2.1359,
       "step": 40
     },
     {
+      "epoch": 6.25,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.00017891405093963938,
+      "loss": 2.1125,
       "step": 50
     },
     {
+      "epoch": 7.5,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.00016772815716257412,
+      "loss": 2.0939,
       "step": 60
     },
     {
+      "epoch": 8.75,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.00015469481581224272,
+      "loss": 2.0638,
       "step": 70
     },
     {
+      "epoch": 10.0,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.00014016954246529696,
+      "loss": 2.0632,
       "step": 80
     },
     {
+      "epoch": 11.25,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.00012454854871407994,
+      "loss": 2.055,
       "step": 90
     },
     {
+      "epoch": 12.5,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.00010825793454723325,
+      "loss": 2.0298,
       "step": 100
     },
     {
+      "epoch": 13.75,
+      "grad_norm": 0.142578125,
+      "learning_rate": 9.174206545276677e-05,
+      "loss": 2.0271,
       "step": 110
     },
     {
+      "epoch": 15.0,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 7.54514512859201e-05,
+      "loss": 2.0168,
       "step": 120
     },
     {
+      "epoch": 16.25,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 5.983045753470308e-05,
+      "loss": 2.0126,
       "step": 130
     },
     {
+      "epoch": 17.5,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 4.530518418775733e-05,
+      "loss": 2.0188,
       "step": 140
     },
     {
+      "epoch": 18.75,
+      "grad_norm": 0.134765625,
+      "learning_rate": 3.227184283742591e-05,
+      "loss": 2.009,
       "step": 150
     },
     {
+      "epoch": 20.0,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 2.1085949060360654e-05,
+      "loss": 2.0108,
       "step": 160
     },
     {
+      "epoch": 21.25,
+      "grad_norm": 0.130859375,
+      "learning_rate": 1.2052624879351104e-05,
+      "loss": 2.0101,
       "step": 170
     },
     {
+      "epoch": 22.5,
+      "grad_norm": 0.146484375,
+      "learning_rate": 5.418275829936537e-06,
+      "loss": 2.0168,
       "step": 180
     },
     {
+      "epoch": 23.75,
+      "grad_norm": 0.146484375,
+      "learning_rate": 1.3638696597277679e-06,
+      "loss": 2.0059,
       "step": 190
     },
     {
+      "epoch": 25.0,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0,
+      "loss": 2.011,
       "step": 200
     },
     {
+      "epoch": 25.0,
+      "eval_loss": 2.0616559982299805,
+      "eval_runtime": 4.9912,
+      "eval_samples_per_second": 300.731,
+      "eval_steps_per_second": 3.406,
       "step": 200
     }
   ],
   "logging_steps": 10,
+  "max_steps": 200,
   "num_input_tokens_seen": 0,
+  "num_train_epochs": 25,
+  "save_steps": 200,
   "stateful_callbacks": {
     "TrainerControl": {
       "args": {
         "should_evaluate": false,
         "should_log": false,
         "should_save": true,
+        "should_training_stop": true
       },
       "attributes": {}
     }
   },
+  "total_flos": 7.113876738932736e+16,
+  "train_batch_size": 23,
   "trial_name": null,
   "trial_params": null
 }

last-checkpoint/training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8a1d092fd3f29167bdacdba8b2489c3b0abb5e1e28bd152259e356428fc4a59e
 size 6840

 version https://git-lfs.github.com/spec/v1
+oid sha256:3156bde561d7a483929e0f1d8c097a973dfeb26f4690b823508131f70e6df615
 size 6840