Training in progress, step 2400, checkpoint

Browse files

Files changed (5) hide show

last-checkpoint/adapter_model.safetensors +1 -1
last-checkpoint/optimizer.pt +1 -1
last-checkpoint/rng_state.pth +1 -1
last-checkpoint/scheduler.pt +1 -1
last-checkpoint/trainer_state.json +1413 -5

last-checkpoint/adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7f4451577bf82ae4c14fb8b5f6d15593c695f63d1bc7c8c377049e28c0b6f430
 size 500770656

 version https://git-lfs.github.com/spec/v1
+oid sha256:0bc773204153173a13c1ac40b0d299d63826a9009d800e65a16ac4dff721fee9
 size 500770656

last-checkpoint/optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:435f4a73c69232486ea2c5684eb01e7449a2602d9445e4a4dbe0c21719127715
 size 254918356

 version https://git-lfs.github.com/spec/v1
+oid sha256:2354a9b4460be38c2facc081861e1d39817d9e3f7d6d7818671513775a0f21bd
 size 254918356

last-checkpoint/rng_state.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c6015ab40414177a8cb3a25519cffb5a624e999127e3ac742f7bf693b450cb8e
 size 14244

 version https://git-lfs.github.com/spec/v1
+oid sha256:a8e8df32598dfacb12011daa77172ba188bcb85dc5dfb5c57bf90f20875c1ee3
 size 14244

last-checkpoint/scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e66e55baeee62db229bddf3da45b85b2a91fe7343a6a75e11aba725017a7a321
 size 1064

 version https://git-lfs.github.com/spec/v1
+oid sha256:dfa45a2010848f8ba6bd00a9aefaa39f18e6a555b04b4e25c9be094c299a3176
 size 1064

last-checkpoint/trainer_state.json CHANGED Viewed

@@ -1,9 +1,9 @@
 {
   "best_metric": 0.635880708694458,
   "best_model_checkpoint": "miner_id_24/checkpoint-1800",
-  "epoch": 0.303469204772743,
   "eval_steps": 200,
-  "global_step": 2200,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
@@ -15503,6 +15503,1414 @@
       "eval_samples_per_second": 2.51,
       "eval_steps_per_second": 2.51,
       "step": 2200
     }
   ],
   "logging_steps": 1,
@@ -15517,7 +16925,7 @@
         "early_stopping_threshold": 0.0
       },
       "attributes": {
-        "early_stopping_patience_counter": 2
       }
     },
     "TrainerControl": {
@@ -15526,12 +16934,12 @@
         "should_evaluate": false,
         "should_log": false,
         "should_save": true,
-        "should_training_stop": false
       },
       "attributes": {}
     }
   },
-  "total_flos": 3.483236466111283e+17,
   "train_batch_size": 1,
   "trial_name": null,
   "trial_params": null

 {
   "best_metric": 0.635880708694458,
   "best_model_checkpoint": "miner_id_24/checkpoint-1800",
+  "epoch": 0.33105731429753776,
   "eval_steps": 200,
+  "global_step": 2400,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
       "eval_samples_per_second": 2.51,
       "eval_steps_per_second": 2.51,
       "step": 2200
+    },
+    {
+      "epoch": 0.3036071453203669,
+      "grad_norm": 0.7370956540107727,
+      "learning_rate": 0.000195108780321177,
+      "loss": 0.4832,
+      "step": 2201
+    },
+    {
+      "epoch": 0.30374508586799087,
+      "grad_norm": 0.6172298192977905,
+      "learning_rate": 0.00019510431046312185,
+      "loss": 0.5685,
+      "step": 2202
+    },
+    {
+      "epoch": 0.3038830264156149,
+      "grad_norm": 0.4689820408821106,
+      "learning_rate": 0.0001950998386148504,
+      "loss": 0.3635,
+      "step": 2203
+    },
+    {
+      "epoch": 0.3040209669632388,
+      "grad_norm": 0.8951042294502258,
+      "learning_rate": 0.00019509536477645617,
+      "loss": 0.8364,
+      "step": 2204
+    },
+    {
+      "epoch": 0.30415890751086283,
+      "grad_norm": 0.6719712018966675,
+      "learning_rate": 0.00019509088894803286,
+      "loss": 0.2531,
+      "step": 2205
+    },
+    {
+      "epoch": 0.3042968480584868,
+      "grad_norm": 0.730803370475769,
+      "learning_rate": 0.00019508641112967408,
+      "loss": 0.5159,
+      "step": 2206
+    },
+    {
+      "epoch": 0.3044347886061108,
+      "grad_norm": 0.7379736304283142,
+      "learning_rate": 0.0001950819313214736,
+      "loss": 0.4162,
+      "step": 2207
+    },
+    {
+      "epoch": 0.30457272915373473,
+      "grad_norm": 0.8285558223724365,
+      "learning_rate": 0.00019507744952352508,
+      "loss": 0.7966,
+      "step": 2208
+    },
+    {
+      "epoch": 0.30471066970135874,
+      "grad_norm": 0.8864738941192627,
+      "learning_rate": 0.00019507296573592235,
+      "loss": 0.7326,
+      "step": 2209
+    },
+    {
+      "epoch": 0.3048486102489827,
+      "grad_norm": 0.7778903841972351,
+      "learning_rate": 0.00019506847995875924,
+      "loss": 0.4939,
+      "step": 2210
+    },
+    {
+      "epoch": 0.30498655079660664,
+      "grad_norm": 0.7180725932121277,
+      "learning_rate": 0.00019506399219212966,
+      "loss": 0.5479,
+      "step": 2211
+    },
+    {
+      "epoch": 0.30512449134423064,
+      "grad_norm": 0.9998635053634644,
+      "learning_rate": 0.00019505950243612746,
+      "loss": 1.02,
+      "step": 2212
+    },
+    {
+      "epoch": 0.3052624318918546,
+      "grad_norm": 1.4325881004333496,
+      "learning_rate": 0.00019505501069084659,
+      "loss": 0.6919,
+      "step": 2213
+    },
+    {
+      "epoch": 0.3054003724394786,
+      "grad_norm": 0.7900728583335876,
+      "learning_rate": 0.00019505051695638113,
+      "loss": 0.3652,
+      "step": 2214
+    },
+    {
+      "epoch": 0.30553831298710254,
+      "grad_norm": 0.8904551863670349,
+      "learning_rate": 0.00019504602123282508,
+      "loss": 0.8051,
+      "step": 2215
+    },
+    {
+      "epoch": 0.30567625353472655,
+      "grad_norm": 0.5742565989494324,
+      "learning_rate": 0.00019504152352027245,
+      "loss": 0.3562,
+      "step": 2216
+    },
+    {
+      "epoch": 0.3058141940823505,
+      "grad_norm": 0.8754223585128784,
+      "learning_rate": 0.00019503702381881745,
+      "loss": 0.7154,
+      "step": 2217
+    },
+    {
+      "epoch": 0.3059521346299745,
+      "grad_norm": 0.834255576133728,
+      "learning_rate": 0.00019503252212855422,
+      "loss": 0.8241,
+      "step": 2218
+    },
+    {
+      "epoch": 0.30609007517759845,
+      "grad_norm": 0.8959856033325195,
+      "learning_rate": 0.00019502801844957697,
+      "loss": 1.1416,
+      "step": 2219
+    },
+    {
+      "epoch": 0.3062280157252224,
+      "grad_norm": 0.76212078332901,
+      "learning_rate": 0.00019502351278197994,
+      "loss": 0.5501,
+      "step": 2220
+    },
+    {
+      "epoch": 0.3063659562728464,
+      "grad_norm": 1.0702933073043823,
+      "learning_rate": 0.0001950190051258574,
+      "loss": 0.5158,
+      "step": 2221
+    },
+    {
+      "epoch": 0.30650389682047036,
+      "grad_norm": 0.9771005511283875,
+      "learning_rate": 0.00019501449548130372,
+      "loss": 0.6492,
+      "step": 2222
+    },
+    {
+      "epoch": 0.30664183736809436,
+      "grad_norm": 0.6449692845344543,
+      "learning_rate": 0.00019500998384841322,
+      "loss": 0.581,
+      "step": 2223
+    },
+    {
+      "epoch": 0.3067797779157183,
+      "grad_norm": 0.6486768126487732,
+      "learning_rate": 0.00019500547022728034,
+      "loss": 0.6896,
+      "step": 2224
+    },
+    {
+      "epoch": 0.3069177184633423,
+      "grad_norm": 0.570933997631073,
+      "learning_rate": 0.00019500095461799955,
+      "loss": 0.4472,
+      "step": 2225
+    },
+    {
+      "epoch": 0.30705565901096626,
+      "grad_norm": 0.6124463081359863,
+      "learning_rate": 0.00019499643702066536,
+      "loss": 0.49,
+      "step": 2226
+    },
+    {
+      "epoch": 0.30719359955859027,
+      "grad_norm": 1.030892014503479,
+      "learning_rate": 0.00019499191743537224,
+      "loss": 0.6116,
+      "step": 2227
+    },
+    {
+      "epoch": 0.3073315401062142,
+      "grad_norm": 0.7422316670417786,
+      "learning_rate": 0.00019498739586221482,
+      "loss": 0.4349,
+      "step": 2228
+    },
+    {
+      "epoch": 0.3074694806538382,
+      "grad_norm": 1.2078644037246704,
+      "learning_rate": 0.00019498287230128775,
+      "loss": 0.8739,
+      "step": 2229
+    },
+    {
+      "epoch": 0.3076074212014622,
+      "grad_norm": 0.6796876788139343,
+      "learning_rate": 0.0001949783467526856,
+      "loss": 0.4402,
+      "step": 2230
+    },
+    {
+      "epoch": 0.3077453617490861,
+      "grad_norm": 0.9108544588088989,
+      "learning_rate": 0.00019497381921650318,
+      "loss": 0.8838,
+      "step": 2231
+    },
+    {
+      "epoch": 0.3078833022967101,
+      "grad_norm": 0.9964629411697388,
+      "learning_rate": 0.00019496928969283517,
+      "loss": 0.7255,
+      "step": 2232
+    },
+    {
+      "epoch": 0.3080212428443341,
+      "grad_norm": 1.5495188236236572,
+      "learning_rate": 0.00019496475818177634,
+      "loss": 1.264,
+      "step": 2233
+    },
+    {
+      "epoch": 0.3081591833919581,
+      "grad_norm": 0.8140445351600647,
+      "learning_rate": 0.0001949602246834216,
+      "loss": 0.9636,
+      "step": 2234
+    },
+    {
+      "epoch": 0.30829712393958203,
+      "grad_norm": 0.6906377077102661,
+      "learning_rate": 0.0001949556891978658,
+      "loss": 0.553,
+      "step": 2235
+    },
+    {
+      "epoch": 0.30843506448720603,
+      "grad_norm": 0.8340548872947693,
+      "learning_rate": 0.00019495115172520378,
+      "loss": 0.5792,
+      "step": 2236
+    },
+    {
+      "epoch": 0.30857300503483,
+      "grad_norm": 1.0296357870101929,
+      "learning_rate": 0.00019494661226553055,
+      "loss": 0.971,
+      "step": 2237
+    },
+    {
+      "epoch": 0.308710945582454,
+      "grad_norm": 0.7610672116279602,
+      "learning_rate": 0.0001949420708189411,
+      "loss": 0.5375,
+      "step": 2238
+    },
+    {
+      "epoch": 0.30884888613007794,
+      "grad_norm": 0.722172200679779,
+      "learning_rate": 0.00019493752738553046,
+      "loss": 0.479,
+      "step": 2239
+    },
+    {
+      "epoch": 0.3089868266777019,
+      "grad_norm": 0.8141410946846008,
+      "learning_rate": 0.00019493298196539375,
+      "loss": 0.8384,
+      "step": 2240
+    },
+    {
+      "epoch": 0.3091247672253259,
+      "grad_norm": 0.7743800282478333,
+      "learning_rate": 0.000194928434558626,
+      "loss": 0.9943,
+      "step": 2241
+    },
+    {
+      "epoch": 0.30926270777294984,
+      "grad_norm": 0.6680206656455994,
+      "learning_rate": 0.00019492388516532247,
+      "loss": 0.4103,
+      "step": 2242
+    },
+    {
+      "epoch": 0.30940064832057385,
+      "grad_norm": 0.9488325715065002,
+      "learning_rate": 0.0001949193337855783,
+      "loss": 0.8465,
+      "step": 2243
+    },
+    {
+      "epoch": 0.3095385888681978,
+      "grad_norm": 0.5857890248298645,
+      "learning_rate": 0.00019491478041948877,
+      "loss": 0.395,
+      "step": 2244
+    },
+    {
+      "epoch": 0.3096765294158218,
+      "grad_norm": 0.5725042223930359,
+      "learning_rate": 0.00019491022506714912,
+      "loss": 0.3626,
+      "step": 2245
+    },
+    {
+      "epoch": 0.30981446996344575,
+      "grad_norm": 0.7076693773269653,
+      "learning_rate": 0.00019490566772865475,
+      "loss": 0.5949,
+      "step": 2246
+    },
+    {
+      "epoch": 0.30995241051106975,
+      "grad_norm": 0.8544387817382812,
+      "learning_rate": 0.00019490110840410097,
+      "loss": 1.0608,
+      "step": 2247
+    },
+    {
+      "epoch": 0.3100903510586937,
+      "grad_norm": 0.832599937915802,
+      "learning_rate": 0.00019489654709358323,
+      "loss": 0.807,
+      "step": 2248
+    },
+    {
+      "epoch": 0.31022829160631765,
+      "grad_norm": 1.0049424171447754,
+      "learning_rate": 0.00019489198379719696,
+      "loss": 0.794,
+      "step": 2249
+    },
+    {
+      "epoch": 0.31036623215394166,
+      "grad_norm": 0.6564392447471619,
+      "learning_rate": 0.00019488741851503765,
+      "loss": 0.5557,
+      "step": 2250
+    },
+    {
+      "epoch": 0.3105041727015656,
+      "grad_norm": 0.5619440078735352,
+      "learning_rate": 0.00019488285124720086,
+      "loss": 0.4077,
+      "step": 2251
+    },
+    {
+      "epoch": 0.3106421132491896,
+      "grad_norm": 0.5860351920127869,
+      "learning_rate": 0.00019487828199378214,
+      "loss": 0.4018,
+      "step": 2252
+    },
+    {
+      "epoch": 0.31078005379681356,
+      "grad_norm": 0.7864125370979309,
+      "learning_rate": 0.00019487371075487713,
+      "loss": 0.6525,
+      "step": 2253
+    },
+    {
+      "epoch": 0.31091799434443756,
+      "grad_norm": 0.6421269178390503,
+      "learning_rate": 0.00019486913753058148,
+      "loss": 0.4446,
+      "step": 2254
+    },
+    {
+      "epoch": 0.3110559348920615,
+      "grad_norm": 1.2416633367538452,
+      "learning_rate": 0.0001948645623209909,
+      "loss": 0.5695,
+      "step": 2255
+    },
+    {
+      "epoch": 0.3111938754396855,
+      "grad_norm": 1.3990689516067505,
+      "learning_rate": 0.00019485998512620113,
+      "loss": 0.8486,
+      "step": 2256
+    },
+    {
+      "epoch": 0.31133181598730947,
+      "grad_norm": 0.8644762635231018,
+      "learning_rate": 0.00019485540594630794,
+      "loss": 0.5197,
+      "step": 2257
+    },
+    {
+      "epoch": 0.3114697565349334,
+      "grad_norm": 0.7197523713111877,
+      "learning_rate": 0.0001948508247814072,
+      "loss": 0.4854,
+      "step": 2258
+    },
+    {
+      "epoch": 0.3116076970825574,
+      "grad_norm": 0.7777307033538818,
+      "learning_rate": 0.00019484624163159474,
+      "loss": 0.8011,
+      "step": 2259
+    },
+    {
+      "epoch": 0.31174563763018137,
+      "grad_norm": 3.498762369155884,
+      "learning_rate": 0.00019484165649696648,
+      "loss": 1.2415,
+      "step": 2260
+    },
+    {
+      "epoch": 0.3118835781778054,
+      "grad_norm": 0.8177916407585144,
+      "learning_rate": 0.00019483706937761837,
+      "loss": 0.6254,
+      "step": 2261
+    },
+    {
+      "epoch": 0.3120215187254293,
+      "grad_norm": 0.8077528476715088,
+      "learning_rate": 0.0001948324802736464,
+      "loss": 1.1841,
+      "step": 2262
+    },
+    {
+      "epoch": 0.31215945927305333,
+      "grad_norm": 0.7529622316360474,
+      "learning_rate": 0.00019482788918514664,
+      "loss": 0.5046,
+      "step": 2263
+    },
+    {
+      "epoch": 0.3122973998206773,
+      "grad_norm": 0.6038236618041992,
+      "learning_rate": 0.0001948232961122151,
+      "loss": 0.5598,
+      "step": 2264
+    },
+    {
+      "epoch": 0.3124353403683013,
+      "grad_norm": 0.6496687531471252,
+      "learning_rate": 0.00019481870105494796,
+      "loss": 0.3127,
+      "step": 2265
+    },
+    {
+      "epoch": 0.31257328091592523,
+      "grad_norm": 0.8372655510902405,
+      "learning_rate": 0.00019481410401344133,
+      "loss": 0.7623,
+      "step": 2266
+    },
+    {
+      "epoch": 0.3127112214635492,
+      "grad_norm": 0.9408671855926514,
+      "learning_rate": 0.00019480950498779144,
+      "loss": 0.913,
+      "step": 2267
+    },
+    {
+      "epoch": 0.3128491620111732,
+      "grad_norm": 1.2297847270965576,
+      "learning_rate": 0.00019480490397809456,
+      "loss": 0.7727,
+      "step": 2268
+    },
+    {
+      "epoch": 0.31298710255879714,
+      "grad_norm": 0.8657265305519104,
+      "learning_rate": 0.0001948003009844469,
+      "loss": 0.7712,
+      "step": 2269
+    },
+    {
+      "epoch": 0.31312504310642114,
+      "grad_norm": 0.6789664030075073,
+      "learning_rate": 0.00019479569600694486,
+      "loss": 0.5377,
+      "step": 2270
+    },
+    {
+      "epoch": 0.3132629836540451,
+      "grad_norm": 0.8153241872787476,
+      "learning_rate": 0.00019479108904568474,
+      "loss": 0.438,
+      "step": 2271
+    },
+    {
+      "epoch": 0.3134009242016691,
+      "grad_norm": 0.820363461971283,
+      "learning_rate": 0.00019478648010076298,
+      "loss": 0.5774,
+      "step": 2272
+    },
+    {
+      "epoch": 0.31353886474929304,
+      "grad_norm": 0.9345502853393555,
+      "learning_rate": 0.00019478186917227605,
+      "loss": 0.7403,
+      "step": 2273
+    },
+    {
+      "epoch": 0.31367680529691705,
+      "grad_norm": 0.6386396884918213,
+      "learning_rate": 0.00019477725626032043,
+      "loss": 0.5016,
+      "step": 2274
+    },
+    {
+      "epoch": 0.313814745844541,
+      "grad_norm": 1.081990122795105,
+      "learning_rate": 0.00019477264136499262,
+      "loss": 0.7868,
+      "step": 2275
+    },
+    {
+      "epoch": 0.313952686392165,
+      "grad_norm": 0.7201882600784302,
+      "learning_rate": 0.00019476802448638924,
+      "loss": 0.488,
+      "step": 2276
+    },
+    {
+      "epoch": 0.31409062693978895,
+      "grad_norm": 0.7955479621887207,
+      "learning_rate": 0.00019476340562460688,
+      "loss": 0.7676,
+      "step": 2277
+    },
+    {
+      "epoch": 0.3142285674874129,
+      "grad_norm": 0.731919527053833,
+      "learning_rate": 0.0001947587847797422,
+      "loss": 0.579,
+      "step": 2278
+    },
+    {
+      "epoch": 0.3143665080350369,
+      "grad_norm": 1.8228474855422974,
+      "learning_rate": 0.00019475416195189192,
+      "loss": 0.8461,
+      "step": 2279
+    },
+    {
+      "epoch": 0.31450444858266086,
+      "grad_norm": 0.5661347508430481,
+      "learning_rate": 0.00019474953714115274,
+      "loss": 0.3593,
+      "step": 2280
+    },
+    {
+      "epoch": 0.31464238913028486,
+      "grad_norm": 0.747999370098114,
+      "learning_rate": 0.00019474491034762145,
+      "loss": 0.6878,
+      "step": 2281
+    },
+    {
+      "epoch": 0.3147803296779088,
+      "grad_norm": 0.9928996562957764,
+      "learning_rate": 0.0001947402815713949,
+      "loss": 0.8761,
+      "step": 2282
+    },
+    {
+      "epoch": 0.3149182702255328,
+      "grad_norm": 0.7003133893013,
+      "learning_rate": 0.00019473565081256996,
+      "loss": 0.4855,
+      "step": 2283
+    },
+    {
+      "epoch": 0.31505621077315676,
+      "grad_norm": 0.6472734808921814,
+      "learning_rate": 0.00019473101807124352,
+      "loss": 0.511,
+      "step": 2284
+    },
+    {
+      "epoch": 0.31519415132078077,
+      "grad_norm": 0.723513662815094,
+      "learning_rate": 0.0001947263833475125,
+      "loss": 0.4892,
+      "step": 2285
+    },
+    {
+      "epoch": 0.3153320918684047,
+      "grad_norm": 1.6176047325134277,
+      "learning_rate": 0.00019472174664147393,
+      "loss": 0.5581,
+      "step": 2286
+    },
+    {
+      "epoch": 0.31547003241602867,
+      "grad_norm": 0.9376205801963806,
+      "learning_rate": 0.00019471710795322485,
+      "loss": 1.091,
+      "step": 2287
+    },
+    {
+      "epoch": 0.31560797296365267,
+      "grad_norm": 1.0848584175109863,
+      "learning_rate": 0.00019471246728286227,
+      "loss": 0.6718,
+      "step": 2288
+    },
+    {
+      "epoch": 0.3157459135112766,
+      "grad_norm": 1.0394634008407593,
+      "learning_rate": 0.00019470782463048336,
+      "loss": 0.4477,
+      "step": 2289
+    },
+    {
+      "epoch": 0.3158838540589006,
+      "grad_norm": 0.8964745998382568,
+      "learning_rate": 0.00019470317999618523,
+      "loss": 0.4769,
+      "step": 2290
+    },
+    {
+      "epoch": 0.3160217946065246,
+      "grad_norm": 0.6246095299720764,
+      "learning_rate": 0.00019469853338006514,
+      "loss": 0.2479,
+      "step": 2291
+    },
+    {
+      "epoch": 0.3161597351541486,
+      "grad_norm": 0.878368079662323,
+      "learning_rate": 0.0001946938847822203,
+      "loss": 0.3964,
+      "step": 2292
+    },
+    {
+      "epoch": 0.31629767570177253,
+      "grad_norm": 0.6446416974067688,
+      "learning_rate": 0.00019468923420274797,
+      "loss": 0.6782,
+      "step": 2293
+    },
+    {
+      "epoch": 0.31643561624939653,
+      "grad_norm": 0.8462199568748474,
+      "learning_rate": 0.0001946845816417455,
+      "loss": 0.6378,
+      "step": 2294
+    },
+    {
+      "epoch": 0.3165735567970205,
+      "grad_norm": 0.7193346619606018,
+      "learning_rate": 0.00019467992709931017,
+      "loss": 0.5933,
+      "step": 2295
+    },
+    {
+      "epoch": 0.31671149734464443,
+      "grad_norm": 1.4028959274291992,
+      "learning_rate": 0.00019467527057553952,
+      "loss": 1.1746,
+      "step": 2296
+    },
+    {
+      "epoch": 0.31684943789226844,
+      "grad_norm": 0.8412365913391113,
+      "learning_rate": 0.00019467061207053087,
+      "loss": 0.5632,
+      "step": 2297
+    },
+    {
+      "epoch": 0.3169873784398924,
+      "grad_norm": 0.6352449655532837,
+      "learning_rate": 0.0001946659515843818,
+      "loss": 0.4559,
+      "step": 2298
+    },
+    {
+      "epoch": 0.3171253189875164,
+      "grad_norm": 0.48701727390289307,
+      "learning_rate": 0.00019466128911718982,
+      "loss": 0.2398,
+      "step": 2299
+    },
+    {
+      "epoch": 0.31726325953514034,
+      "grad_norm": 0.5449528098106384,
+      "learning_rate": 0.00019465662466905243,
+      "loss": 0.6206,
+      "step": 2300
+    },
+    {
+      "epoch": 0.31740120008276435,
+      "grad_norm": 1.2383208274841309,
+      "learning_rate": 0.00019465195824006732,
+      "loss": 0.9354,
+      "step": 2301
+    },
+    {
+      "epoch": 0.3175391406303883,
+      "grad_norm": 0.9451349377632141,
+      "learning_rate": 0.00019464728983033212,
+      "loss": 0.9349,
+      "step": 2302
+    },
+    {
+      "epoch": 0.3176770811780123,
+      "grad_norm": 0.7076907753944397,
+      "learning_rate": 0.0001946426194399445,
+      "loss": 0.667,
+      "step": 2303
+    },
+    {
+      "epoch": 0.31781502172563625,
+      "grad_norm": 0.6356270909309387,
+      "learning_rate": 0.00019463794706900224,
+      "loss": 0.2469,
+      "step": 2304
+    },
+    {
+      "epoch": 0.3179529622732602,
+      "grad_norm": 0.8059444427490234,
+      "learning_rate": 0.00019463327271760308,
+      "loss": 0.6322,
+      "step": 2305
+    },
+    {
+      "epoch": 0.3180909028208842,
+      "grad_norm": 0.7126657366752625,
+      "learning_rate": 0.00019462859638584484,
+      "loss": 0.4607,
+      "step": 2306
+    },
+    {
+      "epoch": 0.31822884336850815,
+      "grad_norm": 1.20512855052948,
+      "learning_rate": 0.0001946239180738254,
+      "loss": 0.7065,
+      "step": 2307
+    },
+    {
+      "epoch": 0.31836678391613216,
+      "grad_norm": 1.0039737224578857,
+      "learning_rate": 0.00019461923778164267,
+      "loss": 0.7817,
+      "step": 2308
+    },
+    {
+      "epoch": 0.3185047244637561,
+      "grad_norm": 0.8472278118133545,
+      "learning_rate": 0.00019461455550939455,
+      "loss": 0.7392,
+      "step": 2309
+    },
+    {
+      "epoch": 0.3186426650113801,
+      "grad_norm": 0.8026204109191895,
+      "learning_rate": 0.00019460987125717905,
+      "loss": 0.6547,
+      "step": 2310
+    },
+    {
+      "epoch": 0.31878060555900406,
+      "grad_norm": 0.985788881778717,
+      "learning_rate": 0.00019460518502509422,
+      "loss": 0.3619,
+      "step": 2311
+    },
+    {
+      "epoch": 0.31891854610662806,
+      "grad_norm": 0.913837194442749,
+      "learning_rate": 0.00019460049681323808,
+      "loss": 0.8376,
+      "step": 2312
+    },
+    {
+      "epoch": 0.319056486654252,
+      "grad_norm": 0.6265845894813538,
+      "learning_rate": 0.0001945958066217088,
+      "loss": 0.579,
+      "step": 2313
+    },
+    {
+      "epoch": 0.319194427201876,
+      "grad_norm": 0.9424504637718201,
+      "learning_rate": 0.00019459111445060444,
+      "loss": 0.5184,
+      "step": 2314
+    },
+    {
+      "epoch": 0.31933236774949997,
+      "grad_norm": 0.5835946202278137,
+      "learning_rate": 0.00019458642030002326,
+      "loss": 0.4495,
+      "step": 2315
+    },
+    {
+      "epoch": 0.3194703082971239,
+      "grad_norm": 0.7594127058982849,
+      "learning_rate": 0.00019458172417006347,
+      "loss": 0.7142,
+      "step": 2316
+    },
+    {
+      "epoch": 0.3196082488447479,
+      "grad_norm": 0.6176849007606506,
+      "learning_rate": 0.00019457702606082337,
+      "loss": 0.3594,
+      "step": 2317
+    },
+    {
+      "epoch": 0.31974618939237187,
+      "grad_norm": 1.6596888303756714,
+      "learning_rate": 0.00019457232597240126,
+      "loss": 0.9118,
+      "step": 2318
+    },
+    {
+      "epoch": 0.3198841299399959,
+      "grad_norm": 0.8690287470817566,
+      "learning_rate": 0.00019456762390489548,
+      "loss": 0.566,
+      "step": 2319
+    },
+    {
+      "epoch": 0.3200220704876198,
+      "grad_norm": 1.1131110191345215,
+      "learning_rate": 0.0001945629198584044,
+      "loss": 1.1,
+      "step": 2320
+    },
+    {
+      "epoch": 0.32016001103524383,
+      "grad_norm": 0.7218566536903381,
+      "learning_rate": 0.00019455821383302657,
+      "loss": 0.5501,
+      "step": 2321
+    },
+    {
+      "epoch": 0.3202979515828678,
+      "grad_norm": 0.5688751339912415,
+      "learning_rate": 0.00019455350582886038,
+      "loss": 0.5373,
+      "step": 2322
+    },
+    {
+      "epoch": 0.3204358921304918,
+      "grad_norm": 1.2792819738388062,
+      "learning_rate": 0.00019454879584600437,
+      "loss": 0.733,
+      "step": 2323
+    },
+    {
+      "epoch": 0.32057383267811573,
+      "grad_norm": 0.9383312463760376,
+      "learning_rate": 0.0001945440838845571,
+      "loss": 0.4367,
+      "step": 2324
+    },
+    {
+      "epoch": 0.3207117732257397,
+      "grad_norm": 0.9324066042900085,
+      "learning_rate": 0.00019453936994461718,
+      "loss": 0.9925,
+      "step": 2325
+    },
+    {
+      "epoch": 0.3208497137733637,
+      "grad_norm": 1.0629867315292358,
+      "learning_rate": 0.0001945346540262833,
+      "loss": 0.7296,
+      "step": 2326
+    },
+    {
+      "epoch": 0.32098765432098764,
+      "grad_norm": 0.7863196730613708,
+      "learning_rate": 0.0001945299361296541,
+      "loss": 0.936,
+      "step": 2327
+    },
+    {
+      "epoch": 0.32112559486861164,
+      "grad_norm": 0.6948659420013428,
+      "learning_rate": 0.0001945252162548283,
+      "loss": 0.4759,
+      "step": 2328
+    },
+    {
+      "epoch": 0.3212635354162356,
+      "grad_norm": 0.908307671546936,
+      "learning_rate": 0.00019452049440190473,
+      "loss": 0.8042,
+      "step": 2329
+    },
+    {
+      "epoch": 0.3214014759638596,
+      "grad_norm": 0.814140796661377,
+      "learning_rate": 0.00019451577057098213,
+      "loss": 0.7884,
+      "step": 2330
+    },
+    {
+      "epoch": 0.32153941651148354,
+      "grad_norm": 0.752573549747467,
+      "learning_rate": 0.0001945110447621594,
+      "loss": 0.8405,
+      "step": 2331
+    },
+    {
+      "epoch": 0.32167735705910755,
+      "grad_norm": 0.8677518963813782,
+      "learning_rate": 0.00019450631697553542,
+      "loss": 0.8891,
+      "step": 2332
+    },
+    {
+      "epoch": 0.3218152976067315,
+      "grad_norm": 0.7212129831314087,
+      "learning_rate": 0.00019450158721120916,
+      "loss": 0.5369,
+      "step": 2333
+    },
+    {
+      "epoch": 0.32195323815435545,
+      "grad_norm": 0.6805658936500549,
+      "learning_rate": 0.00019449685546927954,
+      "loss": 0.4181,
+      "step": 2334
+    },
+    {
+      "epoch": 0.32209117870197945,
+      "grad_norm": 0.8572118878364563,
+      "learning_rate": 0.0001944921217498456,
+      "loss": 0.4678,
+      "step": 2335
+    },
+    {
+      "epoch": 0.3222291192496034,
+      "grad_norm": 0.7739250063896179,
+      "learning_rate": 0.00019448738605300645,
+      "loss": 0.8138,
+      "step": 2336
+    },
+    {
+      "epoch": 0.3223670597972274,
+      "grad_norm": 0.9221212863922119,
+      "learning_rate": 0.00019448264837886113,
+      "loss": 0.6867,
+      "step": 2337
+    },
+    {
+      "epoch": 0.32250500034485136,
+      "grad_norm": 0.5943915247917175,
+      "learning_rate": 0.0001944779087275088,
+      "loss": 0.4269,
+      "step": 2338
+    },
+    {
+      "epoch": 0.32264294089247536,
+      "grad_norm": 0.7601683735847473,
+      "learning_rate": 0.00019447316709904865,
+      "loss": 0.4699,
+      "step": 2339
+    },
+    {
+      "epoch": 0.3227808814400993,
+      "grad_norm": 0.8653863072395325,
+      "learning_rate": 0.0001944684234935799,
+      "loss": 0.6408,
+      "step": 2340
+    },
+    {
+      "epoch": 0.3229188219877233,
+      "grad_norm": 0.8126456141471863,
+      "learning_rate": 0.00019446367791120186,
+      "loss": 0.7773,
+      "step": 2341
+    },
+    {
+      "epoch": 0.32305676253534726,
+      "grad_norm": 0.6638123393058777,
+      "learning_rate": 0.00019445893035201383,
+      "loss": 0.4854,
+      "step": 2342
+    },
+    {
+      "epoch": 0.3231947030829712,
+      "grad_norm": 1.3545905351638794,
+      "learning_rate": 0.00019445418081611506,
+      "loss": 0.9794,
+      "step": 2343
+    },
+    {
+      "epoch": 0.3233326436305952,
+      "grad_norm": 0.8681669235229492,
+      "learning_rate": 0.00019444942930360503,
+      "loss": 0.8998,
+      "step": 2344
+    },
+    {
+      "epoch": 0.32347058417821917,
+      "grad_norm": 1.0023455619812012,
+      "learning_rate": 0.00019444467581458322,
+      "loss": 0.7062,
+      "step": 2345
+    },
+    {
+      "epoch": 0.32360852472584317,
+      "grad_norm": 0.8101288676261902,
+      "learning_rate": 0.00019443992034914897,
+      "loss": 0.6581,
+      "step": 2346
+    },
+    {
+      "epoch": 0.3237464652734671,
+      "grad_norm": 1.2586729526519775,
+      "learning_rate": 0.00019443516290740194,
+      "loss": 0.7804,
+      "step": 2347
+    },
+    {
+      "epoch": 0.3238844058210911,
+      "grad_norm": 0.9507285356521606,
+      "learning_rate": 0.00019443040348944156,
+      "loss": 0.6049,
+      "step": 2348
+    },
+    {
+      "epoch": 0.3240223463687151,
+      "grad_norm": 0.6528936624526978,
+      "learning_rate": 0.00019442564209536754,
+      "loss": 0.4616,
+      "step": 2349
+    },
+    {
+      "epoch": 0.3241602869163391,
+      "grad_norm": 0.7113572359085083,
+      "learning_rate": 0.00019442087872527944,
+      "loss": 0.6116,
+      "step": 2350
+    },
+    {
+      "epoch": 0.32429822746396303,
+      "grad_norm": 0.5419871807098389,
+      "learning_rate": 0.00019441611337927696,
+      "loss": 0.2321,
+      "step": 2351
+    },
+    {
+      "epoch": 0.324436168011587,
+      "grad_norm": 0.679607629776001,
+      "learning_rate": 0.00019441134605745986,
+      "loss": 0.459,
+      "step": 2352
+    },
+    {
+      "epoch": 0.324574108559211,
+      "grad_norm": 0.9691960215568542,
+      "learning_rate": 0.00019440657675992787,
+      "loss": 0.4727,
+      "step": 2353
+    },
+    {
+      "epoch": 0.32471204910683493,
+      "grad_norm": 0.8125988841056824,
+      "learning_rate": 0.0001944018054867808,
+      "loss": 0.8017,
+      "step": 2354
+    },
+    {
+      "epoch": 0.32484998965445894,
+      "grad_norm": 1.649573802947998,
+      "learning_rate": 0.00019439703223811847,
+      "loss": 0.829,
+      "step": 2355
+    },
+    {
+      "epoch": 0.3249879302020829,
+      "grad_norm": 0.745305061340332,
+      "learning_rate": 0.00019439225701404085,
+      "loss": 0.4651,
+      "step": 2356
+    },
+    {
+      "epoch": 0.3251258707497069,
+      "grad_norm": 0.6748473048210144,
+      "learning_rate": 0.00019438747981464775,
+      "loss": 0.5996,
+      "step": 2357
+    },
+    {
+      "epoch": 0.32526381129733084,
+      "grad_norm": 1.0531598329544067,
+      "learning_rate": 0.00019438270064003926,
+      "loss": 0.9084,
+      "step": 2358
+    },
+    {
+      "epoch": 0.32540175184495485,
+      "grad_norm": 0.9223348498344421,
+      "learning_rate": 0.00019437791949031535,
+      "loss": 0.7866,
+      "step": 2359
+    },
+    {
+      "epoch": 0.3255396923925788,
+      "grad_norm": 0.7526196837425232,
+      "learning_rate": 0.00019437313636557602,
+      "loss": 0.4975,
+      "step": 2360
+    },
+    {
+      "epoch": 0.3256776329402028,
+      "grad_norm": 1.6201496124267578,
+      "learning_rate": 0.00019436835126592143,
+      "loss": 0.7395,
+      "step": 2361
+    },
+    {
+      "epoch": 0.32581557348782675,
+      "grad_norm": 0.7340310215950012,
+      "learning_rate": 0.00019436356419145166,
+      "loss": 0.6325,
+      "step": 2362
+    },
+    {
+      "epoch": 0.3259535140354507,
+      "grad_norm": 1.1777743101119995,
+      "learning_rate": 0.00019435877514226697,
+      "loss": 0.4779,
+      "step": 2363
+    },
+    {
+      "epoch": 0.3260914545830747,
+      "grad_norm": 0.9242397546768188,
+      "learning_rate": 0.00019435398411846752,
+      "loss": 0.4674,
+      "step": 2364
+    },
+    {
+      "epoch": 0.32622939513069865,
+      "grad_norm": 0.6935853958129883,
+      "learning_rate": 0.00019434919112015355,
+      "loss": 0.3949,
+      "step": 2365
+    },
+    {
+      "epoch": 0.32636733567832266,
+      "grad_norm": 0.7134401202201843,
+      "learning_rate": 0.00019434439614742543,
+      "loss": 0.5659,
+      "step": 2366
+    },
+    {
+      "epoch": 0.3265052762259466,
+      "grad_norm": 0.9489606618881226,
+      "learning_rate": 0.00019433959920038345,
+      "loss": 0.7124,
+      "step": 2367
+    },
+    {
+      "epoch": 0.3266432167735706,
+      "grad_norm": 0.6194107532501221,
+      "learning_rate": 0.000194334800279128,
+      "loss": 0.8171,
+      "step": 2368
+    },
+    {
+      "epoch": 0.32678115732119456,
+      "grad_norm": 0.8815126419067383,
+      "learning_rate": 0.00019432999938375953,
+      "loss": 0.5195,
+      "step": 2369
+    },
+    {
+      "epoch": 0.32691909786881856,
+      "grad_norm": 0.5797806978225708,
+      "learning_rate": 0.0001943251965143785,
+      "loss": 0.4952,
+      "step": 2370
+    },
+    {
+      "epoch": 0.3270570384164425,
+      "grad_norm": 0.9306840300559998,
+      "learning_rate": 0.00019432039167108537,
+      "loss": 0.476,
+      "step": 2371
+    },
+    {
+      "epoch": 0.32719497896406646,
+      "grad_norm": 0.6784822344779968,
+      "learning_rate": 0.00019431558485398076,
+      "loss": 0.641,
+      "step": 2372
+    },
+    {
+      "epoch": 0.32733291951169047,
+      "grad_norm": 0.7142674922943115,
+      "learning_rate": 0.00019431077606316523,
+      "loss": 0.6712,
+      "step": 2373
+    },
+    {
+      "epoch": 0.3274708600593144,
+      "grad_norm": 1.0263147354125977,
+      "learning_rate": 0.00019430596529873938,
+      "loss": 0.8278,
+      "step": 2374
+    },
+    {
+      "epoch": 0.3276088006069384,
+      "grad_norm": 0.672478199005127,
+      "learning_rate": 0.00019430115256080394,
+      "loss": 0.5935,
+      "step": 2375
+    },
+    {
+      "epoch": 0.32774674115456237,
+      "grad_norm": 0.9333507418632507,
+      "learning_rate": 0.0001942963378494596,
+      "loss": 0.664,
+      "step": 2376
+    },
+    {
+      "epoch": 0.3278846817021864,
+      "grad_norm": 0.8227028250694275,
+      "learning_rate": 0.0001942915211648071,
+      "loss": 0.3793,
+      "step": 2377
+    },
+    {
+      "epoch": 0.3280226222498103,
+      "grad_norm": 0.8363267183303833,
+      "learning_rate": 0.00019428670250694728,
+      "loss": 0.534,
+      "step": 2378
+    },
+    {
+      "epoch": 0.32816056279743433,
+      "grad_norm": 0.6801791787147522,
+      "learning_rate": 0.00019428188187598094,
+      "loss": 0.5693,
+      "step": 2379
+    },
+    {
+      "epoch": 0.3282985033450583,
+      "grad_norm": 0.9937869310379028,
+      "learning_rate": 0.00019427705927200896,
+      "loss": 0.4011,
+      "step": 2380
+    },
+    {
+      "epoch": 0.32843644389268223,
+      "grad_norm": 0.7679700255393982,
+      "learning_rate": 0.00019427223469513228,
+      "loss": 0.4928,
+      "step": 2381
+    },
+    {
+      "epoch": 0.32857438444030623,
+      "grad_norm": 1.2696233987808228,
+      "learning_rate": 0.00019426740814545185,
+      "loss": 0.3716,
+      "step": 2382
+    },
+    {
+      "epoch": 0.3287123249879302,
+      "grad_norm": 0.816831648349762,
+      "learning_rate": 0.00019426257962306868,
+      "loss": 0.469,
+      "step": 2383
+    },
+    {
+      "epoch": 0.3288502655355542,
+      "grad_norm": 1.172206163406372,
+      "learning_rate": 0.0001942577491280838,
+      "loss": 0.7011,
+      "step": 2384
+    },
+    {
+      "epoch": 0.32898820608317814,
+      "grad_norm": 0.8468907475471497,
+      "learning_rate": 0.00019425291666059832,
+      "loss": 0.2813,
+      "step": 2385
+    },
+    {
+      "epoch": 0.32912614663080214,
+      "grad_norm": 0.9245859980583191,
+      "learning_rate": 0.00019424808222071337,
+      "loss": 0.5006,
+      "step": 2386
+    },
+    {
+      "epoch": 0.3292640871784261,
+      "grad_norm": 1.3314694166183472,
+      "learning_rate": 0.00019424324580853006,
+      "loss": 0.3318,
+      "step": 2387
+    },
+    {
+      "epoch": 0.3294020277260501,
+      "grad_norm": 0.6868737936019897,
+      "learning_rate": 0.00019423840742414968,
+      "loss": 0.4828,
+      "step": 2388
+    },
+    {
+      "epoch": 0.32953996827367404,
+      "grad_norm": 0.5695831775665283,
+      "learning_rate": 0.00019423356706767343,
+      "loss": 0.4117,
+      "step": 2389
+    },
+    {
+      "epoch": 0.329677908821298,
+      "grad_norm": 0.8199607729911804,
+      "learning_rate": 0.00019422872473920264,
+      "loss": 0.8271,
+      "step": 2390
+    },
+    {
+      "epoch": 0.329815849368922,
+      "grad_norm": 0.9360648989677429,
+      "learning_rate": 0.0001942238804388386,
+      "loss": 0.8707,
+      "step": 2391
+    },
+    {
+      "epoch": 0.32995378991654595,
+      "grad_norm": 0.7775169610977173,
+      "learning_rate": 0.00019421903416668273,
+      "loss": 0.5637,
+      "step": 2392
+    },
+    {
+      "epoch": 0.33009173046416995,
+      "grad_norm": 0.8939715027809143,
+      "learning_rate": 0.0001942141859228364,
+      "loss": 0.681,
+      "step": 2393
+    },
+    {
+      "epoch": 0.3302296710117939,
+      "grad_norm": 0.7903376221656799,
+      "learning_rate": 0.00019420933570740112,
+      "loss": 0.6019,
+      "step": 2394
+    },
+    {
+      "epoch": 0.3303676115594179,
+      "grad_norm": 0.5653364062309265,
+      "learning_rate": 0.00019420448352047833,
+      "loss": 0.4377,
+      "step": 2395
+    },
+    {
+      "epoch": 0.33050555210704186,
+      "grad_norm": 0.6574212312698364,
+      "learning_rate": 0.0001941996293621696,
+      "loss": 0.3995,
+      "step": 2396
+    },
+    {
+      "epoch": 0.33064349265466586,
+      "grad_norm": 0.9487119913101196,
+      "learning_rate": 0.00019419477323257654,
+      "loss": 0.652,
+      "step": 2397
+    },
+    {
+      "epoch": 0.3307814332022898,
+      "grad_norm": 0.8530499339103699,
+      "learning_rate": 0.0001941899151318007,
+      "loss": 0.7224,
+      "step": 2398
+    },
+    {
+      "epoch": 0.3309193737499138,
+      "grad_norm": 0.8137893676757812,
+      "learning_rate": 0.0001941850550599438,
+      "loss": 0.5038,
+      "step": 2399
+    },
+    {
+      "epoch": 0.33105731429753776,
+      "grad_norm": 0.8479599356651306,
+      "learning_rate": 0.00019418019301710757,
+      "loss": 0.4543,
+      "step": 2400
+    },
+    {
+      "epoch": 0.33105731429753776,
+      "eval_loss": 0.6588593125343323,
+      "eval_runtime": 23.4746,
+      "eval_samples_per_second": 2.513,
+      "eval_steps_per_second": 2.513,
+      "step": 2400
     }
   ],
   "logging_steps": 1,
         "early_stopping_threshold": 0.0
       },
       "attributes": {
+        "early_stopping_patience_counter": 3
       }
     },
     "TrainerControl": {
         "should_evaluate": false,
         "should_log": false,
         "should_save": true,
+        "should_training_stop": true
       },
       "attributes": {}
     }
   },
+  "total_flos": 3.8002757108760576e+17,
   "train_batch_size": 1,
   "trial_name": null,
   "trial_params": null