Training in progress, step 432, checkpoint

Browse files

Files changed (5) hide show

last-checkpoint/adapter_model.safetensors +1 -1
last-checkpoint/optimizer.pt +1 -1
last-checkpoint/rng_state.pth +1 -1
last-checkpoint/scheduler.pt +1 -1
last-checkpoint/trainer_state.json +928 -4

last-checkpoint/adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:bd44ca5fee68a8b8f778de2f7ea71f9f6f6cfb20463e45c031a770c28b3e3643
 size 671149168

 version https://git-lfs.github.com/spec/v1
+oid sha256:0b3baf7f7a16380fb028f475ca35fb4a1485b5449798007811d88895bd00a770
 size 671149168

last-checkpoint/optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:84cad7780694e8033405e2f242fd2aa6390a53e4fd80756c03dc320e1e20c4e9
 size 341314644

 version https://git-lfs.github.com/spec/v1
+oid sha256:5e22ca7071dd076252b43b473fa1704a21a4785031a90f0b9028e8c83e8ba89a
 size 341314644

last-checkpoint/rng_state.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d38a8a486a83654e4e32aea3ee442c71560380819051d896a05b6a8cb94f5ff6
 size 14244

 version https://git-lfs.github.com/spec/v1
+oid sha256:6d78c488bb947774f6f4ca54910a40ea5bafc41a037da391b9eb2e717aae29e0
 size 14244

last-checkpoint/scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5c5b2e441bf9b0fdfc8cb71a6f42776e1a140513500e1faf056d1a979f781bb7
 size 1064

 version https://git-lfs.github.com/spec/v1
+oid sha256:25fb31c3275f1b8913e16332b6b15556ec3b58d9bf65928efef64fedc142b95b
 size 1064

last-checkpoint/trainer_state.json CHANGED Viewed

@@ -1,9 +1,9 @@
 {
   "best_metric": 0.3306281268596649,
   "best_model_checkpoint": "miner_id_24/checkpoint-300",
-  "epoch": 0.6948465547191662,
   "eval_steps": 150,
-  "global_step": 300,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
@@ -2131,6 +2131,930 @@
       "eval_samples_per_second": 23.374,
       "eval_steps_per_second": 5.851,
       "step": 300
     }
   ],
   "logging_steps": 1,
@@ -2154,12 +3078,12 @@
         "should_evaluate": false,
         "should_log": false,
         "should_save": true,
-        "should_training_stop": false
       },
       "attributes": {}
     }
   },
-  "total_flos": 2.2835124298265395e+17,
   "train_batch_size": 8,
   "trial_name": null,
   "trial_params": null

 {
   "best_metric": 0.3306281268596649,
   "best_model_checkpoint": "miner_id_24/checkpoint-300",
+  "epoch": 1.0011580775911986,
   "eval_steps": 150,
+  "global_step": 432,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
       "eval_samples_per_second": 23.374,
       "eval_steps_per_second": 5.851,
       "step": 300
+    },
+    {
+      "epoch": 0.6971627099015634,
+      "grad_norm": 2.477572441101074,
+      "learning_rate": 2.293880768178576e-05,
+      "loss": 1.0048,
+      "step": 301
+    },
+    {
+      "epoch": 0.6994788650839606,
+      "grad_norm": 2.3567538261413574,
+      "learning_rate": 2.2619003103344606e-05,
+      "loss": 1.1464,
+      "step": 302
+    },
+    {
+      "epoch": 0.7017950202663579,
+      "grad_norm": 2.297689199447632,
+      "learning_rate": 2.2300790559367552e-05,
+      "loss": 0.9767,
+      "step": 303
+    },
+    {
+      "epoch": 0.704111175448755,
+      "grad_norm": 2.396148204803467,
+      "learning_rate": 2.1984188551932512e-05,
+      "loss": 0.9108,
+      "step": 304
+    },
+    {
+      "epoch": 0.7064273306311523,
+      "grad_norm": 2.255086660385132,
+      "learning_rate": 2.166921548947466e-05,
+      "loss": 0.6876,
+      "step": 305
+    },
+    {
+      "epoch": 0.7087434858135495,
+      "grad_norm": 2.383619546890259,
+      "learning_rate": 2.1355889685716224e-05,
+      "loss": 0.7666,
+      "step": 306
+    },
+    {
+      "epoch": 0.7110596409959468,
+      "grad_norm": 2.329786777496338,
+      "learning_rate": 2.1044229358601542e-05,
+      "loss": 0.813,
+      "step": 307
+    },
+    {
+      "epoch": 0.7133757961783439,
+      "grad_norm": 2.474313497543335,
+      "learning_rate": 2.0734252629237894e-05,
+      "loss": 0.6183,
+      "step": 308
+    },
+    {
+      "epoch": 0.7156919513607412,
+      "grad_norm": 2.5291049480438232,
+      "learning_rate": 2.0425977520841838e-05,
+      "loss": 0.5384,
+      "step": 309
+    },
+    {
+      "epoch": 0.7180081065431384,
+      "grad_norm": 1.997194766998291,
+      "learning_rate": 2.011942195769122e-05,
+      "loss": 0.5236,
+      "step": 310
+    },
+    {
+      "epoch": 0.7203242617255357,
+      "grad_norm": 1.2435768842697144,
+      "learning_rate": 1.9814603764083112e-05,
+      "loss": 0.2937,
+      "step": 311
+    },
+    {
+      "epoch": 0.7226404169079328,
+      "grad_norm": 1.364814281463623,
+      "learning_rate": 1.9511540663297285e-05,
+      "loss": 0.3747,
+      "step": 312
+    },
+    {
+      "epoch": 0.72495657209033,
+      "grad_norm": 1.3511894941329956,
+      "learning_rate": 1.921025027656587e-05,
+      "loss": 0.4068,
+      "step": 313
+    },
+    {
+      "epoch": 0.7272727272727273,
+      "grad_norm": 1.3470934629440308,
+      "learning_rate": 1.8910750122048637e-05,
+      "loss": 0.3442,
+      "step": 314
+    },
+    {
+      "epoch": 0.7295888824551245,
+      "grad_norm": 1.4333728551864624,
+      "learning_rate": 1.8613057613814584e-05,
+      "loss": 0.3411,
+      "step": 315
+    },
+    {
+      "epoch": 0.7319050376375217,
+      "grad_norm": 1.348252296447754,
+      "learning_rate": 1.831719006082924e-05,
+      "loss": 0.3321,
+      "step": 316
+    },
+    {
+      "epoch": 0.7342211928199189,
+      "grad_norm": 0.9950866103172302,
+      "learning_rate": 1.8023164665948456e-05,
+      "loss": 0.1945,
+      "step": 317
+    },
+    {
+      "epoch": 0.7365373480023162,
+      "grad_norm": 1.1160790920257568,
+      "learning_rate": 1.7730998524917957e-05,
+      "loss": 0.263,
+      "step": 318
+    },
+    {
+      "epoch": 0.7388535031847133,
+      "grad_norm": 1.3591563701629639,
+      "learning_rate": 1.7440708625379505e-05,
+      "loss": 0.3155,
+      "step": 319
+    },
+    {
+      "epoch": 0.7411696583671106,
+      "grad_norm": 1.3873182535171509,
+      "learning_rate": 1.7152311845883095e-05,
+      "loss": 0.2736,
+      "step": 320
+    },
+    {
+      "epoch": 0.7434858135495078,
+      "grad_norm": 1.4288737773895264,
+      "learning_rate": 1.686582495490554e-05,
+      "loss": 0.3022,
+      "step": 321
+    },
+    {
+      "epoch": 0.7458019687319051,
+      "grad_norm": 1.7382503747940063,
+      "learning_rate": 1.658126460987558e-05,
+      "loss": 0.2989,
+      "step": 322
+    },
+    {
+      "epoch": 0.7481181239143022,
+      "grad_norm": 1.7087494134902954,
+      "learning_rate": 1.6298647356205254e-05,
+      "loss": 0.3102,
+      "step": 323
+    },
+    {
+      "epoch": 0.7504342790966995,
+      "grad_norm": 1.5120712518692017,
+      "learning_rate": 1.601798962632799e-05,
+      "loss": 0.2155,
+      "step": 324
+    },
+    {
+      "epoch": 0.7527504342790967,
+      "grad_norm": 1.5661240816116333,
+      "learning_rate": 1.5739307738743057e-05,
+      "loss": 0.1865,
+      "step": 325
+    },
+    {
+      "epoch": 0.755066589461494,
+      "grad_norm": 1.2175641059875488,
+      "learning_rate": 1.546261789706686e-05,
+      "loss": 0.1658,
+      "step": 326
+    },
+    {
+      "epoch": 0.7573827446438911,
+      "grad_norm": 1.4670747518539429,
+      "learning_rate": 1.5187936189090669e-05,
+      "loss": 0.1604,
+      "step": 327
+    },
+    {
+      "epoch": 0.7596988998262884,
+      "grad_norm": 1.8161323070526123,
+      "learning_rate": 1.491527858584535e-05,
+      "loss": 0.2243,
+      "step": 328
+    },
+    {
+      "epoch": 0.7620150550086856,
+      "grad_norm": 1.3583159446716309,
+      "learning_rate": 1.4644660940672627e-05,
+      "loss": 0.1281,
+      "step": 329
+    },
+    {
+      "epoch": 0.7643312101910829,
+      "grad_norm": 1.2247012853622437,
+      "learning_rate": 1.4376098988303405e-05,
+      "loss": 0.1084,
+      "step": 330
+    },
+    {
+      "epoch": 0.76664736537348,
+      "grad_norm": 1.2522218227386475,
+      "learning_rate": 1.4109608343942854e-05,
+      "loss": 0.1137,
+      "step": 331
+    },
+    {
+      "epoch": 0.7689635205558772,
+      "grad_norm": 1.0144122838974,
+      "learning_rate": 1.384520450236244e-05,
+      "loss": 0.0753,
+      "step": 332
+    },
+    {
+      "epoch": 0.7712796757382745,
+      "grad_norm": 1.0482442378997803,
+      "learning_rate": 1.35829028369991e-05,
+      "loss": 0.0872,
+      "step": 333
+    },
+    {
+      "epoch": 0.7735958309206716,
+      "grad_norm": 1.6394468545913696,
+      "learning_rate": 1.3322718599061251e-05,
+      "loss": 0.1127,
+      "step": 334
+    },
+    {
+      "epoch": 0.7759119861030689,
+      "grad_norm": 1.1458487510681152,
+      "learning_rate": 1.306466691664216e-05,
+      "loss": 0.1022,
+      "step": 335
+    },
+    {
+      "epoch": 0.7782281412854661,
+      "grad_norm": 1.6174955368041992,
+      "learning_rate": 1.2808762793840201e-05,
+      "loss": 0.1394,
+      "step": 336
+    },
+    {
+      "epoch": 0.7805442964678634,
+      "grad_norm": 1.2268593311309814,
+      "learning_rate": 1.2555021109886589e-05,
+      "loss": 0.1056,
+      "step": 337
+    },
+    {
+      "epoch": 0.7828604516502605,
+      "grad_norm": 1.2561674118041992,
+      "learning_rate": 1.2303456618280141e-05,
+      "loss": 0.0783,
+      "step": 338
+    },
+    {
+      "epoch": 0.7851766068326578,
+      "grad_norm": 1.1064704656600952,
+      "learning_rate": 1.2054083945929535e-05,
+      "loss": 0.0762,
+      "step": 339
+    },
+    {
+      "epoch": 0.787492762015055,
+      "grad_norm": 1.174625277519226,
+      "learning_rate": 1.1806917592302762e-05,
+      "loss": 0.0862,
+      "step": 340
+    },
+    {
+      "epoch": 0.7898089171974523,
+      "grad_norm": 1.6088016033172607,
+      "learning_rate": 1.1561971928584159e-05,
+      "loss": 0.0962,
+      "step": 341
+    },
+    {
+      "epoch": 0.7921250723798494,
+      "grad_norm": 1.1425096988677979,
+      "learning_rate": 1.1319261196838782e-05,
+      "loss": 0.0699,
+      "step": 342
+    },
+    {
+      "epoch": 0.7944412275622467,
+      "grad_norm": 1.6147550344467163,
+      "learning_rate": 1.1078799509184246e-05,
+      "loss": 0.0919,
+      "step": 343
+    },
+    {
+      "epoch": 0.7967573827446439,
+      "grad_norm": 1.4701851606369019,
+      "learning_rate": 1.0840600846970334e-05,
+      "loss": 0.0891,
+      "step": 344
+    },
+    {
+      "epoch": 0.7990735379270412,
+      "grad_norm": 1.4786357879638672,
+      "learning_rate": 1.0604679059965922e-05,
+      "loss": 0.0638,
+      "step": 345
+    },
+    {
+      "epoch": 0.8013896931094383,
+      "grad_norm": 2.1240947246551514,
+      "learning_rate": 1.0371047865553846e-05,
+      "loss": 0.1207,
+      "step": 346
+    },
+    {
+      "epoch": 0.8037058482918356,
+      "grad_norm": 1.4813666343688965,
+      "learning_rate": 1.0139720847933166e-05,
+      "loss": 0.0871,
+      "step": 347
+    },
+    {
+      "epoch": 0.8060220034742328,
+      "grad_norm": 1.3989876508712769,
+      "learning_rate": 9.91071145732948e-06,
+      "loss": 0.0964,
+      "step": 348
+    },
+    {
+      "epoch": 0.80833815865663,
+      "grad_norm": 1.3372840881347656,
+      "learning_rate": 9.684033009212752e-06,
+      "loss": 0.0714,
+      "step": 349
+    },
+    {
+      "epoch": 0.8106543138390272,
+      "grad_norm": 2.03450870513916,
+      "learning_rate": 9.459698683523204e-06,
+      "loss": 0.1074,
+      "step": 350
+    },
+    {
+      "epoch": 0.8129704690214244,
+      "grad_norm": 1.326187252998352,
+      "learning_rate": 9.237721523904891e-06,
+      "loss": 1.0425,
+      "step": 351
+    },
+    {
+      "epoch": 0.8152866242038217,
+      "grad_norm": 1.3223427534103394,
+      "learning_rate": 9.018114436947373e-06,
+      "loss": 0.8088,
+      "step": 352
+    },
+    {
+      "epoch": 0.8176027793862188,
+      "grad_norm": 1.4911216497421265,
+      "learning_rate": 8.80089019143524e-06,
+      "loss": 0.784,
+      "step": 353
+    },
+    {
+      "epoch": 0.8199189345686161,
+      "grad_norm": 1.5768686532974243,
+      "learning_rate": 8.586061417605668e-06,
+      "loss": 0.7212,
+      "step": 354
+    },
+    {
+      "epoch": 0.8222350897510133,
+      "grad_norm": 1.8095353841781616,
+      "learning_rate": 8.373640606414096e-06,
+      "loss": 0.5331,
+      "step": 355
+    },
+    {
+      "epoch": 0.8245512449334106,
+      "grad_norm": 1.7539920806884766,
+      "learning_rate": 8.163640108807896e-06,
+      "loss": 0.5814,
+      "step": 356
+    },
+    {
+      "epoch": 0.8268674001158077,
+      "grad_norm": 1.8924403190612793,
+      "learning_rate": 7.956072135008336e-06,
+      "loss": 0.6049,
+      "step": 357
+    },
+    {
+      "epoch": 0.829183555298205,
+      "grad_norm": 1.558434009552002,
+      "learning_rate": 7.750948753800507e-06,
+      "loss": 0.3521,
+      "step": 358
+    },
+    {
+      "epoch": 0.8314997104806022,
+      "grad_norm": 1.900602102279663,
+      "learning_rate": 7.548281891831716e-06,
+      "loss": 0.3716,
+      "step": 359
+    },
+    {
+      "epoch": 0.8338158656629994,
+      "grad_norm": 2.0248496532440186,
+      "learning_rate": 7.348083332917926e-06,
+      "loss": 0.3998,
+      "step": 360
+    },
+    {
+      "epoch": 0.8361320208453966,
+      "grad_norm": 1.3187469244003296,
+      "learning_rate": 7.150364717358698e-06,
+      "loss": 0.2192,
+      "step": 361
+    },
+    {
+      "epoch": 0.8384481760277939,
+      "grad_norm": 1.5400038957595825,
+      "learning_rate": 6.955137541260287e-06,
+      "loss": 0.2471,
+      "step": 362
+    },
+    {
+      "epoch": 0.8407643312101911,
+      "grad_norm": 1.604353666305542,
+      "learning_rate": 6.7624131558672756e-06,
+      "loss": 0.2206,
+      "step": 363
+    },
+    {
+      "epoch": 0.8430804863925883,
+      "grad_norm": 1.945562481880188,
+      "learning_rate": 6.572202766902569e-06,
+      "loss": 0.2775,
+      "step": 364
+    },
+    {
+      "epoch": 0.8453966415749855,
+      "grad_norm": 2.460178852081299,
+      "learning_rate": 6.384517433915793e-06,
+      "loss": 0.4032,
+      "step": 365
+    },
+    {
+      "epoch": 0.8477127967573828,
+      "grad_norm": 1.6772760152816772,
+      "learning_rate": 6.199368069640343e-06,
+      "loss": 0.253,
+      "step": 366
+    },
+    {
+      "epoch": 0.85002895193978,
+      "grad_norm": 1.7364951372146606,
+      "learning_rate": 6.016765439358774e-06,
+      "loss": 0.2342,
+      "step": 367
+    },
+    {
+      "epoch": 0.8523451071221771,
+      "grad_norm": 1.791764259338379,
+      "learning_rate": 5.83672016027697e-06,
+      "loss": 0.1775,
+      "step": 368
+    },
+    {
+      "epoch": 0.8546612623045744,
+      "grad_norm": 1.4430023431777954,
+      "learning_rate": 5.659242700906719e-06,
+      "loss": 0.1663,
+      "step": 369
+    },
+    {
+      "epoch": 0.8569774174869716,
+      "grad_norm": 1.4145193099975586,
+      "learning_rate": 5.484343380457125e-06,
+      "loss": 0.1669,
+      "step": 370
+    },
+    {
+      "epoch": 0.8592935726693689,
+      "grad_norm": 1.4928686618804932,
+      "learning_rate": 5.312032368234526e-06,
+      "loss": 0.19,
+      "step": 371
+    },
+    {
+      "epoch": 0.861609727851766,
+      "grad_norm": 1.48283851146698,
+      "learning_rate": 5.1423196830513e-06,
+      "loss": 0.1979,
+      "step": 372
+    },
+    {
+      "epoch": 0.8639258830341633,
+      "grad_norm": 1.4714758396148682,
+      "learning_rate": 4.975215192643246e-06,
+      "loss": 0.1778,
+      "step": 373
+    },
+    {
+      "epoch": 0.8662420382165605,
+      "grad_norm": 1.187078595161438,
+      "learning_rate": 4.81072861309591e-06,
+      "loss": 0.1187,
+      "step": 374
+    },
+    {
+      "epoch": 0.8685581933989577,
+      "grad_norm": 1.1508448123931885,
+      "learning_rate": 4.648869508279613e-06,
+      "loss": 0.1269,
+      "step": 375
+    },
+    {
+      "epoch": 0.8708743485813549,
+      "grad_norm": 0.9376459717750549,
+      "learning_rate": 4.489647289293369e-06,
+      "loss": 0.1059,
+      "step": 376
+    },
+    {
+      "epoch": 0.8731905037637522,
+      "grad_norm": 1.0897644758224487,
+      "learning_rate": 4.333071213917722e-06,
+      "loss": 0.0853,
+      "step": 377
+    },
+    {
+      "epoch": 0.8755066589461494,
+      "grad_norm": 0.9136444926261902,
+      "learning_rate": 4.179150386076424e-06,
+      "loss": 0.0743,
+      "step": 378
+    },
+    {
+      "epoch": 0.8778228141285466,
+      "grad_norm": 0.8426498174667358,
+      "learning_rate": 4.027893755307144e-06,
+      "loss": 0.0606,
+      "step": 379
+    },
+    {
+      "epoch": 0.8801389693109438,
+      "grad_norm": 0.7755717635154724,
+      "learning_rate": 3.879310116241042e-06,
+      "loss": 0.0617,
+      "step": 380
+    },
+    {
+      "epoch": 0.8824551244933411,
+      "grad_norm": 1.0966241359710693,
+      "learning_rate": 3.733408108091485e-06,
+      "loss": 0.0724,
+      "step": 381
+    },
+    {
+      "epoch": 0.8847712796757383,
+      "grad_norm": 0.8539220094680786,
+      "learning_rate": 3.5901962141516977e-06,
+      "loss": 0.0714,
+      "step": 382
+    },
+    {
+      "epoch": 0.8870874348581355,
+      "grad_norm": 1.0539127588272095,
+      "learning_rate": 3.4496827613015202e-06,
+      "loss": 0.0585,
+      "step": 383
+    },
+    {
+      "epoch": 0.8894035900405327,
+      "grad_norm": 0.7426398396492004,
+      "learning_rate": 3.3118759195232275e-06,
+      "loss": 0.0527,
+      "step": 384
+    },
+    {
+      "epoch": 0.89171974522293,
+      "grad_norm": 0.9449732899665833,
+      "learning_rate": 3.176783701426528e-06,
+      "loss": 0.0689,
+      "step": 385
+    },
+    {
+      "epoch": 0.8940359004053272,
+      "grad_norm": 0.9856773614883423,
+      "learning_rate": 3.0444139617826607e-06,
+      "loss": 0.0671,
+      "step": 386
+    },
+    {
+      "epoch": 0.8963520555877244,
+      "grad_norm": 0.826977014541626,
+      "learning_rate": 2.91477439706771e-06,
+      "loss": 0.0554,
+      "step": 387
+    },
+    {
+      "epoch": 0.8986682107701216,
+      "grad_norm": 0.7119414210319519,
+      "learning_rate": 2.787872545015069e-06,
+      "loss": 0.0525,
+      "step": 388
+    },
+    {
+      "epoch": 0.9009843659525189,
+      "grad_norm": 1.1008604764938354,
+      "learning_rate": 2.663715784177201e-06,
+      "loss": 0.0819,
+      "step": 389
+    },
+    {
+      "epoch": 0.903300521134916,
+      "grad_norm": 1.2398359775543213,
+      "learning_rate": 2.542311333496622e-06,
+      "loss": 0.0695,
+      "step": 390
+    },
+    {
+      "epoch": 0.9056166763173132,
+      "grad_norm": 0.6894932985305786,
+      "learning_rate": 2.423666251886114e-06,
+      "loss": 0.0404,
+      "step": 391
+    },
+    {
+      "epoch": 0.9079328314997105,
+      "grad_norm": 1.0042632818222046,
+      "learning_rate": 2.307787437818365e-06,
+      "loss": 0.063,
+      "step": 392
+    },
+    {
+      "epoch": 0.9102489866821077,
+      "grad_norm": 1.2008248567581177,
+      "learning_rate": 2.194681628924816e-06,
+      "loss": 0.0826,
+      "step": 393
+    },
+    {
+      "epoch": 0.9125651418645049,
+      "grad_norm": 1.0816729068756104,
+      "learning_rate": 2.0843554016039326e-06,
+      "loss": 0.0553,
+      "step": 394
+    },
+    {
+      "epoch": 0.9148812970469021,
+      "grad_norm": 1.0502961874008179,
+      "learning_rate": 1.976815170638802e-06,
+      "loss": 0.0598,
+      "step": 395
+    },
+    {
+      "epoch": 0.9171974522292994,
+      "grad_norm": 1.179869532585144,
+      "learning_rate": 1.8720671888242059e-06,
+      "loss": 0.1096,
+      "step": 396
+    },
+    {
+      "epoch": 0.9195136074116966,
+      "grad_norm": 1.1720819473266602,
+      "learning_rate": 1.7701175466029895e-06,
+      "loss": 0.0732,
+      "step": 397
+    },
+    {
+      "epoch": 0.9218297625940938,
+      "grad_norm": 1.741204857826233,
+      "learning_rate": 1.6709721717120042e-06,
+      "loss": 0.0805,
+      "step": 398
+    },
+    {
+      "epoch": 0.924145917776491,
+      "grad_norm": 1.379157304763794,
+      "learning_rate": 1.5746368288373947e-06,
+      "loss": 0.0735,
+      "step": 399
+    },
+    {
+      "epoch": 0.9264620729588883,
+      "grad_norm": 1.8489305973052979,
+      "learning_rate": 1.4811171192794627e-06,
+      "loss": 0.0918,
+      "step": 400
+    },
+    {
+      "epoch": 0.9287782281412854,
+      "grad_norm": 1.2802451848983765,
+      "learning_rate": 1.3904184806269704e-06,
+      "loss": 0.9387,
+      "step": 401
+    },
+    {
+      "epoch": 0.9310943833236827,
+      "grad_norm": 1.3554513454437256,
+      "learning_rate": 1.3025461864409394e-06,
+      "loss": 0.9213,
+      "step": 402
+    },
+    {
+      "epoch": 0.9334105385060799,
+      "grad_norm": 1.2956500053405762,
+      "learning_rate": 1.2175053459481e-06,
+      "loss": 0.6561,
+      "step": 403
+    },
+    {
+      "epoch": 0.9357266936884772,
+      "grad_norm": 1.4841207265853882,
+      "learning_rate": 1.1353009037437523e-06,
+      "loss": 0.4452,
+      "step": 404
+    },
+    {
+      "epoch": 0.9380428488708743,
+      "grad_norm": 1.3164417743682861,
+      "learning_rate": 1.0559376395043285e-06,
+      "loss": 0.4071,
+      "step": 405
+    },
+    {
+      "epoch": 0.9403590040532716,
+      "grad_norm": 1.8103063106536865,
+      "learning_rate": 9.794201677094162e-07,
+      "loss": 0.3799,
+      "step": 406
+    },
+    {
+      "epoch": 0.9426751592356688,
+      "grad_norm": 1.640889286994934,
+      "learning_rate": 9.05752937373533e-07,
+      "loss": 0.3303,
+      "step": 407
+    },
+    {
+      "epoch": 0.944991314418066,
+      "grad_norm": 1.410726547241211,
+      "learning_rate": 8.349402317873789e-07,
+      "loss": 0.2026,
+      "step": 408
+    },
+    {
+      "epoch": 0.9473074696004632,
+      "grad_norm": 1.7915242910385132,
+      "learning_rate": 7.669861682688239e-07,
+      "loss": 0.2115,
+      "step": 409
+    },
+    {
+      "epoch": 0.9496236247828604,
+      "grad_norm": 1.855295181274414,
+      "learning_rate": 7.018946979234997e-07,
+      "loss": 0.2876,
+      "step": 410
+    },
+    {
+      "epoch": 0.9519397799652577,
+      "grad_norm": 1.1143068075180054,
+      "learning_rate": 6.396696054150719e-07,
+      "loss": 0.1409,
+      "step": 411
+    },
+    {
+      "epoch": 0.9542559351476549,
+      "grad_norm": 1.3461558818817139,
+      "learning_rate": 5.803145087451945e-07,
+      "loss": 0.1312,
+      "step": 412
+    },
+    {
+      "epoch": 0.9565720903300521,
+      "grad_norm": 1.343173623085022,
+      "learning_rate": 5.238328590431162e-07,
+      "loss": 0.1521,
+      "step": 413
+    },
+    {
+      "epoch": 0.9588882455124493,
+      "grad_norm": 1.174078106880188,
+      "learning_rate": 4.7022794036505335e-07,
+      "loss": 0.1421,
+      "step": 414
+    },
+    {
+      "epoch": 0.9612044006948466,
+      "grad_norm": 1.0609129667282104,
+      "learning_rate": 4.1950286950321327e-07,
+      "loss": 0.0897,
+      "step": 415
+    },
+    {
+      "epoch": 0.9635205558772437,
+      "grad_norm": 1.1571158170700073,
+      "learning_rate": 3.716605958046071e-07,
+      "loss": 0.0685,
+      "step": 416
+    },
+    {
+      "epoch": 0.965836711059641,
+      "grad_norm": 0.8536246418952942,
+      "learning_rate": 3.267039009995199e-07,
+      "loss": 0.0694,
+      "step": 417
+    },
+    {
+      "epoch": 0.9681528662420382,
+      "grad_norm": 0.7544981837272644,
+      "learning_rate": 2.846353990398065e-07,
+      "loss": 0.0442,
+      "step": 418
+    },
+    {
+      "epoch": 0.9704690214244355,
+      "grad_norm": 0.7445719242095947,
+      "learning_rate": 2.4545753594688583e-07,
+      "loss": 0.0545,
+      "step": 419
+    },
+    {
+      "epoch": 0.9727851766068326,
+      "grad_norm": 0.631585955619812,
+      "learning_rate": 2.0917258966953733e-07,
+      "loss": 0.0462,
+      "step": 420
+    },
+    {
+      "epoch": 0.9751013317892299,
+      "grad_norm": 0.6365635991096497,
+      "learning_rate": 1.7578266995142978e-07,
+      "loss": 0.0554,
+      "step": 421
+    },
+    {
+      "epoch": 0.9774174869716271,
+      "grad_norm": 0.7627168297767639,
+      "learning_rate": 1.4528971820846893e-07,
+      "loss": 0.0552,
+      "step": 422
+    },
+    {
+      "epoch": 0.9797336421540244,
+      "grad_norm": 0.8680822849273682,
+      "learning_rate": 1.1769550741592139e-07,
+      "loss": 0.0643,
+      "step": 423
+    },
+    {
+      "epoch": 0.9820497973364215,
+      "grad_norm": 1.076411485671997,
+      "learning_rate": 9.300164200530814e-08,
+      "loss": 0.0644,
+      "step": 424
+    },
+    {
+      "epoch": 0.9843659525188188,
+      "grad_norm": 1.0168262720108032,
+      "learning_rate": 7.120955777112914e-08,
+      "loss": 0.0862,
+      "step": 425
+    },
+    {
+      "epoch": 0.986682107701216,
+      "grad_norm": 1.153483271598816,
+      "learning_rate": 5.2320521787385667e-08,
+      "loss": 0.0703,
+      "step": 426
+    },
+    {
+      "epoch": 0.9889982628836133,
+      "grad_norm": 0.9161254167556763,
+      "learning_rate": 3.633563233388926e-08,
+      "loss": 0.0481,
+      "step": 427
+    },
+    {
+      "epoch": 0.9913144180660104,
+      "grad_norm": 0.8356172442436218,
+      "learning_rate": 2.3255818832423894e-08,
+      "loss": 0.0574,
+      "step": 428
+    },
+    {
+      "epoch": 0.9936305732484076,
+      "grad_norm": 1.117587924003601,
+      "learning_rate": 1.3081841792694783e-08,
+      "loss": 0.0647,
+      "step": 429
+    },
+    {
+      "epoch": 0.9959467284308049,
+      "grad_norm": 0.9720324873924255,
+      "learning_rate": 5.814292768108187e-09,
+      "loss": 0.0588,
+      "step": 430
+    },
+    {
+      "epoch": 0.998262883613202,
+      "grad_norm": 1.3996336460113525,
+      "learning_rate": 1.453594321393359e-09,
+      "loss": 0.0588,
+      "step": 431
+    },
+    {
+      "epoch": 1.0011580775911986,
+      "grad_norm": 2.588228702545166,
+      "learning_rate": 0.0,
+      "loss": 0.4998,
+      "step": 432
     }
   ],
   "logging_steps": 1,
         "should_evaluate": false,
         "should_log": false,
         "should_save": true,
+        "should_training_stop": true
       },
       "attributes": {}
     }
   },
+  "total_flos": 3.292330885612831e+17,
   "train_batch_size": 8,
   "trial_name": null,
   "trial_params": null