Uploaded from W&B

Browse files

Files changed (7) hide show

model-00001-of-00006.safetensors +1 -1
model-00002-of-00006.safetensors +1 -1
model-00003-of-00006.safetensors +1 -1
model-00004-of-00006.safetensors +1 -1
model-00005-of-00006.safetensors +1 -1
model-00006-of-00006.safetensors +1 -1
trainer_state.json +968 -4

model-00001-of-00006.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:733d1ad4b591a1e0ea3f24761a4b6d0e99d07f0186201dd2788265b6a256b305
 size 4984780784

 version https://git-lfs.github.com/spec/v1
+oid sha256:19d8c44654d03e9974c989cd3b4270e812ad66efec6f20381c1489a201a5d670
 size 4984780784

model-00002-of-00006.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2ccdcb7db2051bb04578c15fbc8de0908dc382e2e0c11d12e841a659b060a362
 size 4980892048

 version https://git-lfs.github.com/spec/v1
+oid sha256:719054f207c7550419e0a70ef585a181d64a661bbafbb6c3cac116bacd778896
 size 4980892048

model-00003-of-00006.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:64e3bf40e26d50d168975db66ec0be143153d22ac74b8dab5ed7bb399b25352f
 size 4928485104

 version https://git-lfs.github.com/spec/v1
+oid sha256:e6f0b9916006362401f797946f52af207421b6067cbb1334da7c4da482155994
 size 4928485104

model-00004-of-00006.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:dd527b82bf98c9aa46042138f842cfcae9ee3f0029cb362d556a8d2344a907a0
 size 4980892112

 version https://git-lfs.github.com/spec/v1
+oid sha256:1d4829659786275428c6590064abed129a62b07c38df176b8751b97b89ad1265
 size 4980892112

model-00005-of-00006.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8a27920d463f37058f3a01ef15e895935a14b076f65075ce57c76fce4775d593
 size 4928485104

 version https://git-lfs.github.com/spec/v1
+oid sha256:9d55c22cdc249df507d11e2c297cef91135f3b60aa29717a0b29d629a0cafdff
 size 4928485104

model-00006-of-00006.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:bae46196571919129ce0bff9017fd324fa4833a01cb82bd6e1d392fa68c3ad9a
 size 4733130504

 version https://git-lfs.github.com/spec/v1
+oid sha256:c9b335b24792043e47d8110ba4af61dd8869ad9681ab9a2ebf6157eb79b960d0
 size 4733130504

trainer_state.json CHANGED Viewed

@@ -2,9 +2,9 @@
   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 2.0,
   "eval_steps": 27,
-  "global_step": 264,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
@@ -1936,6 +1936,970 @@
       "learning_rate": 1.2622039877423267e-06,
       "loss": 0.1916,
       "step": 264
     }
   ],
   "logging_steps": 1,
@@ -1950,12 +2914,12 @@
         "should_evaluate": false,
         "should_log": false,
         "should_save": true,
-        "should_training_stop": false
       },
       "attributes": {}
     }
   },
-  "total_flos": 5.809316809735668e+18,
   "train_batch_size": 2,
   "trial_name": null,
   "trial_params": null

   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 3.0,
   "eval_steps": 27,
+  "global_step": 396,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
       "learning_rate": 1.2622039877423267e-06,
       "loss": 0.1916,
       "step": 264
+    },
+    {
+      "epoch": 2.007575757575758,
+      "grad_norm": 0.281697154045105,
+      "learning_rate": 1.2464987695172266e-06,
+      "loss": 0.178,
+      "step": 265
+    },
+    {
+      "epoch": 2.015151515151515,
+      "grad_norm": 0.3228111267089844,
+      "learning_rate": 1.230822106959742e-06,
+      "loss": 0.1893,
+      "step": 266
+    },
+    {
+      "epoch": 2.022727272727273,
+      "grad_norm": 0.2976820170879364,
+      "learning_rate": 1.2151757659688574e-06,
+      "loss": 0.1755,
+      "step": 267
+    },
+    {
+      "epoch": 2.0303030303030303,
+      "grad_norm": 0.2902336120605469,
+      "learning_rate": 1.1995615090279815e-06,
+      "loss": 0.1826,
+      "step": 268
+    },
+    {
+      "epoch": 2.037878787878788,
+      "grad_norm": 0.32078248262405396,
+      "learning_rate": 1.183981095006411e-06,
+      "loss": 0.2049,
+      "step": 269
+    },
+    {
+      "epoch": 2.0454545454545454,
+      "grad_norm": 0.28856366872787476,
+      "learning_rate": 1.1684362789612054e-06,
+      "loss": 0.1709,
+      "step": 270
+    },
+    {
+      "epoch": 2.0454545454545454,
+      "eval_loss": 0.18923524022102356,
+      "eval_runtime": 3.7815,
+      "eval_samples_per_second": 11.9,
+      "eval_steps_per_second": 0.793,
+      "step": 270
+    },
+    {
+      "epoch": 2.053030303030303,
+      "grad_norm": 0.3012211322784424,
+      "learning_rate": 1.1529288119394879e-06,
+      "loss": 0.1863,
+      "step": 271
+    },
+    {
+      "epoch": 2.0606060606060606,
+      "grad_norm": 0.2844583988189697,
+      "learning_rate": 1.1374604407811962e-06,
+      "loss": 0.1834,
+      "step": 272
+    },
+    {
+      "epoch": 2.0681818181818183,
+      "grad_norm": 0.2947697639465332,
+      "learning_rate": 1.1220329079223124e-06,
+      "loss": 0.1842,
+      "step": 273
+    },
+    {
+      "epoch": 2.0757575757575757,
+      "grad_norm": 0.3138524293899536,
+      "learning_rate": 1.1066479511985838e-06,
+      "loss": 0.1946,
+      "step": 274
+    },
+    {
+      "epoch": 2.0833333333333335,
+      "grad_norm": 0.30240464210510254,
+      "learning_rate": 1.091307303649766e-06,
+      "loss": 0.1921,
+      "step": 275
+    },
+    {
+      "epoch": 2.090909090909091,
+      "grad_norm": 0.2899230420589447,
+      "learning_rate": 1.0760126933244036e-06,
+      "loss": 0.1868,
+      "step": 276
+    },
+    {
+      "epoch": 2.0984848484848486,
+      "grad_norm": 0.3104937672615051,
+      "learning_rate": 1.0607658430851746e-06,
+      "loss": 0.211,
+      "step": 277
+    },
+    {
+      "epoch": 2.106060606060606,
+      "grad_norm": 0.29552116990089417,
+      "learning_rate": 1.0455684704148174e-06,
+      "loss": 0.1914,
+      "step": 278
+    },
+    {
+      "epoch": 2.1136363636363638,
+      "grad_norm": 0.29408350586891174,
+      "learning_rate": 1.030422287222667e-06,
+      "loss": 0.1888,
+      "step": 279
+    },
+    {
+      "epoch": 2.121212121212121,
+      "grad_norm": 0.278326153755188,
+      "learning_rate": 1.0153289996518127e-06,
+      "loss": 0.1642,
+      "step": 280
+    },
+    {
+      "epoch": 2.128787878787879,
+      "grad_norm": 0.3248918354511261,
+      "learning_rate": 1.0002903078869137e-06,
+      "loss": 0.1991,
+      "step": 281
+    },
+    {
+      "epoch": 2.1363636363636362,
+      "grad_norm": 0.3153297007083893,
+      "learning_rate": 9.853079059626806e-07,
+      "loss": 0.1929,
+      "step": 282
+    },
+    {
+      "epoch": 2.143939393939394,
+      "grad_norm": 0.3812675476074219,
+      "learning_rate": 9.703834815730489e-07,
+      "loss": 0.1809,
+      "step": 283
+    },
+    {
+      "epoch": 2.1515151515151514,
+      "grad_norm": 0.28265097737312317,
+      "learning_rate": 9.555187158810704e-07,
+      "loss": 0.1741,
+      "step": 284
+    },
+    {
+      "epoch": 2.159090909090909,
+      "grad_norm": 0.30659934878349304,
+      "learning_rate": 9.407152833295372e-07,
+      "loss": 0.1928,
+      "step": 285
+    },
+    {
+      "epoch": 2.1666666666666665,
+      "grad_norm": 0.2801172137260437,
+      "learning_rate": 9.259748514523654e-07,
+      "loss": 0.1762,
+      "step": 286
+    },
+    {
+      "epoch": 2.1742424242424243,
+      "grad_norm": 0.3246462047100067,
+      "learning_rate": 9.112990806867544e-07,
+      "loss": 0.2035,
+      "step": 287
+    },
+    {
+      "epoch": 2.1818181818181817,
+      "grad_norm": 0.29880762100219727,
+      "learning_rate": 8.966896241861474e-07,
+      "loss": 0.1765,
+      "step": 288
+    },
+    {
+      "epoch": 2.1893939393939394,
+      "grad_norm": 0.32259929180145264,
+      "learning_rate": 8.821481276340112e-07,
+      "loss": 0.2088,
+      "step": 289
+    },
+    {
+      "epoch": 2.196969696969697,
+      "grad_norm": 0.3025321662425995,
+      "learning_rate": 8.676762290584585e-07,
+      "loss": 0.1718,
+      "step": 290
+    },
+    {
+      "epoch": 2.2045454545454546,
+      "grad_norm": 0.3053520917892456,
+      "learning_rate": 8.532755586477326e-07,
+      "loss": 0.196,
+      "step": 291
+    },
+    {
+      "epoch": 2.212121212121212,
+      "grad_norm": 0.28334543108940125,
+      "learning_rate": 8.389477385665733e-07,
+      "loss": 0.1764,
+      "step": 292
+    },
+    {
+      "epoch": 2.2196969696969697,
+      "grad_norm": 0.29423749446868896,
+      "learning_rate": 8.246943827734898e-07,
+      "loss": 0.1805,
+      "step": 293
+    },
+    {
+      "epoch": 2.227272727272727,
+      "grad_norm": 0.2939305007457733,
+      "learning_rate": 8.105170968389552e-07,
+      "loss": 0.1806,
+      "step": 294
+    },
+    {
+      "epoch": 2.234848484848485,
+      "grad_norm": 0.3079436719417572,
+      "learning_rate": 7.964174777645448e-07,
+      "loss": 0.1891,
+      "step": 295
+    },
+    {
+      "epoch": 2.242424242424242,
+      "grad_norm": 0.3062392771244049,
+      "learning_rate": 7.823971138030467e-07,
+      "loss": 0.1859,
+      "step": 296
+    },
+    {
+      "epoch": 2.25,
+      "grad_norm": 0.31168290972709656,
+      "learning_rate": 7.684575842795485e-07,
+      "loss": 0.2003,
+      "step": 297
+    },
+    {
+      "epoch": 2.25,
+      "eval_loss": 0.18853209912776947,
+      "eval_runtime": 4.3319,
+      "eval_samples_per_second": 10.388,
+      "eval_steps_per_second": 0.693,
+      "step": 297
+    },
+    {
+      "epoch": 2.257575757575758,
+      "grad_norm": 0.29143401980400085,
+      "learning_rate": 7.546004594135357e-07,
+      "loss": 0.1889,
+      "step": 298
+    },
+    {
+      "epoch": 2.265151515151515,
+      "grad_norm": 0.2893548607826233,
+      "learning_rate": 7.408273001420153e-07,
+      "loss": 0.1752,
+      "step": 299
+    },
+    {
+      "epoch": 2.2727272727272725,
+      "grad_norm": 0.27464550733566284,
+      "learning_rate": 7.271396579436825e-07,
+      "loss": 0.163,
+      "step": 300
+    },
+    {
+      "epoch": 2.2803030303030303,
+      "grad_norm": 0.2979916036128998,
+      "learning_rate": 7.135390746641527e-07,
+      "loss": 0.1814,
+      "step": 301
+    },
+    {
+      "epoch": 2.287878787878788,
+      "grad_norm": 0.314494252204895,
+      "learning_rate": 7.000270823422838e-07,
+      "loss": 0.1906,
+      "step": 302
+    },
+    {
+      "epoch": 2.2954545454545454,
+      "grad_norm": 0.30358201265335083,
+      "learning_rate": 6.866052030375974e-07,
+      "loss": 0.1807,
+      "step": 303
+    },
+    {
+      "epoch": 2.303030303030303,
+      "grad_norm": 0.32943934202194214,
+      "learning_rate": 6.732749486588266e-07,
+      "loss": 0.2031,
+      "step": 304
+    },
+    {
+      "epoch": 2.3106060606060606,
+      "grad_norm": 0.29630428552627563,
+      "learning_rate": 6.60037820793607e-07,
+      "loss": 0.1941,
+      "step": 305
+    },
+    {
+      "epoch": 2.3181818181818183,
+      "grad_norm": 0.30913493037223816,
+      "learning_rate": 6.468953105393297e-07,
+      "loss": 0.1774,
+      "step": 306
+    },
+    {
+      "epoch": 2.3257575757575757,
+      "grad_norm": 0.2982107400894165,
+      "learning_rate": 6.338488983351778e-07,
+      "loss": 0.19,
+      "step": 307
+    },
+    {
+      "epoch": 2.3333333333333335,
+      "grad_norm": 0.29869621992111206,
+      "learning_rate": 6.209000537953606e-07,
+      "loss": 0.1866,
+      "step": 308
+    },
+    {
+      "epoch": 2.340909090909091,
+      "grad_norm": 0.30003929138183594,
+      "learning_rate": 6.080502355435701e-07,
+      "loss": 0.1842,
+      "step": 309
+    },
+    {
+      "epoch": 2.3484848484848486,
+      "grad_norm": 0.2731933891773224,
+      "learning_rate": 5.953008910486739e-07,
+      "loss": 0.1683,
+      "step": 310
+    },
+    {
+      "epoch": 2.356060606060606,
+      "grad_norm": 0.27522197365760803,
+      "learning_rate": 5.826534564616633e-07,
+      "loss": 0.1493,
+      "step": 311
+    },
+    {
+      "epoch": 2.3636363636363638,
+      "grad_norm": 0.2997157573699951,
+      "learning_rate": 5.701093564538807e-07,
+      "loss": 0.1707,
+      "step": 312
+    },
+    {
+      "epoch": 2.371212121212121,
+      "grad_norm": 0.28120705485343933,
+      "learning_rate": 5.576700040565365e-07,
+      "loss": 0.1693,
+      "step": 313
+    },
+    {
+      "epoch": 2.378787878787879,
+      "grad_norm": 0.2777678966522217,
+      "learning_rate": 5.453368005015363e-07,
+      "loss": 0.1789,
+      "step": 314
+    },
+    {
+      "epoch": 2.3863636363636362,
+      "grad_norm": 0.2859909236431122,
+      "learning_rate": 5.331111350636413e-07,
+      "loss": 0.1654,
+      "step": 315
+    },
+    {
+      "epoch": 2.393939393939394,
+      "grad_norm": 0.3338193893432617,
+      "learning_rate": 5.209943849039722e-07,
+      "loss": 0.2114,
+      "step": 316
+    },
+    {
+      "epoch": 2.4015151515151514,
+      "grad_norm": 0.30051618814468384,
+      "learning_rate": 5.089879149148781e-07,
+      "loss": 0.19,
+      "step": 317
+    },
+    {
+      "epoch": 2.409090909090909,
+      "grad_norm": 0.3063294291496277,
+      "learning_rate": 4.970930775661899e-07,
+      "loss": 0.1813,
+      "step": 318
+    },
+    {
+      "epoch": 2.4166666666666665,
+      "grad_norm": 0.3148181438446045,
+      "learning_rate": 4.853112127528699e-07,
+      "loss": 0.174,
+      "step": 319
+    },
+    {
+      "epoch": 2.4242424242424243,
+      "grad_norm": 0.2876652777194977,
+      "learning_rate": 4.736436476440792e-07,
+      "loss": 0.1658,
+      "step": 320
+    },
+    {
+      "epoch": 2.4318181818181817,
+      "grad_norm": 0.28111517429351807,
+      "learning_rate": 4.620916965336809e-07,
+      "loss": 0.1784,
+      "step": 321
+    },
+    {
+      "epoch": 2.4393939393939394,
+      "grad_norm": 0.27875590324401855,
+      "learning_rate": 4.506566606921865e-07,
+      "loss": 0.1737,
+      "step": 322
+    },
+    {
+      "epoch": 2.446969696969697,
+      "grad_norm": 0.28706124424934387,
+      "learning_rate": 4.3933982822017883e-07,
+      "loss": 0.1678,
+      "step": 323
+    },
+    {
+      "epoch": 2.4545454545454546,
+      "grad_norm": 0.28674250841140747,
+      "learning_rate": 4.281424739032122e-07,
+      "loss": 0.1757,
+      "step": 324
+    },
+    {
+      "epoch": 2.4545454545454546,
+      "eval_loss": 0.18774640560150146,
+      "eval_runtime": 3.7707,
+      "eval_samples_per_second": 11.934,
+      "eval_steps_per_second": 0.796,
+      "step": 324
+    },
+    {
+      "epoch": 2.462121212121212,
+      "grad_norm": 0.30930793285369873,
+      "learning_rate": 4.170658590682134e-07,
+      "loss": 0.2137,
+      "step": 325
+    },
+    {
+      "epoch": 2.4696969696969697,
+      "grad_norm": 0.28437668085098267,
+      "learning_rate": 4.0611123144140083e-07,
+      "loss": 0.1743,
+      "step": 326
+    },
+    {
+      "epoch": 2.4772727272727275,
+      "grad_norm": 0.2986258864402771,
+      "learning_rate": 3.952798250077318e-07,
+      "loss": 0.1813,
+      "step": 327
+    },
+    {
+      "epoch": 2.484848484848485,
+      "grad_norm": 0.28527748584747314,
+      "learning_rate": 3.8457285987190406e-07,
+      "loss": 0.1782,
+      "step": 328
+    },
+    {
+      "epoch": 2.492424242424242,
+      "grad_norm": 0.3192787766456604,
+      "learning_rate": 3.7399154212091333e-07,
+      "loss": 0.1922,
+      "step": 329
+    },
+    {
+      "epoch": 2.5,
+      "grad_norm": 0.3025767505168915,
+      "learning_rate": 3.635370636881958e-07,
+      "loss": 0.1775,
+      "step": 330
+    },
+    {
+      "epoch": 2.507575757575758,
+      "grad_norm": 0.30758988857269287,
+      "learning_rate": 3.532106022193615e-07,
+      "loss": 0.2018,
+      "step": 331
+    },
+    {
+      "epoch": 2.515151515151515,
+      "grad_norm": 0.2812161445617676,
+      "learning_rate": 3.4301332093953813e-07,
+      "loss": 0.175,
+      "step": 332
+    },
+    {
+      "epoch": 2.5227272727272725,
+      "grad_norm": 0.2952413260936737,
+      "learning_rate": 3.3294636852234106e-07,
+      "loss": 0.1815,
+      "step": 333
+    },
+    {
+      "epoch": 2.5303030303030303,
+      "grad_norm": 0.27842938899993896,
+      "learning_rate": 3.230108789604792e-07,
+      "loss": 0.1685,
+      "step": 334
+    },
+    {
+      "epoch": 2.537878787878788,
+      "grad_norm": 0.3101184070110321,
+      "learning_rate": 3.132079714380172e-07,
+      "loss": 0.1805,
+      "step": 335
+    },
+    {
+      "epoch": 2.5454545454545454,
+      "grad_norm": 0.28867068886756897,
+      "learning_rate": 3.035387502043052e-07,
+      "loss": 0.1697,
+      "step": 336
+    },
+    {
+      "epoch": 2.5530303030303028,
+      "grad_norm": 0.2969781458377838,
+      "learning_rate": 2.9400430444958937e-07,
+      "loss": 0.1912,
+      "step": 337
+    },
+    {
+      "epoch": 2.5606060606060606,
+      "grad_norm": 0.28201431035995483,
+      "learning_rate": 2.8460570818232014e-07,
+      "loss": 0.1801,
+      "step": 338
+    },
+    {
+      "epoch": 2.5681818181818183,
+      "grad_norm": 0.2861873209476471,
+      "learning_rate": 2.753440201081716e-07,
+      "loss": 0.1833,
+      "step": 339
+    },
+    {
+      "epoch": 2.5757575757575757,
+      "grad_norm": 0.2919836640357971,
+      "learning_rate": 2.662202835107828e-07,
+      "loss": 0.1702,
+      "step": 340
+    },
+    {
+      "epoch": 2.5833333333333335,
+      "grad_norm": 0.28204408288002014,
+      "learning_rate": 2.572355261342369e-07,
+      "loss": 0.1693,
+      "step": 341
+    },
+    {
+      "epoch": 2.590909090909091,
+      "grad_norm": 0.28755688667297363,
+      "learning_rate": 2.4839076006729086e-07,
+      "loss": 0.1747,
+      "step": 342
+    },
+    {
+      "epoch": 2.5984848484848486,
+      "grad_norm": 0.2947797477245331,
+      "learning_rate": 2.3968698162936857e-07,
+      "loss": 0.1842,
+      "step": 343
+    },
+    {
+      "epoch": 2.606060606060606,
+      "grad_norm": 0.2840390205383301,
+      "learning_rate": 2.3112517125833071e-07,
+      "loss": 0.1859,
+      "step": 344
+    },
+    {
+      "epoch": 2.6136363636363638,
+      "grad_norm": 0.3154660165309906,
+      "learning_rate": 2.2270629340003308e-07,
+      "loss": 0.1898,
+      "step": 345
+    },
+    {
+      "epoch": 2.621212121212121,
+      "grad_norm": 0.27370041608810425,
+      "learning_rate": 2.1443129639968617e-07,
+      "loss": 0.1641,
+      "step": 346
+    },
+    {
+      "epoch": 2.628787878787879,
+      "grad_norm": 0.28645506501197815,
+      "learning_rate": 2.0630111239502954e-07,
+      "loss": 0.1664,
+      "step": 347
+    },
+    {
+      "epoch": 2.6363636363636362,
+      "grad_norm": 0.3576584458351135,
+      "learning_rate": 1.9831665721132957e-07,
+      "loss": 0.1571,
+      "step": 348
+    },
+    {
+      "epoch": 2.643939393939394,
+      "grad_norm": 0.3097120225429535,
+      "learning_rate": 1.9047883025821777e-07,
+      "loss": 0.1918,
+      "step": 349
+    },
+    {
+      "epoch": 2.6515151515151514,
+      "grad_norm": 0.2783317565917969,
+      "learning_rate": 1.827885144283769e-07,
+      "loss": 0.1703,
+      "step": 350
+    },
+    {
+      "epoch": 2.659090909090909,
+      "grad_norm": 0.282617449760437,
+      "learning_rate": 1.7524657599808603e-07,
+      "loss": 0.1743,
+      "step": 351
+    },
+    {
+      "epoch": 2.659090909090909,
+      "eval_loss": 0.18733149766921997,
+      "eval_runtime": 3.7928,
+      "eval_samples_per_second": 11.864,
+      "eval_steps_per_second": 0.791,
+      "step": 351
+    },
+    {
+      "epoch": 2.6666666666666665,
+      "grad_norm": 0.3042234778404236,
+      "learning_rate": 1.6785386452963914e-07,
+      "loss": 0.1945,
+      "step": 352
+    },
+    {
+      "epoch": 2.6742424242424243,
+      "grad_norm": 0.3053586483001709,
+      "learning_rate": 1.6061121277564746e-07,
+      "loss": 0.1968,
+      "step": 353
+    },
+    {
+      "epoch": 2.6818181818181817,
+      "grad_norm": 0.30519694089889526,
+      "learning_rate": 1.5351943658523153e-07,
+      "loss": 0.1854,
+      "step": 354
+    },
+    {
+      "epoch": 2.6893939393939394,
+      "grad_norm": 0.2871159613132477,
+      "learning_rate": 1.4657933481212243e-07,
+      "loss": 0.1891,
+      "step": 355
+    },
+    {
+      "epoch": 2.6969696969696972,
+      "grad_norm": 0.2823057770729065,
+      "learning_rate": 1.39791689224673e-07,
+      "loss": 0.1788,
+      "step": 356
+    },
+    {
+      "epoch": 2.7045454545454546,
+      "grad_norm": 0.3045274615287781,
+      "learning_rate": 1.3315726441779629e-07,
+      "loss": 0.1808,
+      "step": 357
+    },
+    {
+      "epoch": 2.712121212121212,
+      "grad_norm": 0.2801673114299774,
+      "learning_rate": 1.2667680772683826e-07,
+      "loss": 0.1724,
+      "step": 358
+    },
+    {
+      "epoch": 2.7196969696969697,
+      "grad_norm": 0.27538782358169556,
+      "learning_rate": 1.203510491433919e-07,
+      "loss": 0.168,
+      "step": 359
+    },
+    {
+      "epoch": 2.7272727272727275,
+      "grad_norm": 0.2858332097530365,
+      "learning_rate": 1.141807012330699e-07,
+      "loss": 0.1781,
+      "step": 360
+    },
+    {
+      "epoch": 2.734848484848485,
+      "grad_norm": 0.26091665029525757,
+      "learning_rate": 1.0816645905523597e-07,
+      "loss": 0.1618,
+      "step": 361
+    },
+    {
+      "epoch": 2.742424242424242,
+      "grad_norm": 0.2917574346065521,
+      "learning_rate": 1.0230900008471073e-07,
+      "loss": 0.1867,
+      "step": 362
+    },
+    {
+      "epoch": 2.75,
+      "grad_norm": 0.25802233815193176,
+      "learning_rate": 9.660898413545694e-08,
+      "loss": 0.1661,
+      "step": 363
+    },
+    {
+      "epoch": 2.757575757575758,
+      "grad_norm": 0.29267409443855286,
+      "learning_rate": 9.106705328625408e-08,
+      "loss": 0.1777,
+      "step": 364
+    },
+    {
+      "epoch": 2.765151515151515,
+      "grad_norm": 0.2785191237926483,
+      "learning_rate": 8.568383180837369e-08,
+      "loss": 0.1731,
+      "step": 365
+    },
+    {
+      "epoch": 2.7727272727272725,
+      "grad_norm": 0.2805950939655304,
+      "learning_rate": 8.045992609525571e-08,
+      "loss": 0.1727,
+      "step": 366
+    },
+    {
+      "epoch": 2.7803030303030303,
+      "grad_norm": 0.29362258315086365,
+      "learning_rate": 7.539592459420219e-08,
+      "loss": 0.1751,
+      "step": 367
+    },
+    {
+      "epoch": 2.787878787878788,
+      "grad_norm": 0.28130269050598145,
+      "learning_rate": 7.049239774009214e-08,
+      "loss": 0.1861,
+      "step": 368
+    },
+    {
+      "epoch": 2.7954545454545454,
+      "grad_norm": 0.29464268684387207,
+      "learning_rate": 6.574989789112374e-08,
+      "loss": 0.1967,
+      "step": 369
+    },
+    {
+      "epoch": 2.8030303030303028,
+      "grad_norm": 0.2706058621406555,
+      "learning_rate": 6.11689592665951e-08,
+      "loss": 0.1656,
+      "step": 370
+    },
+    {
+      "epoch": 2.8106060606060606,
+      "grad_norm": 0.31391721963882446,
+      "learning_rate": 5.675009788672597e-08,
+      "loss": 0.1395,
+      "step": 371
+    },
+    {
+      "epoch": 2.8181818181818183,
+      "grad_norm": 0.3240405023097992,
+      "learning_rate": 5.249381151453164e-08,
+      "loss": 0.1889,
+      "step": 372
+    },
+    {
+      "epoch": 2.8257575757575757,
+      "grad_norm": 0.286594033241272,
+      "learning_rate": 4.8400579599751696e-08,
+      "loss": 0.1758,
+      "step": 373
+    },
+    {
+      "epoch": 2.8333333333333335,
+      "grad_norm": 0.29338592290878296,
+      "learning_rate": 4.447086322484251e-08,
+      "loss": 0.1869,
+      "step": 374
+    },
+    {
+      "epoch": 2.840909090909091,
+      "grad_norm": 0.29780998826026917,
+      "learning_rate": 4.070510505303815e-08,
+      "loss": 0.1775,
+      "step": 375
+    },
+    {
+      "epoch": 2.8484848484848486,
+      "grad_norm": 0.3508531153202057,
+      "learning_rate": 3.7103729278487766e-08,
+      "loss": 0.189,
+      "step": 376
+    },
+    {
+      "epoch": 2.856060606060606,
+      "grad_norm": 0.28885170817375183,
+      "learning_rate": 3.3667141578470783e-08,
+      "loss": 0.1744,
+      "step": 377
+    },
+    {
+      "epoch": 2.8636363636363638,
+      "grad_norm": 0.31082722544670105,
+      "learning_rate": 3.039572906770033e-08,
+      "loss": 0.1869,
+      "step": 378
+    },
+    {
+      "epoch": 2.8636363636363638,
+      "eval_loss": 0.1873067021369934,
+      "eval_runtime": 4.1732,
+      "eval_samples_per_second": 10.783,
+      "eval_steps_per_second": 0.719,
+      "step": 378
+    },
+    {
+      "epoch": 2.871212121212121,
+      "grad_norm": 0.30285000801086426,
+      "learning_rate": 2.7289860254716416e-08,
+      "loss": 0.1871,
+      "step": 379
+    },
+    {
+      "epoch": 2.878787878787879,
+      "grad_norm": 0.2864581048488617,
+      "learning_rate": 2.434988500037466e-08,
+      "loss": 0.1778,
+      "step": 380
+    },
+    {
+      "epoch": 2.8863636363636362,
+      "grad_norm": 0.2580985724925995,
+      "learning_rate": 2.1576134478437316e-08,
+      "loss": 0.1421,
+      "step": 381
+    },
+    {
+      "epoch": 2.893939393939394,
+      "grad_norm": 0.2779240608215332,
+      "learning_rate": 1.896892113826709e-08,
+      "loss": 0.1685,
+      "step": 382
+    },
+    {
+      "epoch": 2.9015151515151514,
+      "grad_norm": 0.27729371190071106,
+      "learning_rate": 1.6528538669631998e-08,
+      "loss": 0.1854,
+      "step": 383
+    },
+    {
+      "epoch": 2.909090909090909,
+      "grad_norm": 0.28567954897880554,
+      "learning_rate": 1.4255261969622457e-08,
+      "loss": 0.1767,
+      "step": 384
+    },
+    {
+      "epoch": 2.9166666666666665,
+      "grad_norm": 0.3133796453475952,
+      "learning_rate": 1.214934711168475e-08,
+      "loss": 0.2042,
+      "step": 385
+    },
+    {
+      "epoch": 2.9242424242424243,
+      "grad_norm": 0.31388959288597107,
+      "learning_rate": 1.021103131677692e-08,
+      "loss": 0.2098,
+      "step": 386
+    },
+    {
+      "epoch": 2.9318181818181817,
+      "grad_norm": 0.30285680294036865,
+      "learning_rate": 8.440532926646316e-09,
+      "loss": 0.1935,
+      "step": 387
+    },
+    {
+      "epoch": 2.9393939393939394,
+      "grad_norm": 0.29303839802742004,
+      "learning_rate": 6.8380513792341e-09,
+      "loss": 0.1649,
+      "step": 388
+    },
+    {
+      "epoch": 2.9469696969696972,
+      "grad_norm": 0.28131937980651855,
+      "learning_rate": 5.403767186210218e-09,
+      "loss": 0.1741,
+      "step": 389
+    },
+    {
+      "epoch": 2.9545454545454546,
+      "grad_norm": 0.27338099479675293,
+      "learning_rate": 4.1378419126393285e-09,
+      "loss": 0.1688,
+      "step": 390
+    },
+    {
+      "epoch": 2.962121212121212,
+      "grad_norm": 0.2868610918521881,
+      "learning_rate": 3.0404181587811996e-09,
+      "loss": 0.1774,
+      "step": 391
+    },
+    {
+      "epoch": 2.9696969696969697,
+      "grad_norm": 0.28331971168518066,
+      "learning_rate": 2.1116195440278876e-09,
+      "loss": 0.1636,
+      "step": 392
+    },
+    {
+      "epoch": 2.9772727272727275,
+      "grad_norm": 0.2694188356399536,
+      "learning_rate": 1.3515506929778764e-09,
+      "loss": 0.1611,
+      "step": 393
+    },
+    {
+      "epoch": 2.984848484848485,
+      "grad_norm": 0.2967832088470459,
+      "learning_rate": 7.602972236513405e-10,
+      "loss": 0.1834,
+      "step": 394
+    },
+    {
+      "epoch": 2.992424242424242,
+      "grad_norm": 0.2818254828453064,
+      "learning_rate": 3.379257378458567e-10,
+      "loss": 0.1668,
+      "step": 395
+    },
+    {
+      "epoch": 3.0,
+      "grad_norm": 0.291985422372818,
+      "learning_rate": 8.448381363307389e-11,
+      "loss": 0.1836,
+      "step": 396
     }
   ],
   "logging_steps": 1,
         "should_evaluate": false,
         "should_log": false,
         "should_save": true,
+        "should_training_stop": true
       },
       "attributes": {}
     }
   },
+  "total_flos": 8.713975214603502e+18,
   "train_batch_size": 2,
   "trial_name": null,
   "trial_params": null