End of training

Browse files

Files changed (6) hide show

README.md +18 -4
all_results.json +10 -10
eval_results.json +5 -5
log_history.json +1319 -1137
train_results.json +5 -5
trainer_state.json +1324 -1142

README.md CHANGED Viewed

@@ -1,6 +1,7 @@
 ---
 library_name: transformers
 tags:
 - generated_from_trainer
 datasets:
 - voxceleb
@@ -8,7 +9,20 @@ metrics:
 - accuracy
 model-index:
 - name: ecapa-tdnn-voxceleb1-c512-aam
-  results: []
 ---
 <!-- This model card has been generated automatically according to the information the Trainer had access to. You
@@ -16,10 +30,10 @@ should probably proofread and complete it, then remove this comment. -->
 # ecapa-tdnn-voxceleb1-c512-aam
-This model is a fine-tuned version of [](https://huggingface.co/) on the voxceleb dataset.
 It achieves the following results on the evaluation set:
-- Loss: nan
-- Accuracy: 0.0007
 ## Model description

 ---
 library_name: transformers
 tags:
+- audio-classification
 - generated_from_trainer
 datasets:
 - voxceleb
 - accuracy
 model-index:
 - name: ecapa-tdnn-voxceleb1-c512-aam
+  results:
+  - task:
+      name: Audio Classification
+      type: audio-classification
+    dataset:
+      name: confit/voxceleb
+      type: voxceleb
+      config: verification
+      split: train
+      args: verification
+    metrics:
+    - name: Accuracy
+      type: accuracy
+      value: 0.9757901815736382
 ---
 <!-- This model card has been generated automatically according to the information the Trainer had access to. You
 # ecapa-tdnn-voxceleb1-c512-aam
+This model is a fine-tuned version of [](https://huggingface.co/) on the confit/voxceleb dataset.
 It achieves the following results on the evaluation set:
+- Loss: 0.5840
+- Accuracy: 0.9758
 ## Model description

all_results.json CHANGED Viewed

@@ -1,13 +1,13 @@
 {
     "epoch": 10.0,
-    "eval_accuracy": 0.8030272452068618,
-    "eval_loss": 4.700281620025635,
-    "eval_runtime": 77.9967,
-    "eval_samples_per_second": 190.585,
-    "eval_steps_per_second": 190.585,
-    "total_flos": 2.49073133395968e+18,
-    "train_loss": 7.888943860726876,
-    "train_runtime": 28748.4048,
-    "train_samples_per_second": 46.534,
-    "train_steps_per_second": 0.182
 }

 {
     "epoch": 10.0,
+    "eval_accuracy": 0.9757901815736382,
+    "eval_loss": 0.5840117335319519,
+    "eval_runtime": 13.1656,
+    "eval_samples_per_second": 112.946,
+    "eval_steps_per_second": 112.946,
+    "total_flos": 2.7398100529152e+18,
+    "train_loss": 2.9414075751926587,
+    "train_runtime": 59857.6179,
+    "train_samples_per_second": 24.584,
+    "train_steps_per_second": 0.096
 }

eval_results.json CHANGED Viewed

@@ -1,8 +1,8 @@
 {
     "epoch": 10.0,
-    "eval_accuracy": 0.8030272452068618,
-    "eval_loss": 4.700281620025635,
-    "eval_runtime": 77.9967,
-    "eval_samples_per_second": 190.585,
-    "eval_steps_per_second": 190.585
 }

 {
     "epoch": 10.0,
+    "eval_accuracy": 0.9757901815736382,
+    "eval_loss": 0.5840117335319519,
+    "eval_runtime": 13.1656,
+    "eval_samples_per_second": 112.946,
+    "eval_steps_per_second": 112.946
 }

log_history.json CHANGED Viewed

@@ -1,1928 +1,2110 @@
 [
     {
-        "loss": 13.2232,
-        "grad_norm": 6.157718181610107,
-        "learning_rate": 3.824091778202677e-06,
-        "epoch": 0.03824091778202677,
         "step": 20
     },
     {
-        "loss": 13.2113,
-        "grad_norm": 6.144223213195801,
-        "learning_rate": 7.648183556405354e-06,
-        "epoch": 0.07648183556405354,
         "step": 40
     },
     {
-        "loss": 13.1625,
-        "grad_norm": 6.032691955566406,
-        "learning_rate": 1.147227533460803e-05,
-        "epoch": 0.1147227533460803,
         "step": 60
     },
     {
-        "loss": 13.1174,
-        "grad_norm": 5.916826248168945,
-        "learning_rate": 1.529636711281071e-05,
-        "epoch": 0.15296367112810708,
         "step": 80
     },
     {
-        "loss": 13.0512,
-        "grad_norm": 5.7198004722595215,
-        "learning_rate": 1.9120458891013384e-05,
-        "epoch": 0.19120458891013384,
         "step": 100
     },
     {
-        "loss": 12.9931,
-        "grad_norm": 5.554529666900635,
-        "learning_rate": 2.294455066921606e-05,
-        "epoch": 0.2294455066921606,
         "step": 120
     },
     {
-        "loss": 12.9042,
-        "grad_norm": 5.364482879638672,
-        "learning_rate": 2.6768642447418742e-05,
-        "epoch": 0.2676864244741874,
         "step": 140
     },
     {
-        "loss": 12.8488,
-        "grad_norm": 5.091818809509277,
-        "learning_rate": 3.059273422562142e-05,
-        "epoch": 0.30592734225621415,
         "step": 160
     },
     {
-        "loss": 12.7715,
-        "grad_norm": 5.035643577575684,
-        "learning_rate": 3.441682600382409e-05,
-        "epoch": 0.3441682600382409,
         "step": 180
     },
     {
-        "loss": 12.6747,
-        "grad_norm": 4.819056987762451,
-        "learning_rate": 3.824091778202677e-05,
-        "epoch": 0.3824091778202677,
         "step": 200
     },
     {
-        "loss": 12.6366,
-        "grad_norm": 4.597919464111328,
-        "learning_rate": 4.2065009560229444e-05,
-        "epoch": 0.42065009560229444,
         "step": 220
     },
     {
-        "loss": 12.5388,
-        "grad_norm": 4.551054954528809,
-        "learning_rate": 4.588910133843212e-05,
-        "epoch": 0.4588910133843212,
         "step": 240
     },
     {
-        "loss": 12.4527,
-        "grad_norm": 4.289029598236084,
-        "learning_rate": 4.97131931166348e-05,
-        "epoch": 0.497131931166348,
         "step": 260
     },
     {
-        "loss": 12.3809,
-        "grad_norm": 4.291126728057861,
-        "learning_rate": 5.3537284894837484e-05,
-        "epoch": 0.5353728489483748,
         "step": 280
     },
     {
-        "loss": 12.3185,
-        "grad_norm": 4.090356826782227,
-        "learning_rate": 5.736137667304016e-05,
-        "epoch": 0.5736137667304015,
         "step": 300
     },
     {
-        "loss": 12.2101,
-        "grad_norm": 3.9066805839538574,
-        "learning_rate": 6.118546845124283e-05,
-        "epoch": 0.6118546845124283,
         "step": 320
     },
     {
-        "loss": 12.1255,
-        "grad_norm": 3.937908887863159,
-        "learning_rate": 6.50095602294455e-05,
-        "epoch": 0.6500956022944551,
         "step": 340
     },
     {
-        "loss": 12.0543,
-        "grad_norm": 3.919820547103882,
-        "learning_rate": 6.883365200764819e-05,
-        "epoch": 0.6883365200764818,
         "step": 360
     },
     {
-        "loss": 11.9417,
-        "grad_norm": 3.8298187255859375,
-        "learning_rate": 7.265774378585087e-05,
-        "epoch": 0.7265774378585086,
         "step": 380
     },
     {
-        "loss": 11.8644,
-        "grad_norm": 3.7290520668029785,
-        "learning_rate": 7.648183556405354e-05,
-        "epoch": 0.7648183556405354,
         "step": 400
     },
     {
-        "loss": 11.8122,
-        "grad_norm": 3.76938533782959,
-        "learning_rate": 8.030592734225622e-05,
-        "epoch": 0.8030592734225621,
         "step": 420
     },
     {
-        "loss": 11.7117,
-        "grad_norm": 3.8729827404022217,
-        "learning_rate": 8.413001912045889e-05,
-        "epoch": 0.8413001912045889,
         "step": 440
     },
     {
-        "loss": 11.6245,
-        "grad_norm": 3.7178924083709717,
-        "learning_rate": 8.795411089866157e-05,
-        "epoch": 0.8795411089866156,
         "step": 460
     },
     {
-        "loss": 11.547,
-        "grad_norm": 3.7744827270507812,
-        "learning_rate": 9.177820267686424e-05,
-        "epoch": 0.9177820267686424,
         "step": 480
     },
     {
-        "loss": 11.4699,
-        "grad_norm": 3.6705052852630615,
-        "learning_rate": 9.560229445506692e-05,
-        "epoch": 0.9560229445506692,
         "step": 500
     },
     {
-        "loss": 11.3851,
-        "grad_norm": 3.6992719173431396,
-        "learning_rate": 9.94263862332696e-05,
-        "epoch": 0.994263862332696,
         "step": 520
     },
     {
-        "eval_loss": 11.029301643371582,
-        "eval_accuracy": 0.18062563067608475,
-        "eval_runtime": 592.6353,
-        "eval_samples_per_second": 25.083,
-        "eval_steps_per_second": 25.083,
-        "epoch": 1.0,
-        "step": 523
-    },
-    {
-        "loss": 11.2589,
-        "grad_norm": 3.6838159561157227,
-        "learning_rate": 9.963883577650308e-05,
-        "epoch": 1.0325047801147227,
         "step": 540
     },
     {
-        "loss": 11.1668,
-        "grad_norm": 3.7846293449401855,
-        "learning_rate": 9.921393669003612e-05,
-        "epoch": 1.0707456978967496,
         "step": 560
     },
     {
-        "loss": 11.1053,
-        "grad_norm": 3.688416004180908,
-        "learning_rate": 9.878903760356916e-05,
-        "epoch": 1.1089866156787762,
         "step": 580
     },
     {
-        "loss": 11.019,
-        "grad_norm": 3.724273204803467,
-        "learning_rate": 9.836413851710219e-05,
-        "epoch": 1.147227533460803,
         "step": 600
     },
     {
-        "loss": 10.9731,
-        "grad_norm": 3.840388536453247,
-        "learning_rate": 9.793923943063523e-05,
-        "epoch": 1.1854684512428297,
         "step": 620
     },
     {
-        "loss": 10.875,
-        "grad_norm": 3.828228235244751,
-        "learning_rate": 9.751434034416827e-05,
-        "epoch": 1.2237093690248566,
         "step": 640
     },
     {
-        "loss": 10.8111,
-        "grad_norm": 3.891911745071411,
-        "learning_rate": 9.70894412577013e-05,
-        "epoch": 1.2619502868068833,
         "step": 660
     },
     {
-        "loss": 10.7717,
-        "grad_norm": 3.8076562881469727,
-        "learning_rate": 9.666454217123433e-05,
-        "epoch": 1.3001912045889101,
         "step": 680
     },
     {
-        "loss": 10.6723,
-        "grad_norm": 3.8521881103515625,
-        "learning_rate": 9.623964308476737e-05,
-        "epoch": 1.338432122370937,
         "step": 700
     },
     {
-        "loss": 10.5961,
-        "grad_norm": 3.8576488494873047,
-        "learning_rate": 9.58147439983004e-05,
-        "epoch": 1.3766730401529637,
         "step": 720
     },
     {
-        "loss": 10.5392,
-        "grad_norm": 4.002715587615967,
-        "learning_rate": 9.538984491183345e-05,
-        "epoch": 1.4149139579349903,
         "step": 740
     },
     {
-        "loss": 10.5018,
-        "grad_norm": 3.8657026290893555,
-        "learning_rate": 9.496494582536648e-05,
-        "epoch": 1.4531548757170172,
         "step": 760
     },
     {
-        "loss": 10.4325,
-        "grad_norm": 3.9424169063568115,
-        "learning_rate": 9.454004673889951e-05,
-        "epoch": 1.491395793499044,
         "step": 780
     },
     {
-        "loss": 10.3722,
-        "grad_norm": 3.9783968925476074,
-        "learning_rate": 9.411514765243256e-05,
-        "epoch": 1.5296367112810707,
         "step": 800
     },
     {
-        "loss": 10.3069,
-        "grad_norm": 4.081951141357422,
-        "learning_rate": 9.369024856596559e-05,
-        "epoch": 1.5678776290630974,
         "step": 820
     },
     {
-        "loss": 10.2527,
-        "grad_norm": 4.141290187835693,
-        "learning_rate": 9.326534947949863e-05,
-        "epoch": 1.6061185468451242,
         "step": 840
     },
     {
-        "loss": 10.2271,
-        "grad_norm": 4.294083595275879,
-        "learning_rate": 9.284045039303167e-05,
-        "epoch": 1.644359464627151,
         "step": 860
     },
     {
-        "loss": 10.1756,
-        "grad_norm": 4.727543354034424,
-        "learning_rate": 9.241555130656469e-05,
-        "epoch": 1.682600382409178,
         "step": 880
     },
     {
-        "loss": 10.0936,
-        "grad_norm": 4.068965911865234,
-        "learning_rate": 9.199065222009773e-05,
-        "epoch": 1.7208413001912046,
         "step": 900
     },
     {
-        "loss": 10.0937,
-        "grad_norm": 4.025643825531006,
-        "learning_rate": 9.156575313363077e-05,
-        "epoch": 1.7590822179732313,
         "step": 920
     },
     {
-        "loss": 10.0217,
-        "grad_norm": 4.317354679107666,
-        "learning_rate": 9.11408540471638e-05,
-        "epoch": 1.7973231357552581,
         "step": 940
     },
     {
-        "loss": 9.9743,
-        "grad_norm": 4.101060390472412,
-        "learning_rate": 9.071595496069684e-05,
-        "epoch": 1.835564053537285,
         "step": 960
     },
     {
-        "loss": 9.9879,
-        "grad_norm": 4.225609302520752,
-        "learning_rate": 9.029105587422988e-05,
-        "epoch": 1.8738049713193117,
         "step": 980
     },
     {
-        "loss": 9.8273,
-        "grad_norm": 4.3140668869018555,
-        "learning_rate": 8.986615678776292e-05,
-        "epoch": 1.9120458891013383,
         "step": 1000
     },
     {
-        "loss": 9.8136,
-        "grad_norm": 4.199500560760498,
-        "learning_rate": 8.944125770129594e-05,
-        "epoch": 1.9502868068833652,
         "step": 1020
     },
     {
-        "loss": 9.7596,
-        "grad_norm": 4.457912445068359,
-        "learning_rate": 8.901635861482898e-05,
-        "epoch": 1.988527724665392,
         "step": 1040
     },
     {
-        "eval_loss": 9.140138626098633,
-        "eval_accuracy": 0.3849983181971073,
-        "eval_runtime": 461.2724,
-        "eval_samples_per_second": 32.226,
-        "eval_steps_per_second": 32.226,
-        "epoch": 2.0,
-        "step": 1046
-    },
-    {
-        "loss": 9.714,
-        "grad_norm": 4.428006172180176,
-        "learning_rate": 8.859145952836202e-05,
-        "epoch": 2.026768642447419,
         "step": 1060
     },
     {
-        "loss": 9.5508,
-        "grad_norm": 4.372852325439453,
-        "learning_rate": 8.816656044189505e-05,
-        "epoch": 2.0650095602294454,
         "step": 1080
     },
     {
-        "loss": 9.6096,
-        "grad_norm": 4.381687641143799,
-        "learning_rate": 8.774166135542809e-05,
-        "epoch": 2.1032504780114722,
         "step": 1100
     },
     {
-        "loss": 9.5077,
-        "grad_norm": 4.5865631103515625,
-        "learning_rate": 8.731676226896113e-05,
-        "epoch": 2.141491395793499,
         "step": 1120
     },
     {
-        "loss": 9.5044,
-        "grad_norm": 4.363910675048828,
-        "learning_rate": 8.689186318249416e-05,
-        "epoch": 2.179732313575526,
         "step": 1140
     },
     {
-        "loss": 9.4205,
-        "grad_norm": 4.577084541320801,
-        "learning_rate": 8.646696409602721e-05,
-        "epoch": 2.2179732313575524,
         "step": 1160
     },
     {
-        "loss": 9.4317,
-        "grad_norm": 4.576254367828369,
-        "learning_rate": 8.604206500956024e-05,
-        "epoch": 2.2562141491395793,
         "step": 1180
     },
     {
-        "loss": 9.3607,
-        "grad_norm": 4.4399847984313965,
-        "learning_rate": 8.561716592309326e-05,
-        "epoch": 2.294455066921606,
         "step": 1200
     },
     {
-        "loss": 9.2533,
-        "grad_norm": 4.595015525817871,
-        "learning_rate": 8.51922668366263e-05,
-        "epoch": 2.332695984703633,
         "step": 1220
     },
     {
-        "loss": 9.3384,
-        "grad_norm": 4.900874614715576,
-        "learning_rate": 8.476736775015934e-05,
-        "epoch": 2.3709369024856595,
         "step": 1240
     },
     {
-        "loss": 9.293,
-        "grad_norm": 4.594742774963379,
-        "learning_rate": 8.434246866369238e-05,
-        "epoch": 2.4091778202676863,
         "step": 1260
     },
     {
-        "loss": 9.1986,
-        "grad_norm": 4.587216377258301,
-        "learning_rate": 8.391756957722541e-05,
-        "epoch": 2.447418738049713,
         "step": 1280
     },
     {
-        "loss": 9.1358,
-        "grad_norm": 4.735275745391846,
-        "learning_rate": 8.349267049075845e-05,
-        "epoch": 2.48565965583174,
         "step": 1300
     },
     {
-        "loss": 9.1284,
-        "grad_norm": 4.627840995788574,
-        "learning_rate": 8.306777140429149e-05,
-        "epoch": 2.5239005736137665,
         "step": 1320
     },
     {
-        "loss": 9.0949,
-        "grad_norm": 4.658718585968018,
-        "learning_rate": 8.264287231782451e-05,
-        "epoch": 2.5621414913957934,
         "step": 1340
     },
     {
-        "loss": 9.0312,
-        "grad_norm": 4.875549793243408,
-        "learning_rate": 8.221797323135755e-05,
-        "epoch": 2.6003824091778203,
         "step": 1360
     },
     {
-        "loss": 8.9949,
-        "grad_norm": 4.683437347412109,
-        "learning_rate": 8.179307414489059e-05,
-        "epoch": 2.638623326959847,
         "step": 1380
     },
     {
-        "loss": 8.9705,
-        "grad_norm": 4.861114025115967,
-        "learning_rate": 8.136817505842362e-05,
-        "epoch": 2.676864244741874,
         "step": 1400
     },
     {
-        "loss": 8.9483,
-        "grad_norm": 4.727562427520752,
-        "learning_rate": 8.094327597195667e-05,
-        "epoch": 2.7151051625239004,
         "step": 1420
     },
     {
-        "loss": 8.9254,
-        "grad_norm": 4.8202948570251465,
-        "learning_rate": 8.05183768854897e-05,
-        "epoch": 2.7533460803059273,
         "step": 1440
     },
     {
-        "loss": 8.8768,
-        "grad_norm": 4.926464557647705,
-        "learning_rate": 8.009347779902273e-05,
-        "epoch": 2.791586998087954,
         "step": 1460
     },
     {
-        "loss": 8.8044,
-        "grad_norm": 4.7756028175354,
-        "learning_rate": 7.966857871255578e-05,
-        "epoch": 2.8298279158699806,
         "step": 1480
     },
     {
-        "loss": 8.7788,
-        "grad_norm": 4.888403415679932,
-        "learning_rate": 7.92436796260888e-05,
-        "epoch": 2.8680688336520075,
         "step": 1500
     },
     {
-        "loss": 8.8032,
-        "grad_norm": 4.943230152130127,
-        "learning_rate": 7.881878053962184e-05,
-        "epoch": 2.9063097514340344,
         "step": 1520
     },
     {
-        "loss": 8.7507,
-        "grad_norm": 5.011119842529297,
-        "learning_rate": 7.839388145315488e-05,
-        "epoch": 2.9445506692160612,
         "step": 1540
     },
     {
-        "loss": 8.7136,
-        "grad_norm": 5.068637847900391,
-        "learning_rate": 7.796898236668791e-05,
-        "epoch": 2.982791586998088,
         "step": 1560
     },
     {
-        "eval_loss": 7.882061958312988,
-        "eval_accuracy": 0.52418432559704,
-        "eval_runtime": 418.5795,
-        "eval_samples_per_second": 35.513,
-        "eval_steps_per_second": 35.513,
-        "epoch": 3.0,
-        "step": 1569
-    },
-    {
-        "loss": 8.6104,
-        "grad_norm": 4.895749092102051,
-        "learning_rate": 7.754408328022095e-05,
-        "epoch": 3.0210325047801145,
         "step": 1580
     },
     {
-        "loss": 8.6136,
-        "grad_norm": 5.138400077819824,
-        "learning_rate": 7.711918419375399e-05,
-        "epoch": 3.0592734225621414,
         "step": 1600
     },
     {
-        "loss": 8.5866,
-        "grad_norm": 5.270049571990967,
-        "learning_rate": 7.669428510728702e-05,
-        "epoch": 3.0975143403441683,
         "step": 1620
     },
     {
-        "loss": 8.492,
-        "grad_norm": 5.178355693817139,
-        "learning_rate": 7.626938602082006e-05,
-        "epoch": 3.135755258126195,
         "step": 1640
     },
     {
-        "loss": 8.4897,
-        "grad_norm": 5.312692165374756,
-        "learning_rate": 7.58444869343531e-05,
-        "epoch": 3.173996175908222,
         "step": 1660
     },
     {
-        "loss": 8.4441,
-        "grad_norm": 5.227985382080078,
-        "learning_rate": 7.541958784788614e-05,
-        "epoch": 3.2122370936902485,
         "step": 1680
     },
     {
-        "loss": 8.4722,
-        "grad_norm": 5.042078495025635,
-        "learning_rate": 7.499468876141916e-05,
-        "epoch": 3.2504780114722753,
         "step": 1700
     },
     {
-        "loss": 8.3105,
-        "grad_norm": 5.250526428222656,
-        "learning_rate": 7.45697896749522e-05,
-        "epoch": 3.288718929254302,
         "step": 1720
     },
     {
-        "loss": 8.3308,
-        "grad_norm": 5.22187614440918,
-        "learning_rate": 7.414489058848524e-05,
-        "epoch": 3.3269598470363286,
         "step": 1740
     },
     {
-        "loss": 8.2969,
-        "grad_norm": 5.491254806518555,
-        "learning_rate": 7.371999150201827e-05,
-        "epoch": 3.3652007648183555,
         "step": 1760
     },
     {
-        "loss": 8.2593,
-        "grad_norm": 5.482990741729736,
-        "learning_rate": 7.329509241555131e-05,
-        "epoch": 3.4034416826003824,
         "step": 1780
     },
     {
-        "loss": 8.3087,
-        "grad_norm": 5.359766960144043,
-        "learning_rate": 7.287019332908435e-05,
-        "epoch": 3.4416826003824093,
         "step": 1800
     },
     {
-        "loss": 8.2664,
-        "grad_norm": 5.788363456726074,
-        "learning_rate": 7.244529424261737e-05,
-        "epoch": 3.479923518164436,
         "step": 1820
     },
     {
-        "loss": 8.2543,
-        "grad_norm": 5.335551738739014,
-        "learning_rate": 7.202039515615043e-05,
-        "epoch": 3.5181644359464626,
         "step": 1840
     },
     {
-        "loss": 8.2604,
-        "grad_norm": 5.465627193450928,
-        "learning_rate": 7.159549606968345e-05,
-        "epoch": 3.5564053537284894,
         "step": 1860
     },
     {
-        "loss": 8.1616,
-        "grad_norm": 5.594823837280273,
-        "learning_rate": 7.117059698321648e-05,
-        "epoch": 3.5946462715105163,
         "step": 1880
     },
     {
-        "loss": 8.1582,
-        "grad_norm": 5.58858060836792,
-        "learning_rate": 7.074569789674953e-05,
-        "epoch": 3.632887189292543,
         "step": 1900
     },
     {
-        "loss": 8.1061,
-        "grad_norm": 5.514508247375488,
-        "learning_rate": 7.032079881028256e-05,
-        "epoch": 3.67112810707457,
         "step": 1920
     },
     {
-        "loss": 8.0912,
-        "grad_norm": 5.644900321960449,
-        "learning_rate": 6.98958997238156e-05,
-        "epoch": 3.7093690248565965,
         "step": 1940
     },
     {
-        "loss": 7.9596,
-        "grad_norm": 5.701168060302734,
-        "learning_rate": 6.947100063734864e-05,
-        "epoch": 3.7476099426386233,
         "step": 1960
     },
     {
-        "loss": 8.0403,
-        "grad_norm": 5.880733013153076,
-        "learning_rate": 6.904610155088167e-05,
-        "epoch": 3.78585086042065,
         "step": 1980
     },
     {
-        "loss": 7.9666,
-        "grad_norm": 5.638689994812012,
-        "learning_rate": 6.86212024644147e-05,
-        "epoch": 3.8240917782026767,
         "step": 2000
     },
     {
-        "loss": 7.9633,
-        "grad_norm": 6.002101421356201,
-        "learning_rate": 6.819630337794775e-05,
-        "epoch": 3.8623326959847035,
         "step": 2020
     },
     {
-        "loss": 7.8817,
-        "grad_norm": 5.628067493438721,
-        "learning_rate": 6.777140429148077e-05,
-        "epoch": 3.9005736137667304,
         "step": 2040
     },
     {
-        "loss": 7.9118,
-        "grad_norm": 6.128510475158691,
-        "learning_rate": 6.734650520501381e-05,
-        "epoch": 3.9388145315487573,
         "step": 2060
     },
     {
-        "loss": 7.848,
-        "grad_norm": 5.620929718017578,
-        "learning_rate": 6.692160611854685e-05,
-        "epoch": 3.977055449330784,
         "step": 2080
     },
     {
-        "eval_loss": 6.945113658905029,
-        "eval_accuracy": 0.6143962327615203,
-        "eval_runtime": 367.1966,
-        "eval_samples_per_second": 40.482,
-        "eval_steps_per_second": 40.482,
-        "epoch": 4.0,
-        "step": 2092
-    },
-    {
-        "loss": 7.8607,
-        "grad_norm": 5.820804595947266,
-        "learning_rate": 6.649670703207989e-05,
-        "epoch": 4.015296367112811,
         "step": 2100
     },
     {
-        "loss": 7.7072,
-        "grad_norm": 5.6448493003845215,
-        "learning_rate": 6.607180794561292e-05,
-        "epoch": 4.053537284894838,
         "step": 2120
     },
     {
-        "loss": 7.772,
-        "grad_norm": 6.283373832702637,
-        "learning_rate": 6.564690885914596e-05,
-        "epoch": 4.091778202676864,
         "step": 2140
     },
     {
-        "loss": 7.7211,
-        "grad_norm": 6.125846862792969,
-        "learning_rate": 6.5222009772679e-05,
-        "epoch": 4.130019120458891,
         "step": 2160
     },
     {
-        "loss": 7.6563,
-        "grad_norm": 5.701002597808838,
-        "learning_rate": 6.479711068621202e-05,
-        "epoch": 4.168260038240918,
         "step": 2180
     },
     {
-        "loss": 7.711,
-        "grad_norm": 5.910340785980225,
-        "learning_rate": 6.437221159974506e-05,
-        "epoch": 4.2065009560229445,
         "step": 2200
     },
     {
-        "loss": 7.7582,
-        "grad_norm": 5.8003082275390625,
-        "learning_rate": 6.39473125132781e-05,
-        "epoch": 4.244741873804971,
         "step": 2220
     },
     {
-        "loss": 7.6215,
-        "grad_norm": 5.95621395111084,
-        "learning_rate": 6.352241342681113e-05,
-        "epoch": 4.282982791586998,
         "step": 2240
     },
     {
-        "loss": 7.5932,
-        "grad_norm": 5.836912155151367,
-        "learning_rate": 6.309751434034417e-05,
-        "epoch": 4.321223709369025,
         "step": 2260
     },
     {
-        "loss": 7.5122,
-        "grad_norm": 6.156320095062256,
-        "learning_rate": 6.267261525387721e-05,
-        "epoch": 4.359464627151052,
         "step": 2280
     },
     {
-        "loss": 7.5488,
-        "grad_norm": 5.937085151672363,
-        "learning_rate": 6.224771616741024e-05,
-        "epoch": 4.397705544933078,
         "step": 2300
     },
     {
-        "loss": 7.5972,
-        "grad_norm": 5.949016571044922,
-        "learning_rate": 6.182281708094328e-05,
-        "epoch": 4.435946462715105,
         "step": 2320
     },
     {
-        "loss": 7.4327,
-        "grad_norm": 6.26347541809082,
-        "learning_rate": 6.139791799447631e-05,
-        "epoch": 4.474187380497132,
         "step": 2340
     },
     {
-        "loss": 7.555,
-        "grad_norm": 6.376476287841797,
-        "learning_rate": 6.097301890800935e-05,
-        "epoch": 4.512428298279159,
         "step": 2360
     },
     {
-        "loss": 7.5463,
-        "grad_norm": 6.2988200187683105,
-        "learning_rate": 6.054811982154238e-05,
-        "epoch": 4.550669216061186,
         "step": 2380
     },
     {
-        "loss": 7.4637,
-        "grad_norm": 5.916903972625732,
-        "learning_rate": 6.012322073507543e-05,
-        "epoch": 4.588910133843212,
         "step": 2400
     },
     {
-        "loss": 7.3857,
-        "grad_norm": 5.896063327789307,
-        "learning_rate": 5.969832164860846e-05,
-        "epoch": 4.627151051625239,
         "step": 2420
     },
     {
-        "loss": 7.4363,
-        "grad_norm": 6.14431619644165,
-        "learning_rate": 5.927342256214149e-05,
-        "epoch": 4.665391969407266,
         "step": 2440
     },
     {
-        "loss": 7.406,
-        "grad_norm": 6.2994256019592285,
-        "learning_rate": 5.8848523475674533e-05,
-        "epoch": 4.7036328871892925,
         "step": 2460
     },
     {
-        "loss": 7.338,
-        "grad_norm": 6.134793758392334,
-        "learning_rate": 5.8423624389207567e-05,
-        "epoch": 4.741873804971319,
         "step": 2480
     },
     {
-        "loss": 7.3912,
-        "grad_norm": 6.245213031768799,
-        "learning_rate": 5.79987253027406e-05,
-        "epoch": 4.780114722753346,
         "step": 2500
     },
     {
-        "loss": 7.3548,
-        "grad_norm": 6.118636131286621,
-        "learning_rate": 5.757382621627364e-05,
-        "epoch": 4.818355640535373,
         "step": 2520
     },
     {
-        "loss": 7.3119,
-        "grad_norm": 6.391002178192139,
-        "learning_rate": 5.714892712980667e-05,
-        "epoch": 4.8565965583174,
         "step": 2540
     },
     {
-        "loss": 7.2119,
-        "grad_norm": 6.539446830749512,
-        "learning_rate": 5.6724028043339705e-05,
-        "epoch": 4.894837476099426,
         "step": 2560
     },
     {
-        "loss": 7.2505,
-        "grad_norm": 6.162653923034668,
-        "learning_rate": 5.6299128956872745e-05,
-        "epoch": 4.933078393881453,
         "step": 2580
     },
     {
-        "loss": 7.1912,
-        "grad_norm": 6.580591678619385,
-        "learning_rate": 5.587422987040578e-05,
-        "epoch": 4.97131931166348,
         "step": 2600
     },
     {
-        "eval_loss": 6.262951850891113,
-        "eval_accuracy": 0.6821392532795156,
-        "eval_runtime": 76.4531,
-        "eval_samples_per_second": 194.433,
-        "eval_steps_per_second": 194.433,
-        "epoch": 5.0,
-        "step": 2615
-    },
-    {
-        "loss": 7.1863,
-        "grad_norm": 6.838705062866211,
-        "learning_rate": 5.544933078393881e-05,
-        "epoch": 5.009560229445507,
         "step": 2620
     },
     {
-        "loss": 7.1259,
-        "grad_norm": 6.260281562805176,
-        "learning_rate": 5.502443169747186e-05,
-        "epoch": 5.047801147227533,
         "step": 2640
     },
     {
-        "loss": 7.1559,
-        "grad_norm": 6.463006496429443,
-        "learning_rate": 5.459953261100489e-05,
-        "epoch": 5.08604206500956,
         "step": 2660
     },
     {
-        "loss": 7.1318,
-        "grad_norm": 6.499185562133789,
-        "learning_rate": 5.4174633524537924e-05,
-        "epoch": 5.124282982791587,
         "step": 2680
     },
     {
-        "loss": 7.0993,
-        "grad_norm": 6.508650302886963,
-        "learning_rate": 5.3749734438070964e-05,
-        "epoch": 5.162523900573614,
         "step": 2700
     },
     {
-        "loss": 7.0823,
-        "grad_norm": 6.573218822479248,
-        "learning_rate": 5.3324835351604e-05,
-        "epoch": 5.2007648183556405,
         "step": 2720
     },
     {
-        "loss": 7.0839,
-        "grad_norm": 6.863697052001953,
-        "learning_rate": 5.289993626513703e-05,
-        "epoch": 5.239005736137667,
         "step": 2740
     },
     {
-        "loss": 7.0723,
-        "grad_norm": 6.305070877075195,
-        "learning_rate": 5.247503717867007e-05,
-        "epoch": 5.277246653919694,
         "step": 2760
     },
     {
-        "loss": 6.9592,
-        "grad_norm": 6.715279579162598,
-        "learning_rate": 5.20501380922031e-05,
-        "epoch": 5.315487571701721,
         "step": 2780
     },
     {
-        "loss": 7.0275,
-        "grad_norm": 6.625701904296875,
-        "learning_rate": 5.1625239005736136e-05,
-        "epoch": 5.353728489483748,
         "step": 2800
     },
     {
-        "loss": 6.9146,
-        "grad_norm": 6.717496871948242,
-        "learning_rate": 5.120033991926918e-05,
-        "epoch": 5.3919694072657744,
         "step": 2820
     },
     {
-        "loss": 6.9984,
-        "grad_norm": 6.500243186950684,
-        "learning_rate": 5.0775440832802216e-05,
-        "epoch": 5.430210325047801,
         "step": 2840
     },
     {
-        "loss": 6.9367,
-        "grad_norm": 6.41347074508667,
-        "learning_rate": 5.035054174633524e-05,
-        "epoch": 5.468451242829828,
         "step": 2860
     },
     {
-        "loss": 6.9997,
-        "grad_norm": 6.83429479598999,
-        "learning_rate": 4.992564265986828e-05,
-        "epoch": 5.506692160611855,
         "step": 2880
     },
     {
-        "loss": 6.9204,
-        "grad_norm": 6.565597057342529,
-        "learning_rate": 4.950074357340132e-05,
-        "epoch": 5.544933078393882,
         "step": 2900
     },
     {
-        "loss": 6.8926,
-        "grad_norm": 6.9456095695495605,
-        "learning_rate": 4.907584448693436e-05,
-        "epoch": 5.583173996175908,
         "step": 2920
     },
     {
-        "loss": 6.8993,
-        "grad_norm": 7.052099704742432,
-        "learning_rate": 4.865094540046739e-05,
-        "epoch": 5.621414913957935,
         "step": 2940
     },
     {
-        "loss": 6.8474,
-        "grad_norm": 7.128490924835205,
-        "learning_rate": 4.822604631400043e-05,
-        "epoch": 5.659655831739962,
         "step": 2960
     },
     {
-        "loss": 6.8509,
-        "grad_norm": 6.792144298553467,
-        "learning_rate": 4.780114722753346e-05,
-        "epoch": 5.6978967495219885,
         "step": 2980
     },
     {
-        "loss": 6.9141,
-        "grad_norm": 6.853285312652588,
-        "learning_rate": 4.73762481410665e-05,
-        "epoch": 5.736137667304015,
         "step": 3000
     },
     {
-        "loss": 6.7391,
-        "grad_norm": 7.153258800506592,
-        "learning_rate": 4.695134905459953e-05,
-        "epoch": 5.774378585086042,
         "step": 3020
     },
     {
-        "loss": 6.7554,
-        "grad_norm": 6.9271321296691895,
-        "learning_rate": 4.6526449968132566e-05,
-        "epoch": 5.812619502868069,
         "step": 3040
     },
     {
-        "loss": 6.8172,
-        "grad_norm": 7.218133926391602,
-        "learning_rate": 4.6101550881665606e-05,
-        "epoch": 5.850860420650095,
         "step": 3060
     },
     {
-        "loss": 6.8442,
-        "grad_norm": 7.0558695793151855,
-        "learning_rate": 4.5676651795198646e-05,
-        "epoch": 5.8891013384321225,
         "step": 3080
     },
     {
-        "loss": 6.696,
-        "grad_norm": 6.762065887451172,
-        "learning_rate": 4.525175270873168e-05,
-        "epoch": 5.927342256214149,
         "step": 3100
     },
     {
-        "loss": 6.6763,
-        "grad_norm": 6.8173604011535645,
-        "learning_rate": 4.482685362226471e-05,
-        "epoch": 5.965583173996176,
         "step": 3120
     },
     {
-        "eval_loss": 5.7182440757751465,
-        "eval_accuracy": 0.7291624621594349,
-        "eval_runtime": 444.003,
-        "eval_samples_per_second": 33.48,
-        "eval_steps_per_second": 33.48,
-        "epoch": 6.0,
-        "step": 3138
-    },
-    {
-        "loss": 6.6927,
-        "grad_norm": 7.1014723777771,
-        "learning_rate": 4.440195453579775e-05,
-        "epoch": 6.003824091778203,
         "step": 3140
     },
     {
-        "loss": 6.6538,
-        "grad_norm": 6.958450794219971,
-        "learning_rate": 4.3977055449330785e-05,
-        "epoch": 6.042065009560229,
         "step": 3160
     },
     {
-        "loss": 6.5479,
-        "grad_norm": 6.920003890991211,
-        "learning_rate": 4.3552156362863825e-05,
-        "epoch": 6.080305927342256,
         "step": 3180
     },
     {
-        "loss": 6.5668,
-        "grad_norm": 7.053244113922119,
-        "learning_rate": 4.312725727639686e-05,
-        "epoch": 6.118546845124283,
         "step": 3200
     },
     {
-        "loss": 6.6722,
-        "grad_norm": 6.9157185554504395,
-        "learning_rate": 4.270235818992989e-05,
-        "epoch": 6.15678776290631,
         "step": 3220
     },
     {
-        "loss": 6.6397,
-        "grad_norm": 7.149935722351074,
-        "learning_rate": 4.227745910346293e-05,
-        "epoch": 6.195028680688337,
         "step": 3240
     },
     {
-        "loss": 6.6041,
-        "grad_norm": 7.318164825439453,
-        "learning_rate": 4.185256001699597e-05,
-        "epoch": 6.233269598470363,
         "step": 3260
     },
     {
-        "loss": 6.5492,
-        "grad_norm": 7.044018268585205,
-        "learning_rate": 4.1427660930529e-05,
-        "epoch": 6.27151051625239,
         "step": 3280
     },
     {
-        "loss": 6.5679,
-        "grad_norm": 7.045164585113525,
-        "learning_rate": 4.1002761844062037e-05,
-        "epoch": 6.309751434034417,
         "step": 3300
     },
     {
-        "loss": 6.5695,
-        "grad_norm": 7.092489242553711,
-        "learning_rate": 4.0577862757595076e-05,
-        "epoch": 6.347992351816444,
         "step": 3320
     },
     {
-        "loss": 6.4842,
-        "grad_norm": 6.940147399902344,
-        "learning_rate": 4.015296367112811e-05,
-        "epoch": 6.3862332695984705,
         "step": 3340
     },
     {
-        "loss": 6.5317,
-        "grad_norm": 7.10172176361084,
-        "learning_rate": 3.972806458466114e-05,
-        "epoch": 6.424474187380497,
         "step": 3360
     },
     {
-        "loss": 6.4702,
-        "grad_norm": 7.129051208496094,
-        "learning_rate": 3.930316549819418e-05,
-        "epoch": 6.462715105162524,
         "step": 3380
     },
     {
-        "loss": 6.3999,
-        "grad_norm": 7.501070499420166,
-        "learning_rate": 3.8878266411727215e-05,
-        "epoch": 6.500956022944551,
         "step": 3400
     },
     {
-        "loss": 6.4932,
-        "grad_norm": 7.325244426727295,
-        "learning_rate": 3.8453367325260255e-05,
-        "epoch": 6.539196940726577,
         "step": 3420
     },
     {
-        "loss": 6.3927,
-        "grad_norm": 7.361093521118164,
-        "learning_rate": 3.802846823879329e-05,
-        "epoch": 6.577437858508604,
         "step": 3440
     },
     {
-        "loss": 6.4861,
-        "grad_norm": 7.228673458099365,
-        "learning_rate": 3.760356915232632e-05,
-        "epoch": 6.615678776290631,
         "step": 3460
     },
     {
-        "loss": 6.4623,
-        "grad_norm": 7.602611064910889,
-        "learning_rate": 3.717867006585936e-05,
-        "epoch": 6.653919694072657,
         "step": 3480
     },
     {
-        "loss": 6.4282,
-        "grad_norm": 7.901960372924805,
-        "learning_rate": 3.6753770979392394e-05,
-        "epoch": 6.692160611854685,
         "step": 3500
     },
     {
-        "loss": 6.3799,
-        "grad_norm": 7.1125383377075195,
-        "learning_rate": 3.6328871892925434e-05,
-        "epoch": 6.730401529636711,
         "step": 3520
     },
     {
-        "loss": 6.3707,
-        "grad_norm": 7.1385884284973145,
-        "learning_rate": 3.590397280645847e-05,
-        "epoch": 6.768642447418738,
         "step": 3540
     },
     {
-        "loss": 6.4388,
-        "grad_norm": 7.548192977905273,
-        "learning_rate": 3.54790737199915e-05,
-        "epoch": 6.806883365200765,
         "step": 3560
     },
     {
-        "loss": 6.4223,
-        "grad_norm": 7.492359161376953,
-        "learning_rate": 3.505417463352454e-05,
-        "epoch": 6.845124282982791,
         "step": 3580
     },
     {
-        "loss": 6.3552,
-        "grad_norm": 7.575985431671143,
-        "learning_rate": 3.462927554705758e-05,
-        "epoch": 6.8833652007648185,
         "step": 3600
     },
     {
-        "loss": 6.3379,
-        "grad_norm": 7.351112365722656,
-        "learning_rate": 3.4204376460590606e-05,
-        "epoch": 6.921606118546845,
         "step": 3620
     },
     {
-        "loss": 6.3429,
-        "grad_norm": 7.33430290222168,
-        "learning_rate": 3.3779477374123646e-05,
-        "epoch": 6.959847036328872,
         "step": 3640
     },
     {
-        "loss": 6.3112,
-        "grad_norm": 7.511825084686279,
-        "learning_rate": 3.3354578287656686e-05,
-        "epoch": 6.998087954110899,
         "step": 3660
     },
     {
-        "eval_loss": 5.265278339385986,
-        "eval_accuracy": 0.7632021527077026,
-        "eval_runtime": 484.395,
-        "eval_samples_per_second": 30.688,
-        "eval_steps_per_second": 30.688,
-        "epoch": 7.0,
-        "step": 3661
-    },
-    {
-        "loss": 6.1764,
-        "grad_norm": 7.424711227416992,
-        "learning_rate": 3.292967920118972e-05,
-        "epoch": 7.036328871892925,
         "step": 3680
     },
     {
-        "loss": 6.2389,
-        "grad_norm": 7.648799896240234,
-        "learning_rate": 3.250478011472275e-05,
-        "epoch": 7.074569789674952,
         "step": 3700
     },
     {
-        "loss": 6.2506,
-        "grad_norm": 7.4450483322143555,
-        "learning_rate": 3.207988102825579e-05,
-        "epoch": 7.112810707456979,
         "step": 3720
     },
     {
-        "loss": 6.2049,
-        "grad_norm": 7.422061443328857,
-        "learning_rate": 3.1654981941788825e-05,
-        "epoch": 7.151051625239006,
         "step": 3740
     },
     {
-        "loss": 6.2906,
-        "grad_norm": 7.345204830169678,
-        "learning_rate": 3.1230082855321864e-05,
-        "epoch": 7.189292543021033,
         "step": 3760
     },
     {
-        "loss": 6.2644,
-        "grad_norm": 7.486473083496094,
-        "learning_rate": 3.08051837688549e-05,
-        "epoch": 7.227533460803059,
         "step": 3780
     },
     {
-        "loss": 6.2421,
-        "grad_norm": 7.317290782928467,
-        "learning_rate": 3.0380284682387934e-05,
-        "epoch": 7.265774378585086,
         "step": 3800
     },
     {
-        "loss": 6.1406,
-        "grad_norm": 7.4384002685546875,
-        "learning_rate": 2.995538559592097e-05,
-        "epoch": 7.304015296367113,
         "step": 3820
     },
     {
-        "loss": 6.2031,
-        "grad_norm": 7.7606000900268555,
-        "learning_rate": 2.9530486509454007e-05,
-        "epoch": 7.342256214149139,
         "step": 3840
     },
     {
-        "loss": 6.127,
-        "grad_norm": 7.305050373077393,
-        "learning_rate": 2.910558742298704e-05,
-        "epoch": 7.3804971319311665,
         "step": 3860
     },
     {
-        "loss": 6.1474,
-        "grad_norm": 7.713500022888184,
-        "learning_rate": 2.868068833652008e-05,
-        "epoch": 7.418738049713193,
         "step": 3880
     },
     {
-        "loss": 6.1542,
-        "grad_norm": 8.028603553771973,
-        "learning_rate": 2.8255789250053116e-05,
-        "epoch": 7.45697896749522,
         "step": 3900
     },
     {
-        "loss": 6.225,
-        "grad_norm": 7.4730329513549805,
-        "learning_rate": 2.783089016358615e-05,
-        "epoch": 7.495219885277247,
         "step": 3920
     },
     {
-        "loss": 6.1674,
-        "grad_norm": 7.52304220199585,
-        "learning_rate": 2.7405991077119186e-05,
-        "epoch": 7.533460803059273,
         "step": 3940
     },
     {
-        "loss": 6.1169,
-        "grad_norm": 7.616427898406982,
-        "learning_rate": 2.6981091990652225e-05,
-        "epoch": 7.5717017208413,
         "step": 3960
     },
     {
-        "loss": 6.1041,
-        "grad_norm": 7.784472465515137,
-        "learning_rate": 2.6556192904185255e-05,
-        "epoch": 7.609942638623327,
         "step": 3980
     },
     {
-        "loss": 6.1069,
-        "grad_norm": 7.819777011871338,
-        "learning_rate": 2.6131293817718295e-05,
-        "epoch": 7.648183556405353,
         "step": 4000
     },
     {
-        "loss": 5.9985,
-        "grad_norm": 7.889120101928711,
-        "learning_rate": 2.5706394731251328e-05,
-        "epoch": 7.686424474187381,
         "step": 4020
     },
     {
-        "loss": 6.0437,
-        "grad_norm": 7.858097076416016,
-        "learning_rate": 2.5281495644784364e-05,
-        "epoch": 7.724665391969407,
         "step": 4040
     },
     {
-        "loss": 6.1376,
-        "grad_norm": 7.739562511444092,
-        "learning_rate": 2.48565965583174e-05,
-        "epoch": 7.762906309751434,
         "step": 4060
     },
     {
-        "loss": 6.2084,
-        "grad_norm": 7.778552532196045,
-        "learning_rate": 2.4431697471850437e-05,
-        "epoch": 7.801147227533461,
         "step": 4080
     },
     {
-        "loss": 6.0325,
-        "grad_norm": 7.536991596221924,
-        "learning_rate": 2.4006798385383474e-05,
-        "epoch": 7.839388145315487,
         "step": 4100
     },
     {
-        "loss": 6.098,
-        "grad_norm": 7.846856594085693,
-        "learning_rate": 2.3581899298916507e-05,
-        "epoch": 7.8776290630975145,
         "step": 4120
     },
     {
-        "loss": 5.9765,
-        "grad_norm": 7.760807991027832,
-        "learning_rate": 2.3157000212449547e-05,
-        "epoch": 7.915869980879541,
         "step": 4140
     },
     {
-        "loss": 5.9915,
-        "grad_norm": 7.827345371246338,
-        "learning_rate": 2.273210112598258e-05,
-        "epoch": 7.954110898661568,
         "step": 4160
     },
     {
-        "loss": 6.0255,
-        "grad_norm": 8.129748344421387,
-        "learning_rate": 2.2307202039515616e-05,
-        "epoch": 7.992351816443595,
         "step": 4180
     },
     {
-        "eval_loss": 4.966301918029785,
-        "eval_accuracy": 0.782643794147326,
-        "eval_runtime": 260.149,
-        "eval_samples_per_second": 57.14,
-        "eval_steps_per_second": 57.14,
-        "epoch": 8.0,
-        "step": 4184
-    },
-    {
-        "loss": 6.0763,
-        "grad_norm": 7.686340808868408,
-        "learning_rate": 2.1882302953048652e-05,
-        "epoch": 8.030592734225621,
         "step": 4200
     },
     {
-        "loss": 5.868,
-        "grad_norm": 7.666318893432617,
-        "learning_rate": 2.145740386658169e-05,
-        "epoch": 8.068833652007648,
         "step": 4220
     },
     {
-        "loss": 5.8964,
-        "grad_norm": 7.686400890350342,
-        "learning_rate": 2.1032504780114722e-05,
-        "epoch": 8.107074569789676,
         "step": 4240
     },
     {
-        "loss": 5.8408,
-        "grad_norm": 7.418490886688232,
-        "learning_rate": 2.0607605693647762e-05,
-        "epoch": 8.145315487571702,
         "step": 4260
     },
     {
-        "loss": 5.9742,
-        "grad_norm": 7.769067287445068,
-        "learning_rate": 2.0182706607180795e-05,
-        "epoch": 8.183556405353729,
         "step": 4280
     },
     {
-        "loss": 5.913,
-        "grad_norm": 7.915468215942383,
-        "learning_rate": 1.975780752071383e-05,
-        "epoch": 8.221797323135755,
         "step": 4300
     },
     {
-        "loss": 5.8613,
-        "grad_norm": 7.884761810302734,
-        "learning_rate": 1.9332908434246868e-05,
-        "epoch": 8.260038240917781,
         "step": 4320
     },
     {
-        "loss": 5.9791,
-        "grad_norm": 7.765011787414551,
-        "learning_rate": 1.8908009347779904e-05,
-        "epoch": 8.29827915869981,
         "step": 4340
     },
     {
-        "loss": 5.9675,
-        "grad_norm": 8.110984802246094,
-        "learning_rate": 1.8483110261312937e-05,
-        "epoch": 8.336520076481836,
         "step": 4360
     },
     {
-        "loss": 5.9804,
-        "grad_norm": 8.114306449890137,
-        "learning_rate": 1.8058211174845974e-05,
-        "epoch": 8.374760994263863,
         "step": 4380
     },
     {
-        "loss": 5.8832,
-        "grad_norm": 7.981202125549316,
-        "learning_rate": 1.763331208837901e-05,
-        "epoch": 8.413001912045889,
         "step": 4400
     },
     {
-        "loss": 5.9301,
-        "grad_norm": 7.628136157989502,
-        "learning_rate": 1.7208413001912046e-05,
-        "epoch": 8.451242829827915,
         "step": 4420
     },
     {
-        "loss": 5.8983,
-        "grad_norm": 7.863382816314697,
-        "learning_rate": 1.6783513915445083e-05,
-        "epoch": 8.489483747609942,
         "step": 4440
     },
     {
-        "loss": 5.8938,
-        "grad_norm": 7.82211971282959,
-        "learning_rate": 1.635861482897812e-05,
-        "epoch": 8.52772466539197,
         "step": 4460
     },
     {
-        "loss": 5.8945,
-        "grad_norm": 8.038976669311523,
-        "learning_rate": 1.5933715742511156e-05,
-        "epoch": 8.565965583173996,
         "step": 4480
     },
     {
-        "loss": 5.8895,
-        "grad_norm": 7.884932518005371,
-        "learning_rate": 1.550881665604419e-05,
-        "epoch": 8.604206500956023,
         "step": 4500
     },
     {
-        "loss": 5.9617,
-        "grad_norm": 7.975419521331787,
-        "learning_rate": 1.5083917569577227e-05,
-        "epoch": 8.64244741873805,
         "step": 4520
     },
     {
-        "loss": 5.8659,
-        "grad_norm": 7.786068916320801,
-        "learning_rate": 1.4659018483110262e-05,
-        "epoch": 8.680688336520076,
         "step": 4540
     },
     {
-        "loss": 5.9116,
-        "grad_norm": 8.130301475524902,
-        "learning_rate": 1.4234119396643298e-05,
-        "epoch": 8.718929254302104,
         "step": 4560
     },
     {
-        "loss": 5.8536,
-        "grad_norm": 8.042682647705078,
-        "learning_rate": 1.3809220310176335e-05,
-        "epoch": 8.75717017208413,
         "step": 4580
     },
     {
-        "loss": 5.9241,
-        "grad_norm": 8.327803611755371,
-        "learning_rate": 1.3384321223709371e-05,
-        "epoch": 8.795411089866157,
         "step": 4600
     },
     {
-        "loss": 5.864,
-        "grad_norm": 7.880401134490967,
-        "learning_rate": 1.2959422137242406e-05,
-        "epoch": 8.833652007648183,
         "step": 4620
     },
     {
-        "loss": 5.9457,
-        "grad_norm": 7.6825127601623535,
-        "learning_rate": 1.253452305077544e-05,
-        "epoch": 8.87189292543021,
         "step": 4640
     },
     {
-        "loss": 5.8329,
-        "grad_norm": 7.971193313598633,
-        "learning_rate": 1.2109623964308479e-05,
-        "epoch": 8.910133843212238,
         "step": 4660
     },
     {
-        "loss": 5.8671,
-        "grad_norm": 8.04354476928711,
-        "learning_rate": 1.1684724877841513e-05,
-        "epoch": 8.948374760994264,
         "step": 4680
     },
     {
-        "loss": 5.8091,
-        "grad_norm": 7.942180633544922,
-        "learning_rate": 1.125982579137455e-05,
-        "epoch": 8.98661567877629,
         "step": 4700
     },
     {
-        "eval_loss": 4.778744220733643,
-        "eval_accuracy": 0.7956945845946855,
-        "eval_runtime": 531.1827,
-        "eval_samples_per_second": 27.985,
-        "eval_steps_per_second": 27.985,
-        "epoch": 9.0,
-        "step": 4707
-    },
-    {
-        "loss": 5.7978,
-        "grad_norm": 7.77038049697876,
-        "learning_rate": 1.0834926704907584e-05,
-        "epoch": 9.024856596558317,
         "step": 4720
     },
     {
-        "loss": 5.7849,
-        "grad_norm": 7.850288391113281,
-        "learning_rate": 1.0410027618440621e-05,
-        "epoch": 9.063097514340344,
         "step": 4740
     },
     {
-        "loss": 5.7891,
-        "grad_norm": 8.032878875732422,
-        "learning_rate": 9.985128531973657e-06,
-        "epoch": 9.101338432122372,
         "step": 4760
     },
     {
-        "loss": 5.781,
-        "grad_norm": 7.886658668518066,
-        "learning_rate": 9.560229445506692e-06,
-        "epoch": 9.139579349904398,
         "step": 4780
     },
     {
-        "loss": 5.8584,
-        "grad_norm": 7.953343868255615,
-        "learning_rate": 9.135330359039729e-06,
-        "epoch": 9.177820267686425,
         "step": 4800
     },
     {
-        "loss": 5.8192,
-        "grad_norm": 7.899537563323975,
-        "learning_rate": 8.710431272572763e-06,
-        "epoch": 9.216061185468451,
         "step": 4820
     },
     {
-        "loss": 5.7122,
-        "grad_norm": 8.269824028015137,
-        "learning_rate": 8.2855321861058e-06,
-        "epoch": 9.254302103250478,
         "step": 4840
     },
     {
-        "loss": 5.7634,
-        "grad_norm": 7.824770450592041,
-        "learning_rate": 7.860633099638836e-06,
-        "epoch": 9.292543021032504,
         "step": 4860
     },
     {
-        "loss": 5.8083,
-        "grad_norm": 7.953860759735107,
-        "learning_rate": 7.435734013171872e-06,
-        "epoch": 9.330783938814532,
         "step": 4880
     },
     {
-        "loss": 5.8012,
-        "grad_norm": 8.25514030456543,
-        "learning_rate": 7.010834926704908e-06,
-        "epoch": 9.369024856596559,
         "step": 4900
     },
     {
-        "loss": 5.7938,
-        "grad_norm": 8.2761869430542,
-        "learning_rate": 6.585935840237943e-06,
-        "epoch": 9.407265774378585,
         "step": 4920
     },
     {
-        "loss": 5.6735,
-        "grad_norm": 7.865163803100586,
-        "learning_rate": 6.161036753770979e-06,
-        "epoch": 9.445506692160611,
         "step": 4940
     },
     {
-        "loss": 5.7914,
-        "grad_norm": 8.172937393188477,
-        "learning_rate": 5.736137667304015e-06,
-        "epoch": 9.483747609942638,
         "step": 4960
     },
     {
-        "loss": 5.7702,
-        "grad_norm": 8.558911323547363,
-        "learning_rate": 5.311238580837051e-06,
-        "epoch": 9.521988527724666,
         "step": 4980
     },
     {
-        "loss": 5.7283,
-        "grad_norm": 8.265515327453613,
-        "learning_rate": 4.886339494370088e-06,
-        "epoch": 9.560229445506693,
         "step": 5000
     },
     {
-        "loss": 5.8007,
-        "grad_norm": 8.17795467376709,
-        "learning_rate": 4.461440407903123e-06,
-        "epoch": 9.598470363288719,
         "step": 5020
     },
     {
-        "loss": 5.8121,
-        "grad_norm": 8.109586715698242,
-        "learning_rate": 4.036541321436159e-06,
-        "epoch": 9.636711281070745,
         "step": 5040
     },
     {
-        "loss": 5.789,
-        "grad_norm": 7.911646842956543,
-        "learning_rate": 3.6116422349691954e-06,
-        "epoch": 9.674952198852772,
         "step": 5060
     },
     {
-        "loss": 5.7266,
-        "grad_norm": 8.030941009521484,
-        "learning_rate": 3.186743148502231e-06,
-        "epoch": 9.7131931166348,
         "step": 5080
     },
     {
-        "loss": 5.761,
-        "grad_norm": 8.059958457946777,
-        "learning_rate": 2.7618440620352666e-06,
-        "epoch": 9.751434034416826,
         "step": 5100
     },
     {
-        "loss": 5.7338,
-        "grad_norm": 8.002403259277344,
-        "learning_rate": 2.3369449755683026e-06,
-        "epoch": 9.789674952198853,
         "step": 5120
     },
     {
-        "loss": 5.7088,
-        "grad_norm": 8.306962966918945,
-        "learning_rate": 1.9120458891013386e-06,
-        "epoch": 9.82791586998088,
         "step": 5140
     },
     {
-        "loss": 5.6973,
-        "grad_norm": 8.018095970153809,
-        "learning_rate": 1.4871468026343744e-06,
-        "epoch": 9.866156787762906,
         "step": 5160
     },
     {
-        "loss": 5.6422,
-        "grad_norm": 8.168917655944824,
-        "learning_rate": 1.0622477161674104e-06,
-        "epoch": 9.904397705544934,
         "step": 5180
     },
     {
-        "loss": 5.7399,
-        "grad_norm": 7.939206123352051,
-        "learning_rate": 6.373486297004462e-07,
-        "epoch": 9.94263862332696,
         "step": 5200
     },
     {
-        "loss": 5.7269,
-        "grad_norm": 7.970940589904785,
-        "learning_rate": 2.1244954323348205e-07,
-        "epoch": 9.980879541108987,
         "step": 5220
     },
     {
-        "eval_loss": 4.700281620025635,
-        "eval_accuracy": 0.8030272452068618,
-        "eval_runtime": 552.5641,
-        "eval_samples_per_second": 26.902,
-        "eval_steps_per_second": 26.902,
         "epoch": 10.0,
-        "step": 5230
     },
     {
-        "train_runtime": 28748.4048,
-        "train_samples_per_second": 46.534,
-        "train_steps_per_second": 0.182,
-        "total_flos": 2.49073133395968e+18,
-        "train_loss": 7.888943860726876,
         "epoch": 10.0,
-        "step": 5230
     }
 ]

 [
     {
+        "loss": 13.2026,
+        "grad_norm": 6.155358791351318,
+        "learning_rate": 1.739130434782609e-05,
+        "epoch": 0.034782608695652174,
         "step": 20
     },
     {
+        "loss": 13.1252,
+        "grad_norm": 5.816741943359375,
+        "learning_rate": 3.478260869565218e-05,
+        "epoch": 0.06956521739130435,
         "step": 40
     },
     {
+        "loss": 13.0001,
+        "grad_norm": 5.273156642913818,
+        "learning_rate": 5.2173913043478256e-05,
+        "epoch": 0.10434782608695652,
         "step": 60
     },
     {
+        "loss": 12.8639,
+        "grad_norm": 4.86655330657959,
+        "learning_rate": 6.956521739130436e-05,
+        "epoch": 0.1391304347826087,
         "step": 80
     },
     {
+        "loss": 12.7376,
+        "grad_norm": 4.438321113586426,
+        "learning_rate": 8.695652173913044e-05,
+        "epoch": 0.17391304347826086,
         "step": 100
     },
     {
+        "loss": 12.5722,
+        "grad_norm": 4.164404392242432,
+        "learning_rate": 0.00010434782608695651,
+        "epoch": 0.20869565217391303,
         "step": 120
     },
     {
+        "loss": 12.4229,
+        "grad_norm": 3.858990430831909,
+        "learning_rate": 0.00012173913043478261,
+        "epoch": 0.24347826086956523,
         "step": 140
     },
     {
+        "loss": 12.2581,
+        "grad_norm": 3.6574394702911377,
+        "learning_rate": 0.0001391304347826087,
+        "epoch": 0.2782608695652174,
         "step": 160
     },
     {
+        "loss": 12.0753,
+        "grad_norm": 3.3787951469421387,
+        "learning_rate": 0.0001565217391304348,
+        "epoch": 0.3130434782608696,
         "step": 180
     },
     {
+        "loss": 11.9261,
+        "grad_norm": 3.323820114135742,
+        "learning_rate": 0.00017391304347826088,
+        "epoch": 0.34782608695652173,
         "step": 200
     },
     {
+        "loss": 11.7417,
+        "grad_norm": 3.247619152069092,
+        "learning_rate": 0.00019130434782608697,
+        "epoch": 0.3826086956521739,
         "step": 220
     },
     {
+        "loss": 11.5771,
+        "grad_norm": 3.2254152297973633,
+        "learning_rate": 0.00020869565217391303,
+        "epoch": 0.41739130434782606,
         "step": 240
     },
     {
+        "loss": 11.3969,
+        "grad_norm": 3.1803464889526367,
+        "learning_rate": 0.00022608695652173914,
+        "epoch": 0.45217391304347826,
         "step": 260
     },
     {
+        "loss": 11.2684,
+        "grad_norm": 3.41034197807312,
+        "learning_rate": 0.00024347826086956522,
+        "epoch": 0.48695652173913045,
         "step": 280
     },
     {
+        "loss": 11.0744,
+        "grad_norm": 3.246403217315674,
+        "learning_rate": 0.0002608695652173913,
+        "epoch": 0.5217391304347826,
         "step": 300
     },
     {
+        "loss": 10.8929,
+        "grad_norm": 3.202021360397339,
+        "learning_rate": 0.0002782608695652174,
+        "epoch": 0.5565217391304348,
         "step": 320
     },
     {
+        "loss": 10.7468,
+        "grad_norm": 3.1231367588043213,
+        "learning_rate": 0.0002956521739130435,
+        "epoch": 0.591304347826087,
         "step": 340
     },
     {
+        "loss": 10.606,
+        "grad_norm": 3.1820390224456787,
+        "learning_rate": 0.0003130434782608696,
+        "epoch": 0.6260869565217392,
         "step": 360
     },
     {
+        "loss": 10.4871,
+        "grad_norm": 3.2470555305480957,
+        "learning_rate": 0.0003304347826086956,
+        "epoch": 0.6608695652173913,
         "step": 380
     },
     {
+        "loss": 10.2836,
+        "grad_norm": 3.2452709674835205,
+        "learning_rate": 0.00034782608695652176,
+        "epoch": 0.6956521739130435,
         "step": 400
     },
     {
+        "loss": 10.1154,
+        "grad_norm": 3.203894853591919,
+        "learning_rate": 0.00036521739130434785,
+        "epoch": 0.7304347826086957,
         "step": 420
     },
     {
+        "loss": 9.9283,
+        "grad_norm": 3.269970178604126,
+        "learning_rate": 0.00038260869565217393,
+        "epoch": 0.7652173913043478,
         "step": 440
     },
     {
+        "loss": 9.8674,
+        "grad_norm": 3.261357545852661,
+        "learning_rate": 0.0004,
+        "epoch": 0.8,
         "step": 460
     },
     {
+        "loss": 9.6224,
+        "grad_norm": 3.393953323364258,
+        "learning_rate": 0.00041739130434782605,
+        "epoch": 0.8347826086956521,
         "step": 480
     },
     {
+        "loss": 9.524,
+        "grad_norm": 3.321411609649658,
+        "learning_rate": 0.0004347826086956522,
+        "epoch": 0.8695652173913043,
         "step": 500
     },
     {
+        "loss": 9.384,
+        "grad_norm": 3.3886823654174805,
+        "learning_rate": 0.0004521739130434783,
+        "epoch": 0.9043478260869565,
         "step": 520
     },
     {
+        "loss": 9.1767,
+        "grad_norm": 3.4735491275787354,
+        "learning_rate": 0.00046956521739130436,
+        "epoch": 0.9391304347826087,
         "step": 540
     },
     {
+        "loss": 9.047,
+        "grad_norm": 3.416966676712036,
+        "learning_rate": 0.00048695652173913045,
+        "epoch": 0.9739130434782609,
         "step": 560
     },
     {
+        "eval_loss": 8.366157531738281,
+        "eval_accuracy": 0.43039677202420984,
+        "eval_runtime": 42.3364,
+        "eval_samples_per_second": 35.123,
+        "eval_steps_per_second": 35.123,
+        "epoch": 1.0,
+        "step": 575
+    },
+    {
+        "loss": 8.8835,
+        "grad_norm": 3.446899890899658,
+        "learning_rate": 0.0004995169082125604,
+        "epoch": 1.008695652173913,
         "step": 580
     },
     {
+        "loss": 8.6436,
+        "grad_norm": 3.5842247009277344,
+        "learning_rate": 0.0004975845410628019,
+        "epoch": 1.0434782608695652,
         "step": 600
     },
     {
+        "loss": 8.4775,
+        "grad_norm": 3.5029306411743164,
+        "learning_rate": 0.0004956521739130435,
+        "epoch": 1.0782608695652174,
         "step": 620
     },
     {
+        "loss": 8.322,
+        "grad_norm": 3.5451033115386963,
+        "learning_rate": 0.0004937198067632851,
+        "epoch": 1.1130434782608696,
         "step": 640
     },
     {
+        "loss": 8.1264,
+        "grad_norm": 3.5502634048461914,
+        "learning_rate": 0.0004917874396135266,
+        "epoch": 1.1478260869565218,
         "step": 660
     },
     {
+        "loss": 7.9905,
+        "grad_norm": 3.607395648956299,
+        "learning_rate": 0.0004898550724637681,
+        "epoch": 1.182608695652174,
         "step": 680
     },
     {
+        "loss": 7.8252,
+        "grad_norm": 3.6438565254211426,
+        "learning_rate": 0.0004879227053140097,
+        "epoch": 1.2173913043478262,
         "step": 700
     },
     {
+        "loss": 7.7737,
+        "grad_norm": 3.656705141067505,
+        "learning_rate": 0.0004859903381642512,
+        "epoch": 1.2521739130434781,
         "step": 720
     },
     {
+        "loss": 7.5822,
+        "grad_norm": 3.7424328327178955,
+        "learning_rate": 0.0004840579710144928,
+        "epoch": 1.2869565217391306,
         "step": 740
     },
     {
+        "loss": 7.4563,
+        "grad_norm": 3.673156261444092,
+        "learning_rate": 0.0004821256038647343,
+        "epoch": 1.3217391304347825,
         "step": 760
     },
     {
+        "loss": 7.3379,
+        "grad_norm": 3.6774067878723145,
+        "learning_rate": 0.0004801932367149758,
+        "epoch": 1.3565217391304347,
         "step": 780
     },
     {
+        "loss": 7.1559,
+        "grad_norm": 3.811283826828003,
+        "learning_rate": 0.0004782608695652174,
+        "epoch": 1.391304347826087,
         "step": 800
     },
     {
+        "loss": 7.0834,
+        "grad_norm": 3.7899839878082275,
+        "learning_rate": 0.00047632850241545894,
+        "epoch": 1.4260869565217391,
         "step": 820
     },
     {
+        "loss": 6.9172,
+        "grad_norm": 3.583247423171997,
+        "learning_rate": 0.00047439613526570047,
+        "epoch": 1.4608695652173913,
         "step": 840
     },
     {
+        "loss": 6.7251,
+        "grad_norm": 3.8192331790924072,
+        "learning_rate": 0.00047246376811594206,
+        "epoch": 1.4956521739130435,
         "step": 860
     },
     {
+        "loss": 6.7871,
+        "grad_norm": 3.8098299503326416,
+        "learning_rate": 0.0004705314009661836,
+        "epoch": 1.5304347826086957,
         "step": 880
     },
     {
+        "loss": 6.6103,
+        "grad_norm": 3.7341325283050537,
+        "learning_rate": 0.0004685990338164252,
+        "epoch": 1.5652173913043477,
         "step": 900
     },
     {
+        "loss": 6.4507,
+        "grad_norm": 3.9190495014190674,
+        "learning_rate": 0.00046666666666666666,
+        "epoch": 1.6,
         "step": 920
     },
     {
+        "loss": 6.3619,
+        "grad_norm": 3.9456422328948975,
+        "learning_rate": 0.0004647342995169082,
+        "epoch": 1.634782608695652,
         "step": 940
     },
     {
+        "loss": 6.2957,
+        "grad_norm": 3.899134874343872,
+        "learning_rate": 0.0004628019323671498,
+        "epoch": 1.6695652173913045,
         "step": 960
     },
     {
+        "loss": 6.1362,
+        "grad_norm": 3.878810167312622,
+        "learning_rate": 0.0004608695652173913,
+        "epoch": 1.7043478260869565,
         "step": 980
     },
     {
+        "loss": 5.9814,
+        "grad_norm": 3.9270784854888916,
+        "learning_rate": 0.00045893719806763285,
+        "epoch": 1.7391304347826086,
         "step": 1000
     },
     {
+        "loss": 5.9095,
+        "grad_norm": 3.8247644901275635,
+        "learning_rate": 0.00045700483091787444,
+        "epoch": 1.7739130434782608,
         "step": 1020
     },
     {
+        "loss": 5.7793,
+        "grad_norm": 3.8870134353637695,
+        "learning_rate": 0.000455072463768116,
+        "epoch": 1.808695652173913,
         "step": 1040
     },
     {
+        "loss": 5.7754,
+        "grad_norm": 3.9533441066741943,
+        "learning_rate": 0.00045314009661835745,
+        "epoch": 1.8434782608695652,
         "step": 1060
     },
     {
+        "loss": 5.5886,
+        "grad_norm": 3.9928998947143555,
+        "learning_rate": 0.00045120772946859904,
+        "epoch": 1.8782608695652174,
         "step": 1080
     },
     {
+        "loss": 5.5482,
+        "grad_norm": 4.030064582824707,
+        "learning_rate": 0.0004492753623188406,
+        "epoch": 1.9130434782608696,
         "step": 1100
     },
     {
+        "loss": 5.4807,
+        "grad_norm": 3.961806297302246,
+        "learning_rate": 0.0004473429951690821,
+        "epoch": 1.9478260869565216,
         "step": 1120
     },
     {
+        "loss": 5.3508,
+        "grad_norm": 4.003119945526123,
+        "learning_rate": 0.0004454106280193237,
+        "epoch": 1.982608695652174,
         "step": 1140
     },
     {
+        "eval_loss": 4.025164604187012,
+        "eval_accuracy": 0.8190988567585743,
+        "eval_runtime": 42.7144,
+        "eval_samples_per_second": 34.813,
+        "eval_steps_per_second": 34.813,
+        "epoch": 2.0,
+        "step": 1150
+    },
+    {
+        "loss": 5.1229,
+        "grad_norm": 3.958116292953491,
+        "learning_rate": 0.00044347826086956523,
+        "epoch": 2.017391304347826,
         "step": 1160
     },
     {
+        "loss": 4.8146,
+        "grad_norm": 3.864279270172119,
+        "learning_rate": 0.00044154589371980677,
+        "epoch": 2.0521739130434784,
         "step": 1180
     },
     {
+        "loss": 4.8843,
+        "grad_norm": 4.045077323913574,
+        "learning_rate": 0.0004396135265700483,
+        "epoch": 2.0869565217391304,
         "step": 1200
     },
     {
+        "loss": 4.8078,
+        "grad_norm": 4.061978816986084,
+        "learning_rate": 0.00043768115942028983,
+        "epoch": 2.121739130434783,
         "step": 1220
     },
     {
+        "loss": 4.6812,
+        "grad_norm": 4.040159225463867,
+        "learning_rate": 0.0004357487922705314,
+        "epoch": 2.1565217391304348,
         "step": 1240
     },
     {
+        "loss": 4.6701,
+        "grad_norm": 4.234623908996582,
+        "learning_rate": 0.00043381642512077296,
+        "epoch": 2.1913043478260867,
         "step": 1260
     },
     {
+        "loss": 4.6221,
+        "grad_norm": 4.030038356781006,
+        "learning_rate": 0.0004318840579710145,
+        "epoch": 2.226086956521739,
         "step": 1280
     },
     {
+        "loss": 4.5647,
+        "grad_norm": 3.9954497814178467,
+        "learning_rate": 0.0004299516908212561,
+        "epoch": 2.260869565217391,
         "step": 1300
     },
     {
+        "loss": 4.4502,
+        "grad_norm": 4.188636779785156,
+        "learning_rate": 0.0004280193236714976,
+        "epoch": 2.2956521739130435,
         "step": 1320
     },
     {
+        "loss": 4.359,
+        "grad_norm": 4.185456275939941,
+        "learning_rate": 0.00042608695652173915,
+        "epoch": 2.3304347826086955,
         "step": 1340
     },
     {
+        "loss": 4.2863,
+        "grad_norm": 4.123263359069824,
+        "learning_rate": 0.0004241545893719807,
+        "epoch": 2.365217391304348,
         "step": 1360
     },
     {
+        "loss": 4.3354,
+        "grad_norm": 4.194387435913086,
+        "learning_rate": 0.0004222222222222222,
+        "epoch": 2.4,
         "step": 1380
     },
     {
+        "loss": 4.2176,
+        "grad_norm": 4.065763473510742,
+        "learning_rate": 0.00042028985507246375,
+        "epoch": 2.4347826086956523,
         "step": 1400
     },
     {
+        "loss": 4.0597,
+        "grad_norm": 4.120363712310791,
+        "learning_rate": 0.00041835748792270534,
+        "epoch": 2.4695652173913043,
         "step": 1420
     },
     {
+        "loss": 4.028,
+        "grad_norm": 4.3197174072265625,
+        "learning_rate": 0.00041642512077294687,
+        "epoch": 2.5043478260869563,
         "step": 1440
     },
     {
+        "loss": 3.9833,
+        "grad_norm": 4.2683610916137695,
+        "learning_rate": 0.0004144927536231884,
+        "epoch": 2.5391304347826087,
         "step": 1460
     },
     {
+        "loss": 4.0065,
+        "grad_norm": 4.15448522567749,
+        "learning_rate": 0.00041256038647343,
+        "epoch": 2.573913043478261,
         "step": 1480
     },
     {
+        "loss": 3.8134,
+        "grad_norm": 4.348177433013916,
+        "learning_rate": 0.0004106280193236715,
+        "epoch": 2.608695652173913,
         "step": 1500
     },
     {
+        "loss": 3.8548,
+        "grad_norm": 4.100021839141846,
+        "learning_rate": 0.00040869565217391306,
+        "epoch": 2.643478260869565,
         "step": 1520
     },
     {
+        "loss": 3.7814,
+        "grad_norm": 4.344174385070801,
+        "learning_rate": 0.0004067632850241546,
+        "epoch": 2.6782608695652175,
         "step": 1540
     },
     {
+        "loss": 3.7578,
+        "grad_norm": 4.240079402923584,
+        "learning_rate": 0.00040483091787439613,
+        "epoch": 2.7130434782608694,
         "step": 1560
     },
     {
+        "loss": 3.7331,
+        "grad_norm": 4.468689918518066,
+        "learning_rate": 0.0004028985507246377,
+        "epoch": 2.747826086956522,
         "step": 1580
     },
     {
+        "loss": 3.6396,
+        "grad_norm": 4.28464937210083,
+        "learning_rate": 0.00040096618357487925,
+        "epoch": 2.782608695652174,
         "step": 1600
     },
     {
+        "loss": 3.5799,
+        "grad_norm": 4.166805744171143,
+        "learning_rate": 0.0003990338164251208,
+        "epoch": 2.8173913043478263,
         "step": 1620
     },
     {
+        "loss": 3.4734,
+        "grad_norm": 4.237683296203613,
+        "learning_rate": 0.0003971014492753624,
+        "epoch": 2.8521739130434782,
         "step": 1640
     },
     {
+        "loss": 3.5183,
+        "grad_norm": 4.153097152709961,
+        "learning_rate": 0.00039516908212560385,
+        "epoch": 2.8869565217391306,
         "step": 1660
     },
     {
+        "loss": 3.3963,
+        "grad_norm": 4.2313947677612305,
+        "learning_rate": 0.0003932367149758454,
+        "epoch": 2.9217391304347826,
         "step": 1680
     },
     {
+        "loss": 3.3081,
+        "grad_norm": 3.992475748062134,
+        "learning_rate": 0.000391304347826087,
+        "epoch": 2.9565217391304346,
         "step": 1700
     },
     {
+        "loss": 3.3124,
+        "grad_norm": 4.4731059074401855,
+        "learning_rate": 0.0003893719806763285,
+        "epoch": 2.991304347826087,
         "step": 1720
     },
     {
+        "eval_loss": 2.1082653999328613,
+        "eval_accuracy": 0.9260255548083389,
+        "eval_runtime": 22.1676,
+        "eval_samples_per_second": 67.08,
+        "eval_steps_per_second": 67.08,
+        "epoch": 3.0,
+        "step": 1725
+    },
+    {
+        "loss": 3.1247,
+        "grad_norm": 4.272000312805176,
+        "learning_rate": 0.00038743961352657004,
+        "epoch": 3.026086956521739,
         "step": 1740
     },
     {
+        "loss": 3.1064,
+        "grad_norm": 4.102330207824707,
+        "learning_rate": 0.00038550724637681163,
+        "epoch": 3.0608695652173914,
         "step": 1760
     },
     {
+        "loss": 2.9371,
+        "grad_norm": 4.381846904754639,
+        "learning_rate": 0.00038357487922705317,
+        "epoch": 3.0956521739130434,
         "step": 1780
     },
     {
+        "loss": 2.9355,
+        "grad_norm": 4.1588921546936035,
+        "learning_rate": 0.00038164251207729465,
+        "epoch": 3.130434782608696,
         "step": 1800
     },
     {
+        "loss": 2.8545,
+        "grad_norm": 4.279609203338623,
+        "learning_rate": 0.00037971014492753623,
+        "epoch": 3.1652173913043478,
         "step": 1820
     },
     {
+        "loss": 2.8096,
+        "grad_norm": 4.240756988525391,
+        "learning_rate": 0.00037777777777777777,
+        "epoch": 3.2,
         "step": 1840
     },
     {
+        "loss": 2.8138,
+        "grad_norm": 4.11091947555542,
+        "learning_rate": 0.00037584541062801936,
+        "epoch": 3.234782608695652,
         "step": 1860
     },
     {
+        "loss": 2.7417,
+        "grad_norm": 4.078794479370117,
+        "learning_rate": 0.0003739130434782609,
+        "epoch": 3.269565217391304,
         "step": 1880
     },
     {
+        "loss": 2.7937,
+        "grad_norm": 4.368116855621338,
+        "learning_rate": 0.0003719806763285024,
+        "epoch": 3.3043478260869565,
         "step": 1900
     },
     {
+        "loss": 2.7361,
+        "grad_norm": 4.044319152832031,
+        "learning_rate": 0.000370048309178744,
+        "epoch": 3.3391304347826085,
         "step": 1920
     },
     {
+        "loss": 2.7054,
+        "grad_norm": 4.314040184020996,
+        "learning_rate": 0.0003681159420289855,
+        "epoch": 3.373913043478261,
         "step": 1940
     },
     {
+        "loss": 2.6682,
+        "grad_norm": 4.185855388641357,
+        "learning_rate": 0.000366183574879227,
+        "epoch": 3.408695652173913,
         "step": 1960
     },
     {
+        "loss": 2.6644,
+        "grad_norm": 4.433622360229492,
+        "learning_rate": 0.0003642512077294686,
+        "epoch": 3.4434782608695653,
         "step": 1980
     },
     {
+        "loss": 2.618,
+        "grad_norm": 4.048947811126709,
+        "learning_rate": 0.00036231884057971015,
+        "epoch": 3.4782608695652173,
         "step": 2000
     },
     {
+        "loss": 2.5982,
+        "grad_norm": 4.145406246185303,
+        "learning_rate": 0.0003603864734299517,
+        "epoch": 3.5130434782608697,
         "step": 2020
     },
     {
+        "loss": 2.6138,
+        "grad_norm": 4.2812910079956055,
+        "learning_rate": 0.00035845410628019327,
+        "epoch": 3.5478260869565217,
         "step": 2040
     },
     {
+        "loss": 2.5039,
+        "grad_norm": 4.400162220001221,
+        "learning_rate": 0.0003565217391304348,
+        "epoch": 3.5826086956521737,
         "step": 2060
     },
     {
+        "loss": 2.5249,
+        "grad_norm": 4.217800617218018,
+        "learning_rate": 0.0003545893719806763,
+        "epoch": 3.617391304347826,
         "step": 2080
     },
     {
+        "loss": 2.4547,
+        "grad_norm": 4.076215744018555,
+        "learning_rate": 0.0003526570048309179,
+        "epoch": 3.6521739130434785,
         "step": 2100
     },
     {
+        "loss": 2.4315,
+        "grad_norm": 4.139514446258545,
+        "learning_rate": 0.0003507246376811594,
+        "epoch": 3.6869565217391305,
         "step": 2120
     },
     {
+        "loss": 2.3836,
+        "grad_norm": 4.118022918701172,
+        "learning_rate": 0.00034879227053140094,
+        "epoch": 3.7217391304347824,
         "step": 2140
     },
     {
+        "loss": 2.3284,
+        "grad_norm": 4.137601852416992,
+        "learning_rate": 0.00034685990338164253,
+        "epoch": 3.756521739130435,
         "step": 2160
     },
     {
+        "loss": 2.3095,
+        "grad_norm": 4.023979663848877,
+        "learning_rate": 0.00034492753623188406,
+        "epoch": 3.791304347826087,
         "step": 2180
     },
     {
+        "loss": 2.305,
+        "grad_norm": 4.042725086212158,
+        "learning_rate": 0.00034299516908212565,
+        "epoch": 3.8260869565217392,
         "step": 2200
     },
     {
+        "loss": 2.3237,
+        "grad_norm": 4.265875339508057,
+        "learning_rate": 0.0003410628019323672,
+        "epoch": 3.860869565217391,
         "step": 2220
     },
     {
+        "loss": 2.335,
+        "grad_norm": 4.205041408538818,
+        "learning_rate": 0.00033913043478260867,
+        "epoch": 3.8956521739130436,
         "step": 2240
     },
     {
+        "loss": 2.2341,
+        "grad_norm": 4.1344709396362305,
+        "learning_rate": 0.00033719806763285025,
+        "epoch": 3.9304347826086956,
         "step": 2260
     },
     {
+        "loss": 2.251,
+        "grad_norm": 4.247790813446045,
+        "learning_rate": 0.0003352657004830918,
+        "epoch": 3.965217391304348,
         "step": 2280
     },
     {
+        "loss": 2.3212,
+        "grad_norm": 4.859626770019531,
+        "learning_rate": 0.0003333333333333333,
+        "epoch": 4.0,
         "step": 2300
     },
     {
+        "eval_loss": 1.2223739624023438,
+        "eval_accuracy": 0.9435104236718225,
+        "eval_runtime": 14.8513,
+        "eval_samples_per_second": 100.126,
+        "eval_steps_per_second": 100.126,
+        "epoch": 4.0,
+        "step": 2300
+    },
+    {
+        "loss": 1.9133,
+        "grad_norm": 4.098020553588867,
+        "learning_rate": 0.0003314009661835749,
+        "epoch": 4.034782608695652,
         "step": 2320
     },
     {
+        "loss": 1.9814,
+        "grad_norm": 4.198029041290283,
+        "learning_rate": 0.00032946859903381644,
+        "epoch": 4.069565217391304,
         "step": 2340
     },
     {
+        "loss": 1.9505,
+        "grad_norm": 3.960844039916992,
+        "learning_rate": 0.000327536231884058,
+        "epoch": 4.104347826086957,
         "step": 2360
     },
     {
+        "loss": 1.8815,
+        "grad_norm": 4.0190300941467285,
+        "learning_rate": 0.0003256038647342995,
+        "epoch": 4.139130434782609,
         "step": 2380
     },
     {
+        "loss": 1.8365,
+        "grad_norm": 4.040708541870117,
+        "learning_rate": 0.00032367149758454105,
+        "epoch": 4.173913043478261,
         "step": 2400
     },
     {
+        "loss": 1.84,
+        "grad_norm": 4.077364444732666,
+        "learning_rate": 0.0003217391304347826,
+        "epoch": 4.208695652173913,
         "step": 2420
     },
     {
+        "loss": 1.8864,
+        "grad_norm": 4.267309188842773,
+        "learning_rate": 0.0003199033816425121,
+        "epoch": 4.243478260869566,
         "step": 2440
     },
     {
+        "loss": 1.9015,
+        "grad_norm": 3.978663921356201,
+        "learning_rate": 0.00031797101449275363,
+        "epoch": 4.278260869565218,
         "step": 2460
     },
     {
+        "loss": 1.8388,
+        "grad_norm": 4.089256763458252,
+        "learning_rate": 0.0003160386473429952,
+        "epoch": 4.3130434782608695,
         "step": 2480
     },
     {
+        "loss": 1.7845,
+        "grad_norm": 3.9317057132720947,
+        "learning_rate": 0.0003141062801932367,
+        "epoch": 4.3478260869565215,
         "step": 2500
     },
     {
+        "loss": 1.7725,
+        "grad_norm": 3.9738080501556396,
+        "learning_rate": 0.00031217391304347823,
+        "epoch": 4.3826086956521735,
         "step": 2520
     },
     {
+        "loss": 1.852,
+        "grad_norm": 4.232215881347656,
+        "learning_rate": 0.0003102415458937198,
+        "epoch": 4.417391304347826,
         "step": 2540
     },
     {
+        "loss": 1.8234,
+        "grad_norm": 4.050131797790527,
+        "learning_rate": 0.00030830917874396136,
+        "epoch": 4.452173913043478,
         "step": 2560
     },
     {
+        "loss": 1.8148,
+        "grad_norm": 4.217935085296631,
+        "learning_rate": 0.0003063768115942029,
+        "epoch": 4.48695652173913,
         "step": 2580
     },
     {
+        "loss": 1.7134,
+        "grad_norm": 3.9807074069976807,
+        "learning_rate": 0.0003044444444444445,
+        "epoch": 4.521739130434782,
         "step": 2600
     },
     {
+        "loss": 1.6752,
+        "grad_norm": 4.05940580368042,
+        "learning_rate": 0.000302512077294686,
+        "epoch": 4.556521739130435,
         "step": 2620
     },
     {
+        "loss": 1.8413,
+        "grad_norm": 4.454566955566406,
+        "learning_rate": 0.00030057971014492755,
+        "epoch": 4.591304347826087,
         "step": 2640
     },
     {
+        "loss": 1.7948,
+        "grad_norm": 4.144088268280029,
+        "learning_rate": 0.0002986473429951691,
+        "epoch": 4.626086956521739,
         "step": 2660
     },
     {
+        "loss": 1.7468,
+        "grad_norm": 3.940176010131836,
+        "learning_rate": 0.0002967149758454106,
+        "epoch": 4.660869565217391,
         "step": 2680
     },
     {
+        "loss": 1.709,
+        "grad_norm": 4.198675632476807,
+        "learning_rate": 0.0002948792270531401,
+        "epoch": 4.695652173913043,
         "step": 2700
     },
     {
+        "loss": 1.6506,
+        "grad_norm": 3.976001501083374,
+        "learning_rate": 0.00029294685990338167,
+        "epoch": 4.730434782608696,
         "step": 2720
     },
     {
+        "loss": 1.7042,
+        "grad_norm": 4.033059120178223,
+        "learning_rate": 0.0002910144927536232,
+        "epoch": 4.765217391304348,
         "step": 2740
     },
     {
+        "loss": 1.6795,
+        "grad_norm": 4.062041759490967,
+        "learning_rate": 0.0002890821256038648,
+        "epoch": 4.8,
         "step": 2760
     },
     {
+        "loss": 1.7029,
+        "grad_norm": 3.988589286804199,
+        "learning_rate": 0.00028714975845410627,
+        "epoch": 4.834782608695652,
         "step": 2780
     },
     {
+        "loss": 1.6641,
+        "grad_norm": 4.16325044631958,
+        "learning_rate": 0.0002852173913043478,
+        "epoch": 4.869565217391305,
         "step": 2800
     },
     {
+        "loss": 1.6953,
+        "grad_norm": 4.323537349700928,
+        "learning_rate": 0.0002832850241545894,
+        "epoch": 4.904347826086957,
         "step": 2820
     },
     {
+        "loss": 1.5863,
+        "grad_norm": 3.8293144702911377,
+        "learning_rate": 0.0002813526570048309,
+        "epoch": 4.939130434782609,
         "step": 2840
     },
     {
+        "loss": 1.6276,
+        "grad_norm": 3.8955535888671875,
+        "learning_rate": 0.00027942028985507246,
+        "epoch": 4.973913043478261,
         "step": 2860
     },
     {
+        "eval_loss": 0.8229038715362549,
+        "eval_accuracy": 0.9677202420981843,
+        "eval_runtime": 88.6744,
+        "eval_samples_per_second": 16.769,
+        "eval_steps_per_second": 16.769,
+        "epoch": 5.0,
+        "step": 2875
+    },
+    {
+        "loss": 1.5701,
+        "grad_norm": 3.8480091094970703,
+        "learning_rate": 0.00027748792270531405,
+        "epoch": 5.008695652173913,
         "step": 2880
     },
     {
+        "loss": 1.3786,
+        "grad_norm": 3.679872512817383,
+        "learning_rate": 0.0002755555555555556,
+        "epoch": 5.043478260869565,
         "step": 2900
     },
     {
+        "loss": 1.3563,
+        "grad_norm": 4.13381290435791,
+        "learning_rate": 0.00027362318840579706,
+        "epoch": 5.078260869565217,
         "step": 2920
     },
     {
+        "loss": 1.3588,
+        "grad_norm": 3.7467329502105713,
+        "learning_rate": 0.00027169082125603865,
+        "epoch": 5.113043478260869,
         "step": 2940
     },
     {
+        "loss": 1.3782,
+        "grad_norm": 3.5837419033050537,
+        "learning_rate": 0.0002698550724637681,
+        "epoch": 5.147826086956521,
         "step": 2960
     },
     {
+        "loss": 1.3969,
+        "grad_norm": 4.077097415924072,
+        "learning_rate": 0.00026792270531400964,
+        "epoch": 5.182608695652174,
         "step": 2980
     },
     {
+        "loss": 1.3346,
+        "grad_norm": 3.5995211601257324,
+        "learning_rate": 0.00026599033816425123,
+        "epoch": 5.217391304347826,
         "step": 3000
     },
     {
+        "loss": 1.3772,
+        "grad_norm": 3.714010000228882,
+        "learning_rate": 0.00026405797101449277,
+        "epoch": 5.252173913043478,
         "step": 3020
     },
     {
+        "loss": 1.3452,
+        "grad_norm": 3.807094097137451,
+        "learning_rate": 0.00026231884057971016,
+        "epoch": 5.28695652173913,
         "step": 3040
     },
     {
+        "loss": 1.3161,
+        "grad_norm": 4.012477397918701,
+        "learning_rate": 0.0002603864734299517,
+        "epoch": 5.321739130434783,
         "step": 3060
     },
     {
+        "loss": 1.3146,
+        "grad_norm": 3.850520372390747,
+        "learning_rate": 0.0002584541062801932,
+        "epoch": 5.356521739130435,
         "step": 3080
     },
     {
+        "loss": 1.3057,
+        "grad_norm": NaN,
+        "learning_rate": 0.00025661835748792274,
+        "epoch": 5.391304347826087,
         "step": 3100
     },
     {
+        "loss": 1.2619,
+        "grad_norm": 3.697744607925415,
+        "learning_rate": 0.0002546859903381643,
+        "epoch": 5.426086956521739,
         "step": 3120
     },
     {
+        "loss": 1.3436,
+        "grad_norm": 4.125018119812012,
+        "learning_rate": 0.00025275362318840576,
+        "epoch": 5.460869565217392,
         "step": 3140
     },
     {
+        "loss": 1.3289,
+        "grad_norm": 4.1491899490356445,
+        "learning_rate": 0.00025082125603864735,
+        "epoch": 5.495652173913044,
         "step": 3160
     },
     {
+        "loss": 1.218,
+        "grad_norm": 3.9294846057891846,
+        "learning_rate": 0.0002488888888888889,
+        "epoch": 5.530434782608696,
         "step": 3180
     },
     {
+        "loss": 1.3219,
+        "grad_norm": 3.9030706882476807,
+        "learning_rate": 0.00024695652173913047,
+        "epoch": 5.565217391304348,
         "step": 3200
     },
     {
+        "loss": 1.2694,
+        "grad_norm": 4.124849319458008,
+        "learning_rate": 0.000245024154589372,
+        "epoch": 5.6,
         "step": 3220
     },
     {
+        "loss": 1.2379,
+        "grad_norm": 4.1668500900268555,
+        "learning_rate": 0.0002432850241545894,
+        "epoch": 5.6347826086956525,
         "step": 3240
     },
     {
+        "loss": 1.2892,
+        "grad_norm": 4.098198890686035,
+        "learning_rate": 0.00024135265700483093,
+        "epoch": 5.6695652173913045,
         "step": 3260
     },
     {
+        "loss": 1.2742,
+        "grad_norm": 3.690241813659668,
+        "learning_rate": 0.00023942028985507246,
+        "epoch": 5.7043478260869565,
         "step": 3280
     },
     {
+        "loss": 1.1755,
+        "grad_norm": 3.978963613510132,
+        "learning_rate": 0.00023748792270531402,
+        "epoch": 5.739130434782608,
         "step": 3300
     },
     {
+        "loss": 1.2256,
+        "grad_norm": 3.7397215366363525,
+        "learning_rate": 0.00023574879227053139,
+        "epoch": 5.773913043478261,
         "step": 3320
     },
     {
+        "loss": 1.238,
+        "grad_norm": 3.9201064109802246,
+        "learning_rate": 0.00023391304347826088,
+        "epoch": 5.808695652173913,
         "step": 3340
     },
     {
+        "loss": 1.1706,
+        "grad_norm": 3.725389242172241,
+        "learning_rate": 0.0002319806763285024,
+        "epoch": 5.843478260869565,
         "step": 3360
     },
     {
+        "loss": 1.1644,
+        "grad_norm": 3.5844123363494873,
+        "learning_rate": 0.00023004830917874397,
+        "epoch": 5.878260869565217,
         "step": 3380
     },
     {
+        "loss": 1.2256,
+        "grad_norm": 3.79936146736145,
+        "learning_rate": 0.00022821256038647343,
+        "epoch": 5.913043478260869,
         "step": 3400
     },
     {
+        "loss": 1.2488,
+        "grad_norm": 3.5947725772857666,
+        "learning_rate": 0.00022628019323671497,
+        "epoch": 5.947826086956522,
         "step": 3420
     },
     {
+        "loss": 1.1418,
+        "grad_norm": NaN,
+        "learning_rate": 0.00022444444444444446,
+        "epoch": 5.982608695652174,
         "step": 3440
     },
     {
+        "eval_loss": 0.5840117335319519,
+        "eval_accuracy": 0.9757901815736382,
+        "eval_runtime": 97.2696,
+        "eval_samples_per_second": 15.287,
+        "eval_steps_per_second": 15.287,
+        "epoch": 6.0,
+        "step": 3450
+    },
+    {
+        "loss": 1.1254,
+        "grad_norm": 3.5959298610687256,
+        "learning_rate": 0.00022260869565217392,
+        "epoch": 6.017391304347826,
         "step": 3460
     },
     {
+        "loss": 1.0343,
+        "grad_norm": 3.9623775482177734,
+        "learning_rate": 0.00022067632850241545,
+        "epoch": 6.052173913043478,
         "step": 3480
     },
     {
+        "loss": 1.0348,
+        "grad_norm": 3.735102415084839,
+        "learning_rate": 0.00021874396135265702,
+        "epoch": 6.086956521739131,
         "step": 3500
     },
     {
+        "loss": 0.9796,
+        "grad_norm": 3.4255013465881348,
+        "learning_rate": 0.00021681159420289855,
+        "epoch": 6.121739130434783,
         "step": 3520
     },
     {
+        "loss": 0.9865,
+        "grad_norm": 3.981841564178467,
+        "learning_rate": 0.00021497584541062804,
+        "epoch": 6.156521739130435,
         "step": 3540
     },
     {
+        "loss": 1.0054,
+        "grad_norm": 3.9057116508483887,
+        "learning_rate": 0.00021314009661835748,
+        "epoch": 6.191304347826087,
         "step": 3560
     },
     {
+        "loss": 1.0012,
+        "grad_norm": 3.626560688018799,
+        "learning_rate": 0.00021120772946859904,
+        "epoch": 6.226086956521739,
         "step": 3580
     },
     {
+        "loss": 1.0129,
+        "grad_norm": 3.687683582305908,
+        "learning_rate": 0.0002093719806763285,
+        "epoch": 6.260869565217392,
         "step": 3600
     },
     {
+        "loss": 0.9333,
+        "grad_norm": 3.8632826805114746,
+        "learning_rate": 0.00020763285024154592,
+        "epoch": 6.2956521739130435,
         "step": 3620
     },
     {
+        "loss": 1.0259,
+        "grad_norm": 4.089422702789307,
+        "learning_rate": 0.0002058937198067633,
+        "epoch": 6.3304347826086955,
         "step": 3640
     },
     {
+        "loss": 1.0184,
+        "grad_norm": 4.261268615722656,
+        "learning_rate": 0.00020415458937198067,
+        "epoch": 6.3652173913043475,
         "step": 3660
     },
     {
+        "loss": 1.0293,
+        "grad_norm": 2.3901586532592773,
+        "learning_rate": 0.0002026086956521739,
+        "epoch": 6.4,
         "step": 3680
     },
     {
+        "loss": 1.0026,
+        "grad_norm": 2.233633518218994,
+        "learning_rate": 0.00020067632850241546,
+        "epoch": 6.434782608695652,
         "step": 3700
     },
     {
+        "loss": 1.0426,
+        "grad_norm": 2.049773693084717,
+        "learning_rate": 0.00019893719806763285,
+        "epoch": 6.469565217391304,
         "step": 3720
     },
     {
+        "loss": 1.0324,
+        "grad_norm": 2.21939754486084,
+        "learning_rate": 0.0001970048309178744,
+        "epoch": 6.504347826086956,
         "step": 3740
     },
     {
+        "loss": 1.0666,
+        "grad_norm": 2.2138895988464355,
+        "learning_rate": 0.00019516908212560387,
+        "epoch": 6.539130434782608,
         "step": 3760
     },
     {
+        "loss": 1.0724,
+        "grad_norm": 1.9186855554580688,
+        "learning_rate": 0.0001932367149758454,
+        "epoch": 6.573913043478261,
         "step": 3780
     },
     {
+        "loss": 1.0867,
+        "grad_norm": 1.302451729774475,
+        "learning_rate": 0.00019159420289855073,
+        "epoch": 6.608695652173913,
         "step": 3800
     },
     {
+        "loss": 1.0659,
+        "grad_norm": 1.1770459413528442,
+        "learning_rate": 0.00018975845410628022,
+        "epoch": 6.643478260869565,
         "step": 3820
     },
     {
+        "loss": 1.0494,
+        "grad_norm": 0.2651650309562683,
+        "learning_rate": 0.0001881159420289855,
+        "epoch": 6.678260869565217,
         "step": 3840
     },
     {
+        "loss": 1.0464,
+        "grad_norm": 0.0,
+        "learning_rate": 0.0001867632850241546,
+        "epoch": 6.71304347826087,
         "step": 3860
     },
     {
+        "loss": 1.0457,
+        "grad_norm": 0.0,
+        "learning_rate": 0.000185024154589372,
+        "epoch": 6.747826086956522,
         "step": 3880
     },
     {
+        "loss": 0.9815,
+        "grad_norm": 0.0,
+        "learning_rate": 0.00018328502415458937,
+        "epoch": 6.782608695652174,
         "step": 3900
     },
     {
+        "loss": 1.0094,
+        "grad_norm": 0.0,
+        "learning_rate": 0.0001816425120772947,
+        "epoch": 6.817391304347826,
         "step": 3920
     },
     {
+        "loss": 1.0023,
+        "grad_norm": NaN,
+        "learning_rate": 0.00018028985507246377,
+        "epoch": 6.852173913043478,
         "step": 3940
     },
     {
+        "loss": 1.0278,
+        "grad_norm": 0.0,
+        "learning_rate": 0.00017893719806763288,
+        "epoch": 6.886956521739131,
         "step": 3960
     },
     {
+        "loss": 1.0123,
+        "grad_norm": 0.0,
+        "learning_rate": 0.0001771014492753623,
+        "epoch": 6.921739130434783,
         "step": 3980
     },
     {
+        "loss": 1.0774,
+        "grad_norm": 0.0,
+        "learning_rate": 0.00017565217391304346,
+        "epoch": 6.956521739130435,
         "step": 4000
     },
     {
+        "loss": 1.0484,
+        "grad_norm": 0.0,
+        "learning_rate": 0.00017391304347826088,
+        "epoch": 6.9913043478260875,
         "step": 4020
     },
     {
+        "eval_loss": 0.5780686736106873,
+        "eval_accuracy": 0.9737726967047747,
+        "eval_runtime": 118.8154,
+        "eval_samples_per_second": 12.515,
+        "eval_steps_per_second": 12.515,
+        "epoch": 7.0,
+        "step": 4025
+    },
+    {
+        "loss": 0.9799,
+        "grad_norm": 0.0,
+        "learning_rate": 0.0001723671497584541,
+        "epoch": 7.026086956521739,
         "step": 4040
     },
     {
+        "loss": 0.9588,
+        "grad_norm": 0.0,
+        "learning_rate": 0.00017091787439613525,
+        "epoch": 7.060869565217391,
         "step": 4060
     },
     {
+        "loss": 0.9421,
+        "grad_norm": NaN,
+        "learning_rate": 0.00016966183574879226,
+        "epoch": 7.095652173913043,
         "step": 4080
     },
     {
+        "loss": 0.9551,
+        "grad_norm": 0.0,
+        "learning_rate": 0.00016782608695652175,
+        "epoch": 7.130434782608695,
         "step": 4100
     },
     {
+        "loss": 0.9622,
+        "grad_norm": 0.0,
+        "learning_rate": 0.00016618357487922704,
+        "epoch": 7.165217391304348,
         "step": 4120
     },
     {
+        "loss": 0.9712,
+        "grad_norm": 0.0,
+        "learning_rate": 0.00016444444444444446,
+        "epoch": 7.2,
         "step": 4140
     },
     {
+        "loss": 0.9834,
+        "grad_norm": 0.0,
+        "learning_rate": 0.00016299516908212561,
+        "epoch": 7.234782608695652,
         "step": 4160
     },
     {
+        "loss": 0.9968,
+        "grad_norm": NaN,
+        "learning_rate": 0.00016135265700483093,
+        "epoch": 7.269565217391304,
         "step": 4180
     },
     {
+        "loss": 0.956,
+        "grad_norm": 0.0,
+        "learning_rate": 0.00015961352657004833,
+        "epoch": 7.304347826086957,
         "step": 4200
     },
     {
+        "loss": 0.8981,
+        "grad_norm": 0.0,
+        "learning_rate": 0.00015806763285024155,
+        "epoch": 7.339130434782609,
         "step": 4220
     },
     {
+        "loss": 0.9515,
+        "grad_norm": 0.0,
+        "learning_rate": 0.00015642512077294684,
+        "epoch": 7.373913043478261,
         "step": 4240
     },
     {
+        "loss": 0.9535,
+        "grad_norm": 0.0,
+        "learning_rate": 0.0001548792270531401,
+        "epoch": 7.408695652173913,
         "step": 4260
     },
     {
+        "loss": 0.9646,
+        "grad_norm": NaN,
+        "learning_rate": 0.00015333333333333334,
+        "epoch": 7.443478260869565,
         "step": 4280
     },
     {
+        "loss": 0.9821,
+        "grad_norm": 0.0,
+        "learning_rate": 0.00015140096618357487,
+        "epoch": 7.478260869565218,
         "step": 4300
     },
     {
+        "loss": 0.9259,
+        "grad_norm": 0.0,
+        "learning_rate": 0.00015014492753623188,
+        "epoch": 7.51304347826087,
         "step": 4320
     },
     {
+        "loss": 0.9494,
+        "grad_norm": 0.0,
+        "learning_rate": 0.00014869565217391303,
+        "epoch": 7.547826086956522,
         "step": 4340
     },
     {
+        "loss": 0.9305,
+        "grad_norm": 0.0,
+        "learning_rate": 0.00014714975845410628,
+        "epoch": 7.582608695652174,
         "step": 4360
     },
     {
+        "loss": 0.8889,
+        "grad_norm": 0.0,
+        "learning_rate": 0.0001455072463768116,
+        "epoch": 7.6173913043478265,
         "step": 4380
     },
     {
+        "loss": 0.9524,
+        "grad_norm": 0.0,
+        "learning_rate": 0.00014396135265700482,
+        "epoch": 7.6521739130434785,
         "step": 4400
     },
     {
+        "loss": 0.9065,
+        "grad_norm": 0.0,
+        "learning_rate": 0.00014231884057971014,
+        "epoch": 7.6869565217391305,
         "step": 4420
     },
     {
+        "loss": 0.9153,
+        "grad_norm": 0.0,
+        "learning_rate": 0.00014048309178743963,
+        "epoch": 7.721739130434782,
         "step": 4440
     },
     {
+        "loss": 0.6675,
+        "grad_norm": NaN,
+        "learning_rate": 0.0001403864734299517,
+        "epoch": 7.756521739130434,
         "step": 4460
     },
     {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0001403864734299517,
+        "epoch": 7.791304347826087,
         "step": 4480
     },
     {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0001403864734299517,
+        "epoch": 7.826086956521739,
         "step": 4500
     },
     {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0001403864734299517,
+        "epoch": 7.860869565217391,
         "step": 4520
     },
     {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0001403864734299517,
+        "epoch": 7.895652173913043,
         "step": 4540
     },
     {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0001403864734299517,
+        "epoch": 7.930434782608696,
         "step": 4560
     },
     {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0001403864734299517,
+        "epoch": 7.965217391304348,
         "step": 4580
     },
     {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0001403864734299517,
+        "epoch": 8.0,
         "step": 4600
     },
     {
+        "eval_loss": NaN,
+        "eval_accuracy": 0.0006724949562878278,
+        "eval_runtime": 129.6238,
+        "eval_samples_per_second": 11.472,
+        "eval_steps_per_second": 11.472,
+        "epoch": 8.0,
+        "step": 4600
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0001403864734299517,
+        "epoch": 8.034782608695652,
         "step": 4620
     },
     {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0001403864734299517,
+        "epoch": 8.069565217391304,
         "step": 4640
     },
     {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0001403864734299517,
+        "epoch": 8.104347826086956,
         "step": 4660
     },
     {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0001403864734299517,
+        "epoch": 8.139130434782608,
         "step": 4680
     },
     {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0001403864734299517,
+        "epoch": 8.173913043478262,
         "step": 4700
     },
     {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0001403864734299517,
+        "epoch": 8.208695652173914,
         "step": 4720
     },
     {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0001403864734299517,
+        "epoch": 8.243478260869566,
         "step": 4740
     },
     {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0001403864734299517,
+        "epoch": 8.278260869565218,
         "step": 4760
     },
     {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0001403864734299517,
+        "epoch": 8.31304347826087,
         "step": 4780
     },
     {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0001403864734299517,
+        "epoch": 8.347826086956522,
         "step": 4800
     },
     {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0001403864734299517,
+        "epoch": 8.382608695652173,
         "step": 4820
     },
     {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0001403864734299517,
+        "epoch": 8.417391304347825,
         "step": 4840
     },
     {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0001403864734299517,
+        "epoch": 8.452173913043477,
         "step": 4860
     },
     {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0001403864734299517,
+        "epoch": 8.486956521739131,
         "step": 4880
     },
     {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0001403864734299517,
+        "epoch": 8.521739130434783,
         "step": 4900
     },
     {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0001403864734299517,
+        "epoch": 8.556521739130435,
         "step": 4920
     },
     {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0001403864734299517,
+        "epoch": 8.591304347826087,
         "step": 4940
     },
     {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0001403864734299517,
+        "epoch": 8.626086956521739,
         "step": 4960
     },
     {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0001403864734299517,
+        "epoch": 8.660869565217391,
         "step": 4980
     },
     {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0001403864734299517,
+        "epoch": 8.695652173913043,
         "step": 5000
     },
     {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0001403864734299517,
+        "epoch": 8.730434782608695,
         "step": 5020
     },
     {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0001403864734299517,
+        "epoch": 8.765217391304347,
         "step": 5040
     },
     {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0001403864734299517,
+        "epoch": 8.8,
         "step": 5060
     },
     {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0001403864734299517,
+        "epoch": 8.834782608695653,
         "step": 5080
     },
     {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0001403864734299517,
+        "epoch": 8.869565217391305,
         "step": 5100
     },
     {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0001403864734299517,
+        "epoch": 8.904347826086957,
         "step": 5120
     },
     {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0001403864734299517,
+        "epoch": 8.939130434782609,
         "step": 5140
     },
     {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0001403864734299517,
+        "epoch": 8.97391304347826,
         "step": 5160
     },
     {
+        "eval_loss": NaN,
+        "eval_accuracy": 0.0006724949562878278,
+        "eval_runtime": 117.1288,
+        "eval_samples_per_second": 12.695,
+        "eval_steps_per_second": 12.695,
+        "epoch": 9.0,
+        "step": 5175
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0001403864734299517,
+        "epoch": 9.008695652173913,
         "step": 5180
     },
     {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0001403864734299517,
+        "epoch": 9.043478260869565,
         "step": 5200
     },
     {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0001403864734299517,
+        "epoch": 9.078260869565218,
         "step": 5220
     },
     {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0001403864734299517,
+        "epoch": 9.11304347826087,
+        "step": 5240
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0001403864734299517,
+        "epoch": 9.147826086956522,
+        "step": 5260
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0001403864734299517,
+        "epoch": 9.182608695652174,
+        "step": 5280
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0001403864734299517,
+        "epoch": 9.217391304347826,
+        "step": 5300
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0001403864734299517,
+        "epoch": 9.252173913043478,
+        "step": 5320
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0001403864734299517,
+        "epoch": 9.28695652173913,
+        "step": 5340
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0001403864734299517,
+        "epoch": 9.321739130434782,
+        "step": 5360
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0001403864734299517,
+        "epoch": 9.356521739130434,
+        "step": 5380
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0001403864734299517,
+        "epoch": 9.391304347826088,
+        "step": 5400
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0001403864734299517,
+        "epoch": 9.42608695652174,
+        "step": 5420
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0001403864734299517,
+        "epoch": 9.460869565217392,
+        "step": 5440
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0001403864734299517,
+        "epoch": 9.495652173913044,
+        "step": 5460
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0001403864734299517,
+        "epoch": 9.530434782608696,
+        "step": 5480
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0001403864734299517,
+        "epoch": 9.565217391304348,
+        "step": 5500
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0001403864734299517,
+        "epoch": 9.6,
+        "step": 5520
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0001403864734299517,
+        "epoch": 9.634782608695652,
+        "step": 5540
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0001403864734299517,
+        "epoch": 9.669565217391304,
+        "step": 5560
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0001403864734299517,
+        "epoch": 9.704347826086957,
+        "step": 5580
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0001403864734299517,
+        "epoch": 9.73913043478261,
+        "step": 5600
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0001403864734299517,
+        "epoch": 9.773913043478261,
+        "step": 5620
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0001403864734299517,
+        "epoch": 9.808695652173913,
+        "step": 5640
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0001403864734299517,
+        "epoch": 9.843478260869565,
+        "step": 5660
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0001403864734299517,
+        "epoch": 9.878260869565217,
+        "step": 5680
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0001403864734299517,
+        "epoch": 9.91304347826087,
+        "step": 5700
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0001403864734299517,
+        "epoch": 9.947826086956521,
+        "step": 5720
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0001403864734299517,
+        "epoch": 9.982608695652173,
+        "step": 5740
+    },
+    {
+        "eval_loss": NaN,
+        "eval_accuracy": 0.0006724949562878278,
+        "eval_runtime": 103.3199,
+        "eval_samples_per_second": 14.392,
+        "eval_steps_per_second": 14.392,
         "epoch": 10.0,
+        "step": 5750
     },
     {
+        "train_runtime": 59857.6179,
+        "train_samples_per_second": 24.584,
+        "train_steps_per_second": 0.096,
+        "total_flos": 2.7398100529152e+18,
+        "train_loss": 2.9414075751926587,
         "epoch": 10.0,
+        "step": 5750
     }
 ]

train_results.json CHANGED Viewed

@@ -1,8 +1,8 @@
 {
     "epoch": 10.0,
-    "total_flos": 2.49073133395968e+18,
-    "train_loss": 7.888943860726876,
-    "train_runtime": 28748.4048,
-    "train_samples_per_second": 46.534,
-    "train_steps_per_second": 0.182
 }

 {
     "epoch": 10.0,
+    "total_flos": 2.7398100529152e+18,
+    "train_loss": 2.9414075751926587,
+    "train_runtime": 59857.6179,
+    "train_samples_per_second": 24.584,
+    "train_steps_per_second": 0.096
 }

trainer_state.json CHANGED Viewed

@@ -1,1942 +1,2124 @@
 {
-  "best_metric": 0.8030272452068618,
-  "best_model_checkpoint": "/mnt/data4_HDD_14TB/yang/voxceleb-checkpoints/ecapa-tdnn/voxceleb1/pretrain/c512-aam-len3-bs256-lr1e-4/checkpoint-5230",
   "epoch": 10.0,
   "eval_steps": 500,
-  "global_step": 5230,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
-      "epoch": 0.03824091778202677,
-      "grad_norm": 6.157718181610107,
-      "learning_rate": 3.824091778202677e-06,
-      "loss": 13.2232,
       "step": 20
     },
     {
-      "epoch": 0.07648183556405354,
-      "grad_norm": 6.144223213195801,
-      "learning_rate": 7.648183556405354e-06,
-      "loss": 13.2113,
       "step": 40
     },
     {
-      "epoch": 0.1147227533460803,
-      "grad_norm": 6.032691955566406,
-      "learning_rate": 1.147227533460803e-05,
-      "loss": 13.1625,
       "step": 60
     },
     {
-      "epoch": 0.15296367112810708,
-      "grad_norm": 5.916826248168945,
-      "learning_rate": 1.529636711281071e-05,
-      "loss": 13.1174,
       "step": 80
     },
     {
-      "epoch": 0.19120458891013384,
-      "grad_norm": 5.7198004722595215,
-      "learning_rate": 1.9120458891013384e-05,
-      "loss": 13.0512,
       "step": 100
     },
     {
-      "epoch": 0.2294455066921606,
-      "grad_norm": 5.554529666900635,
-      "learning_rate": 2.294455066921606e-05,
-      "loss": 12.9931,
       "step": 120
     },
     {
-      "epoch": 0.2676864244741874,
-      "grad_norm": 5.364482879638672,
-      "learning_rate": 2.6768642447418742e-05,
-      "loss": 12.9042,
       "step": 140
     },
     {
-      "epoch": 0.30592734225621415,
-      "grad_norm": 5.091818809509277,
-      "learning_rate": 3.059273422562142e-05,
-      "loss": 12.8488,
       "step": 160
     },
     {
-      "epoch": 0.3441682600382409,
-      "grad_norm": 5.035643577575684,
-      "learning_rate": 3.441682600382409e-05,
-      "loss": 12.7715,
       "step": 180
     },
     {
-      "epoch": 0.3824091778202677,
-      "grad_norm": 4.819056987762451,
-      "learning_rate": 3.824091778202677e-05,
-      "loss": 12.6747,
       "step": 200
     },
     {
-      "epoch": 0.42065009560229444,
-      "grad_norm": 4.597919464111328,
-      "learning_rate": 4.2065009560229444e-05,
-      "loss": 12.6366,
       "step": 220
     },
     {
-      "epoch": 0.4588910133843212,
-      "grad_norm": 4.551054954528809,
-      "learning_rate": 4.588910133843212e-05,
-      "loss": 12.5388,
       "step": 240
     },
     {
-      "epoch": 0.497131931166348,
-      "grad_norm": 4.289029598236084,
-      "learning_rate": 4.97131931166348e-05,
-      "loss": 12.4527,
       "step": 260
     },
     {
-      "epoch": 0.5353728489483748,
-      "grad_norm": 4.291126728057861,
-      "learning_rate": 5.3537284894837484e-05,
-      "loss": 12.3809,
       "step": 280
     },
     {
-      "epoch": 0.5736137667304015,
-      "grad_norm": 4.090356826782227,
-      "learning_rate": 5.736137667304016e-05,
-      "loss": 12.3185,
       "step": 300
     },
     {
-      "epoch": 0.6118546845124283,
-      "grad_norm": 3.9066805839538574,
-      "learning_rate": 6.118546845124283e-05,
-      "loss": 12.2101,
       "step": 320
     },
     {
-      "epoch": 0.6500956022944551,
-      "grad_norm": 3.937908887863159,
-      "learning_rate": 6.50095602294455e-05,
-      "loss": 12.1255,
       "step": 340
     },
     {
-      "epoch": 0.6883365200764818,
-      "grad_norm": 3.919820547103882,
-      "learning_rate": 6.883365200764819e-05,
-      "loss": 12.0543,
       "step": 360
     },
     {
-      "epoch": 0.7265774378585086,
-      "grad_norm": 3.8298187255859375,
-      "learning_rate": 7.265774378585087e-05,
-      "loss": 11.9417,
       "step": 380
     },
     {
-      "epoch": 0.7648183556405354,
-      "grad_norm": 3.7290520668029785,
-      "learning_rate": 7.648183556405354e-05,
-      "loss": 11.8644,
       "step": 400
     },
     {
-      "epoch": 0.8030592734225621,
-      "grad_norm": 3.76938533782959,
-      "learning_rate": 8.030592734225622e-05,
-      "loss": 11.8122,
       "step": 420
     },
     {
-      "epoch": 0.8413001912045889,
-      "grad_norm": 3.8729827404022217,
-      "learning_rate": 8.413001912045889e-05,
-      "loss": 11.7117,
       "step": 440
     },
     {
-      "epoch": 0.8795411089866156,
-      "grad_norm": 3.7178924083709717,
-      "learning_rate": 8.795411089866157e-05,
-      "loss": 11.6245,
       "step": 460
     },
     {
-      "epoch": 0.9177820267686424,
-      "grad_norm": 3.7744827270507812,
-      "learning_rate": 9.177820267686424e-05,
-      "loss": 11.547,
       "step": 480
     },
     {
-      "epoch": 0.9560229445506692,
-      "grad_norm": 3.6705052852630615,
-      "learning_rate": 9.560229445506692e-05,
-      "loss": 11.4699,
       "step": 500
     },
     {
-      "epoch": 0.994263862332696,
-      "grad_norm": 3.6992719173431396,
-      "learning_rate": 9.94263862332696e-05,
-      "loss": 11.3851,
       "step": 520
     },
     {
-      "epoch": 1.0,
-      "eval_accuracy": 0.18062563067608475,
-      "eval_loss": 11.029301643371582,
-      "eval_runtime": 592.6353,
-      "eval_samples_per_second": 25.083,
-      "eval_steps_per_second": 25.083,
-      "step": 523
-    },
-    {
-      "epoch": 1.0325047801147227,
-      "grad_norm": 3.6838159561157227,
-      "learning_rate": 9.963883577650308e-05,
-      "loss": 11.2589,
       "step": 540
     },
     {
-      "epoch": 1.0707456978967496,
-      "grad_norm": 3.7846293449401855,
-      "learning_rate": 9.921393669003612e-05,
-      "loss": 11.1668,
       "step": 560
     },
     {
-      "epoch": 1.1089866156787762,
-      "grad_norm": 3.688416004180908,
-      "learning_rate": 9.878903760356916e-05,
-      "loss": 11.1053,
       "step": 580
     },
     {
-      "epoch": 1.147227533460803,
-      "grad_norm": 3.724273204803467,
-      "learning_rate": 9.836413851710219e-05,
-      "loss": 11.019,
       "step": 600
     },
     {
-      "epoch": 1.1854684512428297,
-      "grad_norm": 3.840388536453247,
-      "learning_rate": 9.793923943063523e-05,
-      "loss": 10.9731,
       "step": 620
     },
     {
-      "epoch": 1.2237093690248566,
-      "grad_norm": 3.828228235244751,
-      "learning_rate": 9.751434034416827e-05,
-      "loss": 10.875,
       "step": 640
     },
     {
-      "epoch": 1.2619502868068833,
-      "grad_norm": 3.891911745071411,
-      "learning_rate": 9.70894412577013e-05,
-      "loss": 10.8111,
       "step": 660
     },
     {
-      "epoch": 1.3001912045889101,
-      "grad_norm": 3.8076562881469727,
-      "learning_rate": 9.666454217123433e-05,
-      "loss": 10.7717,
       "step": 680
     },
     {
-      "epoch": 1.338432122370937,
-      "grad_norm": 3.8521881103515625,
-      "learning_rate": 9.623964308476737e-05,
-      "loss": 10.6723,
       "step": 700
     },
     {
-      "epoch": 1.3766730401529637,
-      "grad_norm": 3.8576488494873047,
-      "learning_rate": 9.58147439983004e-05,
-      "loss": 10.5961,
       "step": 720
     },
     {
-      "epoch": 1.4149139579349903,
-      "grad_norm": 4.002715587615967,
-      "learning_rate": 9.538984491183345e-05,
-      "loss": 10.5392,
       "step": 740
     },
     {
-      "epoch": 1.4531548757170172,
-      "grad_norm": 3.8657026290893555,
-      "learning_rate": 9.496494582536648e-05,
-      "loss": 10.5018,
       "step": 760
     },
     {
-      "epoch": 1.491395793499044,
-      "grad_norm": 3.9424169063568115,
-      "learning_rate": 9.454004673889951e-05,
-      "loss": 10.4325,
       "step": 780
     },
     {
-      "epoch": 1.5296367112810707,
-      "grad_norm": 3.9783968925476074,
-      "learning_rate": 9.411514765243256e-05,
-      "loss": 10.3722,
       "step": 800
     },
     {
-      "epoch": 1.5678776290630974,
-      "grad_norm": 4.081951141357422,
-      "learning_rate": 9.369024856596559e-05,
-      "loss": 10.3069,
       "step": 820
     },
     {
-      "epoch": 1.6061185468451242,
-      "grad_norm": 4.141290187835693,
-      "learning_rate": 9.326534947949863e-05,
-      "loss": 10.2527,
       "step": 840
     },
     {
-      "epoch": 1.644359464627151,
-      "grad_norm": 4.294083595275879,
-      "learning_rate": 9.284045039303167e-05,
-      "loss": 10.2271,
       "step": 860
     },
     {
-      "epoch": 1.682600382409178,
-      "grad_norm": 4.727543354034424,
-      "learning_rate": 9.241555130656469e-05,
-      "loss": 10.1756,
       "step": 880
     },
     {
-      "epoch": 1.7208413001912046,
-      "grad_norm": 4.068965911865234,
-      "learning_rate": 9.199065222009773e-05,
-      "loss": 10.0936,
       "step": 900
     },
     {
-      "epoch": 1.7590822179732313,
-      "grad_norm": 4.025643825531006,
-      "learning_rate": 9.156575313363077e-05,
-      "loss": 10.0937,
       "step": 920
     },
     {
-      "epoch": 1.7973231357552581,
-      "grad_norm": 4.317354679107666,
-      "learning_rate": 9.11408540471638e-05,
-      "loss": 10.0217,
       "step": 940
     },
     {
-      "epoch": 1.835564053537285,
-      "grad_norm": 4.101060390472412,
-      "learning_rate": 9.071595496069684e-05,
-      "loss": 9.9743,
       "step": 960
     },
     {
-      "epoch": 1.8738049713193117,
-      "grad_norm": 4.225609302520752,
-      "learning_rate": 9.029105587422988e-05,
-      "loss": 9.9879,
       "step": 980
     },
     {
-      "epoch": 1.9120458891013383,
-      "grad_norm": 4.3140668869018555,
-      "learning_rate": 8.986615678776292e-05,
-      "loss": 9.8273,
       "step": 1000
     },
     {
-      "epoch": 1.9502868068833652,
-      "grad_norm": 4.199500560760498,
-      "learning_rate": 8.944125770129594e-05,
-      "loss": 9.8136,
       "step": 1020
     },
     {
-      "epoch": 1.988527724665392,
-      "grad_norm": 4.457912445068359,
-      "learning_rate": 8.901635861482898e-05,
-      "loss": 9.7596,
       "step": 1040
     },
     {
-      "epoch": 2.0,
-      "eval_accuracy": 0.3849983181971073,
-      "eval_loss": 9.140138626098633,
-      "eval_runtime": 461.2724,
-      "eval_samples_per_second": 32.226,
-      "eval_steps_per_second": 32.226,
-      "step": 1046
-    },
-    {
-      "epoch": 2.026768642447419,
-      "grad_norm": 4.428006172180176,
-      "learning_rate": 8.859145952836202e-05,
-      "loss": 9.714,
       "step": 1060
     },
     {
-      "epoch": 2.0650095602294454,
-      "grad_norm": 4.372852325439453,
-      "learning_rate": 8.816656044189505e-05,
-      "loss": 9.5508,
       "step": 1080
     },
     {
-      "epoch": 2.1032504780114722,
-      "grad_norm": 4.381687641143799,
-      "learning_rate": 8.774166135542809e-05,
-      "loss": 9.6096,
       "step": 1100
     },
     {
-      "epoch": 2.141491395793499,
-      "grad_norm": 4.5865631103515625,
-      "learning_rate": 8.731676226896113e-05,
-      "loss": 9.5077,
       "step": 1120
     },
     {
-      "epoch": 2.179732313575526,
-      "grad_norm": 4.363910675048828,
-      "learning_rate": 8.689186318249416e-05,
-      "loss": 9.5044,
       "step": 1140
     },
     {
-      "epoch": 2.2179732313575524,
-      "grad_norm": 4.577084541320801,
-      "learning_rate": 8.646696409602721e-05,
-      "loss": 9.4205,
       "step": 1160
     },
     {
-      "epoch": 2.2562141491395793,
-      "grad_norm": 4.576254367828369,
-      "learning_rate": 8.604206500956024e-05,
-      "loss": 9.4317,
       "step": 1180
     },
     {
-      "epoch": 2.294455066921606,
-      "grad_norm": 4.4399847984313965,
-      "learning_rate": 8.561716592309326e-05,
-      "loss": 9.3607,
       "step": 1200
     },
     {
-      "epoch": 2.332695984703633,
-      "grad_norm": 4.595015525817871,
-      "learning_rate": 8.51922668366263e-05,
-      "loss": 9.2533,
       "step": 1220
     },
     {
-      "epoch": 2.3709369024856595,
-      "grad_norm": 4.900874614715576,
-      "learning_rate": 8.476736775015934e-05,
-      "loss": 9.3384,
       "step": 1240
     },
     {
-      "epoch": 2.4091778202676863,
-      "grad_norm": 4.594742774963379,
-      "learning_rate": 8.434246866369238e-05,
-      "loss": 9.293,
       "step": 1260
     },
     {
-      "epoch": 2.447418738049713,
-      "grad_norm": 4.587216377258301,
-      "learning_rate": 8.391756957722541e-05,
-      "loss": 9.1986,
       "step": 1280
     },
     {
-      "epoch": 2.48565965583174,
-      "grad_norm": 4.735275745391846,
-      "learning_rate": 8.349267049075845e-05,
-      "loss": 9.1358,
       "step": 1300
     },
     {
-      "epoch": 2.5239005736137665,
-      "grad_norm": 4.627840995788574,
-      "learning_rate": 8.306777140429149e-05,
-      "loss": 9.1284,
       "step": 1320
     },
     {
-      "epoch": 2.5621414913957934,
-      "grad_norm": 4.658718585968018,
-      "learning_rate": 8.264287231782451e-05,
-      "loss": 9.0949,
       "step": 1340
     },
     {
-      "epoch": 2.6003824091778203,
-      "grad_norm": 4.875549793243408,
-      "learning_rate": 8.221797323135755e-05,
-      "loss": 9.0312,
       "step": 1360
     },
     {
-      "epoch": 2.638623326959847,
-      "grad_norm": 4.683437347412109,
-      "learning_rate": 8.179307414489059e-05,
-      "loss": 8.9949,
       "step": 1380
     },
     {
-      "epoch": 2.676864244741874,
-      "grad_norm": 4.861114025115967,
-      "learning_rate": 8.136817505842362e-05,
-      "loss": 8.9705,
       "step": 1400
     },
     {
-      "epoch": 2.7151051625239004,
-      "grad_norm": 4.727562427520752,
-      "learning_rate": 8.094327597195667e-05,
-      "loss": 8.9483,
       "step": 1420
     },
     {
-      "epoch": 2.7533460803059273,
-      "grad_norm": 4.8202948570251465,
-      "learning_rate": 8.05183768854897e-05,
-      "loss": 8.9254,
       "step": 1440
     },
     {
-      "epoch": 2.791586998087954,
-      "grad_norm": 4.926464557647705,
-      "learning_rate": 8.009347779902273e-05,
-      "loss": 8.8768,
       "step": 1460
     },
     {
-      "epoch": 2.8298279158699806,
-      "grad_norm": 4.7756028175354,
-      "learning_rate": 7.966857871255578e-05,
-      "loss": 8.8044,
       "step": 1480
     },
     {
-      "epoch": 2.8680688336520075,
-      "grad_norm": 4.888403415679932,
-      "learning_rate": 7.92436796260888e-05,
-      "loss": 8.7788,
       "step": 1500
     },
     {
-      "epoch": 2.9063097514340344,
-      "grad_norm": 4.943230152130127,
-      "learning_rate": 7.881878053962184e-05,
-      "loss": 8.8032,
       "step": 1520
     },
     {
-      "epoch": 2.9445506692160612,
-      "grad_norm": 5.011119842529297,
-      "learning_rate": 7.839388145315488e-05,
-      "loss": 8.7507,
       "step": 1540
     },
     {
-      "epoch": 2.982791586998088,
-      "grad_norm": 5.068637847900391,
-      "learning_rate": 7.796898236668791e-05,
-      "loss": 8.7136,
       "step": 1560
     },
     {
-      "epoch": 3.0,
-      "eval_accuracy": 0.52418432559704,
-      "eval_loss": 7.882061958312988,
-      "eval_runtime": 418.5795,
-      "eval_samples_per_second": 35.513,
-      "eval_steps_per_second": 35.513,
-      "step": 1569
-    },
-    {
-      "epoch": 3.0210325047801145,
-      "grad_norm": 4.895749092102051,
-      "learning_rate": 7.754408328022095e-05,
-      "loss": 8.6104,
       "step": 1580
     },
     {
-      "epoch": 3.0592734225621414,
-      "grad_norm": 5.138400077819824,
-      "learning_rate": 7.711918419375399e-05,
-      "loss": 8.6136,
       "step": 1600
     },
     {
-      "epoch": 3.0975143403441683,
-      "grad_norm": 5.270049571990967,
-      "learning_rate": 7.669428510728702e-05,
-      "loss": 8.5866,
       "step": 1620
     },
     {
-      "epoch": 3.135755258126195,
-      "grad_norm": 5.178355693817139,
-      "learning_rate": 7.626938602082006e-05,
-      "loss": 8.492,
       "step": 1640
     },
     {
-      "epoch": 3.173996175908222,
-      "grad_norm": 5.312692165374756,
-      "learning_rate": 7.58444869343531e-05,
-      "loss": 8.4897,
       "step": 1660
     },
     {
-      "epoch": 3.2122370936902485,
-      "grad_norm": 5.227985382080078,
-      "learning_rate": 7.541958784788614e-05,
-      "loss": 8.4441,
       "step": 1680
     },
     {
-      "epoch": 3.2504780114722753,
-      "grad_norm": 5.042078495025635,
-      "learning_rate": 7.499468876141916e-05,
-      "loss": 8.4722,
       "step": 1700
     },
     {
-      "epoch": 3.288718929254302,
-      "grad_norm": 5.250526428222656,
-      "learning_rate": 7.45697896749522e-05,
-      "loss": 8.3105,
       "step": 1720
     },
     {
-      "epoch": 3.3269598470363286,
-      "grad_norm": 5.22187614440918,
-      "learning_rate": 7.414489058848524e-05,
-      "loss": 8.3308,
       "step": 1740
     },
     {
-      "epoch": 3.3652007648183555,
-      "grad_norm": 5.491254806518555,
-      "learning_rate": 7.371999150201827e-05,
-      "loss": 8.2969,
       "step": 1760
     },
     {
-      "epoch": 3.4034416826003824,
-      "grad_norm": 5.482990741729736,
-      "learning_rate": 7.329509241555131e-05,
-      "loss": 8.2593,
       "step": 1780
     },
     {
-      "epoch": 3.4416826003824093,
-      "grad_norm": 5.359766960144043,
-      "learning_rate": 7.287019332908435e-05,
-      "loss": 8.3087,
       "step": 1800
     },
     {
-      "epoch": 3.479923518164436,
-      "grad_norm": 5.788363456726074,
-      "learning_rate": 7.244529424261737e-05,
-      "loss": 8.2664,
       "step": 1820
     },
     {
-      "epoch": 3.5181644359464626,
-      "grad_norm": 5.335551738739014,
-      "learning_rate": 7.202039515615043e-05,
-      "loss": 8.2543,
       "step": 1840
     },
     {
-      "epoch": 3.5564053537284894,
-      "grad_norm": 5.465627193450928,
-      "learning_rate": 7.159549606968345e-05,
-      "loss": 8.2604,
       "step": 1860
     },
     {
-      "epoch": 3.5946462715105163,
-      "grad_norm": 5.594823837280273,
-      "learning_rate": 7.117059698321648e-05,
-      "loss": 8.1616,
       "step": 1880
     },
     {
-      "epoch": 3.632887189292543,
-      "grad_norm": 5.58858060836792,
-      "learning_rate": 7.074569789674953e-05,
-      "loss": 8.1582,
       "step": 1900
     },
     {
-      "epoch": 3.67112810707457,
-      "grad_norm": 5.514508247375488,
-      "learning_rate": 7.032079881028256e-05,
-      "loss": 8.1061,
       "step": 1920
     },
     {
-      "epoch": 3.7093690248565965,
-      "grad_norm": 5.644900321960449,
-      "learning_rate": 6.98958997238156e-05,
-      "loss": 8.0912,
       "step": 1940
     },
     {
-      "epoch": 3.7476099426386233,
-      "grad_norm": 5.701168060302734,
-      "learning_rate": 6.947100063734864e-05,
-      "loss": 7.9596,
       "step": 1960
     },
     {
-      "epoch": 3.78585086042065,
-      "grad_norm": 5.880733013153076,
-      "learning_rate": 6.904610155088167e-05,
-      "loss": 8.0403,
       "step": 1980
     },
     {
-      "epoch": 3.8240917782026767,
-      "grad_norm": 5.638689994812012,
-      "learning_rate": 6.86212024644147e-05,
-      "loss": 7.9666,
       "step": 2000
     },
     {
-      "epoch": 3.8623326959847035,
-      "grad_norm": 6.002101421356201,
-      "learning_rate": 6.819630337794775e-05,
-      "loss": 7.9633,
       "step": 2020
     },
     {
-      "epoch": 3.9005736137667304,
-      "grad_norm": 5.628067493438721,
-      "learning_rate": 6.777140429148077e-05,
-      "loss": 7.8817,
       "step": 2040
     },
     {
-      "epoch": 3.9388145315487573,
-      "grad_norm": 6.128510475158691,
-      "learning_rate": 6.734650520501381e-05,
-      "loss": 7.9118,
       "step": 2060
     },
     {
-      "epoch": 3.977055449330784,
-      "grad_norm": 5.620929718017578,
-      "learning_rate": 6.692160611854685e-05,
-      "loss": 7.848,
       "step": 2080
     },
     {
-      "epoch": 4.0,
-      "eval_accuracy": 0.6143962327615203,
-      "eval_loss": 6.945113658905029,
-      "eval_runtime": 367.1966,
-      "eval_samples_per_second": 40.482,
-      "eval_steps_per_second": 40.482,
-      "step": 2092
-    },
-    {
-      "epoch": 4.015296367112811,
-      "grad_norm": 5.820804595947266,
-      "learning_rate": 6.649670703207989e-05,
-      "loss": 7.8607,
       "step": 2100
     },
     {
-      "epoch": 4.053537284894838,
-      "grad_norm": 5.6448493003845215,
-      "learning_rate": 6.607180794561292e-05,
-      "loss": 7.7072,
       "step": 2120
     },
     {
-      "epoch": 4.091778202676864,
-      "grad_norm": 6.283373832702637,
-      "learning_rate": 6.564690885914596e-05,
-      "loss": 7.772,
       "step": 2140
     },
     {
-      "epoch": 4.130019120458891,
-      "grad_norm": 6.125846862792969,
-      "learning_rate": 6.5222009772679e-05,
-      "loss": 7.7211,
       "step": 2160
     },
     {
-      "epoch": 4.168260038240918,
-      "grad_norm": 5.701002597808838,
-      "learning_rate": 6.479711068621202e-05,
-      "loss": 7.6563,
       "step": 2180
     },
     {
-      "epoch": 4.2065009560229445,
-      "grad_norm": 5.910340785980225,
-      "learning_rate": 6.437221159974506e-05,
-      "loss": 7.711,
       "step": 2200
     },
     {
-      "epoch": 4.244741873804971,
-      "grad_norm": 5.8003082275390625,
-      "learning_rate": 6.39473125132781e-05,
-      "loss": 7.7582,
       "step": 2220
     },
     {
-      "epoch": 4.282982791586998,
-      "grad_norm": 5.95621395111084,
-      "learning_rate": 6.352241342681113e-05,
-      "loss": 7.6215,
       "step": 2240
     },
     {
-      "epoch": 4.321223709369025,
-      "grad_norm": 5.836912155151367,
-      "learning_rate": 6.309751434034417e-05,
-      "loss": 7.5932,
       "step": 2260
     },
     {
-      "epoch": 4.359464627151052,
-      "grad_norm": 6.156320095062256,
-      "learning_rate": 6.267261525387721e-05,
-      "loss": 7.5122,
       "step": 2280
     },
     {
-      "epoch": 4.397705544933078,
-      "grad_norm": 5.937085151672363,
-      "learning_rate": 6.224771616741024e-05,
-      "loss": 7.5488,
       "step": 2300
     },
     {
-      "epoch": 4.435946462715105,
-      "grad_norm": 5.949016571044922,
-      "learning_rate": 6.182281708094328e-05,
-      "loss": 7.5972,
       "step": 2320
     },
     {
-      "epoch": 4.474187380497132,
-      "grad_norm": 6.26347541809082,
-      "learning_rate": 6.139791799447631e-05,
-      "loss": 7.4327,
       "step": 2340
     },
     {
-      "epoch": 4.512428298279159,
-      "grad_norm": 6.376476287841797,
-      "learning_rate": 6.097301890800935e-05,
-      "loss": 7.555,
       "step": 2360
     },
     {
-      "epoch": 4.550669216061186,
-      "grad_norm": 6.2988200187683105,
-      "learning_rate": 6.054811982154238e-05,
-      "loss": 7.5463,
       "step": 2380
     },
     {
-      "epoch": 4.588910133843212,
-      "grad_norm": 5.916903972625732,
-      "learning_rate": 6.012322073507543e-05,
-      "loss": 7.4637,
       "step": 2400
     },
     {
-      "epoch": 4.627151051625239,
-      "grad_norm": 5.896063327789307,
-      "learning_rate": 5.969832164860846e-05,
-      "loss": 7.3857,
       "step": 2420
     },
     {
-      "epoch": 4.665391969407266,
-      "grad_norm": 6.14431619644165,
-      "learning_rate": 5.927342256214149e-05,
-      "loss": 7.4363,
       "step": 2440
     },
     {
-      "epoch": 4.7036328871892925,
-      "grad_norm": 6.2994256019592285,
-      "learning_rate": 5.8848523475674533e-05,
-      "loss": 7.406,
       "step": 2460
     },
     {
-      "epoch": 4.741873804971319,
-      "grad_norm": 6.134793758392334,
-      "learning_rate": 5.8423624389207567e-05,
-      "loss": 7.338,
       "step": 2480
     },
     {
-      "epoch": 4.780114722753346,
-      "grad_norm": 6.245213031768799,
-      "learning_rate": 5.79987253027406e-05,
-      "loss": 7.3912,
       "step": 2500
     },
     {
-      "epoch": 4.818355640535373,
-      "grad_norm": 6.118636131286621,
-      "learning_rate": 5.757382621627364e-05,
-      "loss": 7.3548,
       "step": 2520
     },
     {
-      "epoch": 4.8565965583174,
-      "grad_norm": 6.391002178192139,
-      "learning_rate": 5.714892712980667e-05,
-      "loss": 7.3119,
       "step": 2540
     },
     {
-      "epoch": 4.894837476099426,
-      "grad_norm": 6.539446830749512,
-      "learning_rate": 5.6724028043339705e-05,
-      "loss": 7.2119,
       "step": 2560
     },
     {
-      "epoch": 4.933078393881453,
-      "grad_norm": 6.162653923034668,
-      "learning_rate": 5.6299128956872745e-05,
-      "loss": 7.2505,
       "step": 2580
     },
     {
-      "epoch": 4.97131931166348,
-      "grad_norm": 6.580591678619385,
-      "learning_rate": 5.587422987040578e-05,
-      "loss": 7.1912,
       "step": 2600
     },
     {
-      "epoch": 5.0,
-      "eval_accuracy": 0.6821392532795156,
-      "eval_loss": 6.262951850891113,
-      "eval_runtime": 76.4531,
-      "eval_samples_per_second": 194.433,
-      "eval_steps_per_second": 194.433,
-      "step": 2615
-    },
-    {
-      "epoch": 5.009560229445507,
-      "grad_norm": 6.838705062866211,
-      "learning_rate": 5.544933078393881e-05,
-      "loss": 7.1863,
       "step": 2620
     },
     {
-      "epoch": 5.047801147227533,
-      "grad_norm": 6.260281562805176,
-      "learning_rate": 5.502443169747186e-05,
-      "loss": 7.1259,
       "step": 2640
     },
     {
-      "epoch": 5.08604206500956,
-      "grad_norm": 6.463006496429443,
-      "learning_rate": 5.459953261100489e-05,
-      "loss": 7.1559,
       "step": 2660
     },
     {
-      "epoch": 5.124282982791587,
-      "grad_norm": 6.499185562133789,
-      "learning_rate": 5.4174633524537924e-05,
-      "loss": 7.1318,
       "step": 2680
     },
     {
-      "epoch": 5.162523900573614,
-      "grad_norm": 6.508650302886963,
-      "learning_rate": 5.3749734438070964e-05,
-      "loss": 7.0993,
       "step": 2700
     },
     {
-      "epoch": 5.2007648183556405,
-      "grad_norm": 6.573218822479248,
-      "learning_rate": 5.3324835351604e-05,
-      "loss": 7.0823,
       "step": 2720
     },
     {
-      "epoch": 5.239005736137667,
-      "grad_norm": 6.863697052001953,
-      "learning_rate": 5.289993626513703e-05,
-      "loss": 7.0839,
       "step": 2740
     },
     {
-      "epoch": 5.277246653919694,
-      "grad_norm": 6.305070877075195,
-      "learning_rate": 5.247503717867007e-05,
-      "loss": 7.0723,
       "step": 2760
     },
     {
-      "epoch": 5.315487571701721,
-      "grad_norm": 6.715279579162598,
-      "learning_rate": 5.20501380922031e-05,
-      "loss": 6.9592,
       "step": 2780
     },
     {
-      "epoch": 5.353728489483748,
-      "grad_norm": 6.625701904296875,
-      "learning_rate": 5.1625239005736136e-05,
-      "loss": 7.0275,
       "step": 2800
     },
     {
-      "epoch": 5.3919694072657744,
-      "grad_norm": 6.717496871948242,
-      "learning_rate": 5.120033991926918e-05,
-      "loss": 6.9146,
       "step": 2820
     },
     {
-      "epoch": 5.430210325047801,
-      "grad_norm": 6.500243186950684,
-      "learning_rate": 5.0775440832802216e-05,
-      "loss": 6.9984,
       "step": 2840
     },
     {
-      "epoch": 5.468451242829828,
-      "grad_norm": 6.41347074508667,
-      "learning_rate": 5.035054174633524e-05,
-      "loss": 6.9367,
       "step": 2860
     },
     {
-      "epoch": 5.506692160611855,
-      "grad_norm": 6.83429479598999,
-      "learning_rate": 4.992564265986828e-05,
-      "loss": 6.9997,
       "step": 2880
     },
     {
-      "epoch": 5.544933078393882,
-      "grad_norm": 6.565597057342529,
-      "learning_rate": 4.950074357340132e-05,
-      "loss": 6.9204,
       "step": 2900
     },
     {
-      "epoch": 5.583173996175908,
-      "grad_norm": 6.9456095695495605,
-      "learning_rate": 4.907584448693436e-05,
-      "loss": 6.8926,
       "step": 2920
     },
     {
-      "epoch": 5.621414913957935,
-      "grad_norm": 7.052099704742432,
-      "learning_rate": 4.865094540046739e-05,
-      "loss": 6.8993,
       "step": 2940
     },
     {
-      "epoch": 5.659655831739962,
-      "grad_norm": 7.128490924835205,
-      "learning_rate": 4.822604631400043e-05,
-      "loss": 6.8474,
       "step": 2960
     },
     {
-      "epoch": 5.6978967495219885,
-      "grad_norm": 6.792144298553467,
-      "learning_rate": 4.780114722753346e-05,
-      "loss": 6.8509,
       "step": 2980
     },
     {
-      "epoch": 5.736137667304015,
-      "grad_norm": 6.853285312652588,
-      "learning_rate": 4.73762481410665e-05,
-      "loss": 6.9141,
       "step": 3000
     },
     {
-      "epoch": 5.774378585086042,
-      "grad_norm": 7.153258800506592,
-      "learning_rate": 4.695134905459953e-05,
-      "loss": 6.7391,
       "step": 3020
     },
     {
-      "epoch": 5.812619502868069,
-      "grad_norm": 6.9271321296691895,
-      "learning_rate": 4.6526449968132566e-05,
-      "loss": 6.7554,
       "step": 3040
     },
     {
-      "epoch": 5.850860420650095,
-      "grad_norm": 7.218133926391602,
-      "learning_rate": 4.6101550881665606e-05,
-      "loss": 6.8172,
       "step": 3060
     },
     {
-      "epoch": 5.8891013384321225,
-      "grad_norm": 7.0558695793151855,
-      "learning_rate": 4.5676651795198646e-05,
-      "loss": 6.8442,
       "step": 3080
     },
     {
-      "epoch": 5.927342256214149,
-      "grad_norm": 6.762065887451172,
-      "learning_rate": 4.525175270873168e-05,
-      "loss": 6.696,
       "step": 3100
     },
     {
-      "epoch": 5.965583173996176,
-      "grad_norm": 6.8173604011535645,
-      "learning_rate": 4.482685362226471e-05,
-      "loss": 6.6763,
       "step": 3120
     },
     {
-      "epoch": 6.0,
-      "eval_accuracy": 0.7291624621594349,
-      "eval_loss": 5.7182440757751465,
-      "eval_runtime": 444.003,
-      "eval_samples_per_second": 33.48,
-      "eval_steps_per_second": 33.48,
-      "step": 3138
-    },
-    {
-      "epoch": 6.003824091778203,
-      "grad_norm": 7.1014723777771,
-      "learning_rate": 4.440195453579775e-05,
-      "loss": 6.6927,
       "step": 3140
     },
     {
-      "epoch": 6.042065009560229,
-      "grad_norm": 6.958450794219971,
-      "learning_rate": 4.3977055449330785e-05,
-      "loss": 6.6538,
       "step": 3160
     },
     {
-      "epoch": 6.080305927342256,
-      "grad_norm": 6.920003890991211,
-      "learning_rate": 4.3552156362863825e-05,
-      "loss": 6.5479,
       "step": 3180
     },
     {
-      "epoch": 6.118546845124283,
-      "grad_norm": 7.053244113922119,
-      "learning_rate": 4.312725727639686e-05,
-      "loss": 6.5668,
       "step": 3200
     },
     {
-      "epoch": 6.15678776290631,
-      "grad_norm": 6.9157185554504395,
-      "learning_rate": 4.270235818992989e-05,
-      "loss": 6.6722,
       "step": 3220
     },
     {
-      "epoch": 6.195028680688337,
-      "grad_norm": 7.149935722351074,
-      "learning_rate": 4.227745910346293e-05,
-      "loss": 6.6397,
       "step": 3240
     },
     {
-      "epoch": 6.233269598470363,
-      "grad_norm": 7.318164825439453,
-      "learning_rate": 4.185256001699597e-05,
-      "loss": 6.6041,
       "step": 3260
     },
     {
-      "epoch": 6.27151051625239,
-      "grad_norm": 7.044018268585205,
-      "learning_rate": 4.1427660930529e-05,
-      "loss": 6.5492,
       "step": 3280
     },
     {
-      "epoch": 6.309751434034417,
-      "grad_norm": 7.045164585113525,
-      "learning_rate": 4.1002761844062037e-05,
-      "loss": 6.5679,
       "step": 3300
     },
     {
-      "epoch": 6.347992351816444,
-      "grad_norm": 7.092489242553711,
-      "learning_rate": 4.0577862757595076e-05,
-      "loss": 6.5695,
       "step": 3320
     },
     {
-      "epoch": 6.3862332695984705,
-      "grad_norm": 6.940147399902344,
-      "learning_rate": 4.015296367112811e-05,
-      "loss": 6.4842,
       "step": 3340
     },
     {
-      "epoch": 6.424474187380497,
-      "grad_norm": 7.10172176361084,
-      "learning_rate": 3.972806458466114e-05,
-      "loss": 6.5317,
       "step": 3360
     },
     {
-      "epoch": 6.462715105162524,
-      "grad_norm": 7.129051208496094,
-      "learning_rate": 3.930316549819418e-05,
-      "loss": 6.4702,
       "step": 3380
     },
     {
-      "epoch": 6.500956022944551,
-      "grad_norm": 7.501070499420166,
-      "learning_rate": 3.8878266411727215e-05,
-      "loss": 6.3999,
       "step": 3400
     },
     {
-      "epoch": 6.539196940726577,
-      "grad_norm": 7.325244426727295,
-      "learning_rate": 3.8453367325260255e-05,
-      "loss": 6.4932,
       "step": 3420
     },
     {
-      "epoch": 6.577437858508604,
-      "grad_norm": 7.361093521118164,
-      "learning_rate": 3.802846823879329e-05,
-      "loss": 6.3927,
       "step": 3440
     },
     {
-      "epoch": 6.615678776290631,
-      "grad_norm": 7.228673458099365,
-      "learning_rate": 3.760356915232632e-05,
-      "loss": 6.4861,
       "step": 3460
     },
     {
-      "epoch": 6.653919694072657,
-      "grad_norm": 7.602611064910889,
-      "learning_rate": 3.717867006585936e-05,
-      "loss": 6.4623,
       "step": 3480
     },
     {
-      "epoch": 6.692160611854685,
-      "grad_norm": 7.901960372924805,
-      "learning_rate": 3.6753770979392394e-05,
-      "loss": 6.4282,
       "step": 3500
     },
     {
-      "epoch": 6.730401529636711,
-      "grad_norm": 7.1125383377075195,
-      "learning_rate": 3.6328871892925434e-05,
-      "loss": 6.3799,
       "step": 3520
     },
     {
-      "epoch": 6.768642447418738,
-      "grad_norm": 7.1385884284973145,
-      "learning_rate": 3.590397280645847e-05,
-      "loss": 6.3707,
       "step": 3540
     },
     {
-      "epoch": 6.806883365200765,
-      "grad_norm": 7.548192977905273,
-      "learning_rate": 3.54790737199915e-05,
-      "loss": 6.4388,
       "step": 3560
     },
     {
-      "epoch": 6.845124282982791,
-      "grad_norm": 7.492359161376953,
-      "learning_rate": 3.505417463352454e-05,
-      "loss": 6.4223,
       "step": 3580
     },
     {
-      "epoch": 6.8833652007648185,
-      "grad_norm": 7.575985431671143,
-      "learning_rate": 3.462927554705758e-05,
-      "loss": 6.3552,
       "step": 3600
     },
     {
-      "epoch": 6.921606118546845,
-      "grad_norm": 7.351112365722656,
-      "learning_rate": 3.4204376460590606e-05,
-      "loss": 6.3379,
       "step": 3620
     },
     {
-      "epoch": 6.959847036328872,
-      "grad_norm": 7.33430290222168,
-      "learning_rate": 3.3779477374123646e-05,
-      "loss": 6.3429,
       "step": 3640
     },
     {
-      "epoch": 6.998087954110899,
-      "grad_norm": 7.511825084686279,
-      "learning_rate": 3.3354578287656686e-05,
-      "loss": 6.3112,
       "step": 3660
     },
     {
-      "epoch": 7.0,
-      "eval_accuracy": 0.7632021527077026,
-      "eval_loss": 5.265278339385986,
-      "eval_runtime": 484.395,
-      "eval_samples_per_second": 30.688,
-      "eval_steps_per_second": 30.688,
-      "step": 3661
-    },
-    {
-      "epoch": 7.036328871892925,
-      "grad_norm": 7.424711227416992,
-      "learning_rate": 3.292967920118972e-05,
-      "loss": 6.1764,
       "step": 3680
     },
     {
-      "epoch": 7.074569789674952,
-      "grad_norm": 7.648799896240234,
-      "learning_rate": 3.250478011472275e-05,
-      "loss": 6.2389,
       "step": 3700
     },
     {
-      "epoch": 7.112810707456979,
-      "grad_norm": 7.4450483322143555,
-      "learning_rate": 3.207988102825579e-05,
-      "loss": 6.2506,
       "step": 3720
     },
     {
-      "epoch": 7.151051625239006,
-      "grad_norm": 7.422061443328857,
-      "learning_rate": 3.1654981941788825e-05,
-      "loss": 6.2049,
       "step": 3740
     },
     {
-      "epoch": 7.189292543021033,
-      "grad_norm": 7.345204830169678,
-      "learning_rate": 3.1230082855321864e-05,
-      "loss": 6.2906,
       "step": 3760
     },
     {
-      "epoch": 7.227533460803059,
-      "grad_norm": 7.486473083496094,
-      "learning_rate": 3.08051837688549e-05,
-      "loss": 6.2644,
       "step": 3780
     },
     {
-      "epoch": 7.265774378585086,
-      "grad_norm": 7.317290782928467,
-      "learning_rate": 3.0380284682387934e-05,
-      "loss": 6.2421,
       "step": 3800
     },
     {
-      "epoch": 7.304015296367113,
-      "grad_norm": 7.4384002685546875,
-      "learning_rate": 2.995538559592097e-05,
-      "loss": 6.1406,
       "step": 3820
     },
     {
-      "epoch": 7.342256214149139,
-      "grad_norm": 7.7606000900268555,
-      "learning_rate": 2.9530486509454007e-05,
-      "loss": 6.2031,
       "step": 3840
     },
     {
-      "epoch": 7.3804971319311665,
-      "grad_norm": 7.305050373077393,
-      "learning_rate": 2.910558742298704e-05,
-      "loss": 6.127,
       "step": 3860
     },
     {
-      "epoch": 7.418738049713193,
-      "grad_norm": 7.713500022888184,
-      "learning_rate": 2.868068833652008e-05,
-      "loss": 6.1474,
       "step": 3880
     },
     {
-      "epoch": 7.45697896749522,
-      "grad_norm": 8.028603553771973,
-      "learning_rate": 2.8255789250053116e-05,
-      "loss": 6.1542,
       "step": 3900
     },
     {
-      "epoch": 7.495219885277247,
-      "grad_norm": 7.4730329513549805,
-      "learning_rate": 2.783089016358615e-05,
-      "loss": 6.225,
       "step": 3920
     },
     {
-      "epoch": 7.533460803059273,
-      "grad_norm": 7.52304220199585,
-      "learning_rate": 2.7405991077119186e-05,
-      "loss": 6.1674,
       "step": 3940
     },
     {
-      "epoch": 7.5717017208413,
-      "grad_norm": 7.616427898406982,
-      "learning_rate": 2.6981091990652225e-05,
-      "loss": 6.1169,
       "step": 3960
     },
     {
-      "epoch": 7.609942638623327,
-      "grad_norm": 7.784472465515137,
-      "learning_rate": 2.6556192904185255e-05,
-      "loss": 6.1041,
       "step": 3980
     },
     {
-      "epoch": 7.648183556405353,
-      "grad_norm": 7.819777011871338,
-      "learning_rate": 2.6131293817718295e-05,
-      "loss": 6.1069,
       "step": 4000
     },
     {
-      "epoch": 7.686424474187381,
-      "grad_norm": 7.889120101928711,
-      "learning_rate": 2.5706394731251328e-05,
-      "loss": 5.9985,
       "step": 4020
     },
     {
-      "epoch": 7.724665391969407,
-      "grad_norm": 7.858097076416016,
-      "learning_rate": 2.5281495644784364e-05,
-      "loss": 6.0437,
       "step": 4040
     },
     {
-      "epoch": 7.762906309751434,
-      "grad_norm": 7.739562511444092,
-      "learning_rate": 2.48565965583174e-05,
-      "loss": 6.1376,
       "step": 4060
     },
     {
-      "epoch": 7.801147227533461,
-      "grad_norm": 7.778552532196045,
-      "learning_rate": 2.4431697471850437e-05,
-      "loss": 6.2084,
       "step": 4080
     },
     {
-      "epoch": 7.839388145315487,
-      "grad_norm": 7.536991596221924,
-      "learning_rate": 2.4006798385383474e-05,
-      "loss": 6.0325,
       "step": 4100
     },
     {
-      "epoch": 7.8776290630975145,
-      "grad_norm": 7.846856594085693,
-      "learning_rate": 2.3581899298916507e-05,
-      "loss": 6.098,
       "step": 4120
     },
     {
-      "epoch": 7.915869980879541,
-      "grad_norm": 7.760807991027832,
-      "learning_rate": 2.3157000212449547e-05,
-      "loss": 5.9765,
       "step": 4140
     },
     {
-      "epoch": 7.954110898661568,
-      "grad_norm": 7.827345371246338,
-      "learning_rate": 2.273210112598258e-05,
-      "loss": 5.9915,
       "step": 4160
     },
     {
-      "epoch": 7.992351816443595,
-      "grad_norm": 8.129748344421387,
-      "learning_rate": 2.2307202039515616e-05,
-      "loss": 6.0255,
       "step": 4180
     },
     {
-      "epoch": 8.0,
-      "eval_accuracy": 0.782643794147326,
-      "eval_loss": 4.966301918029785,
-      "eval_runtime": 260.149,
-      "eval_samples_per_second": 57.14,
-      "eval_steps_per_second": 57.14,
-      "step": 4184
-    },
-    {
-      "epoch": 8.030592734225621,
-      "grad_norm": 7.686340808868408,
-      "learning_rate": 2.1882302953048652e-05,
-      "loss": 6.0763,
       "step": 4200
     },
     {
-      "epoch": 8.068833652007648,
-      "grad_norm": 7.666318893432617,
-      "learning_rate": 2.145740386658169e-05,
-      "loss": 5.868,
       "step": 4220
     },
     {
-      "epoch": 8.107074569789676,
-      "grad_norm": 7.686400890350342,
-      "learning_rate": 2.1032504780114722e-05,
-      "loss": 5.8964,
       "step": 4240
     },
     {
-      "epoch": 8.145315487571702,
-      "grad_norm": 7.418490886688232,
-      "learning_rate": 2.0607605693647762e-05,
-      "loss": 5.8408,
       "step": 4260
     },
     {
-      "epoch": 8.183556405353729,
-      "grad_norm": 7.769067287445068,
-      "learning_rate": 2.0182706607180795e-05,
-      "loss": 5.9742,
       "step": 4280
     },
     {
-      "epoch": 8.221797323135755,
-      "grad_norm": 7.915468215942383,
-      "learning_rate": 1.975780752071383e-05,
-      "loss": 5.913,
       "step": 4300
     },
     {
-      "epoch": 8.260038240917781,
-      "grad_norm": 7.884761810302734,
-      "learning_rate": 1.9332908434246868e-05,
-      "loss": 5.8613,
       "step": 4320
     },
     {
-      "epoch": 8.29827915869981,
-      "grad_norm": 7.765011787414551,
-      "learning_rate": 1.8908009347779904e-05,
-      "loss": 5.9791,
       "step": 4340
     },
     {
-      "epoch": 8.336520076481836,
-      "grad_norm": 8.110984802246094,
-      "learning_rate": 1.8483110261312937e-05,
-      "loss": 5.9675,
       "step": 4360
     },
     {
-      "epoch": 8.374760994263863,
-      "grad_norm": 8.114306449890137,
-      "learning_rate": 1.8058211174845974e-05,
-      "loss": 5.9804,
       "step": 4380
     },
     {
-      "epoch": 8.413001912045889,
-      "grad_norm": 7.981202125549316,
-      "learning_rate": 1.763331208837901e-05,
-      "loss": 5.8832,
       "step": 4400
     },
     {
-      "epoch": 8.451242829827915,
-      "grad_norm": 7.628136157989502,
-      "learning_rate": 1.7208413001912046e-05,
-      "loss": 5.9301,
       "step": 4420
     },
     {
-      "epoch": 8.489483747609942,
-      "grad_norm": 7.863382816314697,
-      "learning_rate": 1.6783513915445083e-05,
-      "loss": 5.8983,
       "step": 4440
     },
     {
-      "epoch": 8.52772466539197,
-      "grad_norm": 7.82211971282959,
-      "learning_rate": 1.635861482897812e-05,
-      "loss": 5.8938,
       "step": 4460
     },
     {
-      "epoch": 8.565965583173996,
-      "grad_norm": 8.038976669311523,
-      "learning_rate": 1.5933715742511156e-05,
-      "loss": 5.8945,
       "step": 4480
     },
     {
-      "epoch": 8.604206500956023,
-      "grad_norm": 7.884932518005371,
-      "learning_rate": 1.550881665604419e-05,
-      "loss": 5.8895,
       "step": 4500
     },
     {
-      "epoch": 8.64244741873805,
-      "grad_norm": 7.975419521331787,
-      "learning_rate": 1.5083917569577227e-05,
-      "loss": 5.9617,
       "step": 4520
     },
     {
-      "epoch": 8.680688336520076,
-      "grad_norm": 7.786068916320801,
-      "learning_rate": 1.4659018483110262e-05,
-      "loss": 5.8659,
       "step": 4540
     },
     {
-      "epoch": 8.718929254302104,
-      "grad_norm": 8.130301475524902,
-      "learning_rate": 1.4234119396643298e-05,
-      "loss": 5.9116,
       "step": 4560
     },
     {
-      "epoch": 8.75717017208413,
-      "grad_norm": 8.042682647705078,
-      "learning_rate": 1.3809220310176335e-05,
-      "loss": 5.8536,
       "step": 4580
     },
     {
-      "epoch": 8.795411089866157,
-      "grad_norm": 8.327803611755371,
-      "learning_rate": 1.3384321223709371e-05,
-      "loss": 5.9241,
       "step": 4600
     },
     {
-      "epoch": 8.833652007648183,
-      "grad_norm": 7.880401134490967,
-      "learning_rate": 1.2959422137242406e-05,
-      "loss": 5.864,
       "step": 4620
     },
     {
-      "epoch": 8.87189292543021,
-      "grad_norm": 7.6825127601623535,
-      "learning_rate": 1.253452305077544e-05,
-      "loss": 5.9457,
       "step": 4640
     },
     {
-      "epoch": 8.910133843212238,
-      "grad_norm": 7.971193313598633,
-      "learning_rate": 1.2109623964308479e-05,
-      "loss": 5.8329,
       "step": 4660
     },
     {
-      "epoch": 8.948374760994264,
-      "grad_norm": 8.04354476928711,
-      "learning_rate": 1.1684724877841513e-05,
-      "loss": 5.8671,
       "step": 4680
     },
     {
-      "epoch": 8.98661567877629,
-      "grad_norm": 7.942180633544922,
-      "learning_rate": 1.125982579137455e-05,
-      "loss": 5.8091,
       "step": 4700
     },
     {
-      "epoch": 9.0,
-      "eval_accuracy": 0.7956945845946855,
-      "eval_loss": 4.778744220733643,
-      "eval_runtime": 531.1827,
-      "eval_samples_per_second": 27.985,
-      "eval_steps_per_second": 27.985,
-      "step": 4707
-    },
-    {
-      "epoch": 9.024856596558317,
-      "grad_norm": 7.77038049697876,
-      "learning_rate": 1.0834926704907584e-05,
-      "loss": 5.7978,
       "step": 4720
     },
     {
-      "epoch": 9.063097514340344,
-      "grad_norm": 7.850288391113281,
-      "learning_rate": 1.0410027618440621e-05,
-      "loss": 5.7849,
       "step": 4740
     },
     {
-      "epoch": 9.101338432122372,
-      "grad_norm": 8.032878875732422,
-      "learning_rate": 9.985128531973657e-06,
-      "loss": 5.7891,
       "step": 4760
     },
     {
-      "epoch": 9.139579349904398,
-      "grad_norm": 7.886658668518066,
-      "learning_rate": 9.560229445506692e-06,
-      "loss": 5.781,
       "step": 4780
     },
     {
-      "epoch": 9.177820267686425,
-      "grad_norm": 7.953343868255615,
-      "learning_rate": 9.135330359039729e-06,
-      "loss": 5.8584,
       "step": 4800
     },
     {
-      "epoch": 9.216061185468451,
-      "grad_norm": 7.899537563323975,
-      "learning_rate": 8.710431272572763e-06,
-      "loss": 5.8192,
       "step": 4820
     },
     {
-      "epoch": 9.254302103250478,
-      "grad_norm": 8.269824028015137,
-      "learning_rate": 8.2855321861058e-06,
-      "loss": 5.7122,
       "step": 4840
     },
     {
-      "epoch": 9.292543021032504,
-      "grad_norm": 7.824770450592041,
-      "learning_rate": 7.860633099638836e-06,
-      "loss": 5.7634,
       "step": 4860
     },
     {
-      "epoch": 9.330783938814532,
-      "grad_norm": 7.953860759735107,
-      "learning_rate": 7.435734013171872e-06,
-      "loss": 5.8083,
       "step": 4880
     },
     {
-      "epoch": 9.369024856596559,
-      "grad_norm": 8.25514030456543,
-      "learning_rate": 7.010834926704908e-06,
-      "loss": 5.8012,
       "step": 4900
     },
     {
-      "epoch": 9.407265774378585,
-      "grad_norm": 8.2761869430542,
-      "learning_rate": 6.585935840237943e-06,
-      "loss": 5.7938,
       "step": 4920
     },
     {
-      "epoch": 9.445506692160611,
-      "grad_norm": 7.865163803100586,
-      "learning_rate": 6.161036753770979e-06,
-      "loss": 5.6735,
       "step": 4940
     },
     {
-      "epoch": 9.483747609942638,
-      "grad_norm": 8.172937393188477,
-      "learning_rate": 5.736137667304015e-06,
-      "loss": 5.7914,
       "step": 4960
     },
     {
-      "epoch": 9.521988527724666,
-      "grad_norm": 8.558911323547363,
-      "learning_rate": 5.311238580837051e-06,
-      "loss": 5.7702,
       "step": 4980
     },
     {
-      "epoch": 9.560229445506693,
-      "grad_norm": 8.265515327453613,
-      "learning_rate": 4.886339494370088e-06,
-      "loss": 5.7283,
       "step": 5000
     },
     {
-      "epoch": 9.598470363288719,
-      "grad_norm": 8.17795467376709,
-      "learning_rate": 4.461440407903123e-06,
-      "loss": 5.8007,
       "step": 5020
     },
     {
-      "epoch": 9.636711281070745,
-      "grad_norm": 8.109586715698242,
-      "learning_rate": 4.036541321436159e-06,
-      "loss": 5.8121,
       "step": 5040
     },
     {
-      "epoch": 9.674952198852772,
-      "grad_norm": 7.911646842956543,
-      "learning_rate": 3.6116422349691954e-06,
-      "loss": 5.789,
       "step": 5060
     },
     {
-      "epoch": 9.7131931166348,
-      "grad_norm": 8.030941009521484,
-      "learning_rate": 3.186743148502231e-06,
-      "loss": 5.7266,
       "step": 5080
     },
     {
-      "epoch": 9.751434034416826,
-      "grad_norm": 8.059958457946777,
-      "learning_rate": 2.7618440620352666e-06,
-      "loss": 5.761,
       "step": 5100
     },
     {
-      "epoch": 9.789674952198853,
-      "grad_norm": 8.002403259277344,
-      "learning_rate": 2.3369449755683026e-06,
-      "loss": 5.7338,
       "step": 5120
     },
     {
-      "epoch": 9.82791586998088,
-      "grad_norm": 8.306962966918945,
-      "learning_rate": 1.9120458891013386e-06,
-      "loss": 5.7088,
       "step": 5140
     },
     {
-      "epoch": 9.866156787762906,
-      "grad_norm": 8.018095970153809,
-      "learning_rate": 1.4871468026343744e-06,
-      "loss": 5.6973,
       "step": 5160
     },
     {
-      "epoch": 9.904397705544934,
-      "grad_norm": 8.168917655944824,
-      "learning_rate": 1.0622477161674104e-06,
-      "loss": 5.6422,
       "step": 5180
     },
     {
-      "epoch": 9.94263862332696,
-      "grad_norm": 7.939206123352051,
-      "learning_rate": 6.373486297004462e-07,
-      "loss": 5.7399,
       "step": 5200
     },
     {
-      "epoch": 9.980879541108987,
-      "grad_norm": 7.970940589904785,
-      "learning_rate": 2.1244954323348205e-07,
-      "loss": 5.7269,
       "step": 5220
     },
     {
       "epoch": 10.0,
-      "eval_accuracy": 0.8030272452068618,
-      "eval_loss": 4.700281620025635,
-      "eval_runtime": 552.5641,
-      "eval_samples_per_second": 26.902,
-      "eval_steps_per_second": 26.902,
-      "step": 5230
     },
     {
       "epoch": 10.0,
-      "step": 5230,
-      "total_flos": 2.49073133395968e+18,
-      "train_loss": 7.888943860726876,
-      "train_runtime": 28748.4048,
-      "train_samples_per_second": 46.534,
-      "train_steps_per_second": 0.182
     }
   ],
   "logging_steps": 20,
-  "max_steps": 5230,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 10,
   "save_steps": 500,
@@ -1952,7 +2134,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 2.49073133395968e+18,
   "train_batch_size": 256,
   "trial_name": null,
   "trial_params": null

 {
+  "best_metric": 0.9757901815736382,
+  "best_model_checkpoint": "/mnt/data4_HDD_14TB/yang/voxceleb-checkpoints/ecapa-tdnn/voxceleb1/pretrain/c512-aam-len3-bs256-lr5e-4/checkpoint-3450",
   "epoch": 10.0,
   "eval_steps": 500,
+  "global_step": 5750,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
+      "epoch": 0.034782608695652174,
+      "grad_norm": 6.155358791351318,
+      "learning_rate": 1.739130434782609e-05,
+      "loss": 13.2026,
       "step": 20
     },
     {
+      "epoch": 0.06956521739130435,
+      "grad_norm": 5.816741943359375,
+      "learning_rate": 3.478260869565218e-05,
+      "loss": 13.1252,
       "step": 40
     },
     {
+      "epoch": 0.10434782608695652,
+      "grad_norm": 5.273156642913818,
+      "learning_rate": 5.2173913043478256e-05,
+      "loss": 13.0001,
       "step": 60
     },
     {
+      "epoch": 0.1391304347826087,
+      "grad_norm": 4.86655330657959,
+      "learning_rate": 6.956521739130436e-05,
+      "loss": 12.8639,
       "step": 80
     },
     {
+      "epoch": 0.17391304347826086,
+      "grad_norm": 4.438321113586426,
+      "learning_rate": 8.695652173913044e-05,
+      "loss": 12.7376,
       "step": 100
     },
     {
+      "epoch": 0.20869565217391303,
+      "grad_norm": 4.164404392242432,
+      "learning_rate": 0.00010434782608695651,
+      "loss": 12.5722,
       "step": 120
     },
     {
+      "epoch": 0.24347826086956523,
+      "grad_norm": 3.858990430831909,
+      "learning_rate": 0.00012173913043478261,
+      "loss": 12.4229,
       "step": 140
     },
     {
+      "epoch": 0.2782608695652174,
+      "grad_norm": 3.6574394702911377,
+      "learning_rate": 0.0001391304347826087,
+      "loss": 12.2581,
       "step": 160
     },
     {
+      "epoch": 0.3130434782608696,
+      "grad_norm": 3.3787951469421387,
+      "learning_rate": 0.0001565217391304348,
+      "loss": 12.0753,
       "step": 180
     },
     {
+      "epoch": 0.34782608695652173,
+      "grad_norm": 3.323820114135742,
+      "learning_rate": 0.00017391304347826088,
+      "loss": 11.9261,
       "step": 200
     },
     {
+      "epoch": 0.3826086956521739,
+      "grad_norm": 3.247619152069092,
+      "learning_rate": 0.00019130434782608697,
+      "loss": 11.7417,
       "step": 220
     },
     {
+      "epoch": 0.41739130434782606,
+      "grad_norm": 3.2254152297973633,
+      "learning_rate": 0.00020869565217391303,
+      "loss": 11.5771,
       "step": 240
     },
     {
+      "epoch": 0.45217391304347826,
+      "grad_norm": 3.1803464889526367,
+      "learning_rate": 0.00022608695652173914,
+      "loss": 11.3969,
       "step": 260
     },
     {
+      "epoch": 0.48695652173913045,
+      "grad_norm": 3.41034197807312,
+      "learning_rate": 0.00024347826086956522,
+      "loss": 11.2684,
       "step": 280
     },
     {
+      "epoch": 0.5217391304347826,
+      "grad_norm": 3.246403217315674,
+      "learning_rate": 0.0002608695652173913,
+      "loss": 11.0744,
       "step": 300
     },
     {
+      "epoch": 0.5565217391304348,
+      "grad_norm": 3.202021360397339,
+      "learning_rate": 0.0002782608695652174,
+      "loss": 10.8929,
       "step": 320
     },
     {
+      "epoch": 0.591304347826087,
+      "grad_norm": 3.1231367588043213,
+      "learning_rate": 0.0002956521739130435,
+      "loss": 10.7468,
       "step": 340
     },
     {
+      "epoch": 0.6260869565217392,
+      "grad_norm": 3.1820390224456787,
+      "learning_rate": 0.0003130434782608696,
+      "loss": 10.606,
       "step": 360
     },
     {
+      "epoch": 0.6608695652173913,
+      "grad_norm": 3.2470555305480957,
+      "learning_rate": 0.0003304347826086956,
+      "loss": 10.4871,
       "step": 380
     },
     {
+      "epoch": 0.6956521739130435,
+      "grad_norm": 3.2452709674835205,
+      "learning_rate": 0.00034782608695652176,
+      "loss": 10.2836,
       "step": 400
     },
     {
+      "epoch": 0.7304347826086957,
+      "grad_norm": 3.203894853591919,
+      "learning_rate": 0.00036521739130434785,
+      "loss": 10.1154,
       "step": 420
     },
     {
+      "epoch": 0.7652173913043478,
+      "grad_norm": 3.269970178604126,
+      "learning_rate": 0.00038260869565217393,
+      "loss": 9.9283,
       "step": 440
     },
     {
+      "epoch": 0.8,
+      "grad_norm": 3.261357545852661,
+      "learning_rate": 0.0004,
+      "loss": 9.8674,
       "step": 460
     },
     {
+      "epoch": 0.8347826086956521,
+      "grad_norm": 3.393953323364258,
+      "learning_rate": 0.00041739130434782605,
+      "loss": 9.6224,
       "step": 480
     },
     {
+      "epoch": 0.8695652173913043,
+      "grad_norm": 3.321411609649658,
+      "learning_rate": 0.0004347826086956522,
+      "loss": 9.524,
       "step": 500
     },
     {
+      "epoch": 0.9043478260869565,
+      "grad_norm": 3.3886823654174805,
+      "learning_rate": 0.0004521739130434783,
+      "loss": 9.384,
       "step": 520
     },
     {
+      "epoch": 0.9391304347826087,
+      "grad_norm": 3.4735491275787354,
+      "learning_rate": 0.00046956521739130436,
+      "loss": 9.1767,
       "step": 540
     },
     {
+      "epoch": 0.9739130434782609,
+      "grad_norm": 3.416966676712036,
+      "learning_rate": 0.00048695652173913045,
+      "loss": 9.047,
       "step": 560
     },
     {
+      "epoch": 1.0,
+      "eval_accuracy": 0.43039677202420984,
+      "eval_loss": 8.366157531738281,
+      "eval_runtime": 42.3364,
+      "eval_samples_per_second": 35.123,
+      "eval_steps_per_second": 35.123,
+      "step": 575
+    },
+    {
+      "epoch": 1.008695652173913,
+      "grad_norm": 3.446899890899658,
+      "learning_rate": 0.0004995169082125604,
+      "loss": 8.8835,
       "step": 580
     },
     {
+      "epoch": 1.0434782608695652,
+      "grad_norm": 3.5842247009277344,
+      "learning_rate": 0.0004975845410628019,
+      "loss": 8.6436,
       "step": 600
     },
     {
+      "epoch": 1.0782608695652174,
+      "grad_norm": 3.5029306411743164,
+      "learning_rate": 0.0004956521739130435,
+      "loss": 8.4775,
       "step": 620
     },
     {
+      "epoch": 1.1130434782608696,
+      "grad_norm": 3.5451033115386963,
+      "learning_rate": 0.0004937198067632851,
+      "loss": 8.322,
       "step": 640
     },
     {
+      "epoch": 1.1478260869565218,
+      "grad_norm": 3.5502634048461914,
+      "learning_rate": 0.0004917874396135266,
+      "loss": 8.1264,
       "step": 660
     },
     {
+      "epoch": 1.182608695652174,
+      "grad_norm": 3.607395648956299,
+      "learning_rate": 0.0004898550724637681,
+      "loss": 7.9905,
       "step": 680
     },
     {
+      "epoch": 1.2173913043478262,
+      "grad_norm": 3.6438565254211426,
+      "learning_rate": 0.0004879227053140097,
+      "loss": 7.8252,
       "step": 700
     },
     {
+      "epoch": 1.2521739130434781,
+      "grad_norm": 3.656705141067505,
+      "learning_rate": 0.0004859903381642512,
+      "loss": 7.7737,
       "step": 720
     },
     {
+      "epoch": 1.2869565217391306,
+      "grad_norm": 3.7424328327178955,
+      "learning_rate": 0.0004840579710144928,
+      "loss": 7.5822,
       "step": 740
     },
     {
+      "epoch": 1.3217391304347825,
+      "grad_norm": 3.673156261444092,
+      "learning_rate": 0.0004821256038647343,
+      "loss": 7.4563,
       "step": 760
     },
     {
+      "epoch": 1.3565217391304347,
+      "grad_norm": 3.6774067878723145,
+      "learning_rate": 0.0004801932367149758,
+      "loss": 7.3379,
       "step": 780
     },
     {
+      "epoch": 1.391304347826087,
+      "grad_norm": 3.811283826828003,
+      "learning_rate": 0.0004782608695652174,
+      "loss": 7.1559,
       "step": 800
     },
     {
+      "epoch": 1.4260869565217391,
+      "grad_norm": 3.7899839878082275,
+      "learning_rate": 0.00047632850241545894,
+      "loss": 7.0834,
       "step": 820
     },
     {
+      "epoch": 1.4608695652173913,
+      "grad_norm": 3.583247423171997,
+      "learning_rate": 0.00047439613526570047,
+      "loss": 6.9172,
       "step": 840
     },
     {
+      "epoch": 1.4956521739130435,
+      "grad_norm": 3.8192331790924072,
+      "learning_rate": 0.00047246376811594206,
+      "loss": 6.7251,
       "step": 860
     },
     {
+      "epoch": 1.5304347826086957,
+      "grad_norm": 3.8098299503326416,
+      "learning_rate": 0.0004705314009661836,
+      "loss": 6.7871,
       "step": 880
     },
     {
+      "epoch": 1.5652173913043477,
+      "grad_norm": 3.7341325283050537,
+      "learning_rate": 0.0004685990338164252,
+      "loss": 6.6103,
       "step": 900
     },
     {
+      "epoch": 1.6,
+      "grad_norm": 3.9190495014190674,
+      "learning_rate": 0.00046666666666666666,
+      "loss": 6.4507,
       "step": 920
     },
     {
+      "epoch": 1.634782608695652,
+      "grad_norm": 3.9456422328948975,
+      "learning_rate": 0.0004647342995169082,
+      "loss": 6.3619,
       "step": 940
     },
     {
+      "epoch": 1.6695652173913045,
+      "grad_norm": 3.899134874343872,
+      "learning_rate": 0.0004628019323671498,
+      "loss": 6.2957,
       "step": 960
     },
     {
+      "epoch": 1.7043478260869565,
+      "grad_norm": 3.878810167312622,
+      "learning_rate": 0.0004608695652173913,
+      "loss": 6.1362,
       "step": 980
     },
     {
+      "epoch": 1.7391304347826086,
+      "grad_norm": 3.9270784854888916,
+      "learning_rate": 0.00045893719806763285,
+      "loss": 5.9814,
       "step": 1000
     },
     {
+      "epoch": 1.7739130434782608,
+      "grad_norm": 3.8247644901275635,
+      "learning_rate": 0.00045700483091787444,
+      "loss": 5.9095,
       "step": 1020
     },
     {
+      "epoch": 1.808695652173913,
+      "grad_norm": 3.8870134353637695,
+      "learning_rate": 0.000455072463768116,
+      "loss": 5.7793,
       "step": 1040
     },
     {
+      "epoch": 1.8434782608695652,
+      "grad_norm": 3.9533441066741943,
+      "learning_rate": 0.00045314009661835745,
+      "loss": 5.7754,
       "step": 1060
     },
     {
+      "epoch": 1.8782608695652174,
+      "grad_norm": 3.9928998947143555,
+      "learning_rate": 0.00045120772946859904,
+      "loss": 5.5886,
       "step": 1080
     },
     {
+      "epoch": 1.9130434782608696,
+      "grad_norm": 4.030064582824707,
+      "learning_rate": 0.0004492753623188406,
+      "loss": 5.5482,
       "step": 1100
     },
     {
+      "epoch": 1.9478260869565216,
+      "grad_norm": 3.961806297302246,
+      "learning_rate": 0.0004473429951690821,
+      "loss": 5.4807,
       "step": 1120
     },
     {
+      "epoch": 1.982608695652174,
+      "grad_norm": 4.003119945526123,
+      "learning_rate": 0.0004454106280193237,
+      "loss": 5.3508,
       "step": 1140
     },
     {
+      "epoch": 2.0,
+      "eval_accuracy": 0.8190988567585743,
+      "eval_loss": 4.025164604187012,
+      "eval_runtime": 42.7144,
+      "eval_samples_per_second": 34.813,
+      "eval_steps_per_second": 34.813,
+      "step": 1150
+    },
+    {
+      "epoch": 2.017391304347826,
+      "grad_norm": 3.958116292953491,
+      "learning_rate": 0.00044347826086956523,
+      "loss": 5.1229,
       "step": 1160
     },
     {
+      "epoch": 2.0521739130434784,
+      "grad_norm": 3.864279270172119,
+      "learning_rate": 0.00044154589371980677,
+      "loss": 4.8146,
       "step": 1180
     },
     {
+      "epoch": 2.0869565217391304,
+      "grad_norm": 4.045077323913574,
+      "learning_rate": 0.0004396135265700483,
+      "loss": 4.8843,
       "step": 1200
     },
     {
+      "epoch": 2.121739130434783,
+      "grad_norm": 4.061978816986084,
+      "learning_rate": 0.00043768115942028983,
+      "loss": 4.8078,
       "step": 1220
     },
     {
+      "epoch": 2.1565217391304348,
+      "grad_norm": 4.040159225463867,
+      "learning_rate": 0.0004357487922705314,
+      "loss": 4.6812,
       "step": 1240
     },
     {
+      "epoch": 2.1913043478260867,
+      "grad_norm": 4.234623908996582,
+      "learning_rate": 0.00043381642512077296,
+      "loss": 4.6701,
       "step": 1260
     },
     {
+      "epoch": 2.226086956521739,
+      "grad_norm": 4.030038356781006,
+      "learning_rate": 0.0004318840579710145,
+      "loss": 4.6221,
       "step": 1280
     },
     {
+      "epoch": 2.260869565217391,
+      "grad_norm": 3.9954497814178467,
+      "learning_rate": 0.0004299516908212561,
+      "loss": 4.5647,
       "step": 1300
     },
     {
+      "epoch": 2.2956521739130435,
+      "grad_norm": 4.188636779785156,
+      "learning_rate": 0.0004280193236714976,
+      "loss": 4.4502,
       "step": 1320
     },
     {
+      "epoch": 2.3304347826086955,
+      "grad_norm": 4.185456275939941,
+      "learning_rate": 0.00042608695652173915,
+      "loss": 4.359,
       "step": 1340
     },
     {
+      "epoch": 2.365217391304348,
+      "grad_norm": 4.123263359069824,
+      "learning_rate": 0.0004241545893719807,
+      "loss": 4.2863,
       "step": 1360
     },
     {
+      "epoch": 2.4,
+      "grad_norm": 4.194387435913086,
+      "learning_rate": 0.0004222222222222222,
+      "loss": 4.3354,
       "step": 1380
     },
     {
+      "epoch": 2.4347826086956523,
+      "grad_norm": 4.065763473510742,
+      "learning_rate": 0.00042028985507246375,
+      "loss": 4.2176,
       "step": 1400
     },
     {
+      "epoch": 2.4695652173913043,
+      "grad_norm": 4.120363712310791,
+      "learning_rate": 0.00041835748792270534,
+      "loss": 4.0597,
       "step": 1420
     },
     {
+      "epoch": 2.5043478260869563,
+      "grad_norm": 4.3197174072265625,
+      "learning_rate": 0.00041642512077294687,
+      "loss": 4.028,
       "step": 1440
     },
     {
+      "epoch": 2.5391304347826087,
+      "grad_norm": 4.2683610916137695,
+      "learning_rate": 0.0004144927536231884,
+      "loss": 3.9833,
       "step": 1460
     },
     {
+      "epoch": 2.573913043478261,
+      "grad_norm": 4.15448522567749,
+      "learning_rate": 0.00041256038647343,
+      "loss": 4.0065,
       "step": 1480
     },
     {
+      "epoch": 2.608695652173913,
+      "grad_norm": 4.348177433013916,
+      "learning_rate": 0.0004106280193236715,
+      "loss": 3.8134,
       "step": 1500
     },
     {
+      "epoch": 2.643478260869565,
+      "grad_norm": 4.100021839141846,
+      "learning_rate": 0.00040869565217391306,
+      "loss": 3.8548,
       "step": 1520
     },
     {
+      "epoch": 2.6782608695652175,
+      "grad_norm": 4.344174385070801,
+      "learning_rate": 0.0004067632850241546,
+      "loss": 3.7814,
       "step": 1540
     },
     {
+      "epoch": 2.7130434782608694,
+      "grad_norm": 4.240079402923584,
+      "learning_rate": 0.00040483091787439613,
+      "loss": 3.7578,
       "step": 1560
     },
     {
+      "epoch": 2.747826086956522,
+      "grad_norm": 4.468689918518066,
+      "learning_rate": 0.0004028985507246377,
+      "loss": 3.7331,
       "step": 1580
     },
     {
+      "epoch": 2.782608695652174,
+      "grad_norm": 4.28464937210083,
+      "learning_rate": 0.00040096618357487925,
+      "loss": 3.6396,
       "step": 1600
     },
     {
+      "epoch": 2.8173913043478263,
+      "grad_norm": 4.166805744171143,
+      "learning_rate": 0.0003990338164251208,
+      "loss": 3.5799,
       "step": 1620
     },
     {
+      "epoch": 2.8521739130434782,
+      "grad_norm": 4.237683296203613,
+      "learning_rate": 0.0003971014492753624,
+      "loss": 3.4734,
       "step": 1640
     },
     {
+      "epoch": 2.8869565217391306,
+      "grad_norm": 4.153097152709961,
+      "learning_rate": 0.00039516908212560385,
+      "loss": 3.5183,
       "step": 1660
     },
     {
+      "epoch": 2.9217391304347826,
+      "grad_norm": 4.2313947677612305,
+      "learning_rate": 0.0003932367149758454,
+      "loss": 3.3963,
       "step": 1680
     },
     {
+      "epoch": 2.9565217391304346,
+      "grad_norm": 3.992475748062134,
+      "learning_rate": 0.000391304347826087,
+      "loss": 3.3081,
       "step": 1700
     },
     {
+      "epoch": 2.991304347826087,
+      "grad_norm": 4.4731059074401855,
+      "learning_rate": 0.0003893719806763285,
+      "loss": 3.3124,
       "step": 1720
     },
     {
+      "epoch": 3.0,
+      "eval_accuracy": 0.9260255548083389,
+      "eval_loss": 2.1082653999328613,
+      "eval_runtime": 22.1676,
+      "eval_samples_per_second": 67.08,
+      "eval_steps_per_second": 67.08,
+      "step": 1725
+    },
+    {
+      "epoch": 3.026086956521739,
+      "grad_norm": 4.272000312805176,
+      "learning_rate": 0.00038743961352657004,
+      "loss": 3.1247,
       "step": 1740
     },
     {
+      "epoch": 3.0608695652173914,
+      "grad_norm": 4.102330207824707,
+      "learning_rate": 0.00038550724637681163,
+      "loss": 3.1064,
       "step": 1760
     },
     {
+      "epoch": 3.0956521739130434,
+      "grad_norm": 4.381846904754639,
+      "learning_rate": 0.00038357487922705317,
+      "loss": 2.9371,
       "step": 1780
     },
     {
+      "epoch": 3.130434782608696,
+      "grad_norm": 4.1588921546936035,
+      "learning_rate": 0.00038164251207729465,
+      "loss": 2.9355,
       "step": 1800
     },
     {
+      "epoch": 3.1652173913043478,
+      "grad_norm": 4.279609203338623,
+      "learning_rate": 0.00037971014492753623,
+      "loss": 2.8545,
       "step": 1820
     },
     {
+      "epoch": 3.2,
+      "grad_norm": 4.240756988525391,
+      "learning_rate": 0.00037777777777777777,
+      "loss": 2.8096,
       "step": 1840
     },
     {
+      "epoch": 3.234782608695652,
+      "grad_norm": 4.11091947555542,
+      "learning_rate": 0.00037584541062801936,
+      "loss": 2.8138,
       "step": 1860
     },
     {
+      "epoch": 3.269565217391304,
+      "grad_norm": 4.078794479370117,
+      "learning_rate": 0.0003739130434782609,
+      "loss": 2.7417,
       "step": 1880
     },
     {
+      "epoch": 3.3043478260869565,
+      "grad_norm": 4.368116855621338,
+      "learning_rate": 0.0003719806763285024,
+      "loss": 2.7937,
       "step": 1900
     },
     {
+      "epoch": 3.3391304347826085,
+      "grad_norm": 4.044319152832031,
+      "learning_rate": 0.000370048309178744,
+      "loss": 2.7361,
       "step": 1920
     },
     {
+      "epoch": 3.373913043478261,
+      "grad_norm": 4.314040184020996,
+      "learning_rate": 0.0003681159420289855,
+      "loss": 2.7054,
       "step": 1940
     },
     {
+      "epoch": 3.408695652173913,
+      "grad_norm": 4.185855388641357,
+      "learning_rate": 0.000366183574879227,
+      "loss": 2.6682,
       "step": 1960
     },
     {
+      "epoch": 3.4434782608695653,
+      "grad_norm": 4.433622360229492,
+      "learning_rate": 0.0003642512077294686,
+      "loss": 2.6644,
       "step": 1980
     },
     {
+      "epoch": 3.4782608695652173,
+      "grad_norm": 4.048947811126709,
+      "learning_rate": 0.00036231884057971015,
+      "loss": 2.618,
       "step": 2000
     },
     {
+      "epoch": 3.5130434782608697,
+      "grad_norm": 4.145406246185303,
+      "learning_rate": 0.0003603864734299517,
+      "loss": 2.5982,
       "step": 2020
     },
     {
+      "epoch": 3.5478260869565217,
+      "grad_norm": 4.2812910079956055,
+      "learning_rate": 0.00035845410628019327,
+      "loss": 2.6138,
       "step": 2040
     },
     {
+      "epoch": 3.5826086956521737,
+      "grad_norm": 4.400162220001221,
+      "learning_rate": 0.0003565217391304348,
+      "loss": 2.5039,
       "step": 2060
     },
     {
+      "epoch": 3.617391304347826,
+      "grad_norm": 4.217800617218018,
+      "learning_rate": 0.0003545893719806763,
+      "loss": 2.5249,
       "step": 2080
     },
     {
+      "epoch": 3.6521739130434785,
+      "grad_norm": 4.076215744018555,
+      "learning_rate": 0.0003526570048309179,
+      "loss": 2.4547,
       "step": 2100
     },
     {
+      "epoch": 3.6869565217391305,
+      "grad_norm": 4.139514446258545,
+      "learning_rate": 0.0003507246376811594,
+      "loss": 2.4315,
       "step": 2120
     },
     {
+      "epoch": 3.7217391304347824,
+      "grad_norm": 4.118022918701172,
+      "learning_rate": 0.00034879227053140094,
+      "loss": 2.3836,
       "step": 2140
     },
     {
+      "epoch": 3.756521739130435,
+      "grad_norm": 4.137601852416992,
+      "learning_rate": 0.00034685990338164253,
+      "loss": 2.3284,
       "step": 2160
     },
     {
+      "epoch": 3.791304347826087,
+      "grad_norm": 4.023979663848877,
+      "learning_rate": 0.00034492753623188406,
+      "loss": 2.3095,
       "step": 2180
     },
     {
+      "epoch": 3.8260869565217392,
+      "grad_norm": 4.042725086212158,
+      "learning_rate": 0.00034299516908212565,
+      "loss": 2.305,
       "step": 2200
     },
     {
+      "epoch": 3.860869565217391,
+      "grad_norm": 4.265875339508057,
+      "learning_rate": 0.0003410628019323672,
+      "loss": 2.3237,
       "step": 2220
     },
     {
+      "epoch": 3.8956521739130436,
+      "grad_norm": 4.205041408538818,
+      "learning_rate": 0.00033913043478260867,
+      "loss": 2.335,
       "step": 2240
     },
     {
+      "epoch": 3.9304347826086956,
+      "grad_norm": 4.1344709396362305,
+      "learning_rate": 0.00033719806763285025,
+      "loss": 2.2341,
       "step": 2260
     },
     {
+      "epoch": 3.965217391304348,
+      "grad_norm": 4.247790813446045,
+      "learning_rate": 0.0003352657004830918,
+      "loss": 2.251,
       "step": 2280
     },
     {
+      "epoch": 4.0,
+      "grad_norm": 4.859626770019531,
+      "learning_rate": 0.0003333333333333333,
+      "loss": 2.3212,
+      "step": 2300
+    },
+    {
+      "epoch": 4.0,
+      "eval_accuracy": 0.9435104236718225,
+      "eval_loss": 1.2223739624023438,
+      "eval_runtime": 14.8513,
+      "eval_samples_per_second": 100.126,
+      "eval_steps_per_second": 100.126,
       "step": 2300
     },
     {
+      "epoch": 4.034782608695652,
+      "grad_norm": 4.098020553588867,
+      "learning_rate": 0.0003314009661835749,
+      "loss": 1.9133,
       "step": 2320
     },
     {
+      "epoch": 4.069565217391304,
+      "grad_norm": 4.198029041290283,
+      "learning_rate": 0.00032946859903381644,
+      "loss": 1.9814,
       "step": 2340
     },
     {
+      "epoch": 4.104347826086957,
+      "grad_norm": 3.960844039916992,
+      "learning_rate": 0.000327536231884058,
+      "loss": 1.9505,
       "step": 2360
     },
     {
+      "epoch": 4.139130434782609,
+      "grad_norm": 4.0190300941467285,
+      "learning_rate": 0.0003256038647342995,
+      "loss": 1.8815,
       "step": 2380
     },
     {
+      "epoch": 4.173913043478261,
+      "grad_norm": 4.040708541870117,
+      "learning_rate": 0.00032367149758454105,
+      "loss": 1.8365,
       "step": 2400
     },
     {
+      "epoch": 4.208695652173913,
+      "grad_norm": 4.077364444732666,
+      "learning_rate": 0.0003217391304347826,
+      "loss": 1.84,
       "step": 2420
     },
     {
+      "epoch": 4.243478260869566,
+      "grad_norm": 4.267309188842773,
+      "learning_rate": 0.0003199033816425121,
+      "loss": 1.8864,
       "step": 2440
     },
     {
+      "epoch": 4.278260869565218,
+      "grad_norm": 3.978663921356201,
+      "learning_rate": 0.00031797101449275363,
+      "loss": 1.9015,
       "step": 2460
     },
     {
+      "epoch": 4.3130434782608695,
+      "grad_norm": 4.089256763458252,
+      "learning_rate": 0.0003160386473429952,
+      "loss": 1.8388,
       "step": 2480
     },
     {
+      "epoch": 4.3478260869565215,
+      "grad_norm": 3.9317057132720947,
+      "learning_rate": 0.0003141062801932367,
+      "loss": 1.7845,
       "step": 2500
     },
     {
+      "epoch": 4.3826086956521735,
+      "grad_norm": 3.9738080501556396,
+      "learning_rate": 0.00031217391304347823,
+      "loss": 1.7725,
       "step": 2520
     },
     {
+      "epoch": 4.417391304347826,
+      "grad_norm": 4.232215881347656,
+      "learning_rate": 0.0003102415458937198,
+      "loss": 1.852,
       "step": 2540
     },
     {
+      "epoch": 4.452173913043478,
+      "grad_norm": 4.050131797790527,
+      "learning_rate": 0.00030830917874396136,
+      "loss": 1.8234,
       "step": 2560
     },
     {
+      "epoch": 4.48695652173913,
+      "grad_norm": 4.217935085296631,
+      "learning_rate": 0.0003063768115942029,
+      "loss": 1.8148,
       "step": 2580
     },
     {
+      "epoch": 4.521739130434782,
+      "grad_norm": 3.9807074069976807,
+      "learning_rate": 0.0003044444444444445,
+      "loss": 1.7134,
       "step": 2600
     },
     {
+      "epoch": 4.556521739130435,
+      "grad_norm": 4.05940580368042,
+      "learning_rate": 0.000302512077294686,
+      "loss": 1.6752,
       "step": 2620
     },
     {
+      "epoch": 4.591304347826087,
+      "grad_norm": 4.454566955566406,
+      "learning_rate": 0.00030057971014492755,
+      "loss": 1.8413,
       "step": 2640
     },
     {
+      "epoch": 4.626086956521739,
+      "grad_norm": 4.144088268280029,
+      "learning_rate": 0.0002986473429951691,
+      "loss": 1.7948,
       "step": 2660
     },
     {
+      "epoch": 4.660869565217391,
+      "grad_norm": 3.940176010131836,
+      "learning_rate": 0.0002967149758454106,
+      "loss": 1.7468,
       "step": 2680
     },
     {
+      "epoch": 4.695652173913043,
+      "grad_norm": 4.198675632476807,
+      "learning_rate": 0.0002948792270531401,
+      "loss": 1.709,
       "step": 2700
     },
     {
+      "epoch": 4.730434782608696,
+      "grad_norm": 3.976001501083374,
+      "learning_rate": 0.00029294685990338167,
+      "loss": 1.6506,
       "step": 2720
     },
     {
+      "epoch": 4.765217391304348,
+      "grad_norm": 4.033059120178223,
+      "learning_rate": 0.0002910144927536232,
+      "loss": 1.7042,
       "step": 2740
     },
     {
+      "epoch": 4.8,
+      "grad_norm": 4.062041759490967,
+      "learning_rate": 0.0002890821256038648,
+      "loss": 1.6795,
       "step": 2760
     },
     {
+      "epoch": 4.834782608695652,
+      "grad_norm": 3.988589286804199,
+      "learning_rate": 0.00028714975845410627,
+      "loss": 1.7029,
       "step": 2780
     },
     {
+      "epoch": 4.869565217391305,
+      "grad_norm": 4.16325044631958,
+      "learning_rate": 0.0002852173913043478,
+      "loss": 1.6641,
       "step": 2800
     },
     {
+      "epoch": 4.904347826086957,
+      "grad_norm": 4.323537349700928,
+      "learning_rate": 0.0002832850241545894,
+      "loss": 1.6953,
       "step": 2820
     },
     {
+      "epoch": 4.939130434782609,
+      "grad_norm": 3.8293144702911377,
+      "learning_rate": 0.0002813526570048309,
+      "loss": 1.5863,
       "step": 2840
     },
     {
+      "epoch": 4.973913043478261,
+      "grad_norm": 3.8955535888671875,
+      "learning_rate": 0.00027942028985507246,
+      "loss": 1.6276,
       "step": 2860
     },
     {
+      "epoch": 5.0,
+      "eval_accuracy": 0.9677202420981843,
+      "eval_loss": 0.8229038715362549,
+      "eval_runtime": 88.6744,
+      "eval_samples_per_second": 16.769,
+      "eval_steps_per_second": 16.769,
+      "step": 2875
+    },
+    {
+      "epoch": 5.008695652173913,
+      "grad_norm": 3.8480091094970703,
+      "learning_rate": 0.00027748792270531405,
+      "loss": 1.5701,
       "step": 2880
     },
     {
+      "epoch": 5.043478260869565,
+      "grad_norm": 3.679872512817383,
+      "learning_rate": 0.0002755555555555556,
+      "loss": 1.3786,
       "step": 2900
     },
     {
+      "epoch": 5.078260869565217,
+      "grad_norm": 4.13381290435791,
+      "learning_rate": 0.00027362318840579706,
+      "loss": 1.3563,
       "step": 2920
     },
     {
+      "epoch": 5.113043478260869,
+      "grad_norm": 3.7467329502105713,
+      "learning_rate": 0.00027169082125603865,
+      "loss": 1.3588,
       "step": 2940
     },
     {
+      "epoch": 5.147826086956521,
+      "grad_norm": 3.5837419033050537,
+      "learning_rate": 0.0002698550724637681,
+      "loss": 1.3782,
       "step": 2960
     },
     {
+      "epoch": 5.182608695652174,
+      "grad_norm": 4.077097415924072,
+      "learning_rate": 0.00026792270531400964,
+      "loss": 1.3969,
       "step": 2980
     },
     {
+      "epoch": 5.217391304347826,
+      "grad_norm": 3.5995211601257324,
+      "learning_rate": 0.00026599033816425123,
+      "loss": 1.3346,
       "step": 3000
     },
     {
+      "epoch": 5.252173913043478,
+      "grad_norm": 3.714010000228882,
+      "learning_rate": 0.00026405797101449277,
+      "loss": 1.3772,
       "step": 3020
     },
     {
+      "epoch": 5.28695652173913,
+      "grad_norm": 3.807094097137451,
+      "learning_rate": 0.00026231884057971016,
+      "loss": 1.3452,
       "step": 3040
     },
     {
+      "epoch": 5.321739130434783,
+      "grad_norm": 4.012477397918701,
+      "learning_rate": 0.0002603864734299517,
+      "loss": 1.3161,
       "step": 3060
     },
     {
+      "epoch": 5.356521739130435,
+      "grad_norm": 3.850520372390747,
+      "learning_rate": 0.0002584541062801932,
+      "loss": 1.3146,
       "step": 3080
     },
     {
+      "epoch": 5.391304347826087,
+      "grad_norm": NaN,
+      "learning_rate": 0.00025661835748792274,
+      "loss": 1.3057,
       "step": 3100
     },
     {
+      "epoch": 5.426086956521739,
+      "grad_norm": 3.697744607925415,
+      "learning_rate": 0.0002546859903381643,
+      "loss": 1.2619,
       "step": 3120
     },
     {
+      "epoch": 5.460869565217392,
+      "grad_norm": 4.125018119812012,
+      "learning_rate": 0.00025275362318840576,
+      "loss": 1.3436,
       "step": 3140
     },
     {
+      "epoch": 5.495652173913044,
+      "grad_norm": 4.1491899490356445,
+      "learning_rate": 0.00025082125603864735,
+      "loss": 1.3289,
       "step": 3160
     },
     {
+      "epoch": 5.530434782608696,
+      "grad_norm": 3.9294846057891846,
+      "learning_rate": 0.0002488888888888889,
+      "loss": 1.218,
       "step": 3180
     },
     {
+      "epoch": 5.565217391304348,
+      "grad_norm": 3.9030706882476807,
+      "learning_rate": 0.00024695652173913047,
+      "loss": 1.3219,
       "step": 3200
     },
     {
+      "epoch": 5.6,
+      "grad_norm": 4.124849319458008,
+      "learning_rate": 0.000245024154589372,
+      "loss": 1.2694,
       "step": 3220
     },
     {
+      "epoch": 5.6347826086956525,
+      "grad_norm": 4.1668500900268555,
+      "learning_rate": 0.0002432850241545894,
+      "loss": 1.2379,
       "step": 3240
     },
     {
+      "epoch": 5.6695652173913045,
+      "grad_norm": 4.098198890686035,
+      "learning_rate": 0.00024135265700483093,
+      "loss": 1.2892,
       "step": 3260
     },
     {
+      "epoch": 5.7043478260869565,
+      "grad_norm": 3.690241813659668,
+      "learning_rate": 0.00023942028985507246,
+      "loss": 1.2742,
       "step": 3280
     },
     {
+      "epoch": 5.739130434782608,
+      "grad_norm": 3.978963613510132,
+      "learning_rate": 0.00023748792270531402,
+      "loss": 1.1755,
       "step": 3300
     },
     {
+      "epoch": 5.773913043478261,
+      "grad_norm": 3.7397215366363525,
+      "learning_rate": 0.00023574879227053139,
+      "loss": 1.2256,
       "step": 3320
     },
     {
+      "epoch": 5.808695652173913,
+      "grad_norm": 3.9201064109802246,
+      "learning_rate": 0.00023391304347826088,
+      "loss": 1.238,
       "step": 3340
     },
     {
+      "epoch": 5.843478260869565,
+      "grad_norm": 3.725389242172241,
+      "learning_rate": 0.0002319806763285024,
+      "loss": 1.1706,
       "step": 3360
     },
     {
+      "epoch": 5.878260869565217,
+      "grad_norm": 3.5844123363494873,
+      "learning_rate": 0.00023004830917874397,
+      "loss": 1.1644,
       "step": 3380
     },
     {
+      "epoch": 5.913043478260869,
+      "grad_norm": 3.79936146736145,
+      "learning_rate": 0.00022821256038647343,
+      "loss": 1.2256,
       "step": 3400
     },
     {
+      "epoch": 5.947826086956522,
+      "grad_norm": 3.5947725772857666,
+      "learning_rate": 0.00022628019323671497,
+      "loss": 1.2488,
       "step": 3420
     },
     {
+      "epoch": 5.982608695652174,
+      "grad_norm": NaN,
+      "learning_rate": 0.00022444444444444446,
+      "loss": 1.1418,
       "step": 3440
     },
     {
+      "epoch": 6.0,
+      "eval_accuracy": 0.9757901815736382,
+      "eval_loss": 0.5840117335319519,
+      "eval_runtime": 97.2696,
+      "eval_samples_per_second": 15.287,
+      "eval_steps_per_second": 15.287,
+      "step": 3450
+    },
+    {
+      "epoch": 6.017391304347826,
+      "grad_norm": 3.5959298610687256,
+      "learning_rate": 0.00022260869565217392,
+      "loss": 1.1254,
       "step": 3460
     },
     {
+      "epoch": 6.052173913043478,
+      "grad_norm": 3.9623775482177734,
+      "learning_rate": 0.00022067632850241545,
+      "loss": 1.0343,
       "step": 3480
     },
     {
+      "epoch": 6.086956521739131,
+      "grad_norm": 3.735102415084839,
+      "learning_rate": 0.00021874396135265702,
+      "loss": 1.0348,
       "step": 3500
     },
     {
+      "epoch": 6.121739130434783,
+      "grad_norm": 3.4255013465881348,
+      "learning_rate": 0.00021681159420289855,
+      "loss": 0.9796,
       "step": 3520
     },
     {
+      "epoch": 6.156521739130435,
+      "grad_norm": 3.981841564178467,
+      "learning_rate": 0.00021497584541062804,
+      "loss": 0.9865,
       "step": 3540
     },
     {
+      "epoch": 6.191304347826087,
+      "grad_norm": 3.9057116508483887,
+      "learning_rate": 0.00021314009661835748,
+      "loss": 1.0054,
       "step": 3560
     },
     {
+      "epoch": 6.226086956521739,
+      "grad_norm": 3.626560688018799,
+      "learning_rate": 0.00021120772946859904,
+      "loss": 1.0012,
       "step": 3580
     },
     {
+      "epoch": 6.260869565217392,
+      "grad_norm": 3.687683582305908,
+      "learning_rate": 0.0002093719806763285,
+      "loss": 1.0129,
       "step": 3600
     },
     {
+      "epoch": 6.2956521739130435,
+      "grad_norm": 3.8632826805114746,
+      "learning_rate": 0.00020763285024154592,
+      "loss": 0.9333,
       "step": 3620
     },
     {
+      "epoch": 6.3304347826086955,
+      "grad_norm": 4.089422702789307,
+      "learning_rate": 0.0002058937198067633,
+      "loss": 1.0259,
       "step": 3640
     },
     {
+      "epoch": 6.3652173913043475,
+      "grad_norm": 4.261268615722656,
+      "learning_rate": 0.00020415458937198067,
+      "loss": 1.0184,
       "step": 3660
     },
     {
+      "epoch": 6.4,
+      "grad_norm": 2.3901586532592773,
+      "learning_rate": 0.0002026086956521739,
+      "loss": 1.0293,
       "step": 3680
     },
     {
+      "epoch": 6.434782608695652,
+      "grad_norm": 2.233633518218994,
+      "learning_rate": 0.00020067632850241546,
+      "loss": 1.0026,
       "step": 3700
     },
     {
+      "epoch": 6.469565217391304,
+      "grad_norm": 2.049773693084717,
+      "learning_rate": 0.00019893719806763285,
+      "loss": 1.0426,
       "step": 3720
     },
     {
+      "epoch": 6.504347826086956,
+      "grad_norm": 2.21939754486084,
+      "learning_rate": 0.0001970048309178744,
+      "loss": 1.0324,
       "step": 3740
     },
     {
+      "epoch": 6.539130434782608,
+      "grad_norm": 2.2138895988464355,
+      "learning_rate": 0.00019516908212560387,
+      "loss": 1.0666,
       "step": 3760
     },
     {
+      "epoch": 6.573913043478261,
+      "grad_norm": 1.9186855554580688,
+      "learning_rate": 0.0001932367149758454,
+      "loss": 1.0724,
       "step": 3780
     },
     {
+      "epoch": 6.608695652173913,
+      "grad_norm": 1.302451729774475,
+      "learning_rate": 0.00019159420289855073,
+      "loss": 1.0867,
       "step": 3800
     },
     {
+      "epoch": 6.643478260869565,
+      "grad_norm": 1.1770459413528442,
+      "learning_rate": 0.00018975845410628022,
+      "loss": 1.0659,
       "step": 3820
     },
     {
+      "epoch": 6.678260869565217,
+      "grad_norm": 0.2651650309562683,
+      "learning_rate": 0.0001881159420289855,
+      "loss": 1.0494,
       "step": 3840
     },
     {
+      "epoch": 6.71304347826087,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0001867632850241546,
+      "loss": 1.0464,
       "step": 3860
     },
     {
+      "epoch": 6.747826086956522,
+      "grad_norm": 0.0,
+      "learning_rate": 0.000185024154589372,
+      "loss": 1.0457,
       "step": 3880
     },
     {
+      "epoch": 6.782608695652174,
+      "grad_norm": 0.0,
+      "learning_rate": 0.00018328502415458937,
+      "loss": 0.9815,
       "step": 3900
     },
     {
+      "epoch": 6.817391304347826,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0001816425120772947,
+      "loss": 1.0094,
       "step": 3920
     },
     {
+      "epoch": 6.852173913043478,
+      "grad_norm": NaN,
+      "learning_rate": 0.00018028985507246377,
+      "loss": 1.0023,
       "step": 3940
     },
     {
+      "epoch": 6.886956521739131,
+      "grad_norm": 0.0,
+      "learning_rate": 0.00017893719806763288,
+      "loss": 1.0278,
       "step": 3960
     },
     {
+      "epoch": 6.921739130434783,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0001771014492753623,
+      "loss": 1.0123,
       "step": 3980
     },
     {
+      "epoch": 6.956521739130435,
+      "grad_norm": 0.0,
+      "learning_rate": 0.00017565217391304346,
+      "loss": 1.0774,
       "step": 4000
     },
     {
+      "epoch": 6.9913043478260875,
+      "grad_norm": 0.0,
+      "learning_rate": 0.00017391304347826088,
+      "loss": 1.0484,
       "step": 4020
     },
     {
+      "epoch": 7.0,
+      "eval_accuracy": 0.9737726967047747,
+      "eval_loss": 0.5780686736106873,
+      "eval_runtime": 118.8154,
+      "eval_samples_per_second": 12.515,
+      "eval_steps_per_second": 12.515,
+      "step": 4025
+    },
+    {
+      "epoch": 7.026086956521739,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0001723671497584541,
+      "loss": 0.9799,
       "step": 4040
     },
     {
+      "epoch": 7.060869565217391,
+      "grad_norm": 0.0,
+      "learning_rate": 0.00017091787439613525,
+      "loss": 0.9588,
       "step": 4060
     },
     {
+      "epoch": 7.095652173913043,
+      "grad_norm": NaN,
+      "learning_rate": 0.00016966183574879226,
+      "loss": 0.9421,
       "step": 4080
     },
     {
+      "epoch": 7.130434782608695,
+      "grad_norm": 0.0,
+      "learning_rate": 0.00016782608695652175,
+      "loss": 0.9551,
       "step": 4100
     },
     {
+      "epoch": 7.165217391304348,
+      "grad_norm": 0.0,
+      "learning_rate": 0.00016618357487922704,
+      "loss": 0.9622,
       "step": 4120
     },
     {
+      "epoch": 7.2,
+      "grad_norm": 0.0,
+      "learning_rate": 0.00016444444444444446,
+      "loss": 0.9712,
       "step": 4140
     },
     {
+      "epoch": 7.234782608695652,
+      "grad_norm": 0.0,
+      "learning_rate": 0.00016299516908212561,
+      "loss": 0.9834,
       "step": 4160
     },
     {
+      "epoch": 7.269565217391304,
+      "grad_norm": NaN,
+      "learning_rate": 0.00016135265700483093,
+      "loss": 0.9968,
       "step": 4180
     },
     {
+      "epoch": 7.304347826086957,
+      "grad_norm": 0.0,
+      "learning_rate": 0.00015961352657004833,
+      "loss": 0.956,
       "step": 4200
     },
     {
+      "epoch": 7.339130434782609,
+      "grad_norm": 0.0,
+      "learning_rate": 0.00015806763285024155,
+      "loss": 0.8981,
       "step": 4220
     },
     {
+      "epoch": 7.373913043478261,
+      "grad_norm": 0.0,
+      "learning_rate": 0.00015642512077294684,
+      "loss": 0.9515,
       "step": 4240
     },
     {
+      "epoch": 7.408695652173913,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0001548792270531401,
+      "loss": 0.9535,
       "step": 4260
     },
     {
+      "epoch": 7.443478260869565,
+      "grad_norm": NaN,
+      "learning_rate": 0.00015333333333333334,
+      "loss": 0.9646,
       "step": 4280
     },
     {
+      "epoch": 7.478260869565218,
+      "grad_norm": 0.0,
+      "learning_rate": 0.00015140096618357487,
+      "loss": 0.9821,
       "step": 4300
     },
     {
+      "epoch": 7.51304347826087,
+      "grad_norm": 0.0,
+      "learning_rate": 0.00015014492753623188,
+      "loss": 0.9259,
       "step": 4320
     },
     {
+      "epoch": 7.547826086956522,
+      "grad_norm": 0.0,
+      "learning_rate": 0.00014869565217391303,
+      "loss": 0.9494,
       "step": 4340
     },
     {
+      "epoch": 7.582608695652174,
+      "grad_norm": 0.0,
+      "learning_rate": 0.00014714975845410628,
+      "loss": 0.9305,
       "step": 4360
     },
     {
+      "epoch": 7.6173913043478265,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0001455072463768116,
+      "loss": 0.8889,
       "step": 4380
     },
     {
+      "epoch": 7.6521739130434785,
+      "grad_norm": 0.0,
+      "learning_rate": 0.00014396135265700482,
+      "loss": 0.9524,
       "step": 4400
     },
     {
+      "epoch": 7.6869565217391305,
+      "grad_norm": 0.0,
+      "learning_rate": 0.00014231884057971014,
+      "loss": 0.9065,
       "step": 4420
     },
     {
+      "epoch": 7.721739130434782,
+      "grad_norm": 0.0,
+      "learning_rate": 0.00014048309178743963,
+      "loss": 0.9153,
       "step": 4440
     },
     {
+      "epoch": 7.756521739130434,
+      "grad_norm": NaN,
+      "learning_rate": 0.0001403864734299517,
+      "loss": 0.6675,
       "step": 4460
     },
     {
+      "epoch": 7.791304347826087,
+      "grad_norm": NaN,
+      "learning_rate": 0.0001403864734299517,
+      "loss": 0.0,
       "step": 4480
     },
     {
+      "epoch": 7.826086956521739,
+      "grad_norm": NaN,
+      "learning_rate": 0.0001403864734299517,
+      "loss": 0.0,
       "step": 4500
     },
     {
+      "epoch": 7.860869565217391,
+      "grad_norm": NaN,
+      "learning_rate": 0.0001403864734299517,
+      "loss": 0.0,
       "step": 4520
     },
     {
+      "epoch": 7.895652173913043,
+      "grad_norm": NaN,
+      "learning_rate": 0.0001403864734299517,
+      "loss": 0.0,
       "step": 4540
     },
     {
+      "epoch": 7.930434782608696,
+      "grad_norm": NaN,
+      "learning_rate": 0.0001403864734299517,
+      "loss": 0.0,
       "step": 4560
     },
     {
+      "epoch": 7.965217391304348,
+      "grad_norm": NaN,
+      "learning_rate": 0.0001403864734299517,
+      "loss": 0.0,
       "step": 4580
     },
     {
+      "epoch": 8.0,
+      "grad_norm": NaN,
+      "learning_rate": 0.0001403864734299517,
+      "loss": 0.0,
+      "step": 4600
+    },
+    {
+      "epoch": 8.0,
+      "eval_accuracy": 0.0006724949562878278,
+      "eval_loss": NaN,
+      "eval_runtime": 129.6238,
+      "eval_samples_per_second": 11.472,
+      "eval_steps_per_second": 11.472,
       "step": 4600
     },
     {
+      "epoch": 8.034782608695652,
+      "grad_norm": NaN,
+      "learning_rate": 0.0001403864734299517,
+      "loss": 0.0,
       "step": 4620
     },
     {
+      "epoch": 8.069565217391304,
+      "grad_norm": NaN,
+      "learning_rate": 0.0001403864734299517,
+      "loss": 0.0,
       "step": 4640
     },
     {
+      "epoch": 8.104347826086956,
+      "grad_norm": NaN,
+      "learning_rate": 0.0001403864734299517,
+      "loss": 0.0,
       "step": 4660
     },
     {
+      "epoch": 8.139130434782608,
+      "grad_norm": NaN,
+      "learning_rate": 0.0001403864734299517,
+      "loss": 0.0,
       "step": 4680
     },
     {
+      "epoch": 8.173913043478262,
+      "grad_norm": NaN,
+      "learning_rate": 0.0001403864734299517,
+      "loss": 0.0,
       "step": 4700
     },
     {
+      "epoch": 8.208695652173914,
+      "grad_norm": NaN,
+      "learning_rate": 0.0001403864734299517,
+      "loss": 0.0,
       "step": 4720
     },
     {
+      "epoch": 8.243478260869566,
+      "grad_norm": NaN,
+      "learning_rate": 0.0001403864734299517,
+      "loss": 0.0,
       "step": 4740
     },
     {
+      "epoch": 8.278260869565218,
+      "grad_norm": NaN,
+      "learning_rate": 0.0001403864734299517,
+      "loss": 0.0,
       "step": 4760
     },
     {
+      "epoch": 8.31304347826087,
+      "grad_norm": NaN,
+      "learning_rate": 0.0001403864734299517,
+      "loss": 0.0,
       "step": 4780
     },
     {
+      "epoch": 8.347826086956522,
+      "grad_norm": NaN,
+      "learning_rate": 0.0001403864734299517,
+      "loss": 0.0,
       "step": 4800
     },
     {
+      "epoch": 8.382608695652173,
+      "grad_norm": NaN,
+      "learning_rate": 0.0001403864734299517,
+      "loss": 0.0,
       "step": 4820
     },
     {
+      "epoch": 8.417391304347825,
+      "grad_norm": NaN,
+      "learning_rate": 0.0001403864734299517,
+      "loss": 0.0,
       "step": 4840
     },
     {
+      "epoch": 8.452173913043477,
+      "grad_norm": NaN,
+      "learning_rate": 0.0001403864734299517,
+      "loss": 0.0,
       "step": 4860
     },
     {
+      "epoch": 8.486956521739131,
+      "grad_norm": NaN,
+      "learning_rate": 0.0001403864734299517,
+      "loss": 0.0,
       "step": 4880
     },
     {
+      "epoch": 8.521739130434783,
+      "grad_norm": NaN,
+      "learning_rate": 0.0001403864734299517,
+      "loss": 0.0,
       "step": 4900
     },
     {
+      "epoch": 8.556521739130435,
+      "grad_norm": NaN,
+      "learning_rate": 0.0001403864734299517,
+      "loss": 0.0,
       "step": 4920
     },
     {
+      "epoch": 8.591304347826087,
+      "grad_norm": NaN,
+      "learning_rate": 0.0001403864734299517,
+      "loss": 0.0,
       "step": 4940
     },
     {
+      "epoch": 8.626086956521739,
+      "grad_norm": NaN,
+      "learning_rate": 0.0001403864734299517,
+      "loss": 0.0,
       "step": 4960
     },
     {
+      "epoch": 8.660869565217391,
+      "grad_norm": NaN,
+      "learning_rate": 0.0001403864734299517,
+      "loss": 0.0,
       "step": 4980
     },
     {
+      "epoch": 8.695652173913043,
+      "grad_norm": NaN,
+      "learning_rate": 0.0001403864734299517,
+      "loss": 0.0,
       "step": 5000
     },
     {
+      "epoch": 8.730434782608695,
+      "grad_norm": NaN,
+      "learning_rate": 0.0001403864734299517,
+      "loss": 0.0,
       "step": 5020
     },
     {
+      "epoch": 8.765217391304347,
+      "grad_norm": NaN,
+      "learning_rate": 0.0001403864734299517,
+      "loss": 0.0,
       "step": 5040
     },
     {
+      "epoch": 8.8,
+      "grad_norm": NaN,
+      "learning_rate": 0.0001403864734299517,
+      "loss": 0.0,
       "step": 5060
     },
     {
+      "epoch": 8.834782608695653,
+      "grad_norm": NaN,
+      "learning_rate": 0.0001403864734299517,
+      "loss": 0.0,
       "step": 5080
     },
     {
+      "epoch": 8.869565217391305,
+      "grad_norm": NaN,
+      "learning_rate": 0.0001403864734299517,
+      "loss": 0.0,
       "step": 5100
     },
     {
+      "epoch": 8.904347826086957,
+      "grad_norm": NaN,
+      "learning_rate": 0.0001403864734299517,
+      "loss": 0.0,
       "step": 5120
     },
     {
+      "epoch": 8.939130434782609,
+      "grad_norm": NaN,
+      "learning_rate": 0.0001403864734299517,
+      "loss": 0.0,
       "step": 5140
     },
     {
+      "epoch": 8.97391304347826,
+      "grad_norm": NaN,
+      "learning_rate": 0.0001403864734299517,
+      "loss": 0.0,
       "step": 5160
     },
     {
+      "epoch": 9.0,
+      "eval_accuracy": 0.0006724949562878278,
+      "eval_loss": NaN,
+      "eval_runtime": 117.1288,
+      "eval_samples_per_second": 12.695,
+      "eval_steps_per_second": 12.695,
+      "step": 5175
+    },
+    {
+      "epoch": 9.008695652173913,
+      "grad_norm": NaN,
+      "learning_rate": 0.0001403864734299517,
+      "loss": 0.0,
       "step": 5180
     },
     {
+      "epoch": 9.043478260869565,
+      "grad_norm": NaN,
+      "learning_rate": 0.0001403864734299517,
+      "loss": 0.0,
       "step": 5200
     },
     {
+      "epoch": 9.078260869565218,
+      "grad_norm": NaN,
+      "learning_rate": 0.0001403864734299517,
+      "loss": 0.0,
       "step": 5220
     },
+    {
+      "epoch": 9.11304347826087,
+      "grad_norm": NaN,
+      "learning_rate": 0.0001403864734299517,
+      "loss": 0.0,
+      "step": 5240
+    },
+    {
+      "epoch": 9.147826086956522,
+      "grad_norm": NaN,
+      "learning_rate": 0.0001403864734299517,
+      "loss": 0.0,
+      "step": 5260
+    },
+    {
+      "epoch": 9.182608695652174,
+      "grad_norm": NaN,
+      "learning_rate": 0.0001403864734299517,
+      "loss": 0.0,
+      "step": 5280
+    },
+    {
+      "epoch": 9.217391304347826,
+      "grad_norm": NaN,
+      "learning_rate": 0.0001403864734299517,
+      "loss": 0.0,
+      "step": 5300
+    },
+    {
+      "epoch": 9.252173913043478,
+      "grad_norm": NaN,
+      "learning_rate": 0.0001403864734299517,
+      "loss": 0.0,
+      "step": 5320
+    },
+    {
+      "epoch": 9.28695652173913,
+      "grad_norm": NaN,
+      "learning_rate": 0.0001403864734299517,
+      "loss": 0.0,
+      "step": 5340
+    },
+    {
+      "epoch": 9.321739130434782,
+      "grad_norm": NaN,
+      "learning_rate": 0.0001403864734299517,
+      "loss": 0.0,
+      "step": 5360
+    },
+    {
+      "epoch": 9.356521739130434,
+      "grad_norm": NaN,
+      "learning_rate": 0.0001403864734299517,
+      "loss": 0.0,
+      "step": 5380
+    },
+    {
+      "epoch": 9.391304347826088,
+      "grad_norm": NaN,
+      "learning_rate": 0.0001403864734299517,
+      "loss": 0.0,
+      "step": 5400
+    },
+    {
+      "epoch": 9.42608695652174,
+      "grad_norm": NaN,
+      "learning_rate": 0.0001403864734299517,
+      "loss": 0.0,
+      "step": 5420
+    },
+    {
+      "epoch": 9.460869565217392,
+      "grad_norm": NaN,
+      "learning_rate": 0.0001403864734299517,
+      "loss": 0.0,
+      "step": 5440
+    },
+    {
+      "epoch": 9.495652173913044,
+      "grad_norm": NaN,
+      "learning_rate": 0.0001403864734299517,
+      "loss": 0.0,
+      "step": 5460
+    },
+    {
+      "epoch": 9.530434782608696,
+      "grad_norm": NaN,
+      "learning_rate": 0.0001403864734299517,
+      "loss": 0.0,
+      "step": 5480
+    },
+    {
+      "epoch": 9.565217391304348,
+      "grad_norm": NaN,
+      "learning_rate": 0.0001403864734299517,
+      "loss": 0.0,
+      "step": 5500
+    },
+    {
+      "epoch": 9.6,
+      "grad_norm": NaN,
+      "learning_rate": 0.0001403864734299517,
+      "loss": 0.0,
+      "step": 5520
+    },
+    {
+      "epoch": 9.634782608695652,
+      "grad_norm": NaN,
+      "learning_rate": 0.0001403864734299517,
+      "loss": 0.0,
+      "step": 5540
+    },
+    {
+      "epoch": 9.669565217391304,
+      "grad_norm": NaN,
+      "learning_rate": 0.0001403864734299517,
+      "loss": 0.0,
+      "step": 5560
+    },
+    {
+      "epoch": 9.704347826086957,
+      "grad_norm": NaN,
+      "learning_rate": 0.0001403864734299517,
+      "loss": 0.0,
+      "step": 5580
+    },
+    {
+      "epoch": 9.73913043478261,
+      "grad_norm": NaN,
+      "learning_rate": 0.0001403864734299517,
+      "loss": 0.0,
+      "step": 5600
+    },
+    {
+      "epoch": 9.773913043478261,
+      "grad_norm": NaN,
+      "learning_rate": 0.0001403864734299517,
+      "loss": 0.0,
+      "step": 5620
+    },
+    {
+      "epoch": 9.808695652173913,
+      "grad_norm": NaN,
+      "learning_rate": 0.0001403864734299517,
+      "loss": 0.0,
+      "step": 5640
+    },
+    {
+      "epoch": 9.843478260869565,
+      "grad_norm": NaN,
+      "learning_rate": 0.0001403864734299517,
+      "loss": 0.0,
+      "step": 5660
+    },
+    {
+      "epoch": 9.878260869565217,
+      "grad_norm": NaN,
+      "learning_rate": 0.0001403864734299517,
+      "loss": 0.0,
+      "step": 5680
+    },
+    {
+      "epoch": 9.91304347826087,
+      "grad_norm": NaN,
+      "learning_rate": 0.0001403864734299517,
+      "loss": 0.0,
+      "step": 5700
+    },
+    {
+      "epoch": 9.947826086956521,
+      "grad_norm": NaN,
+      "learning_rate": 0.0001403864734299517,
+      "loss": 0.0,
+      "step": 5720
+    },
+    {
+      "epoch": 9.982608695652173,
+      "grad_norm": NaN,
+      "learning_rate": 0.0001403864734299517,
+      "loss": 0.0,
+      "step": 5740
+    },
     {
       "epoch": 10.0,
+      "eval_accuracy": 0.0006724949562878278,
+      "eval_loss": NaN,
+      "eval_runtime": 103.3199,
+      "eval_samples_per_second": 14.392,
+      "eval_steps_per_second": 14.392,
+      "step": 5750
     },
     {
       "epoch": 10.0,
+      "step": 5750,
+      "total_flos": 2.7398100529152e+18,
+      "train_loss": 2.9414075751926587,
+      "train_runtime": 59857.6179,
+      "train_samples_per_second": 24.584,
+      "train_steps_per_second": 0.096
     }
   ],
   "logging_steps": 20,
+  "max_steps": 5750,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 10,
   "save_steps": 500,
       "attributes": {}
     }
   },
+  "total_flos": 2.7398100529152e+18,
   "train_batch_size": 256,
   "trial_name": null,
   "trial_params": null