LuigiJoseph commited on Mar 23

Commit

4c365ac

verified ·

1 Parent(s): 152ec27

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +14 -0
checkpoint-11948/config.json +41 -0
checkpoint-11948/generation_config.json +16 -0
checkpoint-11948/model.safetensors +3 -0
checkpoint-11948/optimizer.pt +3 -0
checkpoint-11948/rng_state.pth +3 -0
checkpoint-11948/scaler.pt +3 -0
checkpoint-11948/scheduler.pt +3 -0
checkpoint-11948/source.spm +3 -0
checkpoint-11948/special_tokens_map.json +23 -0
checkpoint-11948/target.spm +3 -0
checkpoint-11948/tokenizer_config.json +40 -0
checkpoint-11948/trainer_state.json +1731 -0
checkpoint-11948/training_args.bin +3 -0
checkpoint-11948/vocab.json +0 -0
checkpoint-14935/config.json +41 -0
checkpoint-14935/generation_config.json +16 -0
checkpoint-14935/model.safetensors +3 -0
checkpoint-14935/optimizer.pt +3 -0
checkpoint-14935/rng_state.pth +3 -0
checkpoint-14935/scaler.pt +3 -0
checkpoint-14935/scheduler.pt +3 -0
checkpoint-14935/source.spm +3 -0
checkpoint-14935/special_tokens_map.json +23 -0
checkpoint-14935/target.spm +3 -0
checkpoint-14935/tokenizer_config.json +40 -0
checkpoint-14935/trainer_state.json +2159 -0
checkpoint-14935/training_args.bin +3 -0
checkpoint-14935/vocab.json +0 -0
checkpoint-17922/config.json +41 -0
checkpoint-17922/generation_config.json +16 -0
checkpoint-17922/model.safetensors +3 -0
checkpoint-17922/optimizer.pt +3 -0
checkpoint-17922/rng_state.pth +3 -0
checkpoint-17922/scaler.pt +3 -0
checkpoint-17922/scheduler.pt +3 -0
checkpoint-17922/source.spm +3 -0
checkpoint-17922/special_tokens_map.json +23 -0
checkpoint-17922/target.spm +3 -0
checkpoint-17922/tokenizer_config.json +40 -0
checkpoint-17922/trainer_state.json +2587 -0
checkpoint-17922/training_args.bin +3 -0
checkpoint-17922/vocab.json +0 -0
checkpoint-2987/config.json +41 -0
checkpoint-2987/generation_config.json +16 -0
checkpoint-2987/model.safetensors +3 -0
checkpoint-2987/optimizer.pt +3 -0
checkpoint-2987/rng_state.pth +3 -0
checkpoint-2987/scaler.pt +3 -0
checkpoint-2987/scheduler.pt +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,17 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+checkpoint-11948/source.spm filter=lfs diff=lfs merge=lfs -text
+checkpoint-11948/target.spm filter=lfs diff=lfs merge=lfs -text
+checkpoint-14935/source.spm filter=lfs diff=lfs merge=lfs -text
+checkpoint-14935/target.spm filter=lfs diff=lfs merge=lfs -text
+checkpoint-17922/source.spm filter=lfs diff=lfs merge=lfs -text
+checkpoint-17922/target.spm filter=lfs diff=lfs merge=lfs -text
+checkpoint-2987/source.spm filter=lfs diff=lfs merge=lfs -text
+checkpoint-2987/target.spm filter=lfs diff=lfs merge=lfs -text
+checkpoint-5974/source.spm filter=lfs diff=lfs merge=lfs -text
+checkpoint-5974/target.spm filter=lfs diff=lfs merge=lfs -text
+checkpoint-8961/source.spm filter=lfs diff=lfs merge=lfs -text
+checkpoint-8961/target.spm filter=lfs diff=lfs merge=lfs -text
+source.spm filter=lfs diff=lfs merge=lfs -text
+target.spm filter=lfs diff=lfs merge=lfs -text

checkpoint-11948/config.json ADDED Viewed

	@@ -0,0 +1,41 @@

+{
+  "_name_or_path": "ckartal/english-to-turkish-finetuned-model",
+  "activation_dropout": 0.0,
+  "activation_function": "swish",
+  "architectures": [
+    "MarianMTModel"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.0,
+  "d_model": 512,
+  "decoder_attention_heads": 8,
+  "decoder_ffn_dim": 2048,
+  "decoder_layerdrop": 0.0,
+  "decoder_layers": 6,
+  "decoder_start_token_id": 59993,
+  "decoder_vocab_size": 59994,
+  "dropout": 0.1,
+  "encoder_attention_heads": 8,
+  "encoder_ffn_dim": 2048,
+  "encoder_layerdrop": 0.0,
+  "encoder_layers": 6,
+  "eos_token_id": 0,
+  "forced_eos_token_id": 0,
+  "init_std": 0.02,
+  "is_encoder_decoder": true,
+  "max_length": null,
+  "max_position_embeddings": 512,
+  "model_type": "marian",
+  "normalize_embedding": false,
+  "num_beams": null,
+  "num_hidden_layers": 6,
+  "pad_token_id": 59993,
+  "scale_embedding": true,
+  "share_encoder_decoder_embeddings": true,
+  "static_position_embeddings": true,
+  "torch_dtype": "float32",
+  "transformers_version": "4.49.0",
+  "use_cache": true,
+  "vocab_size": 59994
+}

checkpoint-11948/generation_config.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+  "bad_words_ids": [
+    [
+      59993
+    ]
+  ],
+  "bos_token_id": 0,
+  "decoder_start_token_id": 59993,
+  "eos_token_id": 0,
+  "forced_eos_token_id": 0,
+  "max_length": 512,
+  "num_beams": 6,
+  "pad_token_id": 59993,
+  "renormalize_logits": true,
+  "transformers_version": "4.49.0"
+}

checkpoint-11948/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:51d1dd137adffe19c3b054ce6fde4add2d4588c755def2d90103ef25413d9ff3
+size 299690728

checkpoint-11948/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9fae7f2689749c5d7511fbc91404d7932e246d3bcacbb83d035467ed24c5573f
+size 599054970

checkpoint-11948/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e98dff2b36b15ed0da9adbed2868493be995c81afcb89ad8f263069c503c6599
+size 14244

checkpoint-11948/scaler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5f824a254b9714a4704b979751121752508a64954c9977c2312faa340ea20ba0
+size 988

checkpoint-11948/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:de83e7c49986c79a6b5bc283e32a2889ea6bca6ea6f90dbe4cbd62bdef7dcccd
+size 1064

checkpoint-11948/source.spm ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:98eb24f0995a9d5f7cb0fb628c474628b1d2284615e881e857d062c0b651ce10
+size 793920

checkpoint-11948/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoint-11948/target.spm ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:45cc6000ed513cdca8f80739087fbcbf9933dc50c9ae36c319c9670882f72e1b
+size 837876

checkpoint-11948/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,40 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "59993": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "</s>",
+  "extra_special_tokens": {},
+  "model_max_length": 512,
+  "pad_token": "<pad>",
+  "return_tensors": "pt",
+  "separate_vocabs": false,
+  "source_lang": "eng",
+  "sp_model_kwargs": {},
+  "target_lang": "tur",
+  "tokenizer_class": "MarianTokenizer",
+  "unk_token": "<unk>"
+}

checkpoint-11948/trainer_state.json ADDED Viewed

	@@ -0,0 +1,1731 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 4.0,
+  "eval_steps": 500,
+  "global_step": 11948,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.016739203213927016,
+      "grad_norm": 0.439177542924881,
+      "learning_rate": 4.987445597589555e-05,
+      "loss": 1.5222,
+      "step": 50
+    },
+    {
+      "epoch": 0.03347840642785403,
+      "grad_norm": 0.4248828887939453,
+      "learning_rate": 4.973496261577949e-05,
+      "loss": 0.1842,
+      "step": 100
+    },
+    {
+      "epoch": 0.05021760964178105,
+      "grad_norm": 0.3019009232521057,
+      "learning_rate": 4.9595469255663436e-05,
+      "loss": 0.1471,
+      "step": 150
+    },
+    {
+      "epoch": 0.06695681285570806,
+      "grad_norm": 0.2518245577812195,
+      "learning_rate": 4.9455975895547376e-05,
+      "loss": 0.1306,
+      "step": 200
+    },
+    {
+      "epoch": 0.08369601606963509,
+      "grad_norm": 0.3660012185573578,
+      "learning_rate": 4.931648253543131e-05,
+      "loss": 0.1276,
+      "step": 250
+    },
+    {
+      "epoch": 0.1004352192835621,
+      "grad_norm": 0.32854148745536804,
+      "learning_rate": 4.917698917531526e-05,
+      "loss": 0.1048,
+      "step": 300
+    },
+    {
+      "epoch": 0.11717442249748912,
+      "grad_norm": 0.24879515171051025,
+      "learning_rate": 4.90374958151992e-05,
+      "loss": 0.1054,
+      "step": 350
+    },
+    {
+      "epoch": 0.13391362571141613,
+      "grad_norm": 0.36416563391685486,
+      "learning_rate": 4.889800245508314e-05,
+      "loss": 0.0985,
+      "step": 400
+    },
+    {
+      "epoch": 0.15065282892534315,
+      "grad_norm": 0.33641186356544495,
+      "learning_rate": 4.875850909496708e-05,
+      "loss": 0.1044,
+      "step": 450
+    },
+    {
+      "epoch": 0.16739203213927017,
+      "grad_norm": 0.32909244298934937,
+      "learning_rate": 4.861901573485103e-05,
+      "loss": 0.1089,
+      "step": 500
+    },
+    {
+      "epoch": 0.1841312353531972,
+      "grad_norm": 0.36060285568237305,
+      "learning_rate": 4.847952237473497e-05,
+      "loss": 0.09,
+      "step": 550
+    },
+    {
+      "epoch": 0.2008704385671242,
+      "grad_norm": 0.2510785758495331,
+      "learning_rate": 4.83400290146189e-05,
+      "loss": 0.0884,
+      "step": 600
+    },
+    {
+      "epoch": 0.21760964178105122,
+      "grad_norm": 0.22478719055652618,
+      "learning_rate": 4.820053565450285e-05,
+      "loss": 0.0866,
+      "step": 650
+    },
+    {
+      "epoch": 0.23434884499497824,
+      "grad_norm": 0.37321263551712036,
+      "learning_rate": 4.806104229438679e-05,
+      "loss": 0.0884,
+      "step": 700
+    },
+    {
+      "epoch": 0.25108804820890523,
+      "grad_norm": 0.2660929262638092,
+      "learning_rate": 4.792154893427073e-05,
+      "loss": 0.0819,
+      "step": 750
+    },
+    {
+      "epoch": 0.26782725142283226,
+      "grad_norm": 0.2338525801897049,
+      "learning_rate": 4.778205557415467e-05,
+      "loss": 0.0845,
+      "step": 800
+    },
+    {
+      "epoch": 0.2845664546367593,
+      "grad_norm": 0.308557391166687,
+      "learning_rate": 4.764256221403862e-05,
+      "loss": 0.0815,
+      "step": 850
+    },
+    {
+      "epoch": 0.3013056578506863,
+      "grad_norm": 0.27098262310028076,
+      "learning_rate": 4.750306885392255e-05,
+      "loss": 0.0833,
+      "step": 900
+    },
+    {
+      "epoch": 0.3180448610646133,
+      "grad_norm": 0.23054952919483185,
+      "learning_rate": 4.736357549380649e-05,
+      "loss": 0.0806,
+      "step": 950
+    },
+    {
+      "epoch": 0.33478406427854035,
+      "grad_norm": 0.21355900168418884,
+      "learning_rate": 4.722408213369044e-05,
+      "loss": 0.073,
+      "step": 1000
+    },
+    {
+      "epoch": 0.3515232674924674,
+      "grad_norm": 0.20395708084106445,
+      "learning_rate": 4.708458877357438e-05,
+      "loss": 0.0775,
+      "step": 1050
+    },
+    {
+      "epoch": 0.3682624707063944,
+      "grad_norm": 0.21063613891601562,
+      "learning_rate": 4.694509541345832e-05,
+      "loss": 0.0789,
+      "step": 1100
+    },
+    {
+      "epoch": 0.3850016739203214,
+      "grad_norm": 0.20589284598827362,
+      "learning_rate": 4.680560205334226e-05,
+      "loss": 0.0809,
+      "step": 1150
+    },
+    {
+      "epoch": 0.4017408771342484,
+      "grad_norm": 0.27975228428840637,
+      "learning_rate": 4.666610869322621e-05,
+      "loss": 0.078,
+      "step": 1200
+    },
+    {
+      "epoch": 0.4184800803481754,
+      "grad_norm": 0.2529745399951935,
+      "learning_rate": 4.6526615333110144e-05,
+      "loss": 0.0704,
+      "step": 1250
+    },
+    {
+      "epoch": 0.43521928356210243,
+      "grad_norm": 0.2205154448747635,
+      "learning_rate": 4.6387121972994084e-05,
+      "loss": 0.0733,
+      "step": 1300
+    },
+    {
+      "epoch": 0.45195848677602946,
+      "grad_norm": 0.2254629135131836,
+      "learning_rate": 4.624762861287803e-05,
+      "loss": 0.0751,
+      "step": 1350
+    },
+    {
+      "epoch": 0.4686976899899565,
+      "grad_norm": 0.17614957690238953,
+      "learning_rate": 4.610813525276197e-05,
+      "loss": 0.0747,
+      "step": 1400
+    },
+    {
+      "epoch": 0.4854368932038835,
+      "grad_norm": 0.15940478444099426,
+      "learning_rate": 4.596864189264591e-05,
+      "loss": 0.0698,
+      "step": 1450
+    },
+    {
+      "epoch": 0.5021760964178105,
+      "grad_norm": 0.1869521141052246,
+      "learning_rate": 4.5829148532529854e-05,
+      "loss": 0.0721,
+      "step": 1500
+    },
+    {
+      "epoch": 0.5189152996317375,
+      "grad_norm": 0.36063650250434875,
+      "learning_rate": 4.5689655172413794e-05,
+      "loss": 0.0706,
+      "step": 1550
+    },
+    {
+      "epoch": 0.5356545028456645,
+      "grad_norm": 0.16967014968395233,
+      "learning_rate": 4.5550161812297735e-05,
+      "loss": 0.0759,
+      "step": 1600
+    },
+    {
+      "epoch": 0.5523937060595916,
+      "grad_norm": 0.29293423891067505,
+      "learning_rate": 4.5410668452181676e-05,
+      "loss": 0.0711,
+      "step": 1650
+    },
+    {
+      "epoch": 0.5691329092735186,
+      "grad_norm": 0.3034748136997223,
+      "learning_rate": 4.527117509206562e-05,
+      "loss": 0.067,
+      "step": 1700
+    },
+    {
+      "epoch": 0.5858721124874456,
+      "grad_norm": 0.1974593997001648,
+      "learning_rate": 4.513168173194956e-05,
+      "loss": 0.0701,
+      "step": 1750
+    },
+    {
+      "epoch": 0.6026113157013726,
+      "grad_norm": 0.18101799488067627,
+      "learning_rate": 4.4992188371833505e-05,
+      "loss": 0.0717,
+      "step": 1800
+    },
+    {
+      "epoch": 0.6193505189152997,
+      "grad_norm": 0.14422941207885742,
+      "learning_rate": 4.4852695011717445e-05,
+      "loss": 0.0686,
+      "step": 1850
+    },
+    {
+      "epoch": 0.6360897221292267,
+      "grad_norm": 0.28663551807403564,
+      "learning_rate": 4.4713201651601386e-05,
+      "loss": 0.0646,
+      "step": 1900
+    },
+    {
+      "epoch": 0.6528289253431536,
+      "grad_norm": 0.23879379034042358,
+      "learning_rate": 4.4573708291485327e-05,
+      "loss": 0.0684,
+      "step": 1950
+    },
+    {
+      "epoch": 0.6695681285570807,
+      "grad_norm": 0.21389362215995789,
+      "learning_rate": 4.443421493136927e-05,
+      "loss": 0.066,
+      "step": 2000
+    },
+    {
+      "epoch": 0.6863073317710077,
+      "grad_norm": 0.26841893792152405,
+      "learning_rate": 4.4294721571253215e-05,
+      "loss": 0.0717,
+      "step": 2050
+    },
+    {
+      "epoch": 0.7030465349849347,
+      "grad_norm": 0.240205317735672,
+      "learning_rate": 4.415522821113715e-05,
+      "loss": 0.0697,
+      "step": 2100
+    },
+    {
+      "epoch": 0.7197857381988617,
+      "grad_norm": 0.28098127245903015,
+      "learning_rate": 4.4015734851021096e-05,
+      "loss": 0.0713,
+      "step": 2150
+    },
+    {
+      "epoch": 0.7365249414127888,
+      "grad_norm": 0.23308847844600677,
+      "learning_rate": 4.3876241490905037e-05,
+      "loss": 0.0667,
+      "step": 2200
+    },
+    {
+      "epoch": 0.7532641446267158,
+      "grad_norm": 0.22748568654060364,
+      "learning_rate": 4.373674813078898e-05,
+      "loss": 0.0605,
+      "step": 2250
+    },
+    {
+      "epoch": 0.7700033478406428,
+      "grad_norm": 0.3932187259197235,
+      "learning_rate": 4.359725477067292e-05,
+      "loss": 0.0676,
+      "step": 2300
+    },
+    {
+      "epoch": 0.7867425510545698,
+      "grad_norm": 0.23918767273426056,
+      "learning_rate": 4.345776141055686e-05,
+      "loss": 0.0624,
+      "step": 2350
+    },
+    {
+      "epoch": 0.8034817542684968,
+      "grad_norm": 0.3068426549434662,
+      "learning_rate": 4.33182680504408e-05,
+      "loss": 0.0664,
+      "step": 2400
+    },
+    {
+      "epoch": 0.8202209574824239,
+      "grad_norm": 0.17977873980998993,
+      "learning_rate": 4.317877469032474e-05,
+      "loss": 0.0726,
+      "step": 2450
+    },
+    {
+      "epoch": 0.8369601606963508,
+      "grad_norm": 0.16876642405986786,
+      "learning_rate": 4.303928133020869e-05,
+      "loss": 0.0639,
+      "step": 2500
+    },
+    {
+      "epoch": 0.8536993639102779,
+      "grad_norm": 0.17980250716209412,
+      "learning_rate": 4.289978797009263e-05,
+      "loss": 0.0701,
+      "step": 2550
+    },
+    {
+      "epoch": 0.8704385671242049,
+      "grad_norm": 0.1711459904909134,
+      "learning_rate": 4.276029460997656e-05,
+      "loss": 0.063,
+      "step": 2600
+    },
+    {
+      "epoch": 0.8871777703381319,
+      "grad_norm": 0.443228542804718,
+      "learning_rate": 4.262080124986051e-05,
+      "loss": 0.0675,
+      "step": 2650
+    },
+    {
+      "epoch": 0.9039169735520589,
+      "grad_norm": 0.2098589390516281,
+      "learning_rate": 4.248130788974445e-05,
+      "loss": 0.062,
+      "step": 2700
+    },
+    {
+      "epoch": 0.920656176765986,
+      "grad_norm": 0.3022039234638214,
+      "learning_rate": 4.234181452962839e-05,
+      "loss": 0.07,
+      "step": 2750
+    },
+    {
+      "epoch": 0.937395379979913,
+      "grad_norm": 0.19368910789489746,
+      "learning_rate": 4.220232116951233e-05,
+      "loss": 0.0621,
+      "step": 2800
+    },
+    {
+      "epoch": 0.9541345831938399,
+      "grad_norm": 0.18753108382225037,
+      "learning_rate": 4.206282780939628e-05,
+      "loss": 0.0631,
+      "step": 2850
+    },
+    {
+      "epoch": 0.970873786407767,
+      "grad_norm": 0.15517786145210266,
+      "learning_rate": 4.192333444928022e-05,
+      "loss": 0.0641,
+      "step": 2900
+    },
+    {
+      "epoch": 0.987612989621694,
+      "grad_norm": 0.11765792220830917,
+      "learning_rate": 4.178384108916415e-05,
+      "loss": 0.0612,
+      "step": 2950
+    },
+    {
+      "epoch": 1.0,
+      "eval_loss": 0.05521286651492119,
+      "eval_runtime": 50.415,
+      "eval_samples_per_second": 236.993,
+      "eval_steps_per_second": 14.817,
+      "step": 2987
+    },
+    {
+      "epoch": 1.004352192835621,
+      "grad_norm": 0.2691793739795685,
+      "learning_rate": 4.16443477290481e-05,
+      "loss": 0.059,
+      "step": 3000
+    },
+    {
+      "epoch": 1.021091396049548,
+      "grad_norm": 0.394694060087204,
+      "learning_rate": 4.150485436893204e-05,
+      "loss": 0.0566,
+      "step": 3050
+    },
+    {
+      "epoch": 1.037830599263475,
+      "grad_norm": 0.19438503682613373,
+      "learning_rate": 4.136536100881598e-05,
+      "loss": 0.0591,
+      "step": 3100
+    },
+    {
+      "epoch": 1.0545698024774022,
+      "grad_norm": 0.21350933611392975,
+      "learning_rate": 4.122586764869992e-05,
+      "loss": 0.0509,
+      "step": 3150
+    },
+    {
+      "epoch": 1.071309005691329,
+      "grad_norm": 0.26747575402259827,
+      "learning_rate": 4.108637428858387e-05,
+      "loss": 0.0589,
+      "step": 3200
+    },
+    {
+      "epoch": 1.088048208905256,
+      "grad_norm": 0.31256961822509766,
+      "learning_rate": 4.0946880928467804e-05,
+      "loss": 0.0602,
+      "step": 3250
+    },
+    {
+      "epoch": 1.1047874121191832,
+      "grad_norm": 0.18631280958652496,
+      "learning_rate": 4.0807387568351745e-05,
+      "loss": 0.0547,
+      "step": 3300
+    },
+    {
+      "epoch": 1.12152661533311,
+      "grad_norm": 0.18677473068237305,
+      "learning_rate": 4.066789420823569e-05,
+      "loss": 0.0543,
+      "step": 3350
+    },
+    {
+      "epoch": 1.1382658185470371,
+      "grad_norm": 0.24535444378852844,
+      "learning_rate": 4.052840084811963e-05,
+      "loss": 0.0583,
+      "step": 3400
+    },
+    {
+      "epoch": 1.1550050217609642,
+      "grad_norm": 0.1752105951309204,
+      "learning_rate": 4.038890748800357e-05,
+      "loss": 0.0504,
+      "step": 3450
+    },
+    {
+      "epoch": 1.1717442249748913,
+      "grad_norm": 0.14743360877037048,
+      "learning_rate": 4.0249414127887514e-05,
+      "loss": 0.055,
+      "step": 3500
+    },
+    {
+      "epoch": 1.1884834281888181,
+      "grad_norm": 0.11535945534706116,
+      "learning_rate": 4.010992076777146e-05,
+      "loss": 0.0552,
+      "step": 3550
+    },
+    {
+      "epoch": 1.2052226314027452,
+      "grad_norm": 0.26563358306884766,
+      "learning_rate": 3.9970427407655395e-05,
+      "loss": 0.0552,
+      "step": 3600
+    },
+    {
+      "epoch": 1.2219618346166723,
+      "grad_norm": 0.15104246139526367,
+      "learning_rate": 3.9830934047539336e-05,
+      "loss": 0.0575,
+      "step": 3650
+    },
+    {
+      "epoch": 1.2387010378305994,
+      "grad_norm": 0.2198421210050583,
+      "learning_rate": 3.9691440687423283e-05,
+      "loss": 0.0567,
+      "step": 3700
+    },
+    {
+      "epoch": 1.2554402410445262,
+      "grad_norm": 0.20177733898162842,
+      "learning_rate": 3.955194732730722e-05,
+      "loss": 0.0556,
+      "step": 3750
+    },
+    {
+      "epoch": 1.2721794442584533,
+      "grad_norm": 0.36604830622673035,
+      "learning_rate": 3.9412453967191165e-05,
+      "loss": 0.0569,
+      "step": 3800
+    },
+    {
+      "epoch": 1.2889186474723804,
+      "grad_norm": 0.18883727490901947,
+      "learning_rate": 3.9272960607075105e-05,
+      "loss": 0.0595,
+      "step": 3850
+    },
+    {
+      "epoch": 1.3056578506863072,
+      "grad_norm": 0.14828617870807648,
+      "learning_rate": 3.9133467246959046e-05,
+      "loss": 0.0548,
+      "step": 3900
+    },
+    {
+      "epoch": 1.3223970539002343,
+      "grad_norm": 0.19220437109470367,
+      "learning_rate": 3.899397388684299e-05,
+      "loss": 0.053,
+      "step": 3950
+    },
+    {
+      "epoch": 1.3391362571141614,
+      "grad_norm": 0.16049669682979584,
+      "learning_rate": 3.885448052672693e-05,
+      "loss": 0.0581,
+      "step": 4000
+    },
+    {
+      "epoch": 1.3558754603280883,
+      "grad_norm": 0.22821515798568726,
+      "learning_rate": 3.8714987166610875e-05,
+      "loss": 0.0518,
+      "step": 4050
+    },
+    {
+      "epoch": 1.3726146635420153,
+      "grad_norm": 0.1879580318927765,
+      "learning_rate": 3.857549380649481e-05,
+      "loss": 0.0574,
+      "step": 4100
+    },
+    {
+      "epoch": 1.3893538667559424,
+      "grad_norm": 0.16026251018047333,
+      "learning_rate": 3.8436000446378756e-05,
+      "loss": 0.063,
+      "step": 4150
+    },
+    {
+      "epoch": 1.4060930699698695,
+      "grad_norm": 0.26868143677711487,
+      "learning_rate": 3.82965070862627e-05,
+      "loss": 0.0571,
+      "step": 4200
+    },
+    {
+      "epoch": 1.4228322731837966,
+      "grad_norm": 0.2529687285423279,
+      "learning_rate": 3.815701372614664e-05,
+      "loss": 0.0528,
+      "step": 4250
+    },
+    {
+      "epoch": 1.4395714763977234,
+      "grad_norm": 0.19138221442699432,
+      "learning_rate": 3.801752036603058e-05,
+      "loss": 0.0584,
+      "step": 4300
+    },
+    {
+      "epoch": 1.4563106796116505,
+      "grad_norm": 0.16359661519527435,
+      "learning_rate": 3.787802700591452e-05,
+      "loss": 0.0539,
+      "step": 4350
+    },
+    {
+      "epoch": 1.4730498828255776,
+      "grad_norm": 0.1373494267463684,
+      "learning_rate": 3.7738533645798466e-05,
+      "loss": 0.0557,
+      "step": 4400
+    },
+    {
+      "epoch": 1.4897890860395044,
+      "grad_norm": 0.15695162117481232,
+      "learning_rate": 3.75990402856824e-05,
+      "loss": 0.0491,
+      "step": 4450
+    },
+    {
+      "epoch": 1.5065282892534315,
+      "grad_norm": 0.18462614715099335,
+      "learning_rate": 3.745954692556635e-05,
+      "loss": 0.0495,
+      "step": 4500
+    },
+    {
+      "epoch": 1.5232674924673586,
+      "grad_norm": 0.27876704931259155,
+      "learning_rate": 3.732005356545029e-05,
+      "loss": 0.0523,
+      "step": 4550
+    },
+    {
+      "epoch": 1.5400066956812855,
+      "grad_norm": 0.30491840839385986,
+      "learning_rate": 3.718056020533423e-05,
+      "loss": 0.0564,
+      "step": 4600
+    },
+    {
+      "epoch": 1.5567458988952128,
+      "grad_norm": 0.18721336126327515,
+      "learning_rate": 3.704106684521817e-05,
+      "loss": 0.0524,
+      "step": 4650
+    },
+    {
+      "epoch": 1.5734851021091396,
+      "grad_norm": 0.21216215193271637,
+      "learning_rate": 3.690157348510211e-05,
+      "loss": 0.0521,
+      "step": 4700
+    },
+    {
+      "epoch": 1.5902243053230665,
+      "grad_norm": 0.1368396282196045,
+      "learning_rate": 3.676208012498605e-05,
+      "loss": 0.056,
+      "step": 4750
+    },
+    {
+      "epoch": 1.6069635085369938,
+      "grad_norm": 0.13692086935043335,
+      "learning_rate": 3.662258676486999e-05,
+      "loss": 0.0443,
+      "step": 4800
+    },
+    {
+      "epoch": 1.6237027117509206,
+      "grad_norm": 0.11640128493309021,
+      "learning_rate": 3.648309340475394e-05,
+      "loss": 0.0488,
+      "step": 4850
+    },
+    {
+      "epoch": 1.6404419149648477,
+      "grad_norm": 0.19953882694244385,
+      "learning_rate": 3.634360004463788e-05,
+      "loss": 0.0553,
+      "step": 4900
+    },
+    {
+      "epoch": 1.6571811181787748,
+      "grad_norm": 0.1966984122991562,
+      "learning_rate": 3.6204106684521813e-05,
+      "loss": 0.0536,
+      "step": 4950
+    },
+    {
+      "epoch": 1.6739203213927016,
+      "grad_norm": 0.2324533313512802,
+      "learning_rate": 3.606461332440576e-05,
+      "loss": 0.0493,
+      "step": 5000
+    },
+    {
+      "epoch": 1.6906595246066287,
+      "grad_norm": 0.16217607259750366,
+      "learning_rate": 3.59251199642897e-05,
+      "loss": 0.0503,
+      "step": 5050
+    },
+    {
+      "epoch": 1.7073987278205558,
+      "grad_norm": 0.23949602246284485,
+      "learning_rate": 3.578562660417364e-05,
+      "loss": 0.0556,
+      "step": 5100
+    },
+    {
+      "epoch": 1.7241379310344827,
+      "grad_norm": 0.21387897431850433,
+      "learning_rate": 3.564613324405758e-05,
+      "loss": 0.0548,
+      "step": 5150
+    },
+    {
+      "epoch": 1.7408771342484097,
+      "grad_norm": 0.2055111676454544,
+      "learning_rate": 3.550663988394153e-05,
+      "loss": 0.06,
+      "step": 5200
+    },
+    {
+      "epoch": 1.7576163374623368,
+      "grad_norm": 0.20280921459197998,
+      "learning_rate": 3.5367146523825464e-05,
+      "loss": 0.0508,
+      "step": 5250
+    },
+    {
+      "epoch": 1.7743555406762637,
+      "grad_norm": 0.14165103435516357,
+      "learning_rate": 3.5227653163709405e-05,
+      "loss": 0.0581,
+      "step": 5300
+    },
+    {
+      "epoch": 1.791094743890191,
+      "grad_norm": 0.18099863827228546,
+      "learning_rate": 3.508815980359335e-05,
+      "loss": 0.0562,
+      "step": 5350
+    },
+    {
+      "epoch": 1.8078339471041178,
+      "grad_norm": 0.21743184328079224,
+      "learning_rate": 3.494866644347729e-05,
+      "loss": 0.0498,
+      "step": 5400
+    },
+    {
+      "epoch": 1.824573150318045,
+      "grad_norm": 0.20934534072875977,
+      "learning_rate": 3.4809173083361234e-05,
+      "loss": 0.0549,
+      "step": 5450
+    },
+    {
+      "epoch": 1.841312353531972,
+      "grad_norm": 0.1582174152135849,
+      "learning_rate": 3.4669679723245174e-05,
+      "loss": 0.0556,
+      "step": 5500
+    },
+    {
+      "epoch": 1.8580515567458988,
+      "grad_norm": 0.1624903827905655,
+      "learning_rate": 3.453018636312912e-05,
+      "loss": 0.0516,
+      "step": 5550
+    },
+    {
+      "epoch": 1.874790759959826,
+      "grad_norm": 0.16255798935890198,
+      "learning_rate": 3.4390693003013056e-05,
+      "loss": 0.0542,
+      "step": 5600
+    },
+    {
+      "epoch": 1.891529963173753,
+      "grad_norm": 0.1269742250442505,
+      "learning_rate": 3.4251199642896996e-05,
+      "loss": 0.0565,
+      "step": 5650
+    },
+    {
+      "epoch": 1.9082691663876798,
+      "grad_norm": 0.15966229140758514,
+      "learning_rate": 3.4111706282780944e-05,
+      "loss": 0.0538,
+      "step": 5700
+    },
+    {
+      "epoch": 1.925008369601607,
+      "grad_norm": 0.21506330370903015,
+      "learning_rate": 3.3972212922664884e-05,
+      "loss": 0.0505,
+      "step": 5750
+    },
+    {
+      "epoch": 1.941747572815534,
+      "grad_norm": 0.2145415097475052,
+      "learning_rate": 3.3832719562548825e-05,
+      "loss": 0.0521,
+      "step": 5800
+    },
+    {
+      "epoch": 1.9584867760294609,
+      "grad_norm": 0.10960496962070465,
+      "learning_rate": 3.3693226202432766e-05,
+      "loss": 0.0513,
+      "step": 5850
+    },
+    {
+      "epoch": 1.9752259792433882,
+      "grad_norm": 0.13635843992233276,
+      "learning_rate": 3.355373284231671e-05,
+      "loss": 0.0499,
+      "step": 5900
+    },
+    {
+      "epoch": 1.991965182457315,
+      "grad_norm": 0.1542210429906845,
+      "learning_rate": 3.341423948220065e-05,
+      "loss": 0.0556,
+      "step": 5950
+    },
+    {
+      "epoch": 2.0,
+      "eval_loss": 0.04946442320942879,
+      "eval_runtime": 55.6114,
+      "eval_samples_per_second": 214.848,
+      "eval_steps_per_second": 13.432,
+      "step": 5974
+    },
+    {
+      "epoch": 2.008704385671242,
+      "grad_norm": 0.1718842089176178,
+      "learning_rate": 3.327474612208459e-05,
+      "loss": 0.0503,
+      "step": 6000
+    },
+    {
+      "epoch": 2.025443588885169,
+      "grad_norm": 0.1528020203113556,
+      "learning_rate": 3.3135252761968535e-05,
+      "loss": 0.0479,
+      "step": 6050
+    },
+    {
+      "epoch": 2.042182792099096,
+      "grad_norm": 0.19148772954940796,
+      "learning_rate": 3.299575940185247e-05,
+      "loss": 0.0443,
+      "step": 6100
+    },
+    {
+      "epoch": 2.058921995313023,
+      "grad_norm": 0.18125496804714203,
+      "learning_rate": 3.2856266041736416e-05,
+      "loss": 0.0473,
+      "step": 6150
+    },
+    {
+      "epoch": 2.07566119852695,
+      "grad_norm": 0.20772996544837952,
+      "learning_rate": 3.271677268162036e-05,
+      "loss": 0.0539,
+      "step": 6200
+    },
+    {
+      "epoch": 2.092400401740877,
+      "grad_norm": 0.2518468201160431,
+      "learning_rate": 3.25772793215043e-05,
+      "loss": 0.0468,
+      "step": 6250
+    },
+    {
+      "epoch": 2.1091396049548043,
+      "grad_norm": 0.1350301206111908,
+      "learning_rate": 3.243778596138824e-05,
+      "loss": 0.0431,
+      "step": 6300
+    },
+    {
+      "epoch": 2.125878808168731,
+      "grad_norm": 0.19141735136508942,
+      "learning_rate": 3.229829260127218e-05,
+      "loss": 0.048,
+      "step": 6350
+    },
+    {
+      "epoch": 2.142618011382658,
+      "grad_norm": 0.2404586374759674,
+      "learning_rate": 3.2158799241156126e-05,
+      "loss": 0.0516,
+      "step": 6400
+    },
+    {
+      "epoch": 2.1593572145965854,
+      "grad_norm": 0.21710112690925598,
+      "learning_rate": 3.201930588104006e-05,
+      "loss": 0.0472,
+      "step": 6450
+    },
+    {
+      "epoch": 2.176096417810512,
+      "grad_norm": 0.14395031332969666,
+      "learning_rate": 3.187981252092401e-05,
+      "loss": 0.0438,
+      "step": 6500
+    },
+    {
+      "epoch": 2.192835621024439,
+      "grad_norm": 0.20882932841777802,
+      "learning_rate": 3.174031916080795e-05,
+      "loss": 0.0488,
+      "step": 6550
+    },
+    {
+      "epoch": 2.2095748242383664,
+      "grad_norm": 0.13824905455112457,
+      "learning_rate": 3.160082580069189e-05,
+      "loss": 0.0464,
+      "step": 6600
+    },
+    {
+      "epoch": 2.2263140274522932,
+      "grad_norm": 0.1783577799797058,
+      "learning_rate": 3.146133244057583e-05,
+      "loss": 0.0459,
+      "step": 6650
+    },
+    {
+      "epoch": 2.24305323066622,
+      "grad_norm": 0.22147531807422638,
+      "learning_rate": 3.132183908045977e-05,
+      "loss": 0.0476,
+      "step": 6700
+    },
+    {
+      "epoch": 2.2597924338801474,
+      "grad_norm": 0.17393821477890015,
+      "learning_rate": 3.118234572034371e-05,
+      "loss": 0.0436,
+      "step": 6750
+    },
+    {
+      "epoch": 2.2765316370940742,
+      "grad_norm": 0.15850785374641418,
+      "learning_rate": 3.104285236022765e-05,
+      "loss": 0.0476,
+      "step": 6800
+    },
+    {
+      "epoch": 2.2932708403080015,
+      "grad_norm": 0.16232182085514069,
+      "learning_rate": 3.09033590001116e-05,
+      "loss": 0.0473,
+      "step": 6850
+    },
+    {
+      "epoch": 2.3100100435219284,
+      "grad_norm": 0.1816001981496811,
+      "learning_rate": 3.076386563999554e-05,
+      "loss": 0.0427,
+      "step": 6900
+    },
+    {
+      "epoch": 2.3267492467358553,
+      "grad_norm": 0.13417834043502808,
+      "learning_rate": 3.062437227987948e-05,
+      "loss": 0.0448,
+      "step": 6950
+    },
+    {
+      "epoch": 2.3434884499497826,
+      "grad_norm": 0.12576530873775482,
+      "learning_rate": 3.048487891976342e-05,
+      "loss": 0.0453,
+      "step": 7000
+    },
+    {
+      "epoch": 2.3602276531637094,
+      "grad_norm": 0.33120718598365784,
+      "learning_rate": 3.0345385559647362e-05,
+      "loss": 0.0462,
+      "step": 7050
+    },
+    {
+      "epoch": 2.3769668563776363,
+      "grad_norm": 0.22310969233512878,
+      "learning_rate": 3.0205892199531306e-05,
+      "loss": 0.0475,
+      "step": 7100
+    },
+    {
+      "epoch": 2.3937060595915636,
+      "grad_norm": 0.18150626122951508,
+      "learning_rate": 3.0066398839415243e-05,
+      "loss": 0.0489,
+      "step": 7150
+    },
+    {
+      "epoch": 2.4104452628054904,
+      "grad_norm": 0.28730452060699463,
+      "learning_rate": 2.9926905479299187e-05,
+      "loss": 0.0536,
+      "step": 7200
+    },
+    {
+      "epoch": 2.4271844660194173,
+      "grad_norm": 0.1918480098247528,
+      "learning_rate": 2.9787412119183128e-05,
+      "loss": 0.0426,
+      "step": 7250
+    },
+    {
+      "epoch": 2.4439236692333446,
+      "grad_norm": 0.16158398985862732,
+      "learning_rate": 2.964791875906707e-05,
+      "loss": 0.0458,
+      "step": 7300
+    },
+    {
+      "epoch": 2.4606628724472714,
+      "grad_norm": 0.27141231298446655,
+      "learning_rate": 2.9508425398951012e-05,
+      "loss": 0.0454,
+      "step": 7350
+    },
+    {
+      "epoch": 2.4774020756611987,
+      "grad_norm": 0.1777345836162567,
+      "learning_rate": 2.936893203883495e-05,
+      "loss": 0.0435,
+      "step": 7400
+    },
+    {
+      "epoch": 2.4941412788751256,
+      "grad_norm": 0.14735421538352966,
+      "learning_rate": 2.9229438678718897e-05,
+      "loss": 0.0489,
+      "step": 7450
+    },
+    {
+      "epoch": 2.5108804820890525,
+      "grad_norm": 0.1486055999994278,
+      "learning_rate": 2.9089945318602834e-05,
+      "loss": 0.0477,
+      "step": 7500
+    },
+    {
+      "epoch": 2.5276196853029793,
+      "grad_norm": 0.17078754305839539,
+      "learning_rate": 2.895045195848678e-05,
+      "loss": 0.0444,
+      "step": 7550
+    },
+    {
+      "epoch": 2.5443588885169066,
+      "grad_norm": 0.19276435673236847,
+      "learning_rate": 2.881095859837072e-05,
+      "loss": 0.0486,
+      "step": 7600
+    },
+    {
+      "epoch": 2.5610980917308335,
+      "grad_norm": 0.21209606528282166,
+      "learning_rate": 2.8671465238254656e-05,
+      "loss": 0.0497,
+      "step": 7650
+    },
+    {
+      "epoch": 2.5778372949447608,
+      "grad_norm": 0.21018877625465393,
+      "learning_rate": 2.8531971878138604e-05,
+      "loss": 0.0441,
+      "step": 7700
+    },
+    {
+      "epoch": 2.5945764981586876,
+      "grad_norm": 0.15666617453098297,
+      "learning_rate": 2.839247851802254e-05,
+      "loss": 0.0467,
+      "step": 7750
+    },
+    {
+      "epoch": 2.6113157013726145,
+      "grad_norm": 0.1940685212612152,
+      "learning_rate": 2.8252985157906485e-05,
+      "loss": 0.0523,
+      "step": 7800
+    },
+    {
+      "epoch": 2.628054904586542,
+      "grad_norm": 0.28480586409568787,
+      "learning_rate": 2.8113491797790426e-05,
+      "loss": 0.0481,
+      "step": 7850
+    },
+    {
+      "epoch": 2.6447941078004686,
+      "grad_norm": 0.2223973125219345,
+      "learning_rate": 2.797399843767437e-05,
+      "loss": 0.0432,
+      "step": 7900
+    },
+    {
+      "epoch": 2.661533311014396,
+      "grad_norm": 0.15986157953739166,
+      "learning_rate": 2.783450507755831e-05,
+      "loss": 0.0454,
+      "step": 7950
+    },
+    {
+      "epoch": 2.678272514228323,
+      "grad_norm": 0.1384258270263672,
+      "learning_rate": 2.7695011717442248e-05,
+      "loss": 0.0477,
+      "step": 8000
+    },
+    {
+      "epoch": 2.6950117174422497,
+      "grad_norm": 0.1721869707107544,
+      "learning_rate": 2.7555518357326192e-05,
+      "loss": 0.0453,
+      "step": 8050
+    },
+    {
+      "epoch": 2.7117509206561765,
+      "grad_norm": 0.20737840235233307,
+      "learning_rate": 2.7416024997210132e-05,
+      "loss": 0.0504,
+      "step": 8100
+    },
+    {
+      "epoch": 2.728490123870104,
+      "grad_norm": 0.18823584914207458,
+      "learning_rate": 2.7276531637094077e-05,
+      "loss": 0.0453,
+      "step": 8150
+    },
+    {
+      "epoch": 2.7452293270840307,
+      "grad_norm": 0.13201962411403656,
+      "learning_rate": 2.7137038276978017e-05,
+      "loss": 0.0433,
+      "step": 8200
+    },
+    {
+      "epoch": 2.761968530297958,
+      "grad_norm": 0.1443973183631897,
+      "learning_rate": 2.699754491686196e-05,
+      "loss": 0.0486,
+      "step": 8250
+    },
+    {
+      "epoch": 2.778707733511885,
+      "grad_norm": 0.29314514994621277,
+      "learning_rate": 2.68580515567459e-05,
+      "loss": 0.05,
+      "step": 8300
+    },
+    {
+      "epoch": 2.7954469367258117,
+      "grad_norm": 0.14852124452590942,
+      "learning_rate": 2.671855819662984e-05,
+      "loss": 0.0495,
+      "step": 8350
+    },
+    {
+      "epoch": 2.812186139939739,
+      "grad_norm": 0.19024662673473358,
+      "learning_rate": 2.6579064836513783e-05,
+      "loss": 0.0508,
+      "step": 8400
+    },
+    {
+      "epoch": 2.828925343153666,
+      "grad_norm": 0.1745578795671463,
+      "learning_rate": 2.6439571476397724e-05,
+      "loss": 0.0443,
+      "step": 8450
+    },
+    {
+      "epoch": 2.845664546367593,
+      "grad_norm": 0.18390017747879028,
+      "learning_rate": 2.6300078116281668e-05,
+      "loss": 0.0468,
+      "step": 8500
+    },
+    {
+      "epoch": 2.86240374958152,
+      "grad_norm": 0.22483347356319427,
+      "learning_rate": 2.616058475616561e-05,
+      "loss": 0.0467,
+      "step": 8550
+    },
+    {
+      "epoch": 2.879142952795447,
+      "grad_norm": 0.18160563707351685,
+      "learning_rate": 2.6021091396049553e-05,
+      "loss": 0.0441,
+      "step": 8600
+    },
+    {
+      "epoch": 2.8958821560093737,
+      "grad_norm": 0.13408955931663513,
+      "learning_rate": 2.588159803593349e-05,
+      "loss": 0.0446,
+      "step": 8650
+    },
+    {
+      "epoch": 2.912621359223301,
+      "grad_norm": 0.16038326919078827,
+      "learning_rate": 2.574210467581743e-05,
+      "loss": 0.0456,
+      "step": 8700
+    },
+    {
+      "epoch": 2.929360562437228,
+      "grad_norm": 0.22738413512706757,
+      "learning_rate": 2.5602611315701375e-05,
+      "loss": 0.0479,
+      "step": 8750
+    },
+    {
+      "epoch": 2.946099765651155,
+      "grad_norm": 0.20327210426330566,
+      "learning_rate": 2.5463117955585315e-05,
+      "loss": 0.0511,
+      "step": 8800
+    },
+    {
+      "epoch": 2.962838968865082,
+      "grad_norm": 0.15756353735923767,
+      "learning_rate": 2.532362459546926e-05,
+      "loss": 0.0426,
+      "step": 8850
+    },
+    {
+      "epoch": 2.979578172079009,
+      "grad_norm": 0.1305045783519745,
+      "learning_rate": 2.5184131235353197e-05,
+      "loss": 0.0442,
+      "step": 8900
+    },
+    {
+      "epoch": 2.996317375292936,
+      "grad_norm": 0.1610562801361084,
+      "learning_rate": 2.5044637875237144e-05,
+      "loss": 0.0467,
+      "step": 8950
+    },
+    {
+      "epoch": 3.0,
+      "eval_loss": 0.04702676460146904,
+      "eval_runtime": 52.8989,
+      "eval_samples_per_second": 225.865,
+      "eval_steps_per_second": 14.121,
+      "step": 8961
+    },
+    {
+      "epoch": 3.013056578506863,
+      "grad_norm": 0.2042045295238495,
+      "learning_rate": 2.490514451512108e-05,
+      "loss": 0.0457,
+      "step": 9000
+    },
+    {
+      "epoch": 3.02979578172079,
+      "grad_norm": 0.27092283964157104,
+      "learning_rate": 2.4765651155005022e-05,
+      "loss": 0.0437,
+      "step": 9050
+    },
+    {
+      "epoch": 3.046534984934717,
+      "grad_norm": 0.18729308247566223,
+      "learning_rate": 2.4626157794888966e-05,
+      "loss": 0.039,
+      "step": 9100
+    },
+    {
+      "epoch": 3.063274188148644,
+      "grad_norm": 0.18845289945602417,
+      "learning_rate": 2.4486664434772903e-05,
+      "loss": 0.0422,
+      "step": 9150
+    },
+    {
+      "epoch": 3.0800133913625714,
+      "grad_norm": 0.17593874037265778,
+      "learning_rate": 2.4347171074656847e-05,
+      "loss": 0.0384,
+      "step": 9200
+    },
+    {
+      "epoch": 3.096752594576498,
+      "grad_norm": 0.23149123787879944,
+      "learning_rate": 2.4207677714540788e-05,
+      "loss": 0.0443,
+      "step": 9250
+    },
+    {
+      "epoch": 3.113491797790425,
+      "grad_norm": 0.1968075931072235,
+      "learning_rate": 2.4068184354424732e-05,
+      "loss": 0.0402,
+      "step": 9300
+    },
+    {
+      "epoch": 3.1302310010043524,
+      "grad_norm": 0.16893354058265686,
+      "learning_rate": 2.3928690994308673e-05,
+      "loss": 0.043,
+      "step": 9350
+    },
+    {
+      "epoch": 3.146970204218279,
+      "grad_norm": 0.161103293299675,
+      "learning_rate": 2.3789197634192613e-05,
+      "loss": 0.0414,
+      "step": 9400
+    },
+    {
+      "epoch": 3.163709407432206,
+      "grad_norm": 0.22712625563144684,
+      "learning_rate": 2.3649704274076557e-05,
+      "loss": 0.0426,
+      "step": 9450
+    },
+    {
+      "epoch": 3.1804486106461334,
+      "grad_norm": 0.12811464071273804,
+      "learning_rate": 2.3510210913960495e-05,
+      "loss": 0.0406,
+      "step": 9500
+    },
+    {
+      "epoch": 3.1971878138600602,
+      "grad_norm": 0.16762731969356537,
+      "learning_rate": 2.337071755384444e-05,
+      "loss": 0.0417,
+      "step": 9550
+    },
+    {
+      "epoch": 3.213927017073987,
+      "grad_norm": 0.14003124833106995,
+      "learning_rate": 2.323122419372838e-05,
+      "loss": 0.0453,
+      "step": 9600
+    },
+    {
+      "epoch": 3.2306662202879144,
+      "grad_norm": 0.16891060769557953,
+      "learning_rate": 2.309173083361232e-05,
+      "loss": 0.0445,
+      "step": 9650
+    },
+    {
+      "epoch": 3.2474054235018412,
+      "grad_norm": 0.16900208592414856,
+      "learning_rate": 2.2952237473496264e-05,
+      "loss": 0.039,
+      "step": 9700
+    },
+    {
+      "epoch": 3.264144626715768,
+      "grad_norm": 0.17154955863952637,
+      "learning_rate": 2.2812744113380205e-05,
+      "loss": 0.0403,
+      "step": 9750
+    },
+    {
+      "epoch": 3.2808838299296954,
+      "grad_norm": 0.1620296835899353,
+      "learning_rate": 2.2673250753264145e-05,
+      "loss": 0.0406,
+      "step": 9800
+    },
+    {
+      "epoch": 3.2976230331436223,
+      "grad_norm": 0.14487063884735107,
+      "learning_rate": 2.2533757393148086e-05,
+      "loss": 0.0467,
+      "step": 9850
+    },
+    {
+      "epoch": 3.3143622363575496,
+      "grad_norm": 0.13799156248569489,
+      "learning_rate": 2.2394264033032027e-05,
+      "loss": 0.0433,
+      "step": 9900
+    },
+    {
+      "epoch": 3.3311014395714764,
+      "grad_norm": 0.1507265418767929,
+      "learning_rate": 2.225477067291597e-05,
+      "loss": 0.0446,
+      "step": 9950
+    },
+    {
+      "epoch": 3.3478406427854033,
+      "grad_norm": 0.1605840027332306,
+      "learning_rate": 2.211527731279991e-05,
+      "loss": 0.0415,
+      "step": 10000
+    },
+    {
+      "epoch": 3.3645798459993306,
+      "grad_norm": 0.11910756677389145,
+      "learning_rate": 2.1975783952683855e-05,
+      "loss": 0.0453,
+      "step": 10050
+    },
+    {
+      "epoch": 3.3813190492132574,
+      "grad_norm": 0.15115414559841156,
+      "learning_rate": 2.1836290592567796e-05,
+      "loss": 0.0395,
+      "step": 10100
+    },
+    {
+      "epoch": 3.3980582524271843,
+      "grad_norm": 0.2044568508863449,
+      "learning_rate": 2.1696797232451737e-05,
+      "loss": 0.0473,
+      "step": 10150
+    },
+    {
+      "epoch": 3.4147974556411116,
+      "grad_norm": 0.18123552203178406,
+      "learning_rate": 2.1557303872335677e-05,
+      "loss": 0.0411,
+      "step": 10200
+    },
+    {
+      "epoch": 3.4315366588550384,
+      "grad_norm": 0.6018120646476746,
+      "learning_rate": 2.1417810512219618e-05,
+      "loss": 0.0413,
+      "step": 10250
+    },
+    {
+      "epoch": 3.4482758620689653,
+      "grad_norm": 0.22490189969539642,
+      "learning_rate": 2.1278317152103562e-05,
+      "loss": 0.0437,
+      "step": 10300
+    },
+    {
+      "epoch": 3.4650150652828926,
+      "grad_norm": 0.1784990429878235,
+      "learning_rate": 2.1138823791987503e-05,
+      "loss": 0.0453,
+      "step": 10350
+    },
+    {
+      "epoch": 3.4817542684968195,
+      "grad_norm": 0.15248402953147888,
+      "learning_rate": 2.0999330431871443e-05,
+      "loss": 0.0456,
+      "step": 10400
+    },
+    {
+      "epoch": 3.4984934717107468,
+      "grad_norm": 0.15146291255950928,
+      "learning_rate": 2.0859837071755387e-05,
+      "loss": 0.0393,
+      "step": 10450
+    },
+    {
+      "epoch": 3.5152326749246736,
+      "grad_norm": 0.1662750393152237,
+      "learning_rate": 2.0720343711639325e-05,
+      "loss": 0.0408,
+      "step": 10500
+    },
+    {
+      "epoch": 3.5319718781386005,
+      "grad_norm": 0.1244506984949112,
+      "learning_rate": 2.058085035152327e-05,
+      "loss": 0.0426,
+      "step": 10550
+    },
+    {
+      "epoch": 3.5487110813525278,
+      "grad_norm": 0.1180344969034195,
+      "learning_rate": 2.044135699140721e-05,
+      "loss": 0.0434,
+      "step": 10600
+    },
+    {
+      "epoch": 3.5654502845664546,
+      "grad_norm": 0.15951013565063477,
+      "learning_rate": 2.030186363129115e-05,
+      "loss": 0.0387,
+      "step": 10650
+    },
+    {
+      "epoch": 3.582189487780382,
+      "grad_norm": 0.16064217686653137,
+      "learning_rate": 2.0162370271175094e-05,
+      "loss": 0.0445,
+      "step": 10700
+    },
+    {
+      "epoch": 3.598928690994309,
+      "grad_norm": 0.17813698947429657,
+      "learning_rate": 2.0022876911059035e-05,
+      "loss": 0.0407,
+      "step": 10750
+    },
+    {
+      "epoch": 3.6156678942082356,
+      "grad_norm": 0.1256450116634369,
+      "learning_rate": 1.988338355094298e-05,
+      "loss": 0.0462,
+      "step": 10800
+    },
+    {
+      "epoch": 3.6324070974221625,
+      "grad_norm": 0.14016403257846832,
+      "learning_rate": 1.9743890190826916e-05,
+      "loss": 0.0456,
+      "step": 10850
+    },
+    {
+      "epoch": 3.64914630063609,
+      "grad_norm": 0.1396850347518921,
+      "learning_rate": 1.9604396830710857e-05,
+      "loss": 0.0403,
+      "step": 10900
+    },
+    {
+      "epoch": 3.6658855038500167,
+      "grad_norm": 0.17943057417869568,
+      "learning_rate": 1.94649034705948e-05,
+      "loss": 0.0417,
+      "step": 10950
+    },
+    {
+      "epoch": 3.682624707063944,
+      "grad_norm": 0.14947953820228577,
+      "learning_rate": 1.932541011047874e-05,
+      "loss": 0.0421,
+      "step": 11000
+    },
+    {
+      "epoch": 3.699363910277871,
+      "grad_norm": 0.12628613412380219,
+      "learning_rate": 1.9185916750362685e-05,
+      "loss": 0.0435,
+      "step": 11050
+    },
+    {
+      "epoch": 3.7161031134917977,
+      "grad_norm": 0.2205984890460968,
+      "learning_rate": 1.9046423390246626e-05,
+      "loss": 0.0396,
+      "step": 11100
+    },
+    {
+      "epoch": 3.732842316705725,
+      "grad_norm": 0.13236357271671295,
+      "learning_rate": 1.8906930030130567e-05,
+      "loss": 0.0405,
+      "step": 11150
+    },
+    {
+      "epoch": 3.749581519919652,
+      "grad_norm": 0.15023528039455414,
+      "learning_rate": 1.8767436670014507e-05,
+      "loss": 0.0434,
+      "step": 11200
+    },
+    {
+      "epoch": 3.7663207231335787,
+      "grad_norm": 0.1427326649427414,
+      "learning_rate": 1.8627943309898448e-05,
+      "loss": 0.0437,
+      "step": 11250
+    },
+    {
+      "epoch": 3.783059926347506,
+      "grad_norm": 0.1890624761581421,
+      "learning_rate": 1.8488449949782392e-05,
+      "loss": 0.0408,
+      "step": 11300
+    },
+    {
+      "epoch": 3.799799129561433,
+      "grad_norm": 0.27970972657203674,
+      "learning_rate": 1.8348956589666333e-05,
+      "loss": 0.0441,
+      "step": 11350
+    },
+    {
+      "epoch": 3.8165383327753597,
+      "grad_norm": 0.12823455035686493,
+      "learning_rate": 1.8209463229550273e-05,
+      "loss": 0.0412,
+      "step": 11400
+    },
+    {
+      "epoch": 3.833277535989287,
+      "grad_norm": 0.1442965269088745,
+      "learning_rate": 1.8069969869434218e-05,
+      "loss": 0.0416,
+      "step": 11450
+    },
+    {
+      "epoch": 3.850016739203214,
+      "grad_norm": 0.13739417493343353,
+      "learning_rate": 1.7930476509318158e-05,
+      "loss": 0.0397,
+      "step": 11500
+    },
+    {
+      "epoch": 3.866755942417141,
+      "grad_norm": 0.16616705060005188,
+      "learning_rate": 1.77909831492021e-05,
+      "loss": 0.0413,
+      "step": 11550
+    },
+    {
+      "epoch": 3.883495145631068,
+      "grad_norm": 0.23060384392738342,
+      "learning_rate": 1.765148978908604e-05,
+      "loss": 0.0447,
+      "step": 11600
+    },
+    {
+      "epoch": 3.900234348844995,
+      "grad_norm": 0.2936810553073883,
+      "learning_rate": 1.751199642896998e-05,
+      "loss": 0.0409,
+      "step": 11650
+    },
+    {
+      "epoch": 3.9169735520589217,
+      "grad_norm": 0.17367126047611237,
+      "learning_rate": 1.7372503068853924e-05,
+      "loss": 0.0411,
+      "step": 11700
+    },
+    {
+      "epoch": 3.933712755272849,
+      "grad_norm": 0.14550547301769257,
+      "learning_rate": 1.7233009708737865e-05,
+      "loss": 0.0445,
+      "step": 11750
+    },
+    {
+      "epoch": 3.950451958486776,
+      "grad_norm": 0.13322454690933228,
+      "learning_rate": 1.709351634862181e-05,
+      "loss": 0.0444,
+      "step": 11800
+    },
+    {
+      "epoch": 3.967191161700703,
+      "grad_norm": 0.13606959581375122,
+      "learning_rate": 1.6954022988505746e-05,
+      "loss": 0.041,
+      "step": 11850
+    },
+    {
+      "epoch": 3.98393036491463,
+      "grad_norm": 0.14227426052093506,
+      "learning_rate": 1.681452962838969e-05,
+      "loss": 0.0412,
+      "step": 11900
+    },
+    {
+      "epoch": 4.0,
+      "eval_loss": 0.045825306326150894,
+      "eval_runtime": 48.668,
+      "eval_samples_per_second": 245.5,
+      "eval_steps_per_second": 15.349,
+      "step": 11948
+    }
+  ],
+  "logging_steps": 50,
+  "max_steps": 17922,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 6,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2.5921130506223616e+16,
+  "train_batch_size": 16,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-11948/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:37dd546b69fb60d8deb15a8b88e40b23e367c0e9f5a053ea3ae7c730b3874f2e
+size 5304

checkpoint-11948/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-14935/config.json ADDED Viewed

	@@ -0,0 +1,41 @@

+{
+  "_name_or_path": "ckartal/english-to-turkish-finetuned-model",
+  "activation_dropout": 0.0,
+  "activation_function": "swish",
+  "architectures": [
+    "MarianMTModel"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.0,
+  "d_model": 512,
+  "decoder_attention_heads": 8,
+  "decoder_ffn_dim": 2048,
+  "decoder_layerdrop": 0.0,
+  "decoder_layers": 6,
+  "decoder_start_token_id": 59993,
+  "decoder_vocab_size": 59994,
+  "dropout": 0.1,
+  "encoder_attention_heads": 8,
+  "encoder_ffn_dim": 2048,
+  "encoder_layerdrop": 0.0,
+  "encoder_layers": 6,
+  "eos_token_id": 0,
+  "forced_eos_token_id": 0,
+  "init_std": 0.02,
+  "is_encoder_decoder": true,
+  "max_length": null,
+  "max_position_embeddings": 512,
+  "model_type": "marian",
+  "normalize_embedding": false,
+  "num_beams": null,
+  "num_hidden_layers": 6,
+  "pad_token_id": 59993,
+  "scale_embedding": true,
+  "share_encoder_decoder_embeddings": true,
+  "static_position_embeddings": true,
+  "torch_dtype": "float32",
+  "transformers_version": "4.49.0",
+  "use_cache": true,
+  "vocab_size": 59994
+}

checkpoint-14935/generation_config.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+  "bad_words_ids": [
+    [
+      59993
+    ]
+  ],
+  "bos_token_id": 0,
+  "decoder_start_token_id": 59993,
+  "eos_token_id": 0,
+  "forced_eos_token_id": 0,
+  "max_length": 512,
+  "num_beams": 6,
+  "pad_token_id": 59993,
+  "renormalize_logits": true,
+  "transformers_version": "4.49.0"
+}

checkpoint-14935/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:10b7619757ec37001c328bb68f33749e3183758158980f8c3a5c346dcb866279
+size 299690728

checkpoint-14935/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ade4432f6ff505cb413003c71d8fa4ceedcadd0040b992e0df392dbd10889c3d
+size 599054970

checkpoint-14935/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3f721aaec70579761bcfa3d2d4441178be84c6f3dcc5f6d933e54b21fe8e6cc2
+size 14244

checkpoint-14935/scaler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:36a823fbff578c8a82c44346b99397ed351d5e6783ee511d8e35285b13133caf
+size 988

checkpoint-14935/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f321649b2aa5ecd287cb6a64d13837fccb0d3045d069aabc11f66246e4800051
+size 1064

checkpoint-14935/source.spm ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:98eb24f0995a9d5f7cb0fb628c474628b1d2284615e881e857d062c0b651ce10
+size 793920

checkpoint-14935/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoint-14935/target.spm ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:45cc6000ed513cdca8f80739087fbcbf9933dc50c9ae36c319c9670882f72e1b
+size 837876

checkpoint-14935/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,40 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "59993": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "</s>",
+  "extra_special_tokens": {},
+  "model_max_length": 512,
+  "pad_token": "<pad>",
+  "return_tensors": "pt",
+  "separate_vocabs": false,
+  "source_lang": "eng",
+  "sp_model_kwargs": {},
+  "target_lang": "tur",
+  "tokenizer_class": "MarianTokenizer",
+  "unk_token": "<unk>"
+}

checkpoint-14935/trainer_state.json ADDED Viewed

	@@ -0,0 +1,2159 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 5.0,
+  "eval_steps": 500,
+  "global_step": 14935,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.016739203213927016,
+      "grad_norm": 0.439177542924881,
+      "learning_rate": 4.987445597589555e-05,
+      "loss": 1.5222,
+      "step": 50
+    },
+    {
+      "epoch": 0.03347840642785403,
+      "grad_norm": 0.4248828887939453,
+      "learning_rate": 4.973496261577949e-05,
+      "loss": 0.1842,
+      "step": 100
+    },
+    {
+      "epoch": 0.05021760964178105,
+      "grad_norm": 0.3019009232521057,
+      "learning_rate": 4.9595469255663436e-05,
+      "loss": 0.1471,
+      "step": 150
+    },
+    {
+      "epoch": 0.06695681285570806,
+      "grad_norm": 0.2518245577812195,
+      "learning_rate": 4.9455975895547376e-05,
+      "loss": 0.1306,
+      "step": 200
+    },
+    {
+      "epoch": 0.08369601606963509,
+      "grad_norm": 0.3660012185573578,
+      "learning_rate": 4.931648253543131e-05,
+      "loss": 0.1276,
+      "step": 250
+    },
+    {
+      "epoch": 0.1004352192835621,
+      "grad_norm": 0.32854148745536804,
+      "learning_rate": 4.917698917531526e-05,
+      "loss": 0.1048,
+      "step": 300
+    },
+    {
+      "epoch": 0.11717442249748912,
+      "grad_norm": 0.24879515171051025,
+      "learning_rate": 4.90374958151992e-05,
+      "loss": 0.1054,
+      "step": 350
+    },
+    {
+      "epoch": 0.13391362571141613,
+      "grad_norm": 0.36416563391685486,
+      "learning_rate": 4.889800245508314e-05,
+      "loss": 0.0985,
+      "step": 400
+    },
+    {
+      "epoch": 0.15065282892534315,
+      "grad_norm": 0.33641186356544495,
+      "learning_rate": 4.875850909496708e-05,
+      "loss": 0.1044,
+      "step": 450
+    },
+    {
+      "epoch": 0.16739203213927017,
+      "grad_norm": 0.32909244298934937,
+      "learning_rate": 4.861901573485103e-05,
+      "loss": 0.1089,
+      "step": 500
+    },
+    {
+      "epoch": 0.1841312353531972,
+      "grad_norm": 0.36060285568237305,
+      "learning_rate": 4.847952237473497e-05,
+      "loss": 0.09,
+      "step": 550
+    },
+    {
+      "epoch": 0.2008704385671242,
+      "grad_norm": 0.2510785758495331,
+      "learning_rate": 4.83400290146189e-05,
+      "loss": 0.0884,
+      "step": 600
+    },
+    {
+      "epoch": 0.21760964178105122,
+      "grad_norm": 0.22478719055652618,
+      "learning_rate": 4.820053565450285e-05,
+      "loss": 0.0866,
+      "step": 650
+    },
+    {
+      "epoch": 0.23434884499497824,
+      "grad_norm": 0.37321263551712036,
+      "learning_rate": 4.806104229438679e-05,
+      "loss": 0.0884,
+      "step": 700
+    },
+    {
+      "epoch": 0.25108804820890523,
+      "grad_norm": 0.2660929262638092,
+      "learning_rate": 4.792154893427073e-05,
+      "loss": 0.0819,
+      "step": 750
+    },
+    {
+      "epoch": 0.26782725142283226,
+      "grad_norm": 0.2338525801897049,
+      "learning_rate": 4.778205557415467e-05,
+      "loss": 0.0845,
+      "step": 800
+    },
+    {
+      "epoch": 0.2845664546367593,
+      "grad_norm": 0.308557391166687,
+      "learning_rate": 4.764256221403862e-05,
+      "loss": 0.0815,
+      "step": 850
+    },
+    {
+      "epoch": 0.3013056578506863,
+      "grad_norm": 0.27098262310028076,
+      "learning_rate": 4.750306885392255e-05,
+      "loss": 0.0833,
+      "step": 900
+    },
+    {
+      "epoch": 0.3180448610646133,
+      "grad_norm": 0.23054952919483185,
+      "learning_rate": 4.736357549380649e-05,
+      "loss": 0.0806,
+      "step": 950
+    },
+    {
+      "epoch": 0.33478406427854035,
+      "grad_norm": 0.21355900168418884,
+      "learning_rate": 4.722408213369044e-05,
+      "loss": 0.073,
+      "step": 1000
+    },
+    {
+      "epoch": 0.3515232674924674,
+      "grad_norm": 0.20395708084106445,
+      "learning_rate": 4.708458877357438e-05,
+      "loss": 0.0775,
+      "step": 1050
+    },
+    {
+      "epoch": 0.3682624707063944,
+      "grad_norm": 0.21063613891601562,
+      "learning_rate": 4.694509541345832e-05,
+      "loss": 0.0789,
+      "step": 1100
+    },
+    {
+      "epoch": 0.3850016739203214,
+      "grad_norm": 0.20589284598827362,
+      "learning_rate": 4.680560205334226e-05,
+      "loss": 0.0809,
+      "step": 1150
+    },
+    {
+      "epoch": 0.4017408771342484,
+      "grad_norm": 0.27975228428840637,
+      "learning_rate": 4.666610869322621e-05,
+      "loss": 0.078,
+      "step": 1200
+    },
+    {
+      "epoch": 0.4184800803481754,
+      "grad_norm": 0.2529745399951935,
+      "learning_rate": 4.6526615333110144e-05,
+      "loss": 0.0704,
+      "step": 1250
+    },
+    {
+      "epoch": 0.43521928356210243,
+      "grad_norm": 0.2205154448747635,
+      "learning_rate": 4.6387121972994084e-05,
+      "loss": 0.0733,
+      "step": 1300
+    },
+    {
+      "epoch": 0.45195848677602946,
+      "grad_norm": 0.2254629135131836,
+      "learning_rate": 4.624762861287803e-05,
+      "loss": 0.0751,
+      "step": 1350
+    },
+    {
+      "epoch": 0.4686976899899565,
+      "grad_norm": 0.17614957690238953,
+      "learning_rate": 4.610813525276197e-05,
+      "loss": 0.0747,
+      "step": 1400
+    },
+    {
+      "epoch": 0.4854368932038835,
+      "grad_norm": 0.15940478444099426,
+      "learning_rate": 4.596864189264591e-05,
+      "loss": 0.0698,
+      "step": 1450
+    },
+    {
+      "epoch": 0.5021760964178105,
+      "grad_norm": 0.1869521141052246,
+      "learning_rate": 4.5829148532529854e-05,
+      "loss": 0.0721,
+      "step": 1500
+    },
+    {
+      "epoch": 0.5189152996317375,
+      "grad_norm": 0.36063650250434875,
+      "learning_rate": 4.5689655172413794e-05,
+      "loss": 0.0706,
+      "step": 1550
+    },
+    {
+      "epoch": 0.5356545028456645,
+      "grad_norm": 0.16967014968395233,
+      "learning_rate": 4.5550161812297735e-05,
+      "loss": 0.0759,
+      "step": 1600
+    },
+    {
+      "epoch": 0.5523937060595916,
+      "grad_norm": 0.29293423891067505,
+      "learning_rate": 4.5410668452181676e-05,
+      "loss": 0.0711,
+      "step": 1650
+    },
+    {
+      "epoch": 0.5691329092735186,
+      "grad_norm": 0.3034748136997223,
+      "learning_rate": 4.527117509206562e-05,
+      "loss": 0.067,
+      "step": 1700
+    },
+    {
+      "epoch": 0.5858721124874456,
+      "grad_norm": 0.1974593997001648,
+      "learning_rate": 4.513168173194956e-05,
+      "loss": 0.0701,
+      "step": 1750
+    },
+    {
+      "epoch": 0.6026113157013726,
+      "grad_norm": 0.18101799488067627,
+      "learning_rate": 4.4992188371833505e-05,
+      "loss": 0.0717,
+      "step": 1800
+    },
+    {
+      "epoch": 0.6193505189152997,
+      "grad_norm": 0.14422941207885742,
+      "learning_rate": 4.4852695011717445e-05,
+      "loss": 0.0686,
+      "step": 1850
+    },
+    {
+      "epoch": 0.6360897221292267,
+      "grad_norm": 0.28663551807403564,
+      "learning_rate": 4.4713201651601386e-05,
+      "loss": 0.0646,
+      "step": 1900
+    },
+    {
+      "epoch": 0.6528289253431536,
+      "grad_norm": 0.23879379034042358,
+      "learning_rate": 4.4573708291485327e-05,
+      "loss": 0.0684,
+      "step": 1950
+    },
+    {
+      "epoch": 0.6695681285570807,
+      "grad_norm": 0.21389362215995789,
+      "learning_rate": 4.443421493136927e-05,
+      "loss": 0.066,
+      "step": 2000
+    },
+    {
+      "epoch": 0.6863073317710077,
+      "grad_norm": 0.26841893792152405,
+      "learning_rate": 4.4294721571253215e-05,
+      "loss": 0.0717,
+      "step": 2050
+    },
+    {
+      "epoch": 0.7030465349849347,
+      "grad_norm": 0.240205317735672,
+      "learning_rate": 4.415522821113715e-05,
+      "loss": 0.0697,
+      "step": 2100
+    },
+    {
+      "epoch": 0.7197857381988617,
+      "grad_norm": 0.28098127245903015,
+      "learning_rate": 4.4015734851021096e-05,
+      "loss": 0.0713,
+      "step": 2150
+    },
+    {
+      "epoch": 0.7365249414127888,
+      "grad_norm": 0.23308847844600677,
+      "learning_rate": 4.3876241490905037e-05,
+      "loss": 0.0667,
+      "step": 2200
+    },
+    {
+      "epoch": 0.7532641446267158,
+      "grad_norm": 0.22748568654060364,
+      "learning_rate": 4.373674813078898e-05,
+      "loss": 0.0605,
+      "step": 2250
+    },
+    {
+      "epoch": 0.7700033478406428,
+      "grad_norm": 0.3932187259197235,
+      "learning_rate": 4.359725477067292e-05,
+      "loss": 0.0676,
+      "step": 2300
+    },
+    {
+      "epoch": 0.7867425510545698,
+      "grad_norm": 0.23918767273426056,
+      "learning_rate": 4.345776141055686e-05,
+      "loss": 0.0624,
+      "step": 2350
+    },
+    {
+      "epoch": 0.8034817542684968,
+      "grad_norm": 0.3068426549434662,
+      "learning_rate": 4.33182680504408e-05,
+      "loss": 0.0664,
+      "step": 2400
+    },
+    {
+      "epoch": 0.8202209574824239,
+      "grad_norm": 0.17977873980998993,
+      "learning_rate": 4.317877469032474e-05,
+      "loss": 0.0726,
+      "step": 2450
+    },
+    {
+      "epoch": 0.8369601606963508,
+      "grad_norm": 0.16876642405986786,
+      "learning_rate": 4.303928133020869e-05,
+      "loss": 0.0639,
+      "step": 2500
+    },
+    {
+      "epoch": 0.8536993639102779,
+      "grad_norm": 0.17980250716209412,
+      "learning_rate": 4.289978797009263e-05,
+      "loss": 0.0701,
+      "step": 2550
+    },
+    {
+      "epoch": 0.8704385671242049,
+      "grad_norm": 0.1711459904909134,
+      "learning_rate": 4.276029460997656e-05,
+      "loss": 0.063,
+      "step": 2600
+    },
+    {
+      "epoch": 0.8871777703381319,
+      "grad_norm": 0.443228542804718,
+      "learning_rate": 4.262080124986051e-05,
+      "loss": 0.0675,
+      "step": 2650
+    },
+    {
+      "epoch": 0.9039169735520589,
+      "grad_norm": 0.2098589390516281,
+      "learning_rate": 4.248130788974445e-05,
+      "loss": 0.062,
+      "step": 2700
+    },
+    {
+      "epoch": 0.920656176765986,
+      "grad_norm": 0.3022039234638214,
+      "learning_rate": 4.234181452962839e-05,
+      "loss": 0.07,
+      "step": 2750
+    },
+    {
+      "epoch": 0.937395379979913,
+      "grad_norm": 0.19368910789489746,
+      "learning_rate": 4.220232116951233e-05,
+      "loss": 0.0621,
+      "step": 2800
+    },
+    {
+      "epoch": 0.9541345831938399,
+      "grad_norm": 0.18753108382225037,
+      "learning_rate": 4.206282780939628e-05,
+      "loss": 0.0631,
+      "step": 2850
+    },
+    {
+      "epoch": 0.970873786407767,
+      "grad_norm": 0.15517786145210266,
+      "learning_rate": 4.192333444928022e-05,
+      "loss": 0.0641,
+      "step": 2900
+    },
+    {
+      "epoch": 0.987612989621694,
+      "grad_norm": 0.11765792220830917,
+      "learning_rate": 4.178384108916415e-05,
+      "loss": 0.0612,
+      "step": 2950
+    },
+    {
+      "epoch": 1.0,
+      "eval_loss": 0.05521286651492119,
+      "eval_runtime": 50.415,
+      "eval_samples_per_second": 236.993,
+      "eval_steps_per_second": 14.817,
+      "step": 2987
+    },
+    {
+      "epoch": 1.004352192835621,
+      "grad_norm": 0.2691793739795685,
+      "learning_rate": 4.16443477290481e-05,
+      "loss": 0.059,
+      "step": 3000
+    },
+    {
+      "epoch": 1.021091396049548,
+      "grad_norm": 0.394694060087204,
+      "learning_rate": 4.150485436893204e-05,
+      "loss": 0.0566,
+      "step": 3050
+    },
+    {
+      "epoch": 1.037830599263475,
+      "grad_norm": 0.19438503682613373,
+      "learning_rate": 4.136536100881598e-05,
+      "loss": 0.0591,
+      "step": 3100
+    },
+    {
+      "epoch": 1.0545698024774022,
+      "grad_norm": 0.21350933611392975,
+      "learning_rate": 4.122586764869992e-05,
+      "loss": 0.0509,
+      "step": 3150
+    },
+    {
+      "epoch": 1.071309005691329,
+      "grad_norm": 0.26747575402259827,
+      "learning_rate": 4.108637428858387e-05,
+      "loss": 0.0589,
+      "step": 3200
+    },
+    {
+      "epoch": 1.088048208905256,
+      "grad_norm": 0.31256961822509766,
+      "learning_rate": 4.0946880928467804e-05,
+      "loss": 0.0602,
+      "step": 3250
+    },
+    {
+      "epoch": 1.1047874121191832,
+      "grad_norm": 0.18631280958652496,
+      "learning_rate": 4.0807387568351745e-05,
+      "loss": 0.0547,
+      "step": 3300
+    },
+    {
+      "epoch": 1.12152661533311,
+      "grad_norm": 0.18677473068237305,
+      "learning_rate": 4.066789420823569e-05,
+      "loss": 0.0543,
+      "step": 3350
+    },
+    {
+      "epoch": 1.1382658185470371,
+      "grad_norm": 0.24535444378852844,
+      "learning_rate": 4.052840084811963e-05,
+      "loss": 0.0583,
+      "step": 3400
+    },
+    {
+      "epoch": 1.1550050217609642,
+      "grad_norm": 0.1752105951309204,
+      "learning_rate": 4.038890748800357e-05,
+      "loss": 0.0504,
+      "step": 3450
+    },
+    {
+      "epoch": 1.1717442249748913,
+      "grad_norm": 0.14743360877037048,
+      "learning_rate": 4.0249414127887514e-05,
+      "loss": 0.055,
+      "step": 3500
+    },
+    {
+      "epoch": 1.1884834281888181,
+      "grad_norm": 0.11535945534706116,
+      "learning_rate": 4.010992076777146e-05,
+      "loss": 0.0552,
+      "step": 3550
+    },
+    {
+      "epoch": 1.2052226314027452,
+      "grad_norm": 0.26563358306884766,
+      "learning_rate": 3.9970427407655395e-05,
+      "loss": 0.0552,
+      "step": 3600
+    },
+    {
+      "epoch": 1.2219618346166723,
+      "grad_norm": 0.15104246139526367,
+      "learning_rate": 3.9830934047539336e-05,
+      "loss": 0.0575,
+      "step": 3650
+    },
+    {
+      "epoch": 1.2387010378305994,
+      "grad_norm": 0.2198421210050583,
+      "learning_rate": 3.9691440687423283e-05,
+      "loss": 0.0567,
+      "step": 3700
+    },
+    {
+      "epoch": 1.2554402410445262,
+      "grad_norm": 0.20177733898162842,
+      "learning_rate": 3.955194732730722e-05,
+      "loss": 0.0556,
+      "step": 3750
+    },
+    {
+      "epoch": 1.2721794442584533,
+      "grad_norm": 0.36604830622673035,
+      "learning_rate": 3.9412453967191165e-05,
+      "loss": 0.0569,
+      "step": 3800
+    },
+    {
+      "epoch": 1.2889186474723804,
+      "grad_norm": 0.18883727490901947,
+      "learning_rate": 3.9272960607075105e-05,
+      "loss": 0.0595,
+      "step": 3850
+    },
+    {
+      "epoch": 1.3056578506863072,
+      "grad_norm": 0.14828617870807648,
+      "learning_rate": 3.9133467246959046e-05,
+      "loss": 0.0548,
+      "step": 3900
+    },
+    {
+      "epoch": 1.3223970539002343,
+      "grad_norm": 0.19220437109470367,
+      "learning_rate": 3.899397388684299e-05,
+      "loss": 0.053,
+      "step": 3950
+    },
+    {
+      "epoch": 1.3391362571141614,
+      "grad_norm": 0.16049669682979584,
+      "learning_rate": 3.885448052672693e-05,
+      "loss": 0.0581,
+      "step": 4000
+    },
+    {
+      "epoch": 1.3558754603280883,
+      "grad_norm": 0.22821515798568726,
+      "learning_rate": 3.8714987166610875e-05,
+      "loss": 0.0518,
+      "step": 4050
+    },
+    {
+      "epoch": 1.3726146635420153,
+      "grad_norm": 0.1879580318927765,
+      "learning_rate": 3.857549380649481e-05,
+      "loss": 0.0574,
+      "step": 4100
+    },
+    {
+      "epoch": 1.3893538667559424,
+      "grad_norm": 0.16026251018047333,
+      "learning_rate": 3.8436000446378756e-05,
+      "loss": 0.063,
+      "step": 4150
+    },
+    {
+      "epoch": 1.4060930699698695,
+      "grad_norm": 0.26868143677711487,
+      "learning_rate": 3.82965070862627e-05,
+      "loss": 0.0571,
+      "step": 4200
+    },
+    {
+      "epoch": 1.4228322731837966,
+      "grad_norm": 0.2529687285423279,
+      "learning_rate": 3.815701372614664e-05,
+      "loss": 0.0528,
+      "step": 4250
+    },
+    {
+      "epoch": 1.4395714763977234,
+      "grad_norm": 0.19138221442699432,
+      "learning_rate": 3.801752036603058e-05,
+      "loss": 0.0584,
+      "step": 4300
+    },
+    {
+      "epoch": 1.4563106796116505,
+      "grad_norm": 0.16359661519527435,
+      "learning_rate": 3.787802700591452e-05,
+      "loss": 0.0539,
+      "step": 4350
+    },
+    {
+      "epoch": 1.4730498828255776,
+      "grad_norm": 0.1373494267463684,
+      "learning_rate": 3.7738533645798466e-05,
+      "loss": 0.0557,
+      "step": 4400
+    },
+    {
+      "epoch": 1.4897890860395044,
+      "grad_norm": 0.15695162117481232,
+      "learning_rate": 3.75990402856824e-05,
+      "loss": 0.0491,
+      "step": 4450
+    },
+    {
+      "epoch": 1.5065282892534315,
+      "grad_norm": 0.18462614715099335,
+      "learning_rate": 3.745954692556635e-05,
+      "loss": 0.0495,
+      "step": 4500
+    },
+    {
+      "epoch": 1.5232674924673586,
+      "grad_norm": 0.27876704931259155,
+      "learning_rate": 3.732005356545029e-05,
+      "loss": 0.0523,
+      "step": 4550
+    },
+    {
+      "epoch": 1.5400066956812855,
+      "grad_norm": 0.30491840839385986,
+      "learning_rate": 3.718056020533423e-05,
+      "loss": 0.0564,
+      "step": 4600
+    },
+    {
+      "epoch": 1.5567458988952128,
+      "grad_norm": 0.18721336126327515,
+      "learning_rate": 3.704106684521817e-05,
+      "loss": 0.0524,
+      "step": 4650
+    },
+    {
+      "epoch": 1.5734851021091396,
+      "grad_norm": 0.21216215193271637,
+      "learning_rate": 3.690157348510211e-05,
+      "loss": 0.0521,
+      "step": 4700
+    },
+    {
+      "epoch": 1.5902243053230665,
+      "grad_norm": 0.1368396282196045,
+      "learning_rate": 3.676208012498605e-05,
+      "loss": 0.056,
+      "step": 4750
+    },
+    {
+      "epoch": 1.6069635085369938,
+      "grad_norm": 0.13692086935043335,
+      "learning_rate": 3.662258676486999e-05,
+      "loss": 0.0443,
+      "step": 4800
+    },
+    {
+      "epoch": 1.6237027117509206,
+      "grad_norm": 0.11640128493309021,
+      "learning_rate": 3.648309340475394e-05,
+      "loss": 0.0488,
+      "step": 4850
+    },
+    {
+      "epoch": 1.6404419149648477,
+      "grad_norm": 0.19953882694244385,
+      "learning_rate": 3.634360004463788e-05,
+      "loss": 0.0553,
+      "step": 4900
+    },
+    {
+      "epoch": 1.6571811181787748,
+      "grad_norm": 0.1966984122991562,
+      "learning_rate": 3.6204106684521813e-05,
+      "loss": 0.0536,
+      "step": 4950
+    },
+    {
+      "epoch": 1.6739203213927016,
+      "grad_norm": 0.2324533313512802,
+      "learning_rate": 3.606461332440576e-05,
+      "loss": 0.0493,
+      "step": 5000
+    },
+    {
+      "epoch": 1.6906595246066287,
+      "grad_norm": 0.16217607259750366,
+      "learning_rate": 3.59251199642897e-05,
+      "loss": 0.0503,
+      "step": 5050
+    },
+    {
+      "epoch": 1.7073987278205558,
+      "grad_norm": 0.23949602246284485,
+      "learning_rate": 3.578562660417364e-05,
+      "loss": 0.0556,
+      "step": 5100
+    },
+    {
+      "epoch": 1.7241379310344827,
+      "grad_norm": 0.21387897431850433,
+      "learning_rate": 3.564613324405758e-05,
+      "loss": 0.0548,
+      "step": 5150
+    },
+    {
+      "epoch": 1.7408771342484097,
+      "grad_norm": 0.2055111676454544,
+      "learning_rate": 3.550663988394153e-05,
+      "loss": 0.06,
+      "step": 5200
+    },
+    {
+      "epoch": 1.7576163374623368,
+      "grad_norm": 0.20280921459197998,
+      "learning_rate": 3.5367146523825464e-05,
+      "loss": 0.0508,
+      "step": 5250
+    },
+    {
+      "epoch": 1.7743555406762637,
+      "grad_norm": 0.14165103435516357,
+      "learning_rate": 3.5227653163709405e-05,
+      "loss": 0.0581,
+      "step": 5300
+    },
+    {
+      "epoch": 1.791094743890191,
+      "grad_norm": 0.18099863827228546,
+      "learning_rate": 3.508815980359335e-05,
+      "loss": 0.0562,
+      "step": 5350
+    },
+    {
+      "epoch": 1.8078339471041178,
+      "grad_norm": 0.21743184328079224,
+      "learning_rate": 3.494866644347729e-05,
+      "loss": 0.0498,
+      "step": 5400
+    },
+    {
+      "epoch": 1.824573150318045,
+      "grad_norm": 0.20934534072875977,
+      "learning_rate": 3.4809173083361234e-05,
+      "loss": 0.0549,
+      "step": 5450
+    },
+    {
+      "epoch": 1.841312353531972,
+      "grad_norm": 0.1582174152135849,
+      "learning_rate": 3.4669679723245174e-05,
+      "loss": 0.0556,
+      "step": 5500
+    },
+    {
+      "epoch": 1.8580515567458988,
+      "grad_norm": 0.1624903827905655,
+      "learning_rate": 3.453018636312912e-05,
+      "loss": 0.0516,
+      "step": 5550
+    },
+    {
+      "epoch": 1.874790759959826,
+      "grad_norm": 0.16255798935890198,
+      "learning_rate": 3.4390693003013056e-05,
+      "loss": 0.0542,
+      "step": 5600
+    },
+    {
+      "epoch": 1.891529963173753,
+      "grad_norm": 0.1269742250442505,
+      "learning_rate": 3.4251199642896996e-05,
+      "loss": 0.0565,
+      "step": 5650
+    },
+    {
+      "epoch": 1.9082691663876798,
+      "grad_norm": 0.15966229140758514,
+      "learning_rate": 3.4111706282780944e-05,
+      "loss": 0.0538,
+      "step": 5700
+    },
+    {
+      "epoch": 1.925008369601607,
+      "grad_norm": 0.21506330370903015,
+      "learning_rate": 3.3972212922664884e-05,
+      "loss": 0.0505,
+      "step": 5750
+    },
+    {
+      "epoch": 1.941747572815534,
+      "grad_norm": 0.2145415097475052,
+      "learning_rate": 3.3832719562548825e-05,
+      "loss": 0.0521,
+      "step": 5800
+    },
+    {
+      "epoch": 1.9584867760294609,
+      "grad_norm": 0.10960496962070465,
+      "learning_rate": 3.3693226202432766e-05,
+      "loss": 0.0513,
+      "step": 5850
+    },
+    {
+      "epoch": 1.9752259792433882,
+      "grad_norm": 0.13635843992233276,
+      "learning_rate": 3.355373284231671e-05,
+      "loss": 0.0499,
+      "step": 5900
+    },
+    {
+      "epoch": 1.991965182457315,
+      "grad_norm": 0.1542210429906845,
+      "learning_rate": 3.341423948220065e-05,
+      "loss": 0.0556,
+      "step": 5950
+    },
+    {
+      "epoch": 2.0,
+      "eval_loss": 0.04946442320942879,
+      "eval_runtime": 55.6114,
+      "eval_samples_per_second": 214.848,
+      "eval_steps_per_second": 13.432,
+      "step": 5974
+    },
+    {
+      "epoch": 2.008704385671242,
+      "grad_norm": 0.1718842089176178,
+      "learning_rate": 3.327474612208459e-05,
+      "loss": 0.0503,
+      "step": 6000
+    },
+    {
+      "epoch": 2.025443588885169,
+      "grad_norm": 0.1528020203113556,
+      "learning_rate": 3.3135252761968535e-05,
+      "loss": 0.0479,
+      "step": 6050
+    },
+    {
+      "epoch": 2.042182792099096,
+      "grad_norm": 0.19148772954940796,
+      "learning_rate": 3.299575940185247e-05,
+      "loss": 0.0443,
+      "step": 6100
+    },
+    {
+      "epoch": 2.058921995313023,
+      "grad_norm": 0.18125496804714203,
+      "learning_rate": 3.2856266041736416e-05,
+      "loss": 0.0473,
+      "step": 6150
+    },
+    {
+      "epoch": 2.07566119852695,
+      "grad_norm": 0.20772996544837952,
+      "learning_rate": 3.271677268162036e-05,
+      "loss": 0.0539,
+      "step": 6200
+    },
+    {
+      "epoch": 2.092400401740877,
+      "grad_norm": 0.2518468201160431,
+      "learning_rate": 3.25772793215043e-05,
+      "loss": 0.0468,
+      "step": 6250
+    },
+    {
+      "epoch": 2.1091396049548043,
+      "grad_norm": 0.1350301206111908,
+      "learning_rate": 3.243778596138824e-05,
+      "loss": 0.0431,
+      "step": 6300
+    },
+    {
+      "epoch": 2.125878808168731,
+      "grad_norm": 0.19141735136508942,
+      "learning_rate": 3.229829260127218e-05,
+      "loss": 0.048,
+      "step": 6350
+    },
+    {
+      "epoch": 2.142618011382658,
+      "grad_norm": 0.2404586374759674,
+      "learning_rate": 3.2158799241156126e-05,
+      "loss": 0.0516,
+      "step": 6400
+    },
+    {
+      "epoch": 2.1593572145965854,
+      "grad_norm": 0.21710112690925598,
+      "learning_rate": 3.201930588104006e-05,
+      "loss": 0.0472,
+      "step": 6450
+    },
+    {
+      "epoch": 2.176096417810512,
+      "grad_norm": 0.14395031332969666,
+      "learning_rate": 3.187981252092401e-05,
+      "loss": 0.0438,
+      "step": 6500
+    },
+    {
+      "epoch": 2.192835621024439,
+      "grad_norm": 0.20882932841777802,
+      "learning_rate": 3.174031916080795e-05,
+      "loss": 0.0488,
+      "step": 6550
+    },
+    {
+      "epoch": 2.2095748242383664,
+      "grad_norm": 0.13824905455112457,
+      "learning_rate": 3.160082580069189e-05,
+      "loss": 0.0464,
+      "step": 6600
+    },
+    {
+      "epoch": 2.2263140274522932,
+      "grad_norm": 0.1783577799797058,
+      "learning_rate": 3.146133244057583e-05,
+      "loss": 0.0459,
+      "step": 6650
+    },
+    {
+      "epoch": 2.24305323066622,
+      "grad_norm": 0.22147531807422638,
+      "learning_rate": 3.132183908045977e-05,
+      "loss": 0.0476,
+      "step": 6700
+    },
+    {
+      "epoch": 2.2597924338801474,
+      "grad_norm": 0.17393821477890015,
+      "learning_rate": 3.118234572034371e-05,
+      "loss": 0.0436,
+      "step": 6750
+    },
+    {
+      "epoch": 2.2765316370940742,
+      "grad_norm": 0.15850785374641418,
+      "learning_rate": 3.104285236022765e-05,
+      "loss": 0.0476,
+      "step": 6800
+    },
+    {
+      "epoch": 2.2932708403080015,
+      "grad_norm": 0.16232182085514069,
+      "learning_rate": 3.09033590001116e-05,
+      "loss": 0.0473,
+      "step": 6850
+    },
+    {
+      "epoch": 2.3100100435219284,
+      "grad_norm": 0.1816001981496811,
+      "learning_rate": 3.076386563999554e-05,
+      "loss": 0.0427,
+      "step": 6900
+    },
+    {
+      "epoch": 2.3267492467358553,
+      "grad_norm": 0.13417834043502808,
+      "learning_rate": 3.062437227987948e-05,
+      "loss": 0.0448,
+      "step": 6950
+    },
+    {
+      "epoch": 2.3434884499497826,
+      "grad_norm": 0.12576530873775482,
+      "learning_rate": 3.048487891976342e-05,
+      "loss": 0.0453,
+      "step": 7000
+    },
+    {
+      "epoch": 2.3602276531637094,
+      "grad_norm": 0.33120718598365784,
+      "learning_rate": 3.0345385559647362e-05,
+      "loss": 0.0462,
+      "step": 7050
+    },
+    {
+      "epoch": 2.3769668563776363,
+      "grad_norm": 0.22310969233512878,
+      "learning_rate": 3.0205892199531306e-05,
+      "loss": 0.0475,
+      "step": 7100
+    },
+    {
+      "epoch": 2.3937060595915636,
+      "grad_norm": 0.18150626122951508,
+      "learning_rate": 3.0066398839415243e-05,
+      "loss": 0.0489,
+      "step": 7150
+    },
+    {
+      "epoch": 2.4104452628054904,
+      "grad_norm": 0.28730452060699463,
+      "learning_rate": 2.9926905479299187e-05,
+      "loss": 0.0536,
+      "step": 7200
+    },
+    {
+      "epoch": 2.4271844660194173,
+      "grad_norm": 0.1918480098247528,
+      "learning_rate": 2.9787412119183128e-05,
+      "loss": 0.0426,
+      "step": 7250
+    },
+    {
+      "epoch": 2.4439236692333446,
+      "grad_norm": 0.16158398985862732,
+      "learning_rate": 2.964791875906707e-05,
+      "loss": 0.0458,
+      "step": 7300
+    },
+    {
+      "epoch": 2.4606628724472714,
+      "grad_norm": 0.27141231298446655,
+      "learning_rate": 2.9508425398951012e-05,
+      "loss": 0.0454,
+      "step": 7350
+    },
+    {
+      "epoch": 2.4774020756611987,
+      "grad_norm": 0.1777345836162567,
+      "learning_rate": 2.936893203883495e-05,
+      "loss": 0.0435,
+      "step": 7400
+    },
+    {
+      "epoch": 2.4941412788751256,
+      "grad_norm": 0.14735421538352966,
+      "learning_rate": 2.9229438678718897e-05,
+      "loss": 0.0489,
+      "step": 7450
+    },
+    {
+      "epoch": 2.5108804820890525,
+      "grad_norm": 0.1486055999994278,
+      "learning_rate": 2.9089945318602834e-05,
+      "loss": 0.0477,
+      "step": 7500
+    },
+    {
+      "epoch": 2.5276196853029793,
+      "grad_norm": 0.17078754305839539,
+      "learning_rate": 2.895045195848678e-05,
+      "loss": 0.0444,
+      "step": 7550
+    },
+    {
+      "epoch": 2.5443588885169066,
+      "grad_norm": 0.19276435673236847,
+      "learning_rate": 2.881095859837072e-05,
+      "loss": 0.0486,
+      "step": 7600
+    },
+    {
+      "epoch": 2.5610980917308335,
+      "grad_norm": 0.21209606528282166,
+      "learning_rate": 2.8671465238254656e-05,
+      "loss": 0.0497,
+      "step": 7650
+    },
+    {
+      "epoch": 2.5778372949447608,
+      "grad_norm": 0.21018877625465393,
+      "learning_rate": 2.8531971878138604e-05,
+      "loss": 0.0441,
+      "step": 7700
+    },
+    {
+      "epoch": 2.5945764981586876,
+      "grad_norm": 0.15666617453098297,
+      "learning_rate": 2.839247851802254e-05,
+      "loss": 0.0467,
+      "step": 7750
+    },
+    {
+      "epoch": 2.6113157013726145,
+      "grad_norm": 0.1940685212612152,
+      "learning_rate": 2.8252985157906485e-05,
+      "loss": 0.0523,
+      "step": 7800
+    },
+    {
+      "epoch": 2.628054904586542,
+      "grad_norm": 0.28480586409568787,
+      "learning_rate": 2.8113491797790426e-05,
+      "loss": 0.0481,
+      "step": 7850
+    },
+    {
+      "epoch": 2.6447941078004686,
+      "grad_norm": 0.2223973125219345,
+      "learning_rate": 2.797399843767437e-05,
+      "loss": 0.0432,
+      "step": 7900
+    },
+    {
+      "epoch": 2.661533311014396,
+      "grad_norm": 0.15986157953739166,
+      "learning_rate": 2.783450507755831e-05,
+      "loss": 0.0454,
+      "step": 7950
+    },
+    {
+      "epoch": 2.678272514228323,
+      "grad_norm": 0.1384258270263672,
+      "learning_rate": 2.7695011717442248e-05,
+      "loss": 0.0477,
+      "step": 8000
+    },
+    {
+      "epoch": 2.6950117174422497,
+      "grad_norm": 0.1721869707107544,
+      "learning_rate": 2.7555518357326192e-05,
+      "loss": 0.0453,
+      "step": 8050
+    },
+    {
+      "epoch": 2.7117509206561765,
+      "grad_norm": 0.20737840235233307,
+      "learning_rate": 2.7416024997210132e-05,
+      "loss": 0.0504,
+      "step": 8100
+    },
+    {
+      "epoch": 2.728490123870104,
+      "grad_norm": 0.18823584914207458,
+      "learning_rate": 2.7276531637094077e-05,
+      "loss": 0.0453,
+      "step": 8150
+    },
+    {
+      "epoch": 2.7452293270840307,
+      "grad_norm": 0.13201962411403656,
+      "learning_rate": 2.7137038276978017e-05,
+      "loss": 0.0433,
+      "step": 8200
+    },
+    {
+      "epoch": 2.761968530297958,
+      "grad_norm": 0.1443973183631897,
+      "learning_rate": 2.699754491686196e-05,
+      "loss": 0.0486,
+      "step": 8250
+    },
+    {
+      "epoch": 2.778707733511885,
+      "grad_norm": 0.29314514994621277,
+      "learning_rate": 2.68580515567459e-05,
+      "loss": 0.05,
+      "step": 8300
+    },
+    {
+      "epoch": 2.7954469367258117,
+      "grad_norm": 0.14852124452590942,
+      "learning_rate": 2.671855819662984e-05,
+      "loss": 0.0495,
+      "step": 8350
+    },
+    {
+      "epoch": 2.812186139939739,
+      "grad_norm": 0.19024662673473358,
+      "learning_rate": 2.6579064836513783e-05,
+      "loss": 0.0508,
+      "step": 8400
+    },
+    {
+      "epoch": 2.828925343153666,
+      "grad_norm": 0.1745578795671463,
+      "learning_rate": 2.6439571476397724e-05,
+      "loss": 0.0443,
+      "step": 8450
+    },
+    {
+      "epoch": 2.845664546367593,
+      "grad_norm": 0.18390017747879028,
+      "learning_rate": 2.6300078116281668e-05,
+      "loss": 0.0468,
+      "step": 8500
+    },
+    {
+      "epoch": 2.86240374958152,
+      "grad_norm": 0.22483347356319427,
+      "learning_rate": 2.616058475616561e-05,
+      "loss": 0.0467,
+      "step": 8550
+    },
+    {
+      "epoch": 2.879142952795447,
+      "grad_norm": 0.18160563707351685,
+      "learning_rate": 2.6021091396049553e-05,
+      "loss": 0.0441,
+      "step": 8600
+    },
+    {
+      "epoch": 2.8958821560093737,
+      "grad_norm": 0.13408955931663513,
+      "learning_rate": 2.588159803593349e-05,
+      "loss": 0.0446,
+      "step": 8650
+    },
+    {
+      "epoch": 2.912621359223301,
+      "grad_norm": 0.16038326919078827,
+      "learning_rate": 2.574210467581743e-05,
+      "loss": 0.0456,
+      "step": 8700
+    },
+    {
+      "epoch": 2.929360562437228,
+      "grad_norm": 0.22738413512706757,
+      "learning_rate": 2.5602611315701375e-05,
+      "loss": 0.0479,
+      "step": 8750
+    },
+    {
+      "epoch": 2.946099765651155,
+      "grad_norm": 0.20327210426330566,
+      "learning_rate": 2.5463117955585315e-05,
+      "loss": 0.0511,
+      "step": 8800
+    },
+    {
+      "epoch": 2.962838968865082,
+      "grad_norm": 0.15756353735923767,
+      "learning_rate": 2.532362459546926e-05,
+      "loss": 0.0426,
+      "step": 8850
+    },
+    {
+      "epoch": 2.979578172079009,
+      "grad_norm": 0.1305045783519745,
+      "learning_rate": 2.5184131235353197e-05,
+      "loss": 0.0442,
+      "step": 8900
+    },
+    {
+      "epoch": 2.996317375292936,
+      "grad_norm": 0.1610562801361084,
+      "learning_rate": 2.5044637875237144e-05,
+      "loss": 0.0467,
+      "step": 8950
+    },
+    {
+      "epoch": 3.0,
+      "eval_loss": 0.04702676460146904,
+      "eval_runtime": 52.8989,
+      "eval_samples_per_second": 225.865,
+      "eval_steps_per_second": 14.121,
+      "step": 8961
+    },
+    {
+      "epoch": 3.013056578506863,
+      "grad_norm": 0.2042045295238495,
+      "learning_rate": 2.490514451512108e-05,
+      "loss": 0.0457,
+      "step": 9000
+    },
+    {
+      "epoch": 3.02979578172079,
+      "grad_norm": 0.27092283964157104,
+      "learning_rate": 2.4765651155005022e-05,
+      "loss": 0.0437,
+      "step": 9050
+    },
+    {
+      "epoch": 3.046534984934717,
+      "grad_norm": 0.18729308247566223,
+      "learning_rate": 2.4626157794888966e-05,
+      "loss": 0.039,
+      "step": 9100
+    },
+    {
+      "epoch": 3.063274188148644,
+      "grad_norm": 0.18845289945602417,
+      "learning_rate": 2.4486664434772903e-05,
+      "loss": 0.0422,
+      "step": 9150
+    },
+    {
+      "epoch": 3.0800133913625714,
+      "grad_norm": 0.17593874037265778,
+      "learning_rate": 2.4347171074656847e-05,
+      "loss": 0.0384,
+      "step": 9200
+    },
+    {
+      "epoch": 3.096752594576498,
+      "grad_norm": 0.23149123787879944,
+      "learning_rate": 2.4207677714540788e-05,
+      "loss": 0.0443,
+      "step": 9250
+    },
+    {
+      "epoch": 3.113491797790425,
+      "grad_norm": 0.1968075931072235,
+      "learning_rate": 2.4068184354424732e-05,
+      "loss": 0.0402,
+      "step": 9300
+    },
+    {
+      "epoch": 3.1302310010043524,
+      "grad_norm": 0.16893354058265686,
+      "learning_rate": 2.3928690994308673e-05,
+      "loss": 0.043,
+      "step": 9350
+    },
+    {
+      "epoch": 3.146970204218279,
+      "grad_norm": 0.161103293299675,
+      "learning_rate": 2.3789197634192613e-05,
+      "loss": 0.0414,
+      "step": 9400
+    },
+    {
+      "epoch": 3.163709407432206,
+      "grad_norm": 0.22712625563144684,
+      "learning_rate": 2.3649704274076557e-05,
+      "loss": 0.0426,
+      "step": 9450
+    },
+    {
+      "epoch": 3.1804486106461334,
+      "grad_norm": 0.12811464071273804,
+      "learning_rate": 2.3510210913960495e-05,
+      "loss": 0.0406,
+      "step": 9500
+    },
+    {
+      "epoch": 3.1971878138600602,
+      "grad_norm": 0.16762731969356537,
+      "learning_rate": 2.337071755384444e-05,
+      "loss": 0.0417,
+      "step": 9550
+    },
+    {
+      "epoch": 3.213927017073987,
+      "grad_norm": 0.14003124833106995,
+      "learning_rate": 2.323122419372838e-05,
+      "loss": 0.0453,
+      "step": 9600
+    },
+    {
+      "epoch": 3.2306662202879144,
+      "grad_norm": 0.16891060769557953,
+      "learning_rate": 2.309173083361232e-05,
+      "loss": 0.0445,
+      "step": 9650
+    },
+    {
+      "epoch": 3.2474054235018412,
+      "grad_norm": 0.16900208592414856,
+      "learning_rate": 2.2952237473496264e-05,
+      "loss": 0.039,
+      "step": 9700
+    },
+    {
+      "epoch": 3.264144626715768,
+      "grad_norm": 0.17154955863952637,
+      "learning_rate": 2.2812744113380205e-05,
+      "loss": 0.0403,
+      "step": 9750
+    },
+    {
+      "epoch": 3.2808838299296954,
+      "grad_norm": 0.1620296835899353,
+      "learning_rate": 2.2673250753264145e-05,
+      "loss": 0.0406,
+      "step": 9800
+    },
+    {
+      "epoch": 3.2976230331436223,
+      "grad_norm": 0.14487063884735107,
+      "learning_rate": 2.2533757393148086e-05,
+      "loss": 0.0467,
+      "step": 9850
+    },
+    {
+      "epoch": 3.3143622363575496,
+      "grad_norm": 0.13799156248569489,
+      "learning_rate": 2.2394264033032027e-05,
+      "loss": 0.0433,
+      "step": 9900
+    },
+    {
+      "epoch": 3.3311014395714764,
+      "grad_norm": 0.1507265418767929,
+      "learning_rate": 2.225477067291597e-05,
+      "loss": 0.0446,
+      "step": 9950
+    },
+    {
+      "epoch": 3.3478406427854033,
+      "grad_norm": 0.1605840027332306,
+      "learning_rate": 2.211527731279991e-05,
+      "loss": 0.0415,
+      "step": 10000
+    },
+    {
+      "epoch": 3.3645798459993306,
+      "grad_norm": 0.11910756677389145,
+      "learning_rate": 2.1975783952683855e-05,
+      "loss": 0.0453,
+      "step": 10050
+    },
+    {
+      "epoch": 3.3813190492132574,
+      "grad_norm": 0.15115414559841156,
+      "learning_rate": 2.1836290592567796e-05,
+      "loss": 0.0395,
+      "step": 10100
+    },
+    {
+      "epoch": 3.3980582524271843,
+      "grad_norm": 0.2044568508863449,
+      "learning_rate": 2.1696797232451737e-05,
+      "loss": 0.0473,
+      "step": 10150
+    },
+    {
+      "epoch": 3.4147974556411116,
+      "grad_norm": 0.18123552203178406,
+      "learning_rate": 2.1557303872335677e-05,
+      "loss": 0.0411,
+      "step": 10200
+    },
+    {
+      "epoch": 3.4315366588550384,
+      "grad_norm": 0.6018120646476746,
+      "learning_rate": 2.1417810512219618e-05,
+      "loss": 0.0413,
+      "step": 10250
+    },
+    {
+      "epoch": 3.4482758620689653,
+      "grad_norm": 0.22490189969539642,
+      "learning_rate": 2.1278317152103562e-05,
+      "loss": 0.0437,
+      "step": 10300
+    },
+    {
+      "epoch": 3.4650150652828926,
+      "grad_norm": 0.1784990429878235,
+      "learning_rate": 2.1138823791987503e-05,
+      "loss": 0.0453,
+      "step": 10350
+    },
+    {
+      "epoch": 3.4817542684968195,
+      "grad_norm": 0.15248402953147888,
+      "learning_rate": 2.0999330431871443e-05,
+      "loss": 0.0456,
+      "step": 10400
+    },
+    {
+      "epoch": 3.4984934717107468,
+      "grad_norm": 0.15146291255950928,
+      "learning_rate": 2.0859837071755387e-05,
+      "loss": 0.0393,
+      "step": 10450
+    },
+    {
+      "epoch": 3.5152326749246736,
+      "grad_norm": 0.1662750393152237,
+      "learning_rate": 2.0720343711639325e-05,
+      "loss": 0.0408,
+      "step": 10500
+    },
+    {
+      "epoch": 3.5319718781386005,
+      "grad_norm": 0.1244506984949112,
+      "learning_rate": 2.058085035152327e-05,
+      "loss": 0.0426,
+      "step": 10550
+    },
+    {
+      "epoch": 3.5487110813525278,
+      "grad_norm": 0.1180344969034195,
+      "learning_rate": 2.044135699140721e-05,
+      "loss": 0.0434,
+      "step": 10600
+    },
+    {
+      "epoch": 3.5654502845664546,
+      "grad_norm": 0.15951013565063477,
+      "learning_rate": 2.030186363129115e-05,
+      "loss": 0.0387,
+      "step": 10650
+    },
+    {
+      "epoch": 3.582189487780382,
+      "grad_norm": 0.16064217686653137,
+      "learning_rate": 2.0162370271175094e-05,
+      "loss": 0.0445,
+      "step": 10700
+    },
+    {
+      "epoch": 3.598928690994309,
+      "grad_norm": 0.17813698947429657,
+      "learning_rate": 2.0022876911059035e-05,
+      "loss": 0.0407,
+      "step": 10750
+    },
+    {
+      "epoch": 3.6156678942082356,
+      "grad_norm": 0.1256450116634369,
+      "learning_rate": 1.988338355094298e-05,
+      "loss": 0.0462,
+      "step": 10800
+    },
+    {
+      "epoch": 3.6324070974221625,
+      "grad_norm": 0.14016403257846832,
+      "learning_rate": 1.9743890190826916e-05,
+      "loss": 0.0456,
+      "step": 10850
+    },
+    {
+      "epoch": 3.64914630063609,
+      "grad_norm": 0.1396850347518921,
+      "learning_rate": 1.9604396830710857e-05,
+      "loss": 0.0403,
+      "step": 10900
+    },
+    {
+      "epoch": 3.6658855038500167,
+      "grad_norm": 0.17943057417869568,
+      "learning_rate": 1.94649034705948e-05,
+      "loss": 0.0417,
+      "step": 10950
+    },
+    {
+      "epoch": 3.682624707063944,
+      "grad_norm": 0.14947953820228577,
+      "learning_rate": 1.932541011047874e-05,
+      "loss": 0.0421,
+      "step": 11000
+    },
+    {
+      "epoch": 3.699363910277871,
+      "grad_norm": 0.12628613412380219,
+      "learning_rate": 1.9185916750362685e-05,
+      "loss": 0.0435,
+      "step": 11050
+    },
+    {
+      "epoch": 3.7161031134917977,
+      "grad_norm": 0.2205984890460968,
+      "learning_rate": 1.9046423390246626e-05,
+      "loss": 0.0396,
+      "step": 11100
+    },
+    {
+      "epoch": 3.732842316705725,
+      "grad_norm": 0.13236357271671295,
+      "learning_rate": 1.8906930030130567e-05,
+      "loss": 0.0405,
+      "step": 11150
+    },
+    {
+      "epoch": 3.749581519919652,
+      "grad_norm": 0.15023528039455414,
+      "learning_rate": 1.8767436670014507e-05,
+      "loss": 0.0434,
+      "step": 11200
+    },
+    {
+      "epoch": 3.7663207231335787,
+      "grad_norm": 0.1427326649427414,
+      "learning_rate": 1.8627943309898448e-05,
+      "loss": 0.0437,
+      "step": 11250
+    },
+    {
+      "epoch": 3.783059926347506,
+      "grad_norm": 0.1890624761581421,
+      "learning_rate": 1.8488449949782392e-05,
+      "loss": 0.0408,
+      "step": 11300
+    },
+    {
+      "epoch": 3.799799129561433,
+      "grad_norm": 0.27970972657203674,
+      "learning_rate": 1.8348956589666333e-05,
+      "loss": 0.0441,
+      "step": 11350
+    },
+    {
+      "epoch": 3.8165383327753597,
+      "grad_norm": 0.12823455035686493,
+      "learning_rate": 1.8209463229550273e-05,
+      "loss": 0.0412,
+      "step": 11400
+    },
+    {
+      "epoch": 3.833277535989287,
+      "grad_norm": 0.1442965269088745,
+      "learning_rate": 1.8069969869434218e-05,
+      "loss": 0.0416,
+      "step": 11450
+    },
+    {
+      "epoch": 3.850016739203214,
+      "grad_norm": 0.13739417493343353,
+      "learning_rate": 1.7930476509318158e-05,
+      "loss": 0.0397,
+      "step": 11500
+    },
+    {
+      "epoch": 3.866755942417141,
+      "grad_norm": 0.16616705060005188,
+      "learning_rate": 1.77909831492021e-05,
+      "loss": 0.0413,
+      "step": 11550
+    },
+    {
+      "epoch": 3.883495145631068,
+      "grad_norm": 0.23060384392738342,
+      "learning_rate": 1.765148978908604e-05,
+      "loss": 0.0447,
+      "step": 11600
+    },
+    {
+      "epoch": 3.900234348844995,
+      "grad_norm": 0.2936810553073883,
+      "learning_rate": 1.751199642896998e-05,
+      "loss": 0.0409,
+      "step": 11650
+    },
+    {
+      "epoch": 3.9169735520589217,
+      "grad_norm": 0.17367126047611237,
+      "learning_rate": 1.7372503068853924e-05,
+      "loss": 0.0411,
+      "step": 11700
+    },
+    {
+      "epoch": 3.933712755272849,
+      "grad_norm": 0.14550547301769257,
+      "learning_rate": 1.7233009708737865e-05,
+      "loss": 0.0445,
+      "step": 11750
+    },
+    {
+      "epoch": 3.950451958486776,
+      "grad_norm": 0.13322454690933228,
+      "learning_rate": 1.709351634862181e-05,
+      "loss": 0.0444,
+      "step": 11800
+    },
+    {
+      "epoch": 3.967191161700703,
+      "grad_norm": 0.13606959581375122,
+      "learning_rate": 1.6954022988505746e-05,
+      "loss": 0.041,
+      "step": 11850
+    },
+    {
+      "epoch": 3.98393036491463,
+      "grad_norm": 0.14227426052093506,
+      "learning_rate": 1.681452962838969e-05,
+      "loss": 0.0412,
+      "step": 11900
+    },
+    {
+      "epoch": 4.0,
+      "eval_loss": 0.045825306326150894,
+      "eval_runtime": 48.668,
+      "eval_samples_per_second": 245.5,
+      "eval_steps_per_second": 15.349,
+      "step": 11948
+    },
+    {
+      "epoch": 4.000669568128557,
+      "grad_norm": 0.14429377019405365,
+      "learning_rate": 1.667503626827363e-05,
+      "loss": 0.0398,
+      "step": 11950
+    },
+    {
+      "epoch": 4.017408771342484,
+      "grad_norm": 0.13842210173606873,
+      "learning_rate": 1.653554290815757e-05,
+      "loss": 0.0365,
+      "step": 12000
+    },
+    {
+      "epoch": 4.0341479745564115,
+      "grad_norm": 0.2168322205543518,
+      "learning_rate": 1.6396049548041516e-05,
+      "loss": 0.0395,
+      "step": 12050
+    },
+    {
+      "epoch": 4.050887177770338,
+      "grad_norm": 0.1534542292356491,
+      "learning_rate": 1.6256556187925456e-05,
+      "loss": 0.0377,
+      "step": 12100
+    },
+    {
+      "epoch": 4.067626380984265,
+      "grad_norm": 0.13067083060741425,
+      "learning_rate": 1.6117062827809397e-05,
+      "loss": 0.042,
+      "step": 12150
+    },
+    {
+      "epoch": 4.084365584198192,
+      "grad_norm": 0.13599875569343567,
+      "learning_rate": 1.5977569467693338e-05,
+      "loss": 0.0403,
+      "step": 12200
+    },
+    {
+      "epoch": 4.101104787412119,
+      "grad_norm": 0.23121419548988342,
+      "learning_rate": 1.5838076107577278e-05,
+      "loss": 0.0406,
+      "step": 12250
+    },
+    {
+      "epoch": 4.117843990626046,
+      "grad_norm": 0.15491856634616852,
+      "learning_rate": 1.5698582747461222e-05,
+      "loss": 0.0381,
+      "step": 12300
+    },
+    {
+      "epoch": 4.1345831938399735,
+      "grad_norm": 0.13168101012706757,
+      "learning_rate": 1.5559089387345163e-05,
+      "loss": 0.0353,
+      "step": 12350
+    },
+    {
+      "epoch": 4.1513223970539,
+      "grad_norm": 0.1694163829088211,
+      "learning_rate": 1.5419596027229104e-05,
+      "loss": 0.04,
+      "step": 12400
+    },
+    {
+      "epoch": 4.168061600267827,
+      "grad_norm": 0.17727704346179962,
+      "learning_rate": 1.5280102667113048e-05,
+      "loss": 0.0364,
+      "step": 12450
+    },
+    {
+      "epoch": 4.184800803481754,
+      "grad_norm": 0.18948902189731598,
+      "learning_rate": 1.5140609306996988e-05,
+      "loss": 0.0372,
+      "step": 12500
+    },
+    {
+      "epoch": 4.201540006695681,
+      "grad_norm": 0.2130223959684372,
+      "learning_rate": 1.5001115946880929e-05,
+      "loss": 0.0398,
+      "step": 12550
+    },
+    {
+      "epoch": 4.218279209909609,
+      "grad_norm": 0.15893810987472534,
+      "learning_rate": 1.486162258676487e-05,
+      "loss": 0.0387,
+      "step": 12600
+    },
+    {
+      "epoch": 4.2350184131235356,
+      "grad_norm": 0.19536694884300232,
+      "learning_rate": 1.4722129226648812e-05,
+      "loss": 0.041,
+      "step": 12650
+    },
+    {
+      "epoch": 4.251757616337462,
+      "grad_norm": 0.16755405068397522,
+      "learning_rate": 1.4582635866532754e-05,
+      "loss": 0.0367,
+      "step": 12700
+    },
+    {
+      "epoch": 4.268496819551389,
+      "grad_norm": 0.14820334315299988,
+      "learning_rate": 1.4443142506416697e-05,
+      "loss": 0.0382,
+      "step": 12750
+    },
+    {
+      "epoch": 4.285236022765316,
+      "grad_norm": 0.21781007945537567,
+      "learning_rate": 1.4303649146300637e-05,
+      "loss": 0.0398,
+      "step": 12800
+    },
+    {
+      "epoch": 4.301975225979243,
+      "grad_norm": 0.17162373661994934,
+      "learning_rate": 1.4164155786184576e-05,
+      "loss": 0.042,
+      "step": 12850
+    },
+    {
+      "epoch": 4.318714429193171,
+      "grad_norm": 0.13765838742256165,
+      "learning_rate": 1.4024662426068519e-05,
+      "loss": 0.0378,
+      "step": 12900
+    },
+    {
+      "epoch": 4.335453632407098,
+      "grad_norm": 0.22830092906951904,
+      "learning_rate": 1.3885169065952461e-05,
+      "loss": 0.0387,
+      "step": 12950
+    },
+    {
+      "epoch": 4.352192835621024,
+      "grad_norm": 0.1682949960231781,
+      "learning_rate": 1.3745675705836403e-05,
+      "loss": 0.0408,
+      "step": 13000
+    },
+    {
+      "epoch": 4.368932038834951,
+      "grad_norm": 0.16153910756111145,
+      "learning_rate": 1.3606182345720344e-05,
+      "loss": 0.0405,
+      "step": 13050
+    },
+    {
+      "epoch": 4.385671242048878,
+      "grad_norm": 0.18771956861019135,
+      "learning_rate": 1.3466688985604286e-05,
+      "loss": 0.0427,
+      "step": 13100
+    },
+    {
+      "epoch": 4.402410445262806,
+      "grad_norm": 0.11162823438644409,
+      "learning_rate": 1.3327195625488229e-05,
+      "loss": 0.0399,
+      "step": 13150
+    },
+    {
+      "epoch": 4.419149648476733,
+      "grad_norm": 0.14466647803783417,
+      "learning_rate": 1.3187702265372168e-05,
+      "loss": 0.0401,
+      "step": 13200
+    },
+    {
+      "epoch": 4.43588885169066,
+      "grad_norm": 0.14834430813789368,
+      "learning_rate": 1.304820890525611e-05,
+      "loss": 0.0386,
+      "step": 13250
+    },
+    {
+      "epoch": 4.4526280549045865,
+      "grad_norm": 0.20324522256851196,
+      "learning_rate": 1.2908715545140052e-05,
+      "loss": 0.0379,
+      "step": 13300
+    },
+    {
+      "epoch": 4.469367258118513,
+      "grad_norm": 0.12625128030776978,
+      "learning_rate": 1.2769222185023993e-05,
+      "loss": 0.0403,
+      "step": 13350
+    },
+    {
+      "epoch": 4.48610646133244,
+      "grad_norm": 0.11018920689821243,
+      "learning_rate": 1.2629728824907935e-05,
+      "loss": 0.0383,
+      "step": 13400
+    },
+    {
+      "epoch": 4.502845664546368,
+      "grad_norm": 0.18631067872047424,
+      "learning_rate": 1.2490235464791876e-05,
+      "loss": 0.0424,
+      "step": 13450
+    },
+    {
+      "epoch": 4.519584867760295,
+      "grad_norm": 0.28578242659568787,
+      "learning_rate": 1.2350742104675818e-05,
+      "loss": 0.0385,
+      "step": 13500
+    },
+    {
+      "epoch": 4.536324070974222,
+      "grad_norm": 0.1431867629289627,
+      "learning_rate": 1.2211248744559759e-05,
+      "loss": 0.038,
+      "step": 13550
+    },
+    {
+      "epoch": 4.5530632741881485,
+      "grad_norm": 0.12731611728668213,
+      "learning_rate": 1.2071755384443701e-05,
+      "loss": 0.0389,
+      "step": 13600
+    },
+    {
+      "epoch": 4.569802477402075,
+      "grad_norm": 0.20958903431892395,
+      "learning_rate": 1.1932262024327642e-05,
+      "loss": 0.0383,
+      "step": 13650
+    },
+    {
+      "epoch": 4.586541680616003,
+      "grad_norm": 0.21877717971801758,
+      "learning_rate": 1.1792768664211584e-05,
+      "loss": 0.0399,
+      "step": 13700
+    },
+    {
+      "epoch": 4.60328088382993,
+      "grad_norm": 0.1719764918088913,
+      "learning_rate": 1.1653275304095527e-05,
+      "loss": 0.0355,
+      "step": 13750
+    },
+    {
+      "epoch": 4.620020087043857,
+      "grad_norm": 0.13044840097427368,
+      "learning_rate": 1.1513781943979467e-05,
+      "loss": 0.0386,
+      "step": 13800
+    },
+    {
+      "epoch": 4.636759290257784,
+      "grad_norm": 0.20552796125411987,
+      "learning_rate": 1.1374288583863408e-05,
+      "loss": 0.04,
+      "step": 13850
+    },
+    {
+      "epoch": 4.6534984934717105,
+      "grad_norm": 0.15806210041046143,
+      "learning_rate": 1.123479522374735e-05,
+      "loss": 0.0381,
+      "step": 13900
+    },
+    {
+      "epoch": 4.670237696685637,
+      "grad_norm": 0.2317190170288086,
+      "learning_rate": 1.1095301863631291e-05,
+      "loss": 0.0394,
+      "step": 13950
+    },
+    {
+      "epoch": 4.686976899899565,
+      "grad_norm": 0.16183790564537048,
+      "learning_rate": 1.0955808503515233e-05,
+      "loss": 0.0392,
+      "step": 14000
+    },
+    {
+      "epoch": 4.703716103113492,
+      "grad_norm": 0.14107303321361542,
+      "learning_rate": 1.0816315143399176e-05,
+      "loss": 0.039,
+      "step": 14050
+    },
+    {
+      "epoch": 4.720455306327419,
+      "grad_norm": 0.21340009570121765,
+      "learning_rate": 1.0676821783283116e-05,
+      "loss": 0.04,
+      "step": 14100
+    },
+    {
+      "epoch": 4.737194509541346,
+      "grad_norm": 0.1754944771528244,
+      "learning_rate": 1.0537328423167057e-05,
+      "loss": 0.0371,
+      "step": 14150
+    },
+    {
+      "epoch": 4.7539337127552725,
+      "grad_norm": 0.13280175626277924,
+      "learning_rate": 1.0397835063051e-05,
+      "loss": 0.0358,
+      "step": 14200
+    },
+    {
+      "epoch": 4.7706729159692,
+      "grad_norm": 0.15039420127868652,
+      "learning_rate": 1.0258341702934942e-05,
+      "loss": 0.0438,
+      "step": 14250
+    },
+    {
+      "epoch": 4.787412119183127,
+      "grad_norm": 0.30474409461021423,
+      "learning_rate": 1.0118848342818882e-05,
+      "loss": 0.0377,
+      "step": 14300
+    },
+    {
+      "epoch": 4.804151322397054,
+      "grad_norm": 0.2012936919927597,
+      "learning_rate": 9.979354982702823e-06,
+      "loss": 0.0417,
+      "step": 14350
+    },
+    {
+      "epoch": 4.820890525610981,
+      "grad_norm": 0.192657932639122,
+      "learning_rate": 9.839861622586765e-06,
+      "loss": 0.0392,
+      "step": 14400
+    },
+    {
+      "epoch": 4.837629728824908,
+      "grad_norm": 0.16141368448734283,
+      "learning_rate": 9.700368262470706e-06,
+      "loss": 0.0378,
+      "step": 14450
+    },
+    {
+      "epoch": 4.854368932038835,
+      "grad_norm": 0.19473744928836823,
+      "learning_rate": 9.560874902354648e-06,
+      "loss": 0.0368,
+      "step": 14500
+    },
+    {
+      "epoch": 4.871108135252762,
+      "grad_norm": 0.15624327957630157,
+      "learning_rate": 9.42138154223859e-06,
+      "loss": 0.0416,
+      "step": 14550
+    },
+    {
+      "epoch": 4.887847338466689,
+      "grad_norm": 0.1572103202342987,
+      "learning_rate": 9.281888182122531e-06,
+      "loss": 0.0383,
+      "step": 14600
+    },
+    {
+      "epoch": 4.904586541680616,
+      "grad_norm": 0.15121281147003174,
+      "learning_rate": 9.142394822006472e-06,
+      "loss": 0.0405,
+      "step": 14650
+    },
+    {
+      "epoch": 4.921325744894543,
+      "grad_norm": 0.1739313155412674,
+      "learning_rate": 9.002901461890415e-06,
+      "loss": 0.0389,
+      "step": 14700
+    },
+    {
+      "epoch": 4.93806494810847,
+      "grad_norm": 0.12826618552207947,
+      "learning_rate": 8.863408101774357e-06,
+      "loss": 0.0385,
+      "step": 14750
+    },
+    {
+      "epoch": 4.9548041513223975,
+      "grad_norm": 0.11419858038425446,
+      "learning_rate": 8.723914741658298e-06,
+      "loss": 0.0387,
+      "step": 14800
+    },
+    {
+      "epoch": 4.971543354536324,
+      "grad_norm": 0.18640589714050293,
+      "learning_rate": 8.584421381542238e-06,
+      "loss": 0.0376,
+      "step": 14850
+    },
+    {
+      "epoch": 4.988282557750251,
+      "grad_norm": 0.18249401450157166,
+      "learning_rate": 8.44492802142618e-06,
+      "loss": 0.041,
+      "step": 14900
+    },
+    {
+      "epoch": 5.0,
+      "eval_loss": 0.045213617384433746,
+      "eval_runtime": 49.0497,
+      "eval_samples_per_second": 243.589,
+      "eval_steps_per_second": 15.229,
+      "step": 14935
+    }
+  ],
+  "logging_steps": 50,
+  "max_steps": 17922,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 6,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 3.240141313277952e+16,
+  "train_batch_size": 16,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-14935/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:37dd546b69fb60d8deb15a8b88e40b23e367c0e9f5a053ea3ae7c730b3874f2e
+size 5304

checkpoint-14935/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-17922/config.json ADDED Viewed

	@@ -0,0 +1,41 @@

+{
+  "_name_or_path": "ckartal/english-to-turkish-finetuned-model",
+  "activation_dropout": 0.0,
+  "activation_function": "swish",
+  "architectures": [
+    "MarianMTModel"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.0,
+  "d_model": 512,
+  "decoder_attention_heads": 8,
+  "decoder_ffn_dim": 2048,
+  "decoder_layerdrop": 0.0,
+  "decoder_layers": 6,
+  "decoder_start_token_id": 59993,
+  "decoder_vocab_size": 59994,
+  "dropout": 0.1,
+  "encoder_attention_heads": 8,
+  "encoder_ffn_dim": 2048,
+  "encoder_layerdrop": 0.0,
+  "encoder_layers": 6,
+  "eos_token_id": 0,
+  "forced_eos_token_id": 0,
+  "init_std": 0.02,
+  "is_encoder_decoder": true,
+  "max_length": null,
+  "max_position_embeddings": 512,
+  "model_type": "marian",
+  "normalize_embedding": false,
+  "num_beams": null,
+  "num_hidden_layers": 6,
+  "pad_token_id": 59993,
+  "scale_embedding": true,
+  "share_encoder_decoder_embeddings": true,
+  "static_position_embeddings": true,
+  "torch_dtype": "float32",
+  "transformers_version": "4.49.0",
+  "use_cache": true,
+  "vocab_size": 59994
+}

checkpoint-17922/generation_config.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+  "bad_words_ids": [
+    [
+      59993
+    ]
+  ],
+  "bos_token_id": 0,
+  "decoder_start_token_id": 59993,
+  "eos_token_id": 0,
+  "forced_eos_token_id": 0,
+  "max_length": 512,
+  "num_beams": 6,
+  "pad_token_id": 59993,
+  "renormalize_logits": true,
+  "transformers_version": "4.49.0"
+}

checkpoint-17922/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0f997b0d1f9ca9a397fb73218159940280ff32f3cbdca06c9d17a7f7d3d66fd6
+size 299690728

checkpoint-17922/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:06b55ff1b42484ee0be4850e41eb3646e2b79718e1fa5063f2d0b28829c5a60c
+size 599054970

checkpoint-17922/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ced61baf7d63e0cafc1cd86b7b3b037ca4c3dbe957b8fabdf5fca21030341962
+size 14244

checkpoint-17922/scaler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d6dcca174cef295bb501b3b96815eb3658373aa4e458716e9734213c4c12acfc
+size 988

checkpoint-17922/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a21237281603ebbc5900a536864616af817483454cb0f49aefdabd07813fd8a8
+size 1064

checkpoint-17922/source.spm ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:98eb24f0995a9d5f7cb0fb628c474628b1d2284615e881e857d062c0b651ce10
+size 793920

checkpoint-17922/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoint-17922/target.spm ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:45cc6000ed513cdca8f80739087fbcbf9933dc50c9ae36c319c9670882f72e1b
+size 837876

checkpoint-17922/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,40 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "59993": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "</s>",
+  "extra_special_tokens": {},
+  "model_max_length": 512,
+  "pad_token": "<pad>",
+  "return_tensors": "pt",
+  "separate_vocabs": false,
+  "source_lang": "eng",
+  "sp_model_kwargs": {},
+  "target_lang": "tur",
+  "tokenizer_class": "MarianTokenizer",
+  "unk_token": "<unk>"
+}

checkpoint-17922/trainer_state.json ADDED Viewed

	@@ -0,0 +1,2587 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 6.0,
+  "eval_steps": 500,
+  "global_step": 17922,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.016739203213927016,
+      "grad_norm": 0.439177542924881,
+      "learning_rate": 4.987445597589555e-05,
+      "loss": 1.5222,
+      "step": 50
+    },
+    {
+      "epoch": 0.03347840642785403,
+      "grad_norm": 0.4248828887939453,
+      "learning_rate": 4.973496261577949e-05,
+      "loss": 0.1842,
+      "step": 100
+    },
+    {
+      "epoch": 0.05021760964178105,
+      "grad_norm": 0.3019009232521057,
+      "learning_rate": 4.9595469255663436e-05,
+      "loss": 0.1471,
+      "step": 150
+    },
+    {
+      "epoch": 0.06695681285570806,
+      "grad_norm": 0.2518245577812195,
+      "learning_rate": 4.9455975895547376e-05,
+      "loss": 0.1306,
+      "step": 200
+    },
+    {
+      "epoch": 0.08369601606963509,
+      "grad_norm": 0.3660012185573578,
+      "learning_rate": 4.931648253543131e-05,
+      "loss": 0.1276,
+      "step": 250
+    },
+    {
+      "epoch": 0.1004352192835621,
+      "grad_norm": 0.32854148745536804,
+      "learning_rate": 4.917698917531526e-05,
+      "loss": 0.1048,
+      "step": 300
+    },
+    {
+      "epoch": 0.11717442249748912,
+      "grad_norm": 0.24879515171051025,
+      "learning_rate": 4.90374958151992e-05,
+      "loss": 0.1054,
+      "step": 350
+    },
+    {
+      "epoch": 0.13391362571141613,
+      "grad_norm": 0.36416563391685486,
+      "learning_rate": 4.889800245508314e-05,
+      "loss": 0.0985,
+      "step": 400
+    },
+    {
+      "epoch": 0.15065282892534315,
+      "grad_norm": 0.33641186356544495,
+      "learning_rate": 4.875850909496708e-05,
+      "loss": 0.1044,
+      "step": 450
+    },
+    {
+      "epoch": 0.16739203213927017,
+      "grad_norm": 0.32909244298934937,
+      "learning_rate": 4.861901573485103e-05,
+      "loss": 0.1089,
+      "step": 500
+    },
+    {
+      "epoch": 0.1841312353531972,
+      "grad_norm": 0.36060285568237305,
+      "learning_rate": 4.847952237473497e-05,
+      "loss": 0.09,
+      "step": 550
+    },
+    {
+      "epoch": 0.2008704385671242,
+      "grad_norm": 0.2510785758495331,
+      "learning_rate": 4.83400290146189e-05,
+      "loss": 0.0884,
+      "step": 600
+    },
+    {
+      "epoch": 0.21760964178105122,
+      "grad_norm": 0.22478719055652618,
+      "learning_rate": 4.820053565450285e-05,
+      "loss": 0.0866,
+      "step": 650
+    },
+    {
+      "epoch": 0.23434884499497824,
+      "grad_norm": 0.37321263551712036,
+      "learning_rate": 4.806104229438679e-05,
+      "loss": 0.0884,
+      "step": 700
+    },
+    {
+      "epoch": 0.25108804820890523,
+      "grad_norm": 0.2660929262638092,
+      "learning_rate": 4.792154893427073e-05,
+      "loss": 0.0819,
+      "step": 750
+    },
+    {
+      "epoch": 0.26782725142283226,
+      "grad_norm": 0.2338525801897049,
+      "learning_rate": 4.778205557415467e-05,
+      "loss": 0.0845,
+      "step": 800
+    },
+    {
+      "epoch": 0.2845664546367593,
+      "grad_norm": 0.308557391166687,
+      "learning_rate": 4.764256221403862e-05,
+      "loss": 0.0815,
+      "step": 850
+    },
+    {
+      "epoch": 0.3013056578506863,
+      "grad_norm": 0.27098262310028076,
+      "learning_rate": 4.750306885392255e-05,
+      "loss": 0.0833,
+      "step": 900
+    },
+    {
+      "epoch": 0.3180448610646133,
+      "grad_norm": 0.23054952919483185,
+      "learning_rate": 4.736357549380649e-05,
+      "loss": 0.0806,
+      "step": 950
+    },
+    {
+      "epoch": 0.33478406427854035,
+      "grad_norm": 0.21355900168418884,
+      "learning_rate": 4.722408213369044e-05,
+      "loss": 0.073,
+      "step": 1000
+    },
+    {
+      "epoch": 0.3515232674924674,
+      "grad_norm": 0.20395708084106445,
+      "learning_rate": 4.708458877357438e-05,
+      "loss": 0.0775,
+      "step": 1050
+    },
+    {
+      "epoch": 0.3682624707063944,
+      "grad_norm": 0.21063613891601562,
+      "learning_rate": 4.694509541345832e-05,
+      "loss": 0.0789,
+      "step": 1100
+    },
+    {
+      "epoch": 0.3850016739203214,
+      "grad_norm": 0.20589284598827362,
+      "learning_rate": 4.680560205334226e-05,
+      "loss": 0.0809,
+      "step": 1150
+    },
+    {
+      "epoch": 0.4017408771342484,
+      "grad_norm": 0.27975228428840637,
+      "learning_rate": 4.666610869322621e-05,
+      "loss": 0.078,
+      "step": 1200
+    },
+    {
+      "epoch": 0.4184800803481754,
+      "grad_norm": 0.2529745399951935,
+      "learning_rate": 4.6526615333110144e-05,
+      "loss": 0.0704,
+      "step": 1250
+    },
+    {
+      "epoch": 0.43521928356210243,
+      "grad_norm": 0.2205154448747635,
+      "learning_rate": 4.6387121972994084e-05,
+      "loss": 0.0733,
+      "step": 1300
+    },
+    {
+      "epoch": 0.45195848677602946,
+      "grad_norm": 0.2254629135131836,
+      "learning_rate": 4.624762861287803e-05,
+      "loss": 0.0751,
+      "step": 1350
+    },
+    {
+      "epoch": 0.4686976899899565,
+      "grad_norm": 0.17614957690238953,
+      "learning_rate": 4.610813525276197e-05,
+      "loss": 0.0747,
+      "step": 1400
+    },
+    {
+      "epoch": 0.4854368932038835,
+      "grad_norm": 0.15940478444099426,
+      "learning_rate": 4.596864189264591e-05,
+      "loss": 0.0698,
+      "step": 1450
+    },
+    {
+      "epoch": 0.5021760964178105,
+      "grad_norm": 0.1869521141052246,
+      "learning_rate": 4.5829148532529854e-05,
+      "loss": 0.0721,
+      "step": 1500
+    },
+    {
+      "epoch": 0.5189152996317375,
+      "grad_norm": 0.36063650250434875,
+      "learning_rate": 4.5689655172413794e-05,
+      "loss": 0.0706,
+      "step": 1550
+    },
+    {
+      "epoch": 0.5356545028456645,
+      "grad_norm": 0.16967014968395233,
+      "learning_rate": 4.5550161812297735e-05,
+      "loss": 0.0759,
+      "step": 1600
+    },
+    {
+      "epoch": 0.5523937060595916,
+      "grad_norm": 0.29293423891067505,
+      "learning_rate": 4.5410668452181676e-05,
+      "loss": 0.0711,
+      "step": 1650
+    },
+    {
+      "epoch": 0.5691329092735186,
+      "grad_norm": 0.3034748136997223,
+      "learning_rate": 4.527117509206562e-05,
+      "loss": 0.067,
+      "step": 1700
+    },
+    {
+      "epoch": 0.5858721124874456,
+      "grad_norm": 0.1974593997001648,
+      "learning_rate": 4.513168173194956e-05,
+      "loss": 0.0701,
+      "step": 1750
+    },
+    {
+      "epoch": 0.6026113157013726,
+      "grad_norm": 0.18101799488067627,
+      "learning_rate": 4.4992188371833505e-05,
+      "loss": 0.0717,
+      "step": 1800
+    },
+    {
+      "epoch": 0.6193505189152997,
+      "grad_norm": 0.14422941207885742,
+      "learning_rate": 4.4852695011717445e-05,
+      "loss": 0.0686,
+      "step": 1850
+    },
+    {
+      "epoch": 0.6360897221292267,
+      "grad_norm": 0.28663551807403564,
+      "learning_rate": 4.4713201651601386e-05,
+      "loss": 0.0646,
+      "step": 1900
+    },
+    {
+      "epoch": 0.6528289253431536,
+      "grad_norm": 0.23879379034042358,
+      "learning_rate": 4.4573708291485327e-05,
+      "loss": 0.0684,
+      "step": 1950
+    },
+    {
+      "epoch": 0.6695681285570807,
+      "grad_norm": 0.21389362215995789,
+      "learning_rate": 4.443421493136927e-05,
+      "loss": 0.066,
+      "step": 2000
+    },
+    {
+      "epoch": 0.6863073317710077,
+      "grad_norm": 0.26841893792152405,
+      "learning_rate": 4.4294721571253215e-05,
+      "loss": 0.0717,
+      "step": 2050
+    },
+    {
+      "epoch": 0.7030465349849347,
+      "grad_norm": 0.240205317735672,
+      "learning_rate": 4.415522821113715e-05,
+      "loss": 0.0697,
+      "step": 2100
+    },
+    {
+      "epoch": 0.7197857381988617,
+      "grad_norm": 0.28098127245903015,
+      "learning_rate": 4.4015734851021096e-05,
+      "loss": 0.0713,
+      "step": 2150
+    },
+    {
+      "epoch": 0.7365249414127888,
+      "grad_norm": 0.23308847844600677,
+      "learning_rate": 4.3876241490905037e-05,
+      "loss": 0.0667,
+      "step": 2200
+    },
+    {
+      "epoch": 0.7532641446267158,
+      "grad_norm": 0.22748568654060364,
+      "learning_rate": 4.373674813078898e-05,
+      "loss": 0.0605,
+      "step": 2250
+    },
+    {
+      "epoch": 0.7700033478406428,
+      "grad_norm": 0.3932187259197235,
+      "learning_rate": 4.359725477067292e-05,
+      "loss": 0.0676,
+      "step": 2300
+    },
+    {
+      "epoch": 0.7867425510545698,
+      "grad_norm": 0.23918767273426056,
+      "learning_rate": 4.345776141055686e-05,
+      "loss": 0.0624,
+      "step": 2350
+    },
+    {
+      "epoch": 0.8034817542684968,
+      "grad_norm": 0.3068426549434662,
+      "learning_rate": 4.33182680504408e-05,
+      "loss": 0.0664,
+      "step": 2400
+    },
+    {
+      "epoch": 0.8202209574824239,
+      "grad_norm": 0.17977873980998993,
+      "learning_rate": 4.317877469032474e-05,
+      "loss": 0.0726,
+      "step": 2450
+    },
+    {
+      "epoch": 0.8369601606963508,
+      "grad_norm": 0.16876642405986786,
+      "learning_rate": 4.303928133020869e-05,
+      "loss": 0.0639,
+      "step": 2500
+    },
+    {
+      "epoch": 0.8536993639102779,
+      "grad_norm": 0.17980250716209412,
+      "learning_rate": 4.289978797009263e-05,
+      "loss": 0.0701,
+      "step": 2550
+    },
+    {
+      "epoch": 0.8704385671242049,
+      "grad_norm": 0.1711459904909134,
+      "learning_rate": 4.276029460997656e-05,
+      "loss": 0.063,
+      "step": 2600
+    },
+    {
+      "epoch": 0.8871777703381319,
+      "grad_norm": 0.443228542804718,
+      "learning_rate": 4.262080124986051e-05,
+      "loss": 0.0675,
+      "step": 2650
+    },
+    {
+      "epoch": 0.9039169735520589,
+      "grad_norm": 0.2098589390516281,
+      "learning_rate": 4.248130788974445e-05,
+      "loss": 0.062,
+      "step": 2700
+    },
+    {
+      "epoch": 0.920656176765986,
+      "grad_norm": 0.3022039234638214,
+      "learning_rate": 4.234181452962839e-05,
+      "loss": 0.07,
+      "step": 2750
+    },
+    {
+      "epoch": 0.937395379979913,
+      "grad_norm": 0.19368910789489746,
+      "learning_rate": 4.220232116951233e-05,
+      "loss": 0.0621,
+      "step": 2800
+    },
+    {
+      "epoch": 0.9541345831938399,
+      "grad_norm": 0.18753108382225037,
+      "learning_rate": 4.206282780939628e-05,
+      "loss": 0.0631,
+      "step": 2850
+    },
+    {
+      "epoch": 0.970873786407767,
+      "grad_norm": 0.15517786145210266,
+      "learning_rate": 4.192333444928022e-05,
+      "loss": 0.0641,
+      "step": 2900
+    },
+    {
+      "epoch": 0.987612989621694,
+      "grad_norm": 0.11765792220830917,
+      "learning_rate": 4.178384108916415e-05,
+      "loss": 0.0612,
+      "step": 2950
+    },
+    {
+      "epoch": 1.0,
+      "eval_loss": 0.05521286651492119,
+      "eval_runtime": 50.415,
+      "eval_samples_per_second": 236.993,
+      "eval_steps_per_second": 14.817,
+      "step": 2987
+    },
+    {
+      "epoch": 1.004352192835621,
+      "grad_norm": 0.2691793739795685,
+      "learning_rate": 4.16443477290481e-05,
+      "loss": 0.059,
+      "step": 3000
+    },
+    {
+      "epoch": 1.021091396049548,
+      "grad_norm": 0.394694060087204,
+      "learning_rate": 4.150485436893204e-05,
+      "loss": 0.0566,
+      "step": 3050
+    },
+    {
+      "epoch": 1.037830599263475,
+      "grad_norm": 0.19438503682613373,
+      "learning_rate": 4.136536100881598e-05,
+      "loss": 0.0591,
+      "step": 3100
+    },
+    {
+      "epoch": 1.0545698024774022,
+      "grad_norm": 0.21350933611392975,
+      "learning_rate": 4.122586764869992e-05,
+      "loss": 0.0509,
+      "step": 3150
+    },
+    {
+      "epoch": 1.071309005691329,
+      "grad_norm": 0.26747575402259827,
+      "learning_rate": 4.108637428858387e-05,
+      "loss": 0.0589,
+      "step": 3200
+    },
+    {
+      "epoch": 1.088048208905256,
+      "grad_norm": 0.31256961822509766,
+      "learning_rate": 4.0946880928467804e-05,
+      "loss": 0.0602,
+      "step": 3250
+    },
+    {
+      "epoch": 1.1047874121191832,
+      "grad_norm": 0.18631280958652496,
+      "learning_rate": 4.0807387568351745e-05,
+      "loss": 0.0547,
+      "step": 3300
+    },
+    {
+      "epoch": 1.12152661533311,
+      "grad_norm": 0.18677473068237305,
+      "learning_rate": 4.066789420823569e-05,
+      "loss": 0.0543,
+      "step": 3350
+    },
+    {
+      "epoch": 1.1382658185470371,
+      "grad_norm": 0.24535444378852844,
+      "learning_rate": 4.052840084811963e-05,
+      "loss": 0.0583,
+      "step": 3400
+    },
+    {
+      "epoch": 1.1550050217609642,
+      "grad_norm": 0.1752105951309204,
+      "learning_rate": 4.038890748800357e-05,
+      "loss": 0.0504,
+      "step": 3450
+    },
+    {
+      "epoch": 1.1717442249748913,
+      "grad_norm": 0.14743360877037048,
+      "learning_rate": 4.0249414127887514e-05,
+      "loss": 0.055,
+      "step": 3500
+    },
+    {
+      "epoch": 1.1884834281888181,
+      "grad_norm": 0.11535945534706116,
+      "learning_rate": 4.010992076777146e-05,
+      "loss": 0.0552,
+      "step": 3550
+    },
+    {
+      "epoch": 1.2052226314027452,
+      "grad_norm": 0.26563358306884766,
+      "learning_rate": 3.9970427407655395e-05,
+      "loss": 0.0552,
+      "step": 3600
+    },
+    {
+      "epoch": 1.2219618346166723,
+      "grad_norm": 0.15104246139526367,
+      "learning_rate": 3.9830934047539336e-05,
+      "loss": 0.0575,
+      "step": 3650
+    },
+    {
+      "epoch": 1.2387010378305994,
+      "grad_norm": 0.2198421210050583,
+      "learning_rate": 3.9691440687423283e-05,
+      "loss": 0.0567,
+      "step": 3700
+    },
+    {
+      "epoch": 1.2554402410445262,
+      "grad_norm": 0.20177733898162842,
+      "learning_rate": 3.955194732730722e-05,
+      "loss": 0.0556,
+      "step": 3750
+    },
+    {
+      "epoch": 1.2721794442584533,
+      "grad_norm": 0.36604830622673035,
+      "learning_rate": 3.9412453967191165e-05,
+      "loss": 0.0569,
+      "step": 3800
+    },
+    {
+      "epoch": 1.2889186474723804,
+      "grad_norm": 0.18883727490901947,
+      "learning_rate": 3.9272960607075105e-05,
+      "loss": 0.0595,
+      "step": 3850
+    },
+    {
+      "epoch": 1.3056578506863072,
+      "grad_norm": 0.14828617870807648,
+      "learning_rate": 3.9133467246959046e-05,
+      "loss": 0.0548,
+      "step": 3900
+    },
+    {
+      "epoch": 1.3223970539002343,
+      "grad_norm": 0.19220437109470367,
+      "learning_rate": 3.899397388684299e-05,
+      "loss": 0.053,
+      "step": 3950
+    },
+    {
+      "epoch": 1.3391362571141614,
+      "grad_norm": 0.16049669682979584,
+      "learning_rate": 3.885448052672693e-05,
+      "loss": 0.0581,
+      "step": 4000
+    },
+    {
+      "epoch": 1.3558754603280883,
+      "grad_norm": 0.22821515798568726,
+      "learning_rate": 3.8714987166610875e-05,
+      "loss": 0.0518,
+      "step": 4050
+    },
+    {
+      "epoch": 1.3726146635420153,
+      "grad_norm": 0.1879580318927765,
+      "learning_rate": 3.857549380649481e-05,
+      "loss": 0.0574,
+      "step": 4100
+    },
+    {
+      "epoch": 1.3893538667559424,
+      "grad_norm": 0.16026251018047333,
+      "learning_rate": 3.8436000446378756e-05,
+      "loss": 0.063,
+      "step": 4150
+    },
+    {
+      "epoch": 1.4060930699698695,
+      "grad_norm": 0.26868143677711487,
+      "learning_rate": 3.82965070862627e-05,
+      "loss": 0.0571,
+      "step": 4200
+    },
+    {
+      "epoch": 1.4228322731837966,
+      "grad_norm": 0.2529687285423279,
+      "learning_rate": 3.815701372614664e-05,
+      "loss": 0.0528,
+      "step": 4250
+    },
+    {
+      "epoch": 1.4395714763977234,
+      "grad_norm": 0.19138221442699432,
+      "learning_rate": 3.801752036603058e-05,
+      "loss": 0.0584,
+      "step": 4300
+    },
+    {
+      "epoch": 1.4563106796116505,
+      "grad_norm": 0.16359661519527435,
+      "learning_rate": 3.787802700591452e-05,
+      "loss": 0.0539,
+      "step": 4350
+    },
+    {
+      "epoch": 1.4730498828255776,
+      "grad_norm": 0.1373494267463684,
+      "learning_rate": 3.7738533645798466e-05,
+      "loss": 0.0557,
+      "step": 4400
+    },
+    {
+      "epoch": 1.4897890860395044,
+      "grad_norm": 0.15695162117481232,
+      "learning_rate": 3.75990402856824e-05,
+      "loss": 0.0491,
+      "step": 4450
+    },
+    {
+      "epoch": 1.5065282892534315,
+      "grad_norm": 0.18462614715099335,
+      "learning_rate": 3.745954692556635e-05,
+      "loss": 0.0495,
+      "step": 4500
+    },
+    {
+      "epoch": 1.5232674924673586,
+      "grad_norm": 0.27876704931259155,
+      "learning_rate": 3.732005356545029e-05,
+      "loss": 0.0523,
+      "step": 4550
+    },
+    {
+      "epoch": 1.5400066956812855,
+      "grad_norm": 0.30491840839385986,
+      "learning_rate": 3.718056020533423e-05,
+      "loss": 0.0564,
+      "step": 4600
+    },
+    {
+      "epoch": 1.5567458988952128,
+      "grad_norm": 0.18721336126327515,
+      "learning_rate": 3.704106684521817e-05,
+      "loss": 0.0524,
+      "step": 4650
+    },
+    {
+      "epoch": 1.5734851021091396,
+      "grad_norm": 0.21216215193271637,
+      "learning_rate": 3.690157348510211e-05,
+      "loss": 0.0521,
+      "step": 4700
+    },
+    {
+      "epoch": 1.5902243053230665,
+      "grad_norm": 0.1368396282196045,
+      "learning_rate": 3.676208012498605e-05,
+      "loss": 0.056,
+      "step": 4750
+    },
+    {
+      "epoch": 1.6069635085369938,
+      "grad_norm": 0.13692086935043335,
+      "learning_rate": 3.662258676486999e-05,
+      "loss": 0.0443,
+      "step": 4800
+    },
+    {
+      "epoch": 1.6237027117509206,
+      "grad_norm": 0.11640128493309021,
+      "learning_rate": 3.648309340475394e-05,
+      "loss": 0.0488,
+      "step": 4850
+    },
+    {
+      "epoch": 1.6404419149648477,
+      "grad_norm": 0.19953882694244385,
+      "learning_rate": 3.634360004463788e-05,
+      "loss": 0.0553,
+      "step": 4900
+    },
+    {
+      "epoch": 1.6571811181787748,
+      "grad_norm": 0.1966984122991562,
+      "learning_rate": 3.6204106684521813e-05,
+      "loss": 0.0536,
+      "step": 4950
+    },
+    {
+      "epoch": 1.6739203213927016,
+      "grad_norm": 0.2324533313512802,
+      "learning_rate": 3.606461332440576e-05,
+      "loss": 0.0493,
+      "step": 5000
+    },
+    {
+      "epoch": 1.6906595246066287,
+      "grad_norm": 0.16217607259750366,
+      "learning_rate": 3.59251199642897e-05,
+      "loss": 0.0503,
+      "step": 5050
+    },
+    {
+      "epoch": 1.7073987278205558,
+      "grad_norm": 0.23949602246284485,
+      "learning_rate": 3.578562660417364e-05,
+      "loss": 0.0556,
+      "step": 5100
+    },
+    {
+      "epoch": 1.7241379310344827,
+      "grad_norm": 0.21387897431850433,
+      "learning_rate": 3.564613324405758e-05,
+      "loss": 0.0548,
+      "step": 5150
+    },
+    {
+      "epoch": 1.7408771342484097,
+      "grad_norm": 0.2055111676454544,
+      "learning_rate": 3.550663988394153e-05,
+      "loss": 0.06,
+      "step": 5200
+    },
+    {
+      "epoch": 1.7576163374623368,
+      "grad_norm": 0.20280921459197998,
+      "learning_rate": 3.5367146523825464e-05,
+      "loss": 0.0508,
+      "step": 5250
+    },
+    {
+      "epoch": 1.7743555406762637,
+      "grad_norm": 0.14165103435516357,
+      "learning_rate": 3.5227653163709405e-05,
+      "loss": 0.0581,
+      "step": 5300
+    },
+    {
+      "epoch": 1.791094743890191,
+      "grad_norm": 0.18099863827228546,
+      "learning_rate": 3.508815980359335e-05,
+      "loss": 0.0562,
+      "step": 5350
+    },
+    {
+      "epoch": 1.8078339471041178,
+      "grad_norm": 0.21743184328079224,
+      "learning_rate": 3.494866644347729e-05,
+      "loss": 0.0498,
+      "step": 5400
+    },
+    {
+      "epoch": 1.824573150318045,
+      "grad_norm": 0.20934534072875977,
+      "learning_rate": 3.4809173083361234e-05,
+      "loss": 0.0549,
+      "step": 5450
+    },
+    {
+      "epoch": 1.841312353531972,
+      "grad_norm": 0.1582174152135849,
+      "learning_rate": 3.4669679723245174e-05,
+      "loss": 0.0556,
+      "step": 5500
+    },
+    {
+      "epoch": 1.8580515567458988,
+      "grad_norm": 0.1624903827905655,
+      "learning_rate": 3.453018636312912e-05,
+      "loss": 0.0516,
+      "step": 5550
+    },
+    {
+      "epoch": 1.874790759959826,
+      "grad_norm": 0.16255798935890198,
+      "learning_rate": 3.4390693003013056e-05,
+      "loss": 0.0542,
+      "step": 5600
+    },
+    {
+      "epoch": 1.891529963173753,
+      "grad_norm": 0.1269742250442505,
+      "learning_rate": 3.4251199642896996e-05,
+      "loss": 0.0565,
+      "step": 5650
+    },
+    {
+      "epoch": 1.9082691663876798,
+      "grad_norm": 0.15966229140758514,
+      "learning_rate": 3.4111706282780944e-05,
+      "loss": 0.0538,
+      "step": 5700
+    },
+    {
+      "epoch": 1.925008369601607,
+      "grad_norm": 0.21506330370903015,
+      "learning_rate": 3.3972212922664884e-05,
+      "loss": 0.0505,
+      "step": 5750
+    },
+    {
+      "epoch": 1.941747572815534,
+      "grad_norm": 0.2145415097475052,
+      "learning_rate": 3.3832719562548825e-05,
+      "loss": 0.0521,
+      "step": 5800
+    },
+    {
+      "epoch": 1.9584867760294609,
+      "grad_norm": 0.10960496962070465,
+      "learning_rate": 3.3693226202432766e-05,
+      "loss": 0.0513,
+      "step": 5850
+    },
+    {
+      "epoch": 1.9752259792433882,
+      "grad_norm": 0.13635843992233276,
+      "learning_rate": 3.355373284231671e-05,
+      "loss": 0.0499,
+      "step": 5900
+    },
+    {
+      "epoch": 1.991965182457315,
+      "grad_norm": 0.1542210429906845,
+      "learning_rate": 3.341423948220065e-05,
+      "loss": 0.0556,
+      "step": 5950
+    },
+    {
+      "epoch": 2.0,
+      "eval_loss": 0.04946442320942879,
+      "eval_runtime": 55.6114,
+      "eval_samples_per_second": 214.848,
+      "eval_steps_per_second": 13.432,
+      "step": 5974
+    },
+    {
+      "epoch": 2.008704385671242,
+      "grad_norm": 0.1718842089176178,
+      "learning_rate": 3.327474612208459e-05,
+      "loss": 0.0503,
+      "step": 6000
+    },
+    {
+      "epoch": 2.025443588885169,
+      "grad_norm": 0.1528020203113556,
+      "learning_rate": 3.3135252761968535e-05,
+      "loss": 0.0479,
+      "step": 6050
+    },
+    {
+      "epoch": 2.042182792099096,
+      "grad_norm": 0.19148772954940796,
+      "learning_rate": 3.299575940185247e-05,
+      "loss": 0.0443,
+      "step": 6100
+    },
+    {
+      "epoch": 2.058921995313023,
+      "grad_norm": 0.18125496804714203,
+      "learning_rate": 3.2856266041736416e-05,
+      "loss": 0.0473,
+      "step": 6150
+    },
+    {
+      "epoch": 2.07566119852695,
+      "grad_norm": 0.20772996544837952,
+      "learning_rate": 3.271677268162036e-05,
+      "loss": 0.0539,
+      "step": 6200
+    },
+    {
+      "epoch": 2.092400401740877,
+      "grad_norm": 0.2518468201160431,
+      "learning_rate": 3.25772793215043e-05,
+      "loss": 0.0468,
+      "step": 6250
+    },
+    {
+      "epoch": 2.1091396049548043,
+      "grad_norm": 0.1350301206111908,
+      "learning_rate": 3.243778596138824e-05,
+      "loss": 0.0431,
+      "step": 6300
+    },
+    {
+      "epoch": 2.125878808168731,
+      "grad_norm": 0.19141735136508942,
+      "learning_rate": 3.229829260127218e-05,
+      "loss": 0.048,
+      "step": 6350
+    },
+    {
+      "epoch": 2.142618011382658,
+      "grad_norm": 0.2404586374759674,
+      "learning_rate": 3.2158799241156126e-05,
+      "loss": 0.0516,
+      "step": 6400
+    },
+    {
+      "epoch": 2.1593572145965854,
+      "grad_norm": 0.21710112690925598,
+      "learning_rate": 3.201930588104006e-05,
+      "loss": 0.0472,
+      "step": 6450
+    },
+    {
+      "epoch": 2.176096417810512,
+      "grad_norm": 0.14395031332969666,
+      "learning_rate": 3.187981252092401e-05,
+      "loss": 0.0438,
+      "step": 6500
+    },
+    {
+      "epoch": 2.192835621024439,
+      "grad_norm": 0.20882932841777802,
+      "learning_rate": 3.174031916080795e-05,
+      "loss": 0.0488,
+      "step": 6550
+    },
+    {
+      "epoch": 2.2095748242383664,
+      "grad_norm": 0.13824905455112457,
+      "learning_rate": 3.160082580069189e-05,
+      "loss": 0.0464,
+      "step": 6600
+    },
+    {
+      "epoch": 2.2263140274522932,
+      "grad_norm": 0.1783577799797058,
+      "learning_rate": 3.146133244057583e-05,
+      "loss": 0.0459,
+      "step": 6650
+    },
+    {
+      "epoch": 2.24305323066622,
+      "grad_norm": 0.22147531807422638,
+      "learning_rate": 3.132183908045977e-05,
+      "loss": 0.0476,
+      "step": 6700
+    },
+    {
+      "epoch": 2.2597924338801474,
+      "grad_norm": 0.17393821477890015,
+      "learning_rate": 3.118234572034371e-05,
+      "loss": 0.0436,
+      "step": 6750
+    },
+    {
+      "epoch": 2.2765316370940742,
+      "grad_norm": 0.15850785374641418,
+      "learning_rate": 3.104285236022765e-05,
+      "loss": 0.0476,
+      "step": 6800
+    },
+    {
+      "epoch": 2.2932708403080015,
+      "grad_norm": 0.16232182085514069,
+      "learning_rate": 3.09033590001116e-05,
+      "loss": 0.0473,
+      "step": 6850
+    },
+    {
+      "epoch": 2.3100100435219284,
+      "grad_norm": 0.1816001981496811,
+      "learning_rate": 3.076386563999554e-05,
+      "loss": 0.0427,
+      "step": 6900
+    },
+    {
+      "epoch": 2.3267492467358553,
+      "grad_norm": 0.13417834043502808,
+      "learning_rate": 3.062437227987948e-05,
+      "loss": 0.0448,
+      "step": 6950
+    },
+    {
+      "epoch": 2.3434884499497826,
+      "grad_norm": 0.12576530873775482,
+      "learning_rate": 3.048487891976342e-05,
+      "loss": 0.0453,
+      "step": 7000
+    },
+    {
+      "epoch": 2.3602276531637094,
+      "grad_norm": 0.33120718598365784,
+      "learning_rate": 3.0345385559647362e-05,
+      "loss": 0.0462,
+      "step": 7050
+    },
+    {
+      "epoch": 2.3769668563776363,
+      "grad_norm": 0.22310969233512878,
+      "learning_rate": 3.0205892199531306e-05,
+      "loss": 0.0475,
+      "step": 7100
+    },
+    {
+      "epoch": 2.3937060595915636,
+      "grad_norm": 0.18150626122951508,
+      "learning_rate": 3.0066398839415243e-05,
+      "loss": 0.0489,
+      "step": 7150
+    },
+    {
+      "epoch": 2.4104452628054904,
+      "grad_norm": 0.28730452060699463,
+      "learning_rate": 2.9926905479299187e-05,
+      "loss": 0.0536,
+      "step": 7200
+    },
+    {
+      "epoch": 2.4271844660194173,
+      "grad_norm": 0.1918480098247528,
+      "learning_rate": 2.9787412119183128e-05,
+      "loss": 0.0426,
+      "step": 7250
+    },
+    {
+      "epoch": 2.4439236692333446,
+      "grad_norm": 0.16158398985862732,
+      "learning_rate": 2.964791875906707e-05,
+      "loss": 0.0458,
+      "step": 7300
+    },
+    {
+      "epoch": 2.4606628724472714,
+      "grad_norm": 0.27141231298446655,
+      "learning_rate": 2.9508425398951012e-05,
+      "loss": 0.0454,
+      "step": 7350
+    },
+    {
+      "epoch": 2.4774020756611987,
+      "grad_norm": 0.1777345836162567,
+      "learning_rate": 2.936893203883495e-05,
+      "loss": 0.0435,
+      "step": 7400
+    },
+    {
+      "epoch": 2.4941412788751256,
+      "grad_norm": 0.14735421538352966,
+      "learning_rate": 2.9229438678718897e-05,
+      "loss": 0.0489,
+      "step": 7450
+    },
+    {
+      "epoch": 2.5108804820890525,
+      "grad_norm": 0.1486055999994278,
+      "learning_rate": 2.9089945318602834e-05,
+      "loss": 0.0477,
+      "step": 7500
+    },
+    {
+      "epoch": 2.5276196853029793,
+      "grad_norm": 0.17078754305839539,
+      "learning_rate": 2.895045195848678e-05,
+      "loss": 0.0444,
+      "step": 7550
+    },
+    {
+      "epoch": 2.5443588885169066,
+      "grad_norm": 0.19276435673236847,
+      "learning_rate": 2.881095859837072e-05,
+      "loss": 0.0486,
+      "step": 7600
+    },
+    {
+      "epoch": 2.5610980917308335,
+      "grad_norm": 0.21209606528282166,
+      "learning_rate": 2.8671465238254656e-05,
+      "loss": 0.0497,
+      "step": 7650
+    },
+    {
+      "epoch": 2.5778372949447608,
+      "grad_norm": 0.21018877625465393,
+      "learning_rate": 2.8531971878138604e-05,
+      "loss": 0.0441,
+      "step": 7700
+    },
+    {
+      "epoch": 2.5945764981586876,
+      "grad_norm": 0.15666617453098297,
+      "learning_rate": 2.839247851802254e-05,
+      "loss": 0.0467,
+      "step": 7750
+    },
+    {
+      "epoch": 2.6113157013726145,
+      "grad_norm": 0.1940685212612152,
+      "learning_rate": 2.8252985157906485e-05,
+      "loss": 0.0523,
+      "step": 7800
+    },
+    {
+      "epoch": 2.628054904586542,
+      "grad_norm": 0.28480586409568787,
+      "learning_rate": 2.8113491797790426e-05,
+      "loss": 0.0481,
+      "step": 7850
+    },
+    {
+      "epoch": 2.6447941078004686,
+      "grad_norm": 0.2223973125219345,
+      "learning_rate": 2.797399843767437e-05,
+      "loss": 0.0432,
+      "step": 7900
+    },
+    {
+      "epoch": 2.661533311014396,
+      "grad_norm": 0.15986157953739166,
+      "learning_rate": 2.783450507755831e-05,
+      "loss": 0.0454,
+      "step": 7950
+    },
+    {
+      "epoch": 2.678272514228323,
+      "grad_norm": 0.1384258270263672,
+      "learning_rate": 2.7695011717442248e-05,
+      "loss": 0.0477,
+      "step": 8000
+    },
+    {
+      "epoch": 2.6950117174422497,
+      "grad_norm": 0.1721869707107544,
+      "learning_rate": 2.7555518357326192e-05,
+      "loss": 0.0453,
+      "step": 8050
+    },
+    {
+      "epoch": 2.7117509206561765,
+      "grad_norm": 0.20737840235233307,
+      "learning_rate": 2.7416024997210132e-05,
+      "loss": 0.0504,
+      "step": 8100
+    },
+    {
+      "epoch": 2.728490123870104,
+      "grad_norm": 0.18823584914207458,
+      "learning_rate": 2.7276531637094077e-05,
+      "loss": 0.0453,
+      "step": 8150
+    },
+    {
+      "epoch": 2.7452293270840307,
+      "grad_norm": 0.13201962411403656,
+      "learning_rate": 2.7137038276978017e-05,
+      "loss": 0.0433,
+      "step": 8200
+    },
+    {
+      "epoch": 2.761968530297958,
+      "grad_norm": 0.1443973183631897,
+      "learning_rate": 2.699754491686196e-05,
+      "loss": 0.0486,
+      "step": 8250
+    },
+    {
+      "epoch": 2.778707733511885,
+      "grad_norm": 0.29314514994621277,
+      "learning_rate": 2.68580515567459e-05,
+      "loss": 0.05,
+      "step": 8300
+    },
+    {
+      "epoch": 2.7954469367258117,
+      "grad_norm": 0.14852124452590942,
+      "learning_rate": 2.671855819662984e-05,
+      "loss": 0.0495,
+      "step": 8350
+    },
+    {
+      "epoch": 2.812186139939739,
+      "grad_norm": 0.19024662673473358,
+      "learning_rate": 2.6579064836513783e-05,
+      "loss": 0.0508,
+      "step": 8400
+    },
+    {
+      "epoch": 2.828925343153666,
+      "grad_norm": 0.1745578795671463,
+      "learning_rate": 2.6439571476397724e-05,
+      "loss": 0.0443,
+      "step": 8450
+    },
+    {
+      "epoch": 2.845664546367593,
+      "grad_norm": 0.18390017747879028,
+      "learning_rate": 2.6300078116281668e-05,
+      "loss": 0.0468,
+      "step": 8500
+    },
+    {
+      "epoch": 2.86240374958152,
+      "grad_norm": 0.22483347356319427,
+      "learning_rate": 2.616058475616561e-05,
+      "loss": 0.0467,
+      "step": 8550
+    },
+    {
+      "epoch": 2.879142952795447,
+      "grad_norm": 0.18160563707351685,
+      "learning_rate": 2.6021091396049553e-05,
+      "loss": 0.0441,
+      "step": 8600
+    },
+    {
+      "epoch": 2.8958821560093737,
+      "grad_norm": 0.13408955931663513,
+      "learning_rate": 2.588159803593349e-05,
+      "loss": 0.0446,
+      "step": 8650
+    },
+    {
+      "epoch": 2.912621359223301,
+      "grad_norm": 0.16038326919078827,
+      "learning_rate": 2.574210467581743e-05,
+      "loss": 0.0456,
+      "step": 8700
+    },
+    {
+      "epoch": 2.929360562437228,
+      "grad_norm": 0.22738413512706757,
+      "learning_rate": 2.5602611315701375e-05,
+      "loss": 0.0479,
+      "step": 8750
+    },
+    {
+      "epoch": 2.946099765651155,
+      "grad_norm": 0.20327210426330566,
+      "learning_rate": 2.5463117955585315e-05,
+      "loss": 0.0511,
+      "step": 8800
+    },
+    {
+      "epoch": 2.962838968865082,
+      "grad_norm": 0.15756353735923767,
+      "learning_rate": 2.532362459546926e-05,
+      "loss": 0.0426,
+      "step": 8850
+    },
+    {
+      "epoch": 2.979578172079009,
+      "grad_norm": 0.1305045783519745,
+      "learning_rate": 2.5184131235353197e-05,
+      "loss": 0.0442,
+      "step": 8900
+    },
+    {
+      "epoch": 2.996317375292936,
+      "grad_norm": 0.1610562801361084,
+      "learning_rate": 2.5044637875237144e-05,
+      "loss": 0.0467,
+      "step": 8950
+    },
+    {
+      "epoch": 3.0,
+      "eval_loss": 0.04702676460146904,
+      "eval_runtime": 52.8989,
+      "eval_samples_per_second": 225.865,
+      "eval_steps_per_second": 14.121,
+      "step": 8961
+    },
+    {
+      "epoch": 3.013056578506863,
+      "grad_norm": 0.2042045295238495,
+      "learning_rate": 2.490514451512108e-05,
+      "loss": 0.0457,
+      "step": 9000
+    },
+    {
+      "epoch": 3.02979578172079,
+      "grad_norm": 0.27092283964157104,
+      "learning_rate": 2.4765651155005022e-05,
+      "loss": 0.0437,
+      "step": 9050
+    },
+    {
+      "epoch": 3.046534984934717,
+      "grad_norm": 0.18729308247566223,
+      "learning_rate": 2.4626157794888966e-05,
+      "loss": 0.039,
+      "step": 9100
+    },
+    {
+      "epoch": 3.063274188148644,
+      "grad_norm": 0.18845289945602417,
+      "learning_rate": 2.4486664434772903e-05,
+      "loss": 0.0422,
+      "step": 9150
+    },
+    {
+      "epoch": 3.0800133913625714,
+      "grad_norm": 0.17593874037265778,
+      "learning_rate": 2.4347171074656847e-05,
+      "loss": 0.0384,
+      "step": 9200
+    },
+    {
+      "epoch": 3.096752594576498,
+      "grad_norm": 0.23149123787879944,
+      "learning_rate": 2.4207677714540788e-05,
+      "loss": 0.0443,
+      "step": 9250
+    },
+    {
+      "epoch": 3.113491797790425,
+      "grad_norm": 0.1968075931072235,
+      "learning_rate": 2.4068184354424732e-05,
+      "loss": 0.0402,
+      "step": 9300
+    },
+    {
+      "epoch": 3.1302310010043524,
+      "grad_norm": 0.16893354058265686,
+      "learning_rate": 2.3928690994308673e-05,
+      "loss": 0.043,
+      "step": 9350
+    },
+    {
+      "epoch": 3.146970204218279,
+      "grad_norm": 0.161103293299675,
+      "learning_rate": 2.3789197634192613e-05,
+      "loss": 0.0414,
+      "step": 9400
+    },
+    {
+      "epoch": 3.163709407432206,
+      "grad_norm": 0.22712625563144684,
+      "learning_rate": 2.3649704274076557e-05,
+      "loss": 0.0426,
+      "step": 9450
+    },
+    {
+      "epoch": 3.1804486106461334,
+      "grad_norm": 0.12811464071273804,
+      "learning_rate": 2.3510210913960495e-05,
+      "loss": 0.0406,
+      "step": 9500
+    },
+    {
+      "epoch": 3.1971878138600602,
+      "grad_norm": 0.16762731969356537,
+      "learning_rate": 2.337071755384444e-05,
+      "loss": 0.0417,
+      "step": 9550
+    },
+    {
+      "epoch": 3.213927017073987,
+      "grad_norm": 0.14003124833106995,
+      "learning_rate": 2.323122419372838e-05,
+      "loss": 0.0453,
+      "step": 9600
+    },
+    {
+      "epoch": 3.2306662202879144,
+      "grad_norm": 0.16891060769557953,
+      "learning_rate": 2.309173083361232e-05,
+      "loss": 0.0445,
+      "step": 9650
+    },
+    {
+      "epoch": 3.2474054235018412,
+      "grad_norm": 0.16900208592414856,
+      "learning_rate": 2.2952237473496264e-05,
+      "loss": 0.039,
+      "step": 9700
+    },
+    {
+      "epoch": 3.264144626715768,
+      "grad_norm": 0.17154955863952637,
+      "learning_rate": 2.2812744113380205e-05,
+      "loss": 0.0403,
+      "step": 9750
+    },
+    {
+      "epoch": 3.2808838299296954,
+      "grad_norm": 0.1620296835899353,
+      "learning_rate": 2.2673250753264145e-05,
+      "loss": 0.0406,
+      "step": 9800
+    },
+    {
+      "epoch": 3.2976230331436223,
+      "grad_norm": 0.14487063884735107,
+      "learning_rate": 2.2533757393148086e-05,
+      "loss": 0.0467,
+      "step": 9850
+    },
+    {
+      "epoch": 3.3143622363575496,
+      "grad_norm": 0.13799156248569489,
+      "learning_rate": 2.2394264033032027e-05,
+      "loss": 0.0433,
+      "step": 9900
+    },
+    {
+      "epoch": 3.3311014395714764,
+      "grad_norm": 0.1507265418767929,
+      "learning_rate": 2.225477067291597e-05,
+      "loss": 0.0446,
+      "step": 9950
+    },
+    {
+      "epoch": 3.3478406427854033,
+      "grad_norm": 0.1605840027332306,
+      "learning_rate": 2.211527731279991e-05,
+      "loss": 0.0415,
+      "step": 10000
+    },
+    {
+      "epoch": 3.3645798459993306,
+      "grad_norm": 0.11910756677389145,
+      "learning_rate": 2.1975783952683855e-05,
+      "loss": 0.0453,
+      "step": 10050
+    },
+    {
+      "epoch": 3.3813190492132574,
+      "grad_norm": 0.15115414559841156,
+      "learning_rate": 2.1836290592567796e-05,
+      "loss": 0.0395,
+      "step": 10100
+    },
+    {
+      "epoch": 3.3980582524271843,
+      "grad_norm": 0.2044568508863449,
+      "learning_rate": 2.1696797232451737e-05,
+      "loss": 0.0473,
+      "step": 10150
+    },
+    {
+      "epoch": 3.4147974556411116,
+      "grad_norm": 0.18123552203178406,
+      "learning_rate": 2.1557303872335677e-05,
+      "loss": 0.0411,
+      "step": 10200
+    },
+    {
+      "epoch": 3.4315366588550384,
+      "grad_norm": 0.6018120646476746,
+      "learning_rate": 2.1417810512219618e-05,
+      "loss": 0.0413,
+      "step": 10250
+    },
+    {
+      "epoch": 3.4482758620689653,
+      "grad_norm": 0.22490189969539642,
+      "learning_rate": 2.1278317152103562e-05,
+      "loss": 0.0437,
+      "step": 10300
+    },
+    {
+      "epoch": 3.4650150652828926,
+      "grad_norm": 0.1784990429878235,
+      "learning_rate": 2.1138823791987503e-05,
+      "loss": 0.0453,
+      "step": 10350
+    },
+    {
+      "epoch": 3.4817542684968195,
+      "grad_norm": 0.15248402953147888,
+      "learning_rate": 2.0999330431871443e-05,
+      "loss": 0.0456,
+      "step": 10400
+    },
+    {
+      "epoch": 3.4984934717107468,
+      "grad_norm": 0.15146291255950928,
+      "learning_rate": 2.0859837071755387e-05,
+      "loss": 0.0393,
+      "step": 10450
+    },
+    {
+      "epoch": 3.5152326749246736,
+      "grad_norm": 0.1662750393152237,
+      "learning_rate": 2.0720343711639325e-05,
+      "loss": 0.0408,
+      "step": 10500
+    },
+    {
+      "epoch": 3.5319718781386005,
+      "grad_norm": 0.1244506984949112,
+      "learning_rate": 2.058085035152327e-05,
+      "loss": 0.0426,
+      "step": 10550
+    },
+    {
+      "epoch": 3.5487110813525278,
+      "grad_norm": 0.1180344969034195,
+      "learning_rate": 2.044135699140721e-05,
+      "loss": 0.0434,
+      "step": 10600
+    },
+    {
+      "epoch": 3.5654502845664546,
+      "grad_norm": 0.15951013565063477,
+      "learning_rate": 2.030186363129115e-05,
+      "loss": 0.0387,
+      "step": 10650
+    },
+    {
+      "epoch": 3.582189487780382,
+      "grad_norm": 0.16064217686653137,
+      "learning_rate": 2.0162370271175094e-05,
+      "loss": 0.0445,
+      "step": 10700
+    },
+    {
+      "epoch": 3.598928690994309,
+      "grad_norm": 0.17813698947429657,
+      "learning_rate": 2.0022876911059035e-05,
+      "loss": 0.0407,
+      "step": 10750
+    },
+    {
+      "epoch": 3.6156678942082356,
+      "grad_norm": 0.1256450116634369,
+      "learning_rate": 1.988338355094298e-05,
+      "loss": 0.0462,
+      "step": 10800
+    },
+    {
+      "epoch": 3.6324070974221625,
+      "grad_norm": 0.14016403257846832,
+      "learning_rate": 1.9743890190826916e-05,
+      "loss": 0.0456,
+      "step": 10850
+    },
+    {
+      "epoch": 3.64914630063609,
+      "grad_norm": 0.1396850347518921,
+      "learning_rate": 1.9604396830710857e-05,
+      "loss": 0.0403,
+      "step": 10900
+    },
+    {
+      "epoch": 3.6658855038500167,
+      "grad_norm": 0.17943057417869568,
+      "learning_rate": 1.94649034705948e-05,
+      "loss": 0.0417,
+      "step": 10950
+    },
+    {
+      "epoch": 3.682624707063944,
+      "grad_norm": 0.14947953820228577,
+      "learning_rate": 1.932541011047874e-05,
+      "loss": 0.0421,
+      "step": 11000
+    },
+    {
+      "epoch": 3.699363910277871,
+      "grad_norm": 0.12628613412380219,
+      "learning_rate": 1.9185916750362685e-05,
+      "loss": 0.0435,
+      "step": 11050
+    },
+    {
+      "epoch": 3.7161031134917977,
+      "grad_norm": 0.2205984890460968,
+      "learning_rate": 1.9046423390246626e-05,
+      "loss": 0.0396,
+      "step": 11100
+    },
+    {
+      "epoch": 3.732842316705725,
+      "grad_norm": 0.13236357271671295,
+      "learning_rate": 1.8906930030130567e-05,
+      "loss": 0.0405,
+      "step": 11150
+    },
+    {
+      "epoch": 3.749581519919652,
+      "grad_norm": 0.15023528039455414,
+      "learning_rate": 1.8767436670014507e-05,
+      "loss": 0.0434,
+      "step": 11200
+    },
+    {
+      "epoch": 3.7663207231335787,
+      "grad_norm": 0.1427326649427414,
+      "learning_rate": 1.8627943309898448e-05,
+      "loss": 0.0437,
+      "step": 11250
+    },
+    {
+      "epoch": 3.783059926347506,
+      "grad_norm": 0.1890624761581421,
+      "learning_rate": 1.8488449949782392e-05,
+      "loss": 0.0408,
+      "step": 11300
+    },
+    {
+      "epoch": 3.799799129561433,
+      "grad_norm": 0.27970972657203674,
+      "learning_rate": 1.8348956589666333e-05,
+      "loss": 0.0441,
+      "step": 11350
+    },
+    {
+      "epoch": 3.8165383327753597,
+      "grad_norm": 0.12823455035686493,
+      "learning_rate": 1.8209463229550273e-05,
+      "loss": 0.0412,
+      "step": 11400
+    },
+    {
+      "epoch": 3.833277535989287,
+      "grad_norm": 0.1442965269088745,
+      "learning_rate": 1.8069969869434218e-05,
+      "loss": 0.0416,
+      "step": 11450
+    },
+    {
+      "epoch": 3.850016739203214,
+      "grad_norm": 0.13739417493343353,
+      "learning_rate": 1.7930476509318158e-05,
+      "loss": 0.0397,
+      "step": 11500
+    },
+    {
+      "epoch": 3.866755942417141,
+      "grad_norm": 0.16616705060005188,
+      "learning_rate": 1.77909831492021e-05,
+      "loss": 0.0413,
+      "step": 11550
+    },
+    {
+      "epoch": 3.883495145631068,
+      "grad_norm": 0.23060384392738342,
+      "learning_rate": 1.765148978908604e-05,
+      "loss": 0.0447,
+      "step": 11600
+    },
+    {
+      "epoch": 3.900234348844995,
+      "grad_norm": 0.2936810553073883,
+      "learning_rate": 1.751199642896998e-05,
+      "loss": 0.0409,
+      "step": 11650
+    },
+    {
+      "epoch": 3.9169735520589217,
+      "grad_norm": 0.17367126047611237,
+      "learning_rate": 1.7372503068853924e-05,
+      "loss": 0.0411,
+      "step": 11700
+    },
+    {
+      "epoch": 3.933712755272849,
+      "grad_norm": 0.14550547301769257,
+      "learning_rate": 1.7233009708737865e-05,
+      "loss": 0.0445,
+      "step": 11750
+    },
+    {
+      "epoch": 3.950451958486776,
+      "grad_norm": 0.13322454690933228,
+      "learning_rate": 1.709351634862181e-05,
+      "loss": 0.0444,
+      "step": 11800
+    },
+    {
+      "epoch": 3.967191161700703,
+      "grad_norm": 0.13606959581375122,
+      "learning_rate": 1.6954022988505746e-05,
+      "loss": 0.041,
+      "step": 11850
+    },
+    {
+      "epoch": 3.98393036491463,
+      "grad_norm": 0.14227426052093506,
+      "learning_rate": 1.681452962838969e-05,
+      "loss": 0.0412,
+      "step": 11900
+    },
+    {
+      "epoch": 4.0,
+      "eval_loss": 0.045825306326150894,
+      "eval_runtime": 48.668,
+      "eval_samples_per_second": 245.5,
+      "eval_steps_per_second": 15.349,
+      "step": 11948
+    },
+    {
+      "epoch": 4.000669568128557,
+      "grad_norm": 0.14429377019405365,
+      "learning_rate": 1.667503626827363e-05,
+      "loss": 0.0398,
+      "step": 11950
+    },
+    {
+      "epoch": 4.017408771342484,
+      "grad_norm": 0.13842210173606873,
+      "learning_rate": 1.653554290815757e-05,
+      "loss": 0.0365,
+      "step": 12000
+    },
+    {
+      "epoch": 4.0341479745564115,
+      "grad_norm": 0.2168322205543518,
+      "learning_rate": 1.6396049548041516e-05,
+      "loss": 0.0395,
+      "step": 12050
+    },
+    {
+      "epoch": 4.050887177770338,
+      "grad_norm": 0.1534542292356491,
+      "learning_rate": 1.6256556187925456e-05,
+      "loss": 0.0377,
+      "step": 12100
+    },
+    {
+      "epoch": 4.067626380984265,
+      "grad_norm": 0.13067083060741425,
+      "learning_rate": 1.6117062827809397e-05,
+      "loss": 0.042,
+      "step": 12150
+    },
+    {
+      "epoch": 4.084365584198192,
+      "grad_norm": 0.13599875569343567,
+      "learning_rate": 1.5977569467693338e-05,
+      "loss": 0.0403,
+      "step": 12200
+    },
+    {
+      "epoch": 4.101104787412119,
+      "grad_norm": 0.23121419548988342,
+      "learning_rate": 1.5838076107577278e-05,
+      "loss": 0.0406,
+      "step": 12250
+    },
+    {
+      "epoch": 4.117843990626046,
+      "grad_norm": 0.15491856634616852,
+      "learning_rate": 1.5698582747461222e-05,
+      "loss": 0.0381,
+      "step": 12300
+    },
+    {
+      "epoch": 4.1345831938399735,
+      "grad_norm": 0.13168101012706757,
+      "learning_rate": 1.5559089387345163e-05,
+      "loss": 0.0353,
+      "step": 12350
+    },
+    {
+      "epoch": 4.1513223970539,
+      "grad_norm": 0.1694163829088211,
+      "learning_rate": 1.5419596027229104e-05,
+      "loss": 0.04,
+      "step": 12400
+    },
+    {
+      "epoch": 4.168061600267827,
+      "grad_norm": 0.17727704346179962,
+      "learning_rate": 1.5280102667113048e-05,
+      "loss": 0.0364,
+      "step": 12450
+    },
+    {
+      "epoch": 4.184800803481754,
+      "grad_norm": 0.18948902189731598,
+      "learning_rate": 1.5140609306996988e-05,
+      "loss": 0.0372,
+      "step": 12500
+    },
+    {
+      "epoch": 4.201540006695681,
+      "grad_norm": 0.2130223959684372,
+      "learning_rate": 1.5001115946880929e-05,
+      "loss": 0.0398,
+      "step": 12550
+    },
+    {
+      "epoch": 4.218279209909609,
+      "grad_norm": 0.15893810987472534,
+      "learning_rate": 1.486162258676487e-05,
+      "loss": 0.0387,
+      "step": 12600
+    },
+    {
+      "epoch": 4.2350184131235356,
+      "grad_norm": 0.19536694884300232,
+      "learning_rate": 1.4722129226648812e-05,
+      "loss": 0.041,
+      "step": 12650
+    },
+    {
+      "epoch": 4.251757616337462,
+      "grad_norm": 0.16755405068397522,
+      "learning_rate": 1.4582635866532754e-05,
+      "loss": 0.0367,
+      "step": 12700
+    },
+    {
+      "epoch": 4.268496819551389,
+      "grad_norm": 0.14820334315299988,
+      "learning_rate": 1.4443142506416697e-05,
+      "loss": 0.0382,
+      "step": 12750
+    },
+    {
+      "epoch": 4.285236022765316,
+      "grad_norm": 0.21781007945537567,
+      "learning_rate": 1.4303649146300637e-05,
+      "loss": 0.0398,
+      "step": 12800
+    },
+    {
+      "epoch": 4.301975225979243,
+      "grad_norm": 0.17162373661994934,
+      "learning_rate": 1.4164155786184576e-05,
+      "loss": 0.042,
+      "step": 12850
+    },
+    {
+      "epoch": 4.318714429193171,
+      "grad_norm": 0.13765838742256165,
+      "learning_rate": 1.4024662426068519e-05,
+      "loss": 0.0378,
+      "step": 12900
+    },
+    {
+      "epoch": 4.335453632407098,
+      "grad_norm": 0.22830092906951904,
+      "learning_rate": 1.3885169065952461e-05,
+      "loss": 0.0387,
+      "step": 12950
+    },
+    {
+      "epoch": 4.352192835621024,
+      "grad_norm": 0.1682949960231781,
+      "learning_rate": 1.3745675705836403e-05,
+      "loss": 0.0408,
+      "step": 13000
+    },
+    {
+      "epoch": 4.368932038834951,
+      "grad_norm": 0.16153910756111145,
+      "learning_rate": 1.3606182345720344e-05,
+      "loss": 0.0405,
+      "step": 13050
+    },
+    {
+      "epoch": 4.385671242048878,
+      "grad_norm": 0.18771956861019135,
+      "learning_rate": 1.3466688985604286e-05,
+      "loss": 0.0427,
+      "step": 13100
+    },
+    {
+      "epoch": 4.402410445262806,
+      "grad_norm": 0.11162823438644409,
+      "learning_rate": 1.3327195625488229e-05,
+      "loss": 0.0399,
+      "step": 13150
+    },
+    {
+      "epoch": 4.419149648476733,
+      "grad_norm": 0.14466647803783417,
+      "learning_rate": 1.3187702265372168e-05,
+      "loss": 0.0401,
+      "step": 13200
+    },
+    {
+      "epoch": 4.43588885169066,
+      "grad_norm": 0.14834430813789368,
+      "learning_rate": 1.304820890525611e-05,
+      "loss": 0.0386,
+      "step": 13250
+    },
+    {
+      "epoch": 4.4526280549045865,
+      "grad_norm": 0.20324522256851196,
+      "learning_rate": 1.2908715545140052e-05,
+      "loss": 0.0379,
+      "step": 13300
+    },
+    {
+      "epoch": 4.469367258118513,
+      "grad_norm": 0.12625128030776978,
+      "learning_rate": 1.2769222185023993e-05,
+      "loss": 0.0403,
+      "step": 13350
+    },
+    {
+      "epoch": 4.48610646133244,
+      "grad_norm": 0.11018920689821243,
+      "learning_rate": 1.2629728824907935e-05,
+      "loss": 0.0383,
+      "step": 13400
+    },
+    {
+      "epoch": 4.502845664546368,
+      "grad_norm": 0.18631067872047424,
+      "learning_rate": 1.2490235464791876e-05,
+      "loss": 0.0424,
+      "step": 13450
+    },
+    {
+      "epoch": 4.519584867760295,
+      "grad_norm": 0.28578242659568787,
+      "learning_rate": 1.2350742104675818e-05,
+      "loss": 0.0385,
+      "step": 13500
+    },
+    {
+      "epoch": 4.536324070974222,
+      "grad_norm": 0.1431867629289627,
+      "learning_rate": 1.2211248744559759e-05,
+      "loss": 0.038,
+      "step": 13550
+    },
+    {
+      "epoch": 4.5530632741881485,
+      "grad_norm": 0.12731611728668213,
+      "learning_rate": 1.2071755384443701e-05,
+      "loss": 0.0389,
+      "step": 13600
+    },
+    {
+      "epoch": 4.569802477402075,
+      "grad_norm": 0.20958903431892395,
+      "learning_rate": 1.1932262024327642e-05,
+      "loss": 0.0383,
+      "step": 13650
+    },
+    {
+      "epoch": 4.586541680616003,
+      "grad_norm": 0.21877717971801758,
+      "learning_rate": 1.1792768664211584e-05,
+      "loss": 0.0399,
+      "step": 13700
+    },
+    {
+      "epoch": 4.60328088382993,
+      "grad_norm": 0.1719764918088913,
+      "learning_rate": 1.1653275304095527e-05,
+      "loss": 0.0355,
+      "step": 13750
+    },
+    {
+      "epoch": 4.620020087043857,
+      "grad_norm": 0.13044840097427368,
+      "learning_rate": 1.1513781943979467e-05,
+      "loss": 0.0386,
+      "step": 13800
+    },
+    {
+      "epoch": 4.636759290257784,
+      "grad_norm": 0.20552796125411987,
+      "learning_rate": 1.1374288583863408e-05,
+      "loss": 0.04,
+      "step": 13850
+    },
+    {
+      "epoch": 4.6534984934717105,
+      "grad_norm": 0.15806210041046143,
+      "learning_rate": 1.123479522374735e-05,
+      "loss": 0.0381,
+      "step": 13900
+    },
+    {
+      "epoch": 4.670237696685637,
+      "grad_norm": 0.2317190170288086,
+      "learning_rate": 1.1095301863631291e-05,
+      "loss": 0.0394,
+      "step": 13950
+    },
+    {
+      "epoch": 4.686976899899565,
+      "grad_norm": 0.16183790564537048,
+      "learning_rate": 1.0955808503515233e-05,
+      "loss": 0.0392,
+      "step": 14000
+    },
+    {
+      "epoch": 4.703716103113492,
+      "grad_norm": 0.14107303321361542,
+      "learning_rate": 1.0816315143399176e-05,
+      "loss": 0.039,
+      "step": 14050
+    },
+    {
+      "epoch": 4.720455306327419,
+      "grad_norm": 0.21340009570121765,
+      "learning_rate": 1.0676821783283116e-05,
+      "loss": 0.04,
+      "step": 14100
+    },
+    {
+      "epoch": 4.737194509541346,
+      "grad_norm": 0.1754944771528244,
+      "learning_rate": 1.0537328423167057e-05,
+      "loss": 0.0371,
+      "step": 14150
+    },
+    {
+      "epoch": 4.7539337127552725,
+      "grad_norm": 0.13280175626277924,
+      "learning_rate": 1.0397835063051e-05,
+      "loss": 0.0358,
+      "step": 14200
+    },
+    {
+      "epoch": 4.7706729159692,
+      "grad_norm": 0.15039420127868652,
+      "learning_rate": 1.0258341702934942e-05,
+      "loss": 0.0438,
+      "step": 14250
+    },
+    {
+      "epoch": 4.787412119183127,
+      "grad_norm": 0.30474409461021423,
+      "learning_rate": 1.0118848342818882e-05,
+      "loss": 0.0377,
+      "step": 14300
+    },
+    {
+      "epoch": 4.804151322397054,
+      "grad_norm": 0.2012936919927597,
+      "learning_rate": 9.979354982702823e-06,
+      "loss": 0.0417,
+      "step": 14350
+    },
+    {
+      "epoch": 4.820890525610981,
+      "grad_norm": 0.192657932639122,
+      "learning_rate": 9.839861622586765e-06,
+      "loss": 0.0392,
+      "step": 14400
+    },
+    {
+      "epoch": 4.837629728824908,
+      "grad_norm": 0.16141368448734283,
+      "learning_rate": 9.700368262470706e-06,
+      "loss": 0.0378,
+      "step": 14450
+    },
+    {
+      "epoch": 4.854368932038835,
+      "grad_norm": 0.19473744928836823,
+      "learning_rate": 9.560874902354648e-06,
+      "loss": 0.0368,
+      "step": 14500
+    },
+    {
+      "epoch": 4.871108135252762,
+      "grad_norm": 0.15624327957630157,
+      "learning_rate": 9.42138154223859e-06,
+      "loss": 0.0416,
+      "step": 14550
+    },
+    {
+      "epoch": 4.887847338466689,
+      "grad_norm": 0.1572103202342987,
+      "learning_rate": 9.281888182122531e-06,
+      "loss": 0.0383,
+      "step": 14600
+    },
+    {
+      "epoch": 4.904586541680616,
+      "grad_norm": 0.15121281147003174,
+      "learning_rate": 9.142394822006472e-06,
+      "loss": 0.0405,
+      "step": 14650
+    },
+    {
+      "epoch": 4.921325744894543,
+      "grad_norm": 0.1739313155412674,
+      "learning_rate": 9.002901461890415e-06,
+      "loss": 0.0389,
+      "step": 14700
+    },
+    {
+      "epoch": 4.93806494810847,
+      "grad_norm": 0.12826618552207947,
+      "learning_rate": 8.863408101774357e-06,
+      "loss": 0.0385,
+      "step": 14750
+    },
+    {
+      "epoch": 4.9548041513223975,
+      "grad_norm": 0.11419858038425446,
+      "learning_rate": 8.723914741658298e-06,
+      "loss": 0.0387,
+      "step": 14800
+    },
+    {
+      "epoch": 4.971543354536324,
+      "grad_norm": 0.18640589714050293,
+      "learning_rate": 8.584421381542238e-06,
+      "loss": 0.0376,
+      "step": 14850
+    },
+    {
+      "epoch": 4.988282557750251,
+      "grad_norm": 0.18249401450157166,
+      "learning_rate": 8.44492802142618e-06,
+      "loss": 0.041,
+      "step": 14900
+    },
+    {
+      "epoch": 5.0,
+      "eval_loss": 0.045213617384433746,
+      "eval_runtime": 49.0497,
+      "eval_samples_per_second": 243.589,
+      "eval_steps_per_second": 15.229,
+      "step": 14935
+    },
+    {
+      "epoch": 5.005021760964178,
+      "grad_norm": 0.25643524527549744,
+      "learning_rate": 8.305434661310121e-06,
+      "loss": 0.0403,
+      "step": 14950
+    },
+    {
+      "epoch": 5.021760964178105,
+      "grad_norm": 0.16471606492996216,
+      "learning_rate": 8.165941301194064e-06,
+      "loss": 0.0374,
+      "step": 15000
+    },
+    {
+      "epoch": 5.038500167392032,
+      "grad_norm": 0.14938130974769592,
+      "learning_rate": 8.026447941078006e-06,
+      "loss": 0.0363,
+      "step": 15050
+    },
+    {
+      "epoch": 5.0552393706059595,
+      "grad_norm": 0.13239839673042297,
+      "learning_rate": 7.886954580961947e-06,
+      "loss": 0.037,
+      "step": 15100
+    },
+    {
+      "epoch": 5.071978573819886,
+      "grad_norm": 0.17447875440120697,
+      "learning_rate": 7.747461220845887e-06,
+      "loss": 0.0374,
+      "step": 15150
+    },
+    {
+      "epoch": 5.088717777033813,
+      "grad_norm": 0.15902550518512726,
+      "learning_rate": 7.6079678607298296e-06,
+      "loss": 0.0361,
+      "step": 15200
+    },
+    {
+      "epoch": 5.10545698024774,
+      "grad_norm": 0.22053466737270355,
+      "learning_rate": 7.468474500613772e-06,
+      "loss": 0.0381,
+      "step": 15250
+    },
+    {
+      "epoch": 5.122196183461667,
+      "grad_norm": 0.1754140853881836,
+      "learning_rate": 7.328981140497712e-06,
+      "loss": 0.0327,
+      "step": 15300
+    },
+    {
+      "epoch": 5.138935386675595,
+      "grad_norm": 0.15227694809436798,
+      "learning_rate": 7.189487780381654e-06,
+      "loss": 0.0343,
+      "step": 15350
+    },
+    {
+      "epoch": 5.1556745898895215,
+      "grad_norm": 0.1644590198993683,
+      "learning_rate": 7.049994420265596e-06,
+      "loss": 0.0373,
+      "step": 15400
+    },
+    {
+      "epoch": 5.172413793103448,
+      "grad_norm": 0.13261474668979645,
+      "learning_rate": 6.910501060149536e-06,
+      "loss": 0.0354,
+      "step": 15450
+    },
+    {
+      "epoch": 5.189152996317375,
+      "grad_norm": 0.16326355934143066,
+      "learning_rate": 6.771007700033479e-06,
+      "loss": 0.0368,
+      "step": 15500
+    },
+    {
+      "epoch": 5.205892199531302,
+      "grad_norm": 0.21868841350078583,
+      "learning_rate": 6.63151433991742e-06,
+      "loss": 0.0384,
+      "step": 15550
+    },
+    {
+      "epoch": 5.222631402745229,
+      "grad_norm": 0.1279917061328888,
+      "learning_rate": 6.4920209798013624e-06,
+      "loss": 0.037,
+      "step": 15600
+    },
+    {
+      "epoch": 5.239370605959157,
+      "grad_norm": 0.14255809783935547,
+      "learning_rate": 6.352527619685303e-06,
+      "loss": 0.0345,
+      "step": 15650
+    },
+    {
+      "epoch": 5.256109809173084,
+      "grad_norm": 0.15950387716293335,
+      "learning_rate": 6.213034259569245e-06,
+      "loss": 0.0389,
+      "step": 15700
+    },
+    {
+      "epoch": 5.27284901238701,
+      "grad_norm": 0.1789381355047226,
+      "learning_rate": 6.073540899453186e-06,
+      "loss": 0.0368,
+      "step": 15750
+    },
+    {
+      "epoch": 5.289588215600937,
+      "grad_norm": 0.17775952816009521,
+      "learning_rate": 5.934047539337128e-06,
+      "loss": 0.0367,
+      "step": 15800
+    },
+    {
+      "epoch": 5.306327418814864,
+      "grad_norm": 0.16045907139778137,
+      "learning_rate": 5.794554179221069e-06,
+      "loss": 0.0363,
+      "step": 15850
+    },
+    {
+      "epoch": 5.323066622028792,
+      "grad_norm": 0.14778949320316315,
+      "learning_rate": 5.6550608191050115e-06,
+      "loss": 0.038,
+      "step": 15900
+    },
+    {
+      "epoch": 5.339805825242719,
+      "grad_norm": 0.13788272440433502,
+      "learning_rate": 5.515567458988952e-06,
+      "loss": 0.0336,
+      "step": 15950
+    },
+    {
+      "epoch": 5.356545028456646,
+      "grad_norm": 0.14058952033519745,
+      "learning_rate": 5.376074098872894e-06,
+      "loss": 0.0371,
+      "step": 16000
+    },
+    {
+      "epoch": 5.3732842316705725,
+      "grad_norm": 0.13372714817523956,
+      "learning_rate": 5.236580738756835e-06,
+      "loss": 0.0388,
+      "step": 16050
+    },
+    {
+      "epoch": 5.390023434884499,
+      "grad_norm": 0.09582552313804626,
+      "learning_rate": 5.097087378640777e-06,
+      "loss": 0.0345,
+      "step": 16100
+    },
+    {
+      "epoch": 5.406762638098426,
+      "grad_norm": 0.14247213304042816,
+      "learning_rate": 4.957594018524719e-06,
+      "loss": 0.0348,
+      "step": 16150
+    },
+    {
+      "epoch": 5.423501841312354,
+      "grad_norm": 0.18378828465938568,
+      "learning_rate": 4.81810065840866e-06,
+      "loss": 0.036,
+      "step": 16200
+    },
+    {
+      "epoch": 5.440241044526281,
+      "grad_norm": 0.14933009445667267,
+      "learning_rate": 4.678607298292601e-06,
+      "loss": 0.0385,
+      "step": 16250
+    },
+    {
+      "epoch": 5.456980247740208,
+      "grad_norm": 0.14775556325912476,
+      "learning_rate": 4.5391139381765435e-06,
+      "loss": 0.0381,
+      "step": 16300
+    },
+    {
+      "epoch": 5.4737194509541345,
+      "grad_norm": 0.17397841811180115,
+      "learning_rate": 4.399620578060484e-06,
+      "loss": 0.0355,
+      "step": 16350
+    },
+    {
+      "epoch": 5.490458654168061,
+      "grad_norm": 0.15232603251934052,
+      "learning_rate": 4.2601272179444265e-06,
+      "loss": 0.0363,
+      "step": 16400
+    },
+    {
+      "epoch": 5.507197857381989,
+      "grad_norm": 0.1729612499475479,
+      "learning_rate": 4.120633857828367e-06,
+      "loss": 0.0356,
+      "step": 16450
+    },
+    {
+      "epoch": 5.523937060595916,
+      "grad_norm": 0.12964119017124176,
+      "learning_rate": 3.9811404977123095e-06,
+      "loss": 0.0393,
+      "step": 16500
+    },
+    {
+      "epoch": 5.540676263809843,
+      "grad_norm": 0.201249361038208,
+      "learning_rate": 3.841647137596251e-06,
+      "loss": 0.0368,
+      "step": 16550
+    },
+    {
+      "epoch": 5.55741546702377,
+      "grad_norm": 0.14805611968040466,
+      "learning_rate": 3.7021537774801917e-06,
+      "loss": 0.0338,
+      "step": 16600
+    },
+    {
+      "epoch": 5.5741546702376965,
+      "grad_norm": 0.11392233520746231,
+      "learning_rate": 3.5626604173641336e-06,
+      "loss": 0.0351,
+      "step": 16650
+    },
+    {
+      "epoch": 5.590893873451623,
+      "grad_norm": 0.13584397733211517,
+      "learning_rate": 3.423167057248075e-06,
+      "loss": 0.0373,
+      "step": 16700
+    },
+    {
+      "epoch": 5.60763307666555,
+      "grad_norm": 0.17985470592975616,
+      "learning_rate": 3.283673697132017e-06,
+      "loss": 0.0355,
+      "step": 16750
+    },
+    {
+      "epoch": 5.624372279879478,
+      "grad_norm": 0.18598899245262146,
+      "learning_rate": 3.144180337015958e-06,
+      "loss": 0.0409,
+      "step": 16800
+    },
+    {
+      "epoch": 5.641111483093405,
+      "grad_norm": 0.17507706582546234,
+      "learning_rate": 3.0046869768998996e-06,
+      "loss": 0.0354,
+      "step": 16850
+    },
+    {
+      "epoch": 5.657850686307332,
+      "grad_norm": 0.15566672384738922,
+      "learning_rate": 2.865193616783841e-06,
+      "loss": 0.0347,
+      "step": 16900
+    },
+    {
+      "epoch": 5.6745898895212585,
+      "grad_norm": 0.15541379153728485,
+      "learning_rate": 2.7257002566677827e-06,
+      "loss": 0.0398,
+      "step": 16950
+    },
+    {
+      "epoch": 5.691329092735186,
+      "grad_norm": 0.20108754932880402,
+      "learning_rate": 2.586206896551724e-06,
+      "loss": 0.0358,
+      "step": 17000
+    },
+    {
+      "epoch": 5.708068295949113,
+      "grad_norm": 0.1303117424249649,
+      "learning_rate": 2.4467135364356657e-06,
+      "loss": 0.043,
+      "step": 17050
+    },
+    {
+      "epoch": 5.72480749916304,
+      "grad_norm": 0.14489831030368805,
+      "learning_rate": 2.3072201763196076e-06,
+      "loss": 0.0377,
+      "step": 17100
+    },
+    {
+      "epoch": 5.741546702376967,
+      "grad_norm": 0.10605516284704208,
+      "learning_rate": 2.167726816203549e-06,
+      "loss": 0.0348,
+      "step": 17150
+    },
+    {
+      "epoch": 5.758285905590894,
+      "grad_norm": 0.29746726155281067,
+      "learning_rate": 2.02823345608749e-06,
+      "loss": 0.0359,
+      "step": 17200
+    },
+    {
+      "epoch": 5.775025108804821,
+      "grad_norm": 0.14607931673526764,
+      "learning_rate": 1.888740095971432e-06,
+      "loss": 0.0361,
+      "step": 17250
+    },
+    {
+      "epoch": 5.791764312018747,
+      "grad_norm": 0.12281953543424606,
+      "learning_rate": 1.7492467358553734e-06,
+      "loss": 0.0377,
+      "step": 17300
+    },
+    {
+      "epoch": 5.808503515232675,
+      "grad_norm": 0.19870831072330475,
+      "learning_rate": 1.609753375739315e-06,
+      "loss": 0.0383,
+      "step": 17350
+    },
+    {
+      "epoch": 5.825242718446602,
+      "grad_norm": 0.13006938993930817,
+      "learning_rate": 1.4702600156232564e-06,
+      "loss": 0.0381,
+      "step": 17400
+    },
+    {
+      "epoch": 5.841981921660529,
+      "grad_norm": 0.12237653881311417,
+      "learning_rate": 1.330766655507198e-06,
+      "loss": 0.0375,
+      "step": 17450
+    },
+    {
+      "epoch": 5.858721124874456,
+      "grad_norm": 0.17220979928970337,
+      "learning_rate": 1.1912732953911394e-06,
+      "loss": 0.0365,
+      "step": 17500
+    },
+    {
+      "epoch": 5.8754603280883835,
+      "grad_norm": 0.18653325736522675,
+      "learning_rate": 1.051779935275081e-06,
+      "loss": 0.0383,
+      "step": 17550
+    },
+    {
+      "epoch": 5.89219953130231,
+      "grad_norm": 0.11963806301355362,
+      "learning_rate": 9.122865751590225e-07,
+      "loss": 0.0399,
+      "step": 17600
+    },
+    {
+      "epoch": 5.908938734516237,
+      "grad_norm": 0.26904088258743286,
+      "learning_rate": 7.72793215042964e-07,
+      "loss": 0.0405,
+      "step": 17650
+    },
+    {
+      "epoch": 5.925677937730164,
+      "grad_norm": 0.149702250957489,
+      "learning_rate": 6.332998549269055e-07,
+      "loss": 0.0358,
+      "step": 17700
+    },
+    {
+      "epoch": 5.942417140944091,
+      "grad_norm": 0.15700772404670715,
+      "learning_rate": 4.93806494810847e-07,
+      "loss": 0.0374,
+      "step": 17750
+    },
+    {
+      "epoch": 5.959156344158018,
+      "grad_norm": 0.20573197305202484,
+      "learning_rate": 3.5431313469478856e-07,
+      "loss": 0.0362,
+      "step": 17800
+    },
+    {
+      "epoch": 5.975895547371945,
+      "grad_norm": 0.11808827519416809,
+      "learning_rate": 2.1481977457873006e-07,
+      "loss": 0.0371,
+      "step": 17850
+    },
+    {
+      "epoch": 5.992634750585872,
+      "grad_norm": 0.1634693145751953,
+      "learning_rate": 7.532641446267158e-08,
+      "loss": 0.0365,
+      "step": 17900
+    },
+    {
+      "epoch": 6.0,
+      "eval_loss": 0.045013878494501114,
+      "eval_runtime": 49.7786,
+      "eval_samples_per_second": 240.023,
+      "eval_steps_per_second": 15.006,
+      "step": 17922
+    }
+  ],
+  "logging_steps": 50,
+  "max_steps": 17922,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 6,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 3.888169575933542e+16,
+  "train_batch_size": 16,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-17922/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:37dd546b69fb60d8deb15a8b88e40b23e367c0e9f5a053ea3ae7c730b3874f2e
+size 5304

checkpoint-17922/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-2987/config.json ADDED Viewed

	@@ -0,0 +1,41 @@

+{
+  "_name_or_path": "ckartal/english-to-turkish-finetuned-model",
+  "activation_dropout": 0.0,
+  "activation_function": "swish",
+  "architectures": [
+    "MarianMTModel"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.0,
+  "d_model": 512,
+  "decoder_attention_heads": 8,
+  "decoder_ffn_dim": 2048,
+  "decoder_layerdrop": 0.0,
+  "decoder_layers": 6,
+  "decoder_start_token_id": 59993,
+  "decoder_vocab_size": 59994,
+  "dropout": 0.1,
+  "encoder_attention_heads": 8,
+  "encoder_ffn_dim": 2048,
+  "encoder_layerdrop": 0.0,
+  "encoder_layers": 6,
+  "eos_token_id": 0,
+  "forced_eos_token_id": 0,
+  "init_std": 0.02,
+  "is_encoder_decoder": true,
+  "max_length": null,
+  "max_position_embeddings": 512,
+  "model_type": "marian",
+  "normalize_embedding": false,
+  "num_beams": null,
+  "num_hidden_layers": 6,
+  "pad_token_id": 59993,
+  "scale_embedding": true,
+  "share_encoder_decoder_embeddings": true,
+  "static_position_embeddings": true,
+  "torch_dtype": "float32",
+  "transformers_version": "4.49.0",
+  "use_cache": true,
+  "vocab_size": 59994
+}

checkpoint-2987/generation_config.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+  "bad_words_ids": [
+    [
+      59993
+    ]
+  ],
+  "bos_token_id": 0,
+  "decoder_start_token_id": 59993,
+  "eos_token_id": 0,
+  "forced_eos_token_id": 0,
+  "max_length": 512,
+  "num_beams": 6,
+  "pad_token_id": 59993,
+  "renormalize_logits": true,
+  "transformers_version": "4.49.0"
+}

checkpoint-2987/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:772b21519870015edcff8e2db9a6a4448060eeeb8178debe04f9b0dd8da0ceff
+size 299690728

checkpoint-2987/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:06b8b97b6f892dac0c6a91fa2167f3739d8cd231960f03b5887658f9fbe99aa4
+size 599054970

checkpoint-2987/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f9ec85fbd9238c9947c10323eb559f724ad5ae320e21f57a8df756ef2b058ef4
+size 14244

checkpoint-2987/scaler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:73f80cf9a0e45ee7096ade8909076162b7d66cc8713162644286a9a6524a58c6
+size 988

checkpoint-2987/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:48275315da074348c5e3aa2b9e9ce7e52122fa39adbd74e2b655d4d79629ace5
+size 1064