ewre324 commited on Jan 30

Commit

92b39f0

verified ·

1 Parent(s): 06df5ec

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

checkpoint-1000/added_tokens.json +4 -0
checkpoint-1000/config.json +27 -0
checkpoint-1000/generation_config.json +6 -0
checkpoint-1000/model.safetensors +3 -0
checkpoint-1000/optimizer.pt +3 -0
checkpoint-1000/rng_state.pth +3 -0
checkpoint-1000/scheduler.pt +3 -0
checkpoint-1000/special_tokens_map.json +24 -0
checkpoint-1000/tokenizer.json +0 -0
checkpoint-1000/tokenizer.model +3 -0
checkpoint-1000/tokenizer_config.json +61 -0
checkpoint-1000/trainer_state.json +1473 -0
checkpoint-1000/training_args.bin +3 -0
checkpoint-1500/added_tokens.json +4 -0
checkpoint-1500/config.json +27 -0
checkpoint-1500/generation_config.json +6 -0
checkpoint-1500/model.safetensors +3 -0
checkpoint-1500/optimizer.pt +3 -0
checkpoint-1500/rng_state.pth +3 -0
checkpoint-1500/scheduler.pt +3 -0
checkpoint-1500/special_tokens_map.json +24 -0
checkpoint-1500/tokenizer.json +0 -0
checkpoint-1500/tokenizer.model +3 -0
checkpoint-1500/tokenizer_config.json +61 -0
checkpoint-1500/trainer_state.json +2189 -0
checkpoint-1500/training_args.bin +3 -0
checkpoint-2000/added_tokens.json +4 -0
checkpoint-2000/config.json +27 -0
checkpoint-2000/generation_config.json +6 -0
checkpoint-2000/model.safetensors +3 -0
checkpoint-2000/optimizer.pt +3 -0
checkpoint-2000/rng_state.pth +3 -0
checkpoint-2000/scheduler.pt +3 -0
checkpoint-2000/special_tokens_map.json +24 -0
checkpoint-2000/tokenizer.json +0 -0
checkpoint-2000/tokenizer.model +3 -0
checkpoint-2000/tokenizer_config.json +61 -0
checkpoint-2000/trainer_state.json +2913 -0
checkpoint-2000/training_args.bin +3 -0
checkpoint-500/added_tokens.json +4 -0
checkpoint-500/config.json +27 -0
checkpoint-500/generation_config.json +6 -0
checkpoint-500/model.safetensors +3 -0
checkpoint-500/optimizer.pt +3 -0
checkpoint-500/rng_state.pth +3 -0
checkpoint-500/scheduler.pt +3 -0
checkpoint-500/special_tokens_map.json +24 -0
checkpoint-500/tokenizer.json +0 -0
checkpoint-500/tokenizer.model +3 -0
checkpoint-500/tokenizer_config.json +61 -0

checkpoint-1000/added_tokens.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "<|im_end|>": 32000,
+  "<|im_start|>": 32001
+}

checkpoint-1000/config.json ADDED Viewed

	@@ -0,0 +1,27 @@

+{
+  "_name_or_path": "Felladrin/Minueza-32M-Chat",
+  "architectures": [
+    "MistralForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "eos_token_id": 32000,
+  "head_dim": 26,
+  "hidden_act": "silu",
+  "hidden_size": 312,
+  "initializer_range": 0.02,
+  "intermediate_size": 1092,
+  "max_position_embeddings": 2048,
+  "model_type": "mistral",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 10,
+  "num_key_value_heads": 4,
+  "rms_norm_eps": 1e-06,
+  "rope_theta": 10000.0,
+  "sliding_window": 1024,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float32",
+  "transformers_version": "4.48.1",
+  "use_cache": false,
+  "vocab_size": 32002
+}

checkpoint-1000/generation_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "eos_token_id": 32000,
+  "transformers_version": "4.48.1"
+}

checkpoint-1000/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:02563fdb17f2e6d99f27e3e09ff134d70ab636e0790c26fcfd0bdb41eb8eb72b
+size 131181272

checkpoint-1000/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:18e4b35653ac0ef3b4691a5b8b02753852872f1c718997792735d5aa55ac1830
+size 262419258

checkpoint-1000/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c062f7f375beded48b5337f5a3f3a5cb38807fa3e85dbf3e294c0ab6b627bfc2
+size 14244

checkpoint-1000/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:753b60ceb48341188690448f803646e87b0db50bdbf0da140937469ffbf4f610
+size 1064

checkpoint-1000/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<|im_end|>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoint-1000/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-1000/tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dadfd56d766715c61d2ef780a525ab43b8e6da4de6865bda3d95fdef5e134055
+size 493443

checkpoint-1000/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,61 @@

+{
+  "add_bos_token": false,
+  "add_eos_token": false,
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32000": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32001": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [],
+  "bos_token": "<s>",
+  "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "extra_special_tokens": {},
+  "legacy": true,
+  "model_max_length": 2048,
+  "pad_token": "<|im_end|>",
+  "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}

checkpoint-1000/trainer_state.json ADDED Viewed

	@@ -0,0 +1,1473 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.2408187838651415,
+  "eval_steps": 200,
+  "global_step": 1000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0012040939193257074,
+      "grad_norm": 4.470886707305908,
+      "learning_rate": 1.9975915221579964e-05,
+      "loss": 2.6688,
+      "step": 5
+    },
+    {
+      "epoch": 0.002408187838651415,
+      "grad_norm": 4.067092418670654,
+      "learning_rate": 1.9951830443159926e-05,
+      "loss": 2.2509,
+      "step": 10
+    },
+    {
+      "epoch": 0.003612281757977122,
+      "grad_norm": 4.133108615875244,
+      "learning_rate": 1.9927745664739885e-05,
+      "loss": 2.3732,
+      "step": 15
+    },
+    {
+      "epoch": 0.00481637567730283,
+      "grad_norm": 3.4890763759613037,
+      "learning_rate": 1.9903660886319848e-05,
+      "loss": 2.3632,
+      "step": 20
+    },
+    {
+      "epoch": 0.006020469596628537,
+      "grad_norm": 4.1045308113098145,
+      "learning_rate": 1.987957610789981e-05,
+      "loss": 2.5203,
+      "step": 25
+    },
+    {
+      "epoch": 0.007224563515954244,
+      "grad_norm": 4.26784086227417,
+      "learning_rate": 1.985549132947977e-05,
+      "loss": 2.3349,
+      "step": 30
+    },
+    {
+      "epoch": 0.008428657435279952,
+      "grad_norm": 4.144766330718994,
+      "learning_rate": 1.983140655105973e-05,
+      "loss": 2.4106,
+      "step": 35
+    },
+    {
+      "epoch": 0.00963275135460566,
+      "grad_norm": 3.9538934230804443,
+      "learning_rate": 1.9807321772639694e-05,
+      "loss": 2.3192,
+      "step": 40
+    },
+    {
+      "epoch": 0.010836845273931367,
+      "grad_norm": 3.9219865798950195,
+      "learning_rate": 1.9783236994219656e-05,
+      "loss": 2.2012,
+      "step": 45
+    },
+    {
+      "epoch": 0.012040939193257074,
+      "grad_norm": 3.391493797302246,
+      "learning_rate": 1.9759152215799615e-05,
+      "loss": 2.4064,
+      "step": 50
+    },
+    {
+      "epoch": 0.013245033112582781,
+      "grad_norm": 4.393350124359131,
+      "learning_rate": 1.9735067437379577e-05,
+      "loss": 2.1513,
+      "step": 55
+    },
+    {
+      "epoch": 0.014449127031908489,
+      "grad_norm": 3.6243207454681396,
+      "learning_rate": 1.971098265895954e-05,
+      "loss": 2.2788,
+      "step": 60
+    },
+    {
+      "epoch": 0.015653220951234198,
+      "grad_norm": 3.642468214035034,
+      "learning_rate": 1.9686897880539502e-05,
+      "loss": 2.2244,
+      "step": 65
+    },
+    {
+      "epoch": 0.016857314870559904,
+      "grad_norm": 4.0894670486450195,
+      "learning_rate": 1.966281310211946e-05,
+      "loss": 2.3622,
+      "step": 70
+    },
+    {
+      "epoch": 0.018061408789885613,
+      "grad_norm": 4.033819198608398,
+      "learning_rate": 1.9638728323699423e-05,
+      "loss": 2.3803,
+      "step": 75
+    },
+    {
+      "epoch": 0.01926550270921132,
+      "grad_norm": 3.535987138748169,
+      "learning_rate": 1.9614643545279385e-05,
+      "loss": 2.2161,
+      "step": 80
+    },
+    {
+      "epoch": 0.020469596628537028,
+      "grad_norm": 3.541868209838867,
+      "learning_rate": 1.9590558766859348e-05,
+      "loss": 2.1721,
+      "step": 85
+    },
+    {
+      "epoch": 0.021673690547862733,
+      "grad_norm": 4.147072792053223,
+      "learning_rate": 1.9566473988439307e-05,
+      "loss": 2.3239,
+      "step": 90
+    },
+    {
+      "epoch": 0.022877784467188442,
+      "grad_norm": 3.4949986934661865,
+      "learning_rate": 1.954238921001927e-05,
+      "loss": 2.265,
+      "step": 95
+    },
+    {
+      "epoch": 0.024081878386514148,
+      "grad_norm": 3.793950319290161,
+      "learning_rate": 1.951830443159923e-05,
+      "loss": 2.152,
+      "step": 100
+    },
+    {
+      "epoch": 0.025285972305839857,
+      "grad_norm": 3.9355053901672363,
+      "learning_rate": 1.949421965317919e-05,
+      "loss": 2.2534,
+      "step": 105
+    },
+    {
+      "epoch": 0.026490066225165563,
+      "grad_norm": 3.255175828933716,
+      "learning_rate": 1.9470134874759156e-05,
+      "loss": 2.2971,
+      "step": 110
+    },
+    {
+      "epoch": 0.027694160144491272,
+      "grad_norm": 3.650298595428467,
+      "learning_rate": 1.9446050096339115e-05,
+      "loss": 2.1228,
+      "step": 115
+    },
+    {
+      "epoch": 0.028898254063816978,
+      "grad_norm": 3.1906814575195312,
+      "learning_rate": 1.9421965317919077e-05,
+      "loss": 2.0995,
+      "step": 120
+    },
+    {
+      "epoch": 0.030102347983142687,
+      "grad_norm": 3.8122494220733643,
+      "learning_rate": 1.939788053949904e-05,
+      "loss": 2.1651,
+      "step": 125
+    },
+    {
+      "epoch": 0.031306441902468396,
+      "grad_norm": 3.8269336223602295,
+      "learning_rate": 1.9373795761079e-05,
+      "loss": 2.1731,
+      "step": 130
+    },
+    {
+      "epoch": 0.0325105358217941,
+      "grad_norm": 3.75238037109375,
+      "learning_rate": 1.934971098265896e-05,
+      "loss": 2.3071,
+      "step": 135
+    },
+    {
+      "epoch": 0.03371462974111981,
+      "grad_norm": 3.538330078125,
+      "learning_rate": 1.9325626204238923e-05,
+      "loss": 2.3015,
+      "step": 140
+    },
+    {
+      "epoch": 0.034918723660445516,
+      "grad_norm": 3.497131586074829,
+      "learning_rate": 1.9301541425818882e-05,
+      "loss": 2.128,
+      "step": 145
+    },
+    {
+      "epoch": 0.036122817579771226,
+      "grad_norm": 3.6173276901245117,
+      "learning_rate": 1.9277456647398845e-05,
+      "loss": 2.1792,
+      "step": 150
+    },
+    {
+      "epoch": 0.03732691149909693,
+      "grad_norm": 3.2987892627716064,
+      "learning_rate": 1.9253371868978807e-05,
+      "loss": 2.0246,
+      "step": 155
+    },
+    {
+      "epoch": 0.03853100541842264,
+      "grad_norm": 3.1787831783294678,
+      "learning_rate": 1.922928709055877e-05,
+      "loss": 2.2108,
+      "step": 160
+    },
+    {
+      "epoch": 0.039735099337748346,
+      "grad_norm": 3.5422236919403076,
+      "learning_rate": 1.920520231213873e-05,
+      "loss": 2.1738,
+      "step": 165
+    },
+    {
+      "epoch": 0.040939193257074055,
+      "grad_norm": 3.7987539768218994,
+      "learning_rate": 1.918111753371869e-05,
+      "loss": 2.1161,
+      "step": 170
+    },
+    {
+      "epoch": 0.04214328717639976,
+      "grad_norm": 3.2058522701263428,
+      "learning_rate": 1.9157032755298653e-05,
+      "loss": 2.0808,
+      "step": 175
+    },
+    {
+      "epoch": 0.04334738109572547,
+      "grad_norm": 3.00519061088562,
+      "learning_rate": 1.9132947976878615e-05,
+      "loss": 2.1412,
+      "step": 180
+    },
+    {
+      "epoch": 0.044551475015051176,
+      "grad_norm": 3.4471330642700195,
+      "learning_rate": 1.9108863198458578e-05,
+      "loss": 2.1695,
+      "step": 185
+    },
+    {
+      "epoch": 0.045755568934376885,
+      "grad_norm": 3.394496440887451,
+      "learning_rate": 1.9084778420038536e-05,
+      "loss": 1.9532,
+      "step": 190
+    },
+    {
+      "epoch": 0.04695966285370259,
+      "grad_norm": 3.03004789352417,
+      "learning_rate": 1.90606936416185e-05,
+      "loss": 2.0659,
+      "step": 195
+    },
+    {
+      "epoch": 0.048163756773028296,
+      "grad_norm": 3.4260365962982178,
+      "learning_rate": 1.903660886319846e-05,
+      "loss": 2.0792,
+      "step": 200
+    },
+    {
+      "epoch": 0.048163756773028296,
+      "eval_loss": 2.1430513858795166,
+      "eval_runtime": 16.4051,
+      "eval_samples_per_second": 6.096,
+      "eval_steps_per_second": 0.792,
+      "step": 200
+    },
+    {
+      "epoch": 0.049367850692354005,
+      "grad_norm": 4.670680999755859,
+      "learning_rate": 1.901252408477842e-05,
+      "loss": 2.0952,
+      "step": 205
+    },
+    {
+      "epoch": 0.050571944611679714,
+      "grad_norm": 3.510042667388916,
+      "learning_rate": 1.8988439306358382e-05,
+      "loss": 2.195,
+      "step": 210
+    },
+    {
+      "epoch": 0.05177603853100542,
+      "grad_norm": 3.0459847450256348,
+      "learning_rate": 1.8964354527938345e-05,
+      "loss": 2.2117,
+      "step": 215
+    },
+    {
+      "epoch": 0.052980132450331126,
+      "grad_norm": 4.36016321182251,
+      "learning_rate": 1.8940269749518304e-05,
+      "loss": 2.1191,
+      "step": 220
+    },
+    {
+      "epoch": 0.054184226369656835,
+      "grad_norm": 3.0498242378234863,
+      "learning_rate": 1.891618497109827e-05,
+      "loss": 2.0838,
+      "step": 225
+    },
+    {
+      "epoch": 0.055388320288982544,
+      "grad_norm": 3.218038558959961,
+      "learning_rate": 1.889210019267823e-05,
+      "loss": 2.1118,
+      "step": 230
+    },
+    {
+      "epoch": 0.056592414208308246,
+      "grad_norm": 3.3144683837890625,
+      "learning_rate": 1.886801541425819e-05,
+      "loss": 2.2176,
+      "step": 235
+    },
+    {
+      "epoch": 0.057796508127633955,
+      "grad_norm": 3.2364652156829834,
+      "learning_rate": 1.8843930635838153e-05,
+      "loss": 2.112,
+      "step": 240
+    },
+    {
+      "epoch": 0.059000602046959665,
+      "grad_norm": 3.291278839111328,
+      "learning_rate": 1.8819845857418112e-05,
+      "loss": 2.144,
+      "step": 245
+    },
+    {
+      "epoch": 0.060204695966285374,
+      "grad_norm": 3.65297794342041,
+      "learning_rate": 1.8795761078998074e-05,
+      "loss": 2.2597,
+      "step": 250
+    },
+    {
+      "epoch": 0.061408789885611076,
+      "grad_norm": 3.2321982383728027,
+      "learning_rate": 1.8771676300578037e-05,
+      "loss": 2.1618,
+      "step": 255
+    },
+    {
+      "epoch": 0.06261288380493679,
+      "grad_norm": 3.352842330932617,
+      "learning_rate": 1.8747591522158e-05,
+      "loss": 2.006,
+      "step": 260
+    },
+    {
+      "epoch": 0.0638169777242625,
+      "grad_norm": 3.5657215118408203,
+      "learning_rate": 1.8723506743737958e-05,
+      "loss": 2.2253,
+      "step": 265
+    },
+    {
+      "epoch": 0.0650210716435882,
+      "grad_norm": 3.060060739517212,
+      "learning_rate": 1.869942196531792e-05,
+      "loss": 2.1187,
+      "step": 270
+    },
+    {
+      "epoch": 0.06622516556291391,
+      "grad_norm": 3.473719835281372,
+      "learning_rate": 1.8675337186897883e-05,
+      "loss": 2.0299,
+      "step": 275
+    },
+    {
+      "epoch": 0.06742925948223961,
+      "grad_norm": 3.1167919635772705,
+      "learning_rate": 1.8651252408477845e-05,
+      "loss": 2.0381,
+      "step": 280
+    },
+    {
+      "epoch": 0.06863335340156532,
+      "grad_norm": 3.815816640853882,
+      "learning_rate": 1.8627167630057804e-05,
+      "loss": 2.1624,
+      "step": 285
+    },
+    {
+      "epoch": 0.06983744732089103,
+      "grad_norm": 3.2820959091186523,
+      "learning_rate": 1.8603082851637766e-05,
+      "loss": 2.0819,
+      "step": 290
+    },
+    {
+      "epoch": 0.07104154124021674,
+      "grad_norm": 3.568885087966919,
+      "learning_rate": 1.857899807321773e-05,
+      "loss": 2.0749,
+      "step": 295
+    },
+    {
+      "epoch": 0.07224563515954245,
+      "grad_norm": 3.424076795578003,
+      "learning_rate": 1.855491329479769e-05,
+      "loss": 2.129,
+      "step": 300
+    },
+    {
+      "epoch": 0.07344972907886815,
+      "grad_norm": 3.2800493240356445,
+      "learning_rate": 1.853082851637765e-05,
+      "loss": 2.2067,
+      "step": 305
+    },
+    {
+      "epoch": 0.07465382299819386,
+      "grad_norm": 3.487868547439575,
+      "learning_rate": 1.8506743737957612e-05,
+      "loss": 2.124,
+      "step": 310
+    },
+    {
+      "epoch": 0.07585791691751957,
+      "grad_norm": 3.3999245166778564,
+      "learning_rate": 1.8482658959537575e-05,
+      "loss": 1.9888,
+      "step": 315
+    },
+    {
+      "epoch": 0.07706201083684527,
+      "grad_norm": 3.973482370376587,
+      "learning_rate": 1.8458574181117533e-05,
+      "loss": 2.0592,
+      "step": 320
+    },
+    {
+      "epoch": 0.07826610475617098,
+      "grad_norm": 2.9601657390594482,
+      "learning_rate": 1.8434489402697496e-05,
+      "loss": 2.1022,
+      "step": 325
+    },
+    {
+      "epoch": 0.07947019867549669,
+      "grad_norm": 3.260118246078491,
+      "learning_rate": 1.8410404624277458e-05,
+      "loss": 1.9763,
+      "step": 330
+    },
+    {
+      "epoch": 0.0806742925948224,
+      "grad_norm": 3.509838819503784,
+      "learning_rate": 1.838631984585742e-05,
+      "loss": 2.0284,
+      "step": 335
+    },
+    {
+      "epoch": 0.08187838651414811,
+      "grad_norm": 4.363494396209717,
+      "learning_rate": 1.8362235067437383e-05,
+      "loss": 2.0479,
+      "step": 340
+    },
+    {
+      "epoch": 0.08308248043347381,
+      "grad_norm": 3.2578630447387695,
+      "learning_rate": 1.8338150289017342e-05,
+      "loss": 2.0488,
+      "step": 345
+    },
+    {
+      "epoch": 0.08428657435279951,
+      "grad_norm": 3.2846531867980957,
+      "learning_rate": 1.8314065510597304e-05,
+      "loss": 2.0876,
+      "step": 350
+    },
+    {
+      "epoch": 0.08549066827212523,
+      "grad_norm": 3.3275203704833984,
+      "learning_rate": 1.8289980732177266e-05,
+      "loss": 2.0564,
+      "step": 355
+    },
+    {
+      "epoch": 0.08669476219145093,
+      "grad_norm": 3.1368625164031982,
+      "learning_rate": 1.8265895953757225e-05,
+      "loss": 2.1533,
+      "step": 360
+    },
+    {
+      "epoch": 0.08789885611077664,
+      "grad_norm": 3.3824191093444824,
+      "learning_rate": 1.8241811175337188e-05,
+      "loss": 2.1821,
+      "step": 365
+    },
+    {
+      "epoch": 0.08910295003010235,
+      "grad_norm": 3.5150134563446045,
+      "learning_rate": 1.821772639691715e-05,
+      "loss": 2.0292,
+      "step": 370
+    },
+    {
+      "epoch": 0.09030704394942805,
+      "grad_norm": 3.421921730041504,
+      "learning_rate": 1.8193641618497112e-05,
+      "loss": 1.9862,
+      "step": 375
+    },
+    {
+      "epoch": 0.09151113786875377,
+      "grad_norm": 3.616887092590332,
+      "learning_rate": 1.8169556840077075e-05,
+      "loss": 2.0158,
+      "step": 380
+    },
+    {
+      "epoch": 0.09271523178807947,
+      "grad_norm": 5.063056945800781,
+      "learning_rate": 1.8145472061657034e-05,
+      "loss": 2.0579,
+      "step": 385
+    },
+    {
+      "epoch": 0.09391932570740517,
+      "grad_norm": 3.5242559909820557,
+      "learning_rate": 1.8121387283236996e-05,
+      "loss": 2.0272,
+      "step": 390
+    },
+    {
+      "epoch": 0.09512341962673089,
+      "grad_norm": 3.2852962017059326,
+      "learning_rate": 1.809730250481696e-05,
+      "loss": 2.077,
+      "step": 395
+    },
+    {
+      "epoch": 0.09632751354605659,
+      "grad_norm": 3.710927963256836,
+      "learning_rate": 1.8073217726396917e-05,
+      "loss": 2.1271,
+      "step": 400
+    },
+    {
+      "epoch": 0.09632751354605659,
+      "eval_loss": 2.0655810832977295,
+      "eval_runtime": 16.3755,
+      "eval_samples_per_second": 6.107,
+      "eval_steps_per_second": 0.794,
+      "step": 400
+    },
+    {
+      "epoch": 0.0975316074653823,
+      "grad_norm": 3.5019216537475586,
+      "learning_rate": 1.804913294797688e-05,
+      "loss": 2.1081,
+      "step": 405
+    },
+    {
+      "epoch": 0.09873570138470801,
+      "grad_norm": 3.5533690452575684,
+      "learning_rate": 1.8025048169556842e-05,
+      "loss": 2.0751,
+      "step": 410
+    },
+    {
+      "epoch": 0.09993979530403371,
+      "grad_norm": 3.4970240592956543,
+      "learning_rate": 1.8000963391136804e-05,
+      "loss": 2.066,
+      "step": 415
+    },
+    {
+      "epoch": 0.10114388922335943,
+      "grad_norm": 3.0926427841186523,
+      "learning_rate": 1.7976878612716763e-05,
+      "loss": 2.0516,
+      "step": 420
+    },
+    {
+      "epoch": 0.10234798314268513,
+      "grad_norm": 3.747452974319458,
+      "learning_rate": 1.7952793834296726e-05,
+      "loss": 2.0721,
+      "step": 425
+    },
+    {
+      "epoch": 0.10355207706201083,
+      "grad_norm": 3.3113677501678467,
+      "learning_rate": 1.7928709055876688e-05,
+      "loss": 2.1527,
+      "step": 430
+    },
+    {
+      "epoch": 0.10475617098133655,
+      "grad_norm": 3.357912063598633,
+      "learning_rate": 1.7904624277456647e-05,
+      "loss": 2.0113,
+      "step": 435
+    },
+    {
+      "epoch": 0.10596026490066225,
+      "grad_norm": 3.023893356323242,
+      "learning_rate": 1.7880539499036613e-05,
+      "loss": 2.1332,
+      "step": 440
+    },
+    {
+      "epoch": 0.10716435881998795,
+      "grad_norm": 3.3027355670928955,
+      "learning_rate": 1.785645472061657e-05,
+      "loss": 1.9699,
+      "step": 445
+    },
+    {
+      "epoch": 0.10836845273931367,
+      "grad_norm": 5.3524932861328125,
+      "learning_rate": 1.7832369942196534e-05,
+      "loss": 2.0182,
+      "step": 450
+    },
+    {
+      "epoch": 0.10957254665863937,
+      "grad_norm": 3.200258731842041,
+      "learning_rate": 1.7808285163776496e-05,
+      "loss": 2.007,
+      "step": 455
+    },
+    {
+      "epoch": 0.11077664057796509,
+      "grad_norm": 3.286268949508667,
+      "learning_rate": 1.7784200385356455e-05,
+      "loss": 2.0907,
+      "step": 460
+    },
+    {
+      "epoch": 0.11198073449729079,
+      "grad_norm": 3.15291428565979,
+      "learning_rate": 1.7760115606936417e-05,
+      "loss": 2.0468,
+      "step": 465
+    },
+    {
+      "epoch": 0.11318482841661649,
+      "grad_norm": 3.3798069953918457,
+      "learning_rate": 1.773603082851638e-05,
+      "loss": 1.9927,
+      "step": 470
+    },
+    {
+      "epoch": 0.11438892233594221,
+      "grad_norm": 3.4220967292785645,
+      "learning_rate": 1.771194605009634e-05,
+      "loss": 2.1326,
+      "step": 475
+    },
+    {
+      "epoch": 0.11559301625526791,
+      "grad_norm": 3.379628896713257,
+      "learning_rate": 1.76878612716763e-05,
+      "loss": 1.9202,
+      "step": 480
+    },
+    {
+      "epoch": 0.11679711017459361,
+      "grad_norm": 3.3020846843719482,
+      "learning_rate": 1.7663776493256263e-05,
+      "loss": 2.1176,
+      "step": 485
+    },
+    {
+      "epoch": 0.11800120409391933,
+      "grad_norm": 3.2711665630340576,
+      "learning_rate": 1.7639691714836226e-05,
+      "loss": 2.0865,
+      "step": 490
+    },
+    {
+      "epoch": 0.11920529801324503,
+      "grad_norm": 3.239253520965576,
+      "learning_rate": 1.7615606936416188e-05,
+      "loss": 1.9284,
+      "step": 495
+    },
+    {
+      "epoch": 0.12040939193257075,
+      "grad_norm": 3.4960460662841797,
+      "learning_rate": 1.7591522157996147e-05,
+      "loss": 2.0088,
+      "step": 500
+    },
+    {
+      "epoch": 0.12161348585189645,
+      "grad_norm": 3.337407350540161,
+      "learning_rate": 1.756743737957611e-05,
+      "loss": 1.9687,
+      "step": 505
+    },
+    {
+      "epoch": 0.12281757977122215,
+      "grad_norm": 3.534827709197998,
+      "learning_rate": 1.754335260115607e-05,
+      "loss": 2.0273,
+      "step": 510
+    },
+    {
+      "epoch": 0.12402167369054787,
+      "grad_norm": 3.6207938194274902,
+      "learning_rate": 1.7519267822736034e-05,
+      "loss": 2.0458,
+      "step": 515
+    },
+    {
+      "epoch": 0.12522576760987358,
+      "grad_norm": 3.396012544631958,
+      "learning_rate": 1.7495183044315993e-05,
+      "loss": 1.9185,
+      "step": 520
+    },
+    {
+      "epoch": 0.12642986152919927,
+      "grad_norm": 3.001236915588379,
+      "learning_rate": 1.7471098265895955e-05,
+      "loss": 1.9407,
+      "step": 525
+    },
+    {
+      "epoch": 0.127633955448525,
+      "grad_norm": 3.1318376064300537,
+      "learning_rate": 1.7447013487475918e-05,
+      "loss": 1.8984,
+      "step": 530
+    },
+    {
+      "epoch": 0.1288380493678507,
+      "grad_norm": 3.4541585445404053,
+      "learning_rate": 1.7422928709055877e-05,
+      "loss": 1.8846,
+      "step": 535
+    },
+    {
+      "epoch": 0.1300421432871764,
+      "grad_norm": 3.311082363128662,
+      "learning_rate": 1.739884393063584e-05,
+      "loss": 2.0015,
+      "step": 540
+    },
+    {
+      "epoch": 0.1312462372065021,
+      "grad_norm": 3.2366561889648438,
+      "learning_rate": 1.73747591522158e-05,
+      "loss": 2.0176,
+      "step": 545
+    },
+    {
+      "epoch": 0.13245033112582782,
+      "grad_norm": 3.123307943344116,
+      "learning_rate": 1.735067437379576e-05,
+      "loss": 1.9731,
+      "step": 550
+    },
+    {
+      "epoch": 0.1336544250451535,
+      "grad_norm": 3.776921033859253,
+      "learning_rate": 1.7326589595375726e-05,
+      "loss": 2.0484,
+      "step": 555
+    },
+    {
+      "epoch": 0.13485851896447923,
+      "grad_norm": 2.959716796875,
+      "learning_rate": 1.7302504816955685e-05,
+      "loss": 1.9689,
+      "step": 560
+    },
+    {
+      "epoch": 0.13606261288380495,
+      "grad_norm": 3.527384042739868,
+      "learning_rate": 1.7278420038535647e-05,
+      "loss": 1.9488,
+      "step": 565
+    },
+    {
+      "epoch": 0.13726670680313063,
+      "grad_norm": 3.0703189373016357,
+      "learning_rate": 1.725433526011561e-05,
+      "loss": 2.1226,
+      "step": 570
+    },
+    {
+      "epoch": 0.13847080072245635,
+      "grad_norm": 3.1028363704681396,
+      "learning_rate": 1.723025048169557e-05,
+      "loss": 1.8966,
+      "step": 575
+    },
+    {
+      "epoch": 0.13967489464178207,
+      "grad_norm": 3.340517044067383,
+      "learning_rate": 1.720616570327553e-05,
+      "loss": 2.2156,
+      "step": 580
+    },
+    {
+      "epoch": 0.14087898856110775,
+      "grad_norm": 3.2740213871002197,
+      "learning_rate": 1.7182080924855493e-05,
+      "loss": 1.9445,
+      "step": 585
+    },
+    {
+      "epoch": 0.14208308248043347,
+      "grad_norm": 3.240690231323242,
+      "learning_rate": 1.7157996146435455e-05,
+      "loss": 2.0295,
+      "step": 590
+    },
+    {
+      "epoch": 0.1432871763997592,
+      "grad_norm": 3.821340799331665,
+      "learning_rate": 1.7133911368015418e-05,
+      "loss": 2.1401,
+      "step": 595
+    },
+    {
+      "epoch": 0.1444912703190849,
+      "grad_norm": 3.103550910949707,
+      "learning_rate": 1.7109826589595377e-05,
+      "loss": 2.1164,
+      "step": 600
+    },
+    {
+      "epoch": 0.1444912703190849,
+      "eval_loss": 2.017059564590454,
+      "eval_runtime": 16.4324,
+      "eval_samples_per_second": 6.086,
+      "eval_steps_per_second": 0.791,
+      "step": 600
+    },
+    {
+      "epoch": 0.1456953642384106,
+      "grad_norm": 3.4971117973327637,
+      "learning_rate": 1.708574181117534e-05,
+      "loss": 1.9864,
+      "step": 605
+    },
+    {
+      "epoch": 0.1468994581577363,
+      "grad_norm": 3.324803590774536,
+      "learning_rate": 1.70616570327553e-05,
+      "loss": 2.0402,
+      "step": 610
+    },
+    {
+      "epoch": 0.14810355207706202,
+      "grad_norm": 3.302614450454712,
+      "learning_rate": 1.703757225433526e-05,
+      "loss": 1.9494,
+      "step": 615
+    },
+    {
+      "epoch": 0.1493076459963877,
+      "grad_norm": 3.3090734481811523,
+      "learning_rate": 1.7013487475915223e-05,
+      "loss": 2.0748,
+      "step": 620
+    },
+    {
+      "epoch": 0.15051173991571343,
+      "grad_norm": 3.559049129486084,
+      "learning_rate": 1.6989402697495185e-05,
+      "loss": 1.9038,
+      "step": 625
+    },
+    {
+      "epoch": 0.15171583383503914,
+      "grad_norm": 3.5149178504943848,
+      "learning_rate": 1.6965317919075147e-05,
+      "loss": 2.0261,
+      "step": 630
+    },
+    {
+      "epoch": 0.15291992775436483,
+      "grad_norm": 3.835693120956421,
+      "learning_rate": 1.6941233140655106e-05,
+      "loss": 1.9453,
+      "step": 635
+    },
+    {
+      "epoch": 0.15412402167369055,
+      "grad_norm": 3.521132469177246,
+      "learning_rate": 1.691714836223507e-05,
+      "loss": 2.0138,
+      "step": 640
+    },
+    {
+      "epoch": 0.15532811559301626,
+      "grad_norm": 3.2369840145111084,
+      "learning_rate": 1.689306358381503e-05,
+      "loss": 2.0285,
+      "step": 645
+    },
+    {
+      "epoch": 0.15653220951234195,
+      "grad_norm": 3.1592392921447754,
+      "learning_rate": 1.686897880539499e-05,
+      "loss": 1.9912,
+      "step": 650
+    },
+    {
+      "epoch": 0.15773630343166767,
+      "grad_norm": 3.2069106101989746,
+      "learning_rate": 1.6844894026974952e-05,
+      "loss": 2.0159,
+      "step": 655
+    },
+    {
+      "epoch": 0.15894039735099338,
+      "grad_norm": 3.318230390548706,
+      "learning_rate": 1.6820809248554915e-05,
+      "loss": 2.0412,
+      "step": 660
+    },
+    {
+      "epoch": 0.16014449127031907,
+      "grad_norm": 3.549443244934082,
+      "learning_rate": 1.6796724470134877e-05,
+      "loss": 2.0014,
+      "step": 665
+    },
+    {
+      "epoch": 0.1613485851896448,
+      "grad_norm": 3.32999324798584,
+      "learning_rate": 1.677263969171484e-05,
+      "loss": 2.0303,
+      "step": 670
+    },
+    {
+      "epoch": 0.1625526791089705,
+      "grad_norm": 3.262946367263794,
+      "learning_rate": 1.6748554913294798e-05,
+      "loss": 1.9883,
+      "step": 675
+    },
+    {
+      "epoch": 0.16375677302829622,
+      "grad_norm": 3.484685182571411,
+      "learning_rate": 1.672447013487476e-05,
+      "loss": 1.9695,
+      "step": 680
+    },
+    {
+      "epoch": 0.1649608669476219,
+      "grad_norm": 3.4177358150482178,
+      "learning_rate": 1.6700385356454723e-05,
+      "loss": 2.0088,
+      "step": 685
+    },
+    {
+      "epoch": 0.16616496086694763,
+      "grad_norm": 3.447498321533203,
+      "learning_rate": 1.6676300578034682e-05,
+      "loss": 2.0813,
+      "step": 690
+    },
+    {
+      "epoch": 0.16736905478627334,
+      "grad_norm": 3.152740240097046,
+      "learning_rate": 1.6652215799614644e-05,
+      "loss": 1.9988,
+      "step": 695
+    },
+    {
+      "epoch": 0.16857314870559903,
+      "grad_norm": 3.8948824405670166,
+      "learning_rate": 1.6628131021194607e-05,
+      "loss": 2.0801,
+      "step": 700
+    },
+    {
+      "epoch": 0.16977724262492475,
+      "grad_norm": 3.81358003616333,
+      "learning_rate": 1.660404624277457e-05,
+      "loss": 1.944,
+      "step": 705
+    },
+    {
+      "epoch": 0.17098133654425046,
+      "grad_norm": 2.980236053466797,
+      "learning_rate": 1.657996146435453e-05,
+      "loss": 1.9151,
+      "step": 710
+    },
+    {
+      "epoch": 0.17218543046357615,
+      "grad_norm": 3.041680335998535,
+      "learning_rate": 1.655587668593449e-05,
+      "loss": 1.9486,
+      "step": 715
+    },
+    {
+      "epoch": 0.17338952438290187,
+      "grad_norm": 2.898974657058716,
+      "learning_rate": 1.6531791907514452e-05,
+      "loss": 2.1119,
+      "step": 720
+    },
+    {
+      "epoch": 0.17459361830222758,
+      "grad_norm": 3.161224603652954,
+      "learning_rate": 1.6507707129094415e-05,
+      "loss": 2.036,
+      "step": 725
+    },
+    {
+      "epoch": 0.17579771222155327,
+      "grad_norm": 3.2449426651000977,
+      "learning_rate": 1.6483622350674374e-05,
+      "loss": 2.0635,
+      "step": 730
+    },
+    {
+      "epoch": 0.177001806140879,
+      "grad_norm": 3.2805328369140625,
+      "learning_rate": 1.6459537572254336e-05,
+      "loss": 1.8022,
+      "step": 735
+    },
+    {
+      "epoch": 0.1782059000602047,
+      "grad_norm": 3.491149663925171,
+      "learning_rate": 1.64354527938343e-05,
+      "loss": 1.9832,
+      "step": 740
+    },
+    {
+      "epoch": 0.1794099939795304,
+      "grad_norm": 3.423267126083374,
+      "learning_rate": 1.641136801541426e-05,
+      "loss": 1.9574,
+      "step": 745
+    },
+    {
+      "epoch": 0.1806140878988561,
+      "grad_norm": 3.1914217472076416,
+      "learning_rate": 1.638728323699422e-05,
+      "loss": 1.9283,
+      "step": 750
+    },
+    {
+      "epoch": 0.18181818181818182,
+      "grad_norm": 3.2903149127960205,
+      "learning_rate": 1.6363198458574182e-05,
+      "loss": 2.1174,
+      "step": 755
+    },
+    {
+      "epoch": 0.18302227573750754,
+      "grad_norm": 3.113159656524658,
+      "learning_rate": 1.6339113680154144e-05,
+      "loss": 1.8794,
+      "step": 760
+    },
+    {
+      "epoch": 0.18422636965683323,
+      "grad_norm": 3.261596918106079,
+      "learning_rate": 1.6315028901734103e-05,
+      "loss": 2.0853,
+      "step": 765
+    },
+    {
+      "epoch": 0.18543046357615894,
+      "grad_norm": 2.9525296688079834,
+      "learning_rate": 1.629094412331407e-05,
+      "loss": 1.8622,
+      "step": 770
+    },
+    {
+      "epoch": 0.18663455749548466,
+      "grad_norm": 3.2103638648986816,
+      "learning_rate": 1.6266859344894028e-05,
+      "loss": 2.0536,
+      "step": 775
+    },
+    {
+      "epoch": 0.18783865141481035,
+      "grad_norm": 3.5312676429748535,
+      "learning_rate": 1.624277456647399e-05,
+      "loss": 1.9387,
+      "step": 780
+    },
+    {
+      "epoch": 0.18904274533413606,
+      "grad_norm": 3.277223825454712,
+      "learning_rate": 1.6218689788053953e-05,
+      "loss": 1.9804,
+      "step": 785
+    },
+    {
+      "epoch": 0.19024683925346178,
+      "grad_norm": 3.207287549972534,
+      "learning_rate": 1.619460500963391e-05,
+      "loss": 1.8782,
+      "step": 790
+    },
+    {
+      "epoch": 0.19145093317278747,
+      "grad_norm": 3.401834487915039,
+      "learning_rate": 1.6170520231213874e-05,
+      "loss": 1.9383,
+      "step": 795
+    },
+    {
+      "epoch": 0.19265502709211318,
+      "grad_norm": 3.5186078548431396,
+      "learning_rate": 1.6146435452793836e-05,
+      "loss": 2.0963,
+      "step": 800
+    },
+    {
+      "epoch": 0.19265502709211318,
+      "eval_loss": 1.979533076286316,
+      "eval_runtime": 16.4348,
+      "eval_samples_per_second": 6.085,
+      "eval_steps_per_second": 0.791,
+      "step": 800
+    },
+    {
+      "epoch": 0.1938591210114389,
+      "grad_norm": 3.0080161094665527,
+      "learning_rate": 1.6122350674373795e-05,
+      "loss": 1.951,
+      "step": 805
+    },
+    {
+      "epoch": 0.1950632149307646,
+      "grad_norm": 3.124155044555664,
+      "learning_rate": 1.6098265895953758e-05,
+      "loss": 1.8663,
+      "step": 810
+    },
+    {
+      "epoch": 0.1962673088500903,
+      "grad_norm": 3.6262383460998535,
+      "learning_rate": 1.607418111753372e-05,
+      "loss": 1.9478,
+      "step": 815
+    },
+    {
+      "epoch": 0.19747140276941602,
+      "grad_norm": 3.3047947883605957,
+      "learning_rate": 1.6050096339113682e-05,
+      "loss": 1.9203,
+      "step": 820
+    },
+    {
+      "epoch": 0.1986754966887417,
+      "grad_norm": 3.0261447429656982,
+      "learning_rate": 1.6026011560693645e-05,
+      "loss": 1.8988,
+      "step": 825
+    },
+    {
+      "epoch": 0.19987959060806743,
+      "grad_norm": 4.233884334564209,
+      "learning_rate": 1.6001926782273604e-05,
+      "loss": 2.0327,
+      "step": 830
+    },
+    {
+      "epoch": 0.20108368452739314,
+      "grad_norm": 2.9169118404388428,
+      "learning_rate": 1.5977842003853566e-05,
+      "loss": 1.9264,
+      "step": 835
+    },
+    {
+      "epoch": 0.20228777844671886,
+      "grad_norm": 3.0078282356262207,
+      "learning_rate": 1.5953757225433528e-05,
+      "loss": 1.8821,
+      "step": 840
+    },
+    {
+      "epoch": 0.20349187236604455,
+      "grad_norm": 3.4188835620880127,
+      "learning_rate": 1.592967244701349e-05,
+      "loss": 1.933,
+      "step": 845
+    },
+    {
+      "epoch": 0.20469596628537026,
+      "grad_norm": 4.739987850189209,
+      "learning_rate": 1.590558766859345e-05,
+      "loss": 1.9182,
+      "step": 850
+    },
+    {
+      "epoch": 0.20590006020469598,
+      "grad_norm": 3.1810977458953857,
+      "learning_rate": 1.5881502890173412e-05,
+      "loss": 1.984,
+      "step": 855
+    },
+    {
+      "epoch": 0.20710415412402167,
+      "grad_norm": 3.174739360809326,
+      "learning_rate": 1.5857418111753374e-05,
+      "loss": 1.7719,
+      "step": 860
+    },
+    {
+      "epoch": 0.20830824804334738,
+      "grad_norm": 3.379767656326294,
+      "learning_rate": 1.5833333333333333e-05,
+      "loss": 1.9481,
+      "step": 865
+    },
+    {
+      "epoch": 0.2095123419626731,
+      "grad_norm": 3.3487260341644287,
+      "learning_rate": 1.5809248554913295e-05,
+      "loss": 1.9416,
+      "step": 870
+    },
+    {
+      "epoch": 0.2107164358819988,
+      "grad_norm": 3.4879958629608154,
+      "learning_rate": 1.5785163776493258e-05,
+      "loss": 2.0463,
+      "step": 875
+    },
+    {
+      "epoch": 0.2119205298013245,
+      "grad_norm": 3.2338194847106934,
+      "learning_rate": 1.5761078998073217e-05,
+      "loss": 1.9441,
+      "step": 880
+    },
+    {
+      "epoch": 0.21312462372065022,
+      "grad_norm": 3.122405529022217,
+      "learning_rate": 1.5736994219653182e-05,
+      "loss": 2.0644,
+      "step": 885
+    },
+    {
+      "epoch": 0.2143287176399759,
+      "grad_norm": 3.0773510932922363,
+      "learning_rate": 1.571290944123314e-05,
+      "loss": 1.9415,
+      "step": 890
+    },
+    {
+      "epoch": 0.21553281155930162,
+      "grad_norm": 3.004040241241455,
+      "learning_rate": 1.5688824662813104e-05,
+      "loss": 1.9305,
+      "step": 895
+    },
+    {
+      "epoch": 0.21673690547862734,
+      "grad_norm": 3.547109603881836,
+      "learning_rate": 1.5664739884393066e-05,
+      "loss": 2.1088,
+      "step": 900
+    },
+    {
+      "epoch": 0.21794099939795303,
+      "grad_norm": 3.1982204914093018,
+      "learning_rate": 1.5640655105973025e-05,
+      "loss": 1.8667,
+      "step": 905
+    },
+    {
+      "epoch": 0.21914509331727874,
+      "grad_norm": 3.381781578063965,
+      "learning_rate": 1.5616570327552987e-05,
+      "loss": 1.9783,
+      "step": 910
+    },
+    {
+      "epoch": 0.22034918723660446,
+      "grad_norm": 2.9775896072387695,
+      "learning_rate": 1.559248554913295e-05,
+      "loss": 2.0211,
+      "step": 915
+    },
+    {
+      "epoch": 0.22155328115593018,
+      "grad_norm": 2.864551067352295,
+      "learning_rate": 1.5568400770712912e-05,
+      "loss": 1.8579,
+      "step": 920
+    },
+    {
+      "epoch": 0.22275737507525586,
+      "grad_norm": 3.0532050132751465,
+      "learning_rate": 1.5544315992292874e-05,
+      "loss": 1.9398,
+      "step": 925
+    },
+    {
+      "epoch": 0.22396146899458158,
+      "grad_norm": 2.859631061553955,
+      "learning_rate": 1.5520231213872833e-05,
+      "loss": 1.8625,
+      "step": 930
+    },
+    {
+      "epoch": 0.2251655629139073,
+      "grad_norm": 3.1373536586761475,
+      "learning_rate": 1.5496146435452796e-05,
+      "loss": 1.9003,
+      "step": 935
+    },
+    {
+      "epoch": 0.22636965683323299,
+      "grad_norm": 3.3248465061187744,
+      "learning_rate": 1.5472061657032758e-05,
+      "loss": 2.0517,
+      "step": 940
+    },
+    {
+      "epoch": 0.2275737507525587,
+      "grad_norm": 3.5990936756134033,
+      "learning_rate": 1.5447976878612717e-05,
+      "loss": 1.8975,
+      "step": 945
+    },
+    {
+      "epoch": 0.22877784467188442,
+      "grad_norm": 3.4047725200653076,
+      "learning_rate": 1.542389210019268e-05,
+      "loss": 1.9786,
+      "step": 950
+    },
+    {
+      "epoch": 0.2299819385912101,
+      "grad_norm": 3.3326022624969482,
+      "learning_rate": 1.539980732177264e-05,
+      "loss": 1.7848,
+      "step": 955
+    },
+    {
+      "epoch": 0.23118603251053582,
+      "grad_norm": 3.2942848205566406,
+      "learning_rate": 1.5375722543352604e-05,
+      "loss": 1.8549,
+      "step": 960
+    },
+    {
+      "epoch": 0.23239012642986154,
+      "grad_norm": 3.4602601528167725,
+      "learning_rate": 1.5351637764932563e-05,
+      "loss": 1.917,
+      "step": 965
+    },
+    {
+      "epoch": 0.23359422034918723,
+      "grad_norm": 3.591327428817749,
+      "learning_rate": 1.5327552986512525e-05,
+      "loss": 1.9457,
+      "step": 970
+    },
+    {
+      "epoch": 0.23479831426851294,
+      "grad_norm": 3.215808868408203,
+      "learning_rate": 1.5303468208092487e-05,
+      "loss": 1.9261,
+      "step": 975
+    },
+    {
+      "epoch": 0.23600240818783866,
+      "grad_norm": 3.5032927989959717,
+      "learning_rate": 1.5279383429672446e-05,
+      "loss": 1.967,
+      "step": 980
+    },
+    {
+      "epoch": 0.23720650210716435,
+      "grad_norm": 3.1476144790649414,
+      "learning_rate": 1.525529865125241e-05,
+      "loss": 2.0448,
+      "step": 985
+    },
+    {
+      "epoch": 0.23841059602649006,
+      "grad_norm": 3.046126365661621,
+      "learning_rate": 1.5231213872832371e-05,
+      "loss": 1.9086,
+      "step": 990
+    },
+    {
+      "epoch": 0.23961468994581578,
+      "grad_norm": 3.0403099060058594,
+      "learning_rate": 1.5207129094412332e-05,
+      "loss": 1.9671,
+      "step": 995
+    },
+    {
+      "epoch": 0.2408187838651415,
+      "grad_norm": 3.524573802947998,
+      "learning_rate": 1.5183044315992294e-05,
+      "loss": 1.9532,
+      "step": 1000
+    },
+    {
+      "epoch": 0.2408187838651415,
+      "eval_loss": 1.9571956396102905,
+      "eval_runtime": 16.4441,
+      "eval_samples_per_second": 6.081,
+      "eval_steps_per_second": 0.791,
+      "step": 1000
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 4152,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1054167504365664.0,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-1000/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fd020aa9333054ccbd44f2f69522647a4bdda52101c5b015e38e78115982722b
+size 5816

checkpoint-1500/added_tokens.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "<|im_end|>": 32000,
+  "<|im_start|>": 32001
+}

checkpoint-1500/config.json ADDED Viewed

	@@ -0,0 +1,27 @@

+{
+  "_name_or_path": "Felladrin/Minueza-32M-Chat",
+  "architectures": [
+    "MistralForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "eos_token_id": 32000,
+  "head_dim": 26,
+  "hidden_act": "silu",
+  "hidden_size": 312,
+  "initializer_range": 0.02,
+  "intermediate_size": 1092,
+  "max_position_embeddings": 2048,
+  "model_type": "mistral",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 10,
+  "num_key_value_heads": 4,
+  "rms_norm_eps": 1e-06,
+  "rope_theta": 10000.0,
+  "sliding_window": 1024,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float32",
+  "transformers_version": "4.48.1",
+  "use_cache": false,
+  "vocab_size": 32002
+}

checkpoint-1500/generation_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "eos_token_id": 32000,
+  "transformers_version": "4.48.1"
+}

checkpoint-1500/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ae77a3b4589ed3c32499d8a9282916075fcf1985a4e586bdbb041ebba8d7ad6c
+size 131181272

checkpoint-1500/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a57335810720a4f9d8d5a04bebec0887838484b90101b756da32e3ef233f22a8
+size 262419258

checkpoint-1500/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9899ccda7f0d8d9511991180b93aab508ce6e8489de708c88ad1188e7e1d90d6
+size 14244

checkpoint-1500/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f7acaaba5bf80cb5d5e1a88c4be3ebca794cd4bacb62bf046245137790e9a740
+size 1064

checkpoint-1500/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<|im_end|>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoint-1500/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-1500/tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dadfd56d766715c61d2ef780a525ab43b8e6da4de6865bda3d95fdef5e134055
+size 493443

checkpoint-1500/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,61 @@

+{
+  "add_bos_token": false,
+  "add_eos_token": false,
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32000": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32001": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [],
+  "bos_token": "<s>",
+  "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "extra_special_tokens": {},
+  "legacy": true,
+  "model_max_length": 2048,
+  "pad_token": "<|im_end|>",
+  "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}

checkpoint-1500/trainer_state.json ADDED Viewed

	@@ -0,0 +1,2189 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.3612281757977122,
+  "eval_steps": 200,
+  "global_step": 1500,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0012040939193257074,
+      "grad_norm": 4.470886707305908,
+      "learning_rate": 1.9975915221579964e-05,
+      "loss": 2.6688,
+      "step": 5
+    },
+    {
+      "epoch": 0.002408187838651415,
+      "grad_norm": 4.067092418670654,
+      "learning_rate": 1.9951830443159926e-05,
+      "loss": 2.2509,
+      "step": 10
+    },
+    {
+      "epoch": 0.003612281757977122,
+      "grad_norm": 4.133108615875244,
+      "learning_rate": 1.9927745664739885e-05,
+      "loss": 2.3732,
+      "step": 15
+    },
+    {
+      "epoch": 0.00481637567730283,
+      "grad_norm": 3.4890763759613037,
+      "learning_rate": 1.9903660886319848e-05,
+      "loss": 2.3632,
+      "step": 20
+    },
+    {
+      "epoch": 0.006020469596628537,
+      "grad_norm": 4.1045308113098145,
+      "learning_rate": 1.987957610789981e-05,
+      "loss": 2.5203,
+      "step": 25
+    },
+    {
+      "epoch": 0.007224563515954244,
+      "grad_norm": 4.26784086227417,
+      "learning_rate": 1.985549132947977e-05,
+      "loss": 2.3349,
+      "step": 30
+    },
+    {
+      "epoch": 0.008428657435279952,
+      "grad_norm": 4.144766330718994,
+      "learning_rate": 1.983140655105973e-05,
+      "loss": 2.4106,
+      "step": 35
+    },
+    {
+      "epoch": 0.00963275135460566,
+      "grad_norm": 3.9538934230804443,
+      "learning_rate": 1.9807321772639694e-05,
+      "loss": 2.3192,
+      "step": 40
+    },
+    {
+      "epoch": 0.010836845273931367,
+      "grad_norm": 3.9219865798950195,
+      "learning_rate": 1.9783236994219656e-05,
+      "loss": 2.2012,
+      "step": 45
+    },
+    {
+      "epoch": 0.012040939193257074,
+      "grad_norm": 3.391493797302246,
+      "learning_rate": 1.9759152215799615e-05,
+      "loss": 2.4064,
+      "step": 50
+    },
+    {
+      "epoch": 0.013245033112582781,
+      "grad_norm": 4.393350124359131,
+      "learning_rate": 1.9735067437379577e-05,
+      "loss": 2.1513,
+      "step": 55
+    },
+    {
+      "epoch": 0.014449127031908489,
+      "grad_norm": 3.6243207454681396,
+      "learning_rate": 1.971098265895954e-05,
+      "loss": 2.2788,
+      "step": 60
+    },
+    {
+      "epoch": 0.015653220951234198,
+      "grad_norm": 3.642468214035034,
+      "learning_rate": 1.9686897880539502e-05,
+      "loss": 2.2244,
+      "step": 65
+    },
+    {
+      "epoch": 0.016857314870559904,
+      "grad_norm": 4.0894670486450195,
+      "learning_rate": 1.966281310211946e-05,
+      "loss": 2.3622,
+      "step": 70
+    },
+    {
+      "epoch": 0.018061408789885613,
+      "grad_norm": 4.033819198608398,
+      "learning_rate": 1.9638728323699423e-05,
+      "loss": 2.3803,
+      "step": 75
+    },
+    {
+      "epoch": 0.01926550270921132,
+      "grad_norm": 3.535987138748169,
+      "learning_rate": 1.9614643545279385e-05,
+      "loss": 2.2161,
+      "step": 80
+    },
+    {
+      "epoch": 0.020469596628537028,
+      "grad_norm": 3.541868209838867,
+      "learning_rate": 1.9590558766859348e-05,
+      "loss": 2.1721,
+      "step": 85
+    },
+    {
+      "epoch": 0.021673690547862733,
+      "grad_norm": 4.147072792053223,
+      "learning_rate": 1.9566473988439307e-05,
+      "loss": 2.3239,
+      "step": 90
+    },
+    {
+      "epoch": 0.022877784467188442,
+      "grad_norm": 3.4949986934661865,
+      "learning_rate": 1.954238921001927e-05,
+      "loss": 2.265,
+      "step": 95
+    },
+    {
+      "epoch": 0.024081878386514148,
+      "grad_norm": 3.793950319290161,
+      "learning_rate": 1.951830443159923e-05,
+      "loss": 2.152,
+      "step": 100
+    },
+    {
+      "epoch": 0.025285972305839857,
+      "grad_norm": 3.9355053901672363,
+      "learning_rate": 1.949421965317919e-05,
+      "loss": 2.2534,
+      "step": 105
+    },
+    {
+      "epoch": 0.026490066225165563,
+      "grad_norm": 3.255175828933716,
+      "learning_rate": 1.9470134874759156e-05,
+      "loss": 2.2971,
+      "step": 110
+    },
+    {
+      "epoch": 0.027694160144491272,
+      "grad_norm": 3.650298595428467,
+      "learning_rate": 1.9446050096339115e-05,
+      "loss": 2.1228,
+      "step": 115
+    },
+    {
+      "epoch": 0.028898254063816978,
+      "grad_norm": 3.1906814575195312,
+      "learning_rate": 1.9421965317919077e-05,
+      "loss": 2.0995,
+      "step": 120
+    },
+    {
+      "epoch": 0.030102347983142687,
+      "grad_norm": 3.8122494220733643,
+      "learning_rate": 1.939788053949904e-05,
+      "loss": 2.1651,
+      "step": 125
+    },
+    {
+      "epoch": 0.031306441902468396,
+      "grad_norm": 3.8269336223602295,
+      "learning_rate": 1.9373795761079e-05,
+      "loss": 2.1731,
+      "step": 130
+    },
+    {
+      "epoch": 0.0325105358217941,
+      "grad_norm": 3.75238037109375,
+      "learning_rate": 1.934971098265896e-05,
+      "loss": 2.3071,
+      "step": 135
+    },
+    {
+      "epoch": 0.03371462974111981,
+      "grad_norm": 3.538330078125,
+      "learning_rate": 1.9325626204238923e-05,
+      "loss": 2.3015,
+      "step": 140
+    },
+    {
+      "epoch": 0.034918723660445516,
+      "grad_norm": 3.497131586074829,
+      "learning_rate": 1.9301541425818882e-05,
+      "loss": 2.128,
+      "step": 145
+    },
+    {
+      "epoch": 0.036122817579771226,
+      "grad_norm": 3.6173276901245117,
+      "learning_rate": 1.9277456647398845e-05,
+      "loss": 2.1792,
+      "step": 150
+    },
+    {
+      "epoch": 0.03732691149909693,
+      "grad_norm": 3.2987892627716064,
+      "learning_rate": 1.9253371868978807e-05,
+      "loss": 2.0246,
+      "step": 155
+    },
+    {
+      "epoch": 0.03853100541842264,
+      "grad_norm": 3.1787831783294678,
+      "learning_rate": 1.922928709055877e-05,
+      "loss": 2.2108,
+      "step": 160
+    },
+    {
+      "epoch": 0.039735099337748346,
+      "grad_norm": 3.5422236919403076,
+      "learning_rate": 1.920520231213873e-05,
+      "loss": 2.1738,
+      "step": 165
+    },
+    {
+      "epoch": 0.040939193257074055,
+      "grad_norm": 3.7987539768218994,
+      "learning_rate": 1.918111753371869e-05,
+      "loss": 2.1161,
+      "step": 170
+    },
+    {
+      "epoch": 0.04214328717639976,
+      "grad_norm": 3.2058522701263428,
+      "learning_rate": 1.9157032755298653e-05,
+      "loss": 2.0808,
+      "step": 175
+    },
+    {
+      "epoch": 0.04334738109572547,
+      "grad_norm": 3.00519061088562,
+      "learning_rate": 1.9132947976878615e-05,
+      "loss": 2.1412,
+      "step": 180
+    },
+    {
+      "epoch": 0.044551475015051176,
+      "grad_norm": 3.4471330642700195,
+      "learning_rate": 1.9108863198458578e-05,
+      "loss": 2.1695,
+      "step": 185
+    },
+    {
+      "epoch": 0.045755568934376885,
+      "grad_norm": 3.394496440887451,
+      "learning_rate": 1.9084778420038536e-05,
+      "loss": 1.9532,
+      "step": 190
+    },
+    {
+      "epoch": 0.04695966285370259,
+      "grad_norm": 3.03004789352417,
+      "learning_rate": 1.90606936416185e-05,
+      "loss": 2.0659,
+      "step": 195
+    },
+    {
+      "epoch": 0.048163756773028296,
+      "grad_norm": 3.4260365962982178,
+      "learning_rate": 1.903660886319846e-05,
+      "loss": 2.0792,
+      "step": 200
+    },
+    {
+      "epoch": 0.048163756773028296,
+      "eval_loss": 2.1430513858795166,
+      "eval_runtime": 16.4051,
+      "eval_samples_per_second": 6.096,
+      "eval_steps_per_second": 0.792,
+      "step": 200
+    },
+    {
+      "epoch": 0.049367850692354005,
+      "grad_norm": 4.670680999755859,
+      "learning_rate": 1.901252408477842e-05,
+      "loss": 2.0952,
+      "step": 205
+    },
+    {
+      "epoch": 0.050571944611679714,
+      "grad_norm": 3.510042667388916,
+      "learning_rate": 1.8988439306358382e-05,
+      "loss": 2.195,
+      "step": 210
+    },
+    {
+      "epoch": 0.05177603853100542,
+      "grad_norm": 3.0459847450256348,
+      "learning_rate": 1.8964354527938345e-05,
+      "loss": 2.2117,
+      "step": 215
+    },
+    {
+      "epoch": 0.052980132450331126,
+      "grad_norm": 4.36016321182251,
+      "learning_rate": 1.8940269749518304e-05,
+      "loss": 2.1191,
+      "step": 220
+    },
+    {
+      "epoch": 0.054184226369656835,
+      "grad_norm": 3.0498242378234863,
+      "learning_rate": 1.891618497109827e-05,
+      "loss": 2.0838,
+      "step": 225
+    },
+    {
+      "epoch": 0.055388320288982544,
+      "grad_norm": 3.218038558959961,
+      "learning_rate": 1.889210019267823e-05,
+      "loss": 2.1118,
+      "step": 230
+    },
+    {
+      "epoch": 0.056592414208308246,
+      "grad_norm": 3.3144683837890625,
+      "learning_rate": 1.886801541425819e-05,
+      "loss": 2.2176,
+      "step": 235
+    },
+    {
+      "epoch": 0.057796508127633955,
+      "grad_norm": 3.2364652156829834,
+      "learning_rate": 1.8843930635838153e-05,
+      "loss": 2.112,
+      "step": 240
+    },
+    {
+      "epoch": 0.059000602046959665,
+      "grad_norm": 3.291278839111328,
+      "learning_rate": 1.8819845857418112e-05,
+      "loss": 2.144,
+      "step": 245
+    },
+    {
+      "epoch": 0.060204695966285374,
+      "grad_norm": 3.65297794342041,
+      "learning_rate": 1.8795761078998074e-05,
+      "loss": 2.2597,
+      "step": 250
+    },
+    {
+      "epoch": 0.061408789885611076,
+      "grad_norm": 3.2321982383728027,
+      "learning_rate": 1.8771676300578037e-05,
+      "loss": 2.1618,
+      "step": 255
+    },
+    {
+      "epoch": 0.06261288380493679,
+      "grad_norm": 3.352842330932617,
+      "learning_rate": 1.8747591522158e-05,
+      "loss": 2.006,
+      "step": 260
+    },
+    {
+      "epoch": 0.0638169777242625,
+      "grad_norm": 3.5657215118408203,
+      "learning_rate": 1.8723506743737958e-05,
+      "loss": 2.2253,
+      "step": 265
+    },
+    {
+      "epoch": 0.0650210716435882,
+      "grad_norm": 3.060060739517212,
+      "learning_rate": 1.869942196531792e-05,
+      "loss": 2.1187,
+      "step": 270
+    },
+    {
+      "epoch": 0.06622516556291391,
+      "grad_norm": 3.473719835281372,
+      "learning_rate": 1.8675337186897883e-05,
+      "loss": 2.0299,
+      "step": 275
+    },
+    {
+      "epoch": 0.06742925948223961,
+      "grad_norm": 3.1167919635772705,
+      "learning_rate": 1.8651252408477845e-05,
+      "loss": 2.0381,
+      "step": 280
+    },
+    {
+      "epoch": 0.06863335340156532,
+      "grad_norm": 3.815816640853882,
+      "learning_rate": 1.8627167630057804e-05,
+      "loss": 2.1624,
+      "step": 285
+    },
+    {
+      "epoch": 0.06983744732089103,
+      "grad_norm": 3.2820959091186523,
+      "learning_rate": 1.8603082851637766e-05,
+      "loss": 2.0819,
+      "step": 290
+    },
+    {
+      "epoch": 0.07104154124021674,
+      "grad_norm": 3.568885087966919,
+      "learning_rate": 1.857899807321773e-05,
+      "loss": 2.0749,
+      "step": 295
+    },
+    {
+      "epoch": 0.07224563515954245,
+      "grad_norm": 3.424076795578003,
+      "learning_rate": 1.855491329479769e-05,
+      "loss": 2.129,
+      "step": 300
+    },
+    {
+      "epoch": 0.07344972907886815,
+      "grad_norm": 3.2800493240356445,
+      "learning_rate": 1.853082851637765e-05,
+      "loss": 2.2067,
+      "step": 305
+    },
+    {
+      "epoch": 0.07465382299819386,
+      "grad_norm": 3.487868547439575,
+      "learning_rate": 1.8506743737957612e-05,
+      "loss": 2.124,
+      "step": 310
+    },
+    {
+      "epoch": 0.07585791691751957,
+      "grad_norm": 3.3999245166778564,
+      "learning_rate": 1.8482658959537575e-05,
+      "loss": 1.9888,
+      "step": 315
+    },
+    {
+      "epoch": 0.07706201083684527,
+      "grad_norm": 3.973482370376587,
+      "learning_rate": 1.8458574181117533e-05,
+      "loss": 2.0592,
+      "step": 320
+    },
+    {
+      "epoch": 0.07826610475617098,
+      "grad_norm": 2.9601657390594482,
+      "learning_rate": 1.8434489402697496e-05,
+      "loss": 2.1022,
+      "step": 325
+    },
+    {
+      "epoch": 0.07947019867549669,
+      "grad_norm": 3.260118246078491,
+      "learning_rate": 1.8410404624277458e-05,
+      "loss": 1.9763,
+      "step": 330
+    },
+    {
+      "epoch": 0.0806742925948224,
+      "grad_norm": 3.509838819503784,
+      "learning_rate": 1.838631984585742e-05,
+      "loss": 2.0284,
+      "step": 335
+    },
+    {
+      "epoch": 0.08187838651414811,
+      "grad_norm": 4.363494396209717,
+      "learning_rate": 1.8362235067437383e-05,
+      "loss": 2.0479,
+      "step": 340
+    },
+    {
+      "epoch": 0.08308248043347381,
+      "grad_norm": 3.2578630447387695,
+      "learning_rate": 1.8338150289017342e-05,
+      "loss": 2.0488,
+      "step": 345
+    },
+    {
+      "epoch": 0.08428657435279951,
+      "grad_norm": 3.2846531867980957,
+      "learning_rate": 1.8314065510597304e-05,
+      "loss": 2.0876,
+      "step": 350
+    },
+    {
+      "epoch": 0.08549066827212523,
+      "grad_norm": 3.3275203704833984,
+      "learning_rate": 1.8289980732177266e-05,
+      "loss": 2.0564,
+      "step": 355
+    },
+    {
+      "epoch": 0.08669476219145093,
+      "grad_norm": 3.1368625164031982,
+      "learning_rate": 1.8265895953757225e-05,
+      "loss": 2.1533,
+      "step": 360
+    },
+    {
+      "epoch": 0.08789885611077664,
+      "grad_norm": 3.3824191093444824,
+      "learning_rate": 1.8241811175337188e-05,
+      "loss": 2.1821,
+      "step": 365
+    },
+    {
+      "epoch": 0.08910295003010235,
+      "grad_norm": 3.5150134563446045,
+      "learning_rate": 1.821772639691715e-05,
+      "loss": 2.0292,
+      "step": 370
+    },
+    {
+      "epoch": 0.09030704394942805,
+      "grad_norm": 3.421921730041504,
+      "learning_rate": 1.8193641618497112e-05,
+      "loss": 1.9862,
+      "step": 375
+    },
+    {
+      "epoch": 0.09151113786875377,
+      "grad_norm": 3.616887092590332,
+      "learning_rate": 1.8169556840077075e-05,
+      "loss": 2.0158,
+      "step": 380
+    },
+    {
+      "epoch": 0.09271523178807947,
+      "grad_norm": 5.063056945800781,
+      "learning_rate": 1.8145472061657034e-05,
+      "loss": 2.0579,
+      "step": 385
+    },
+    {
+      "epoch": 0.09391932570740517,
+      "grad_norm": 3.5242559909820557,
+      "learning_rate": 1.8121387283236996e-05,
+      "loss": 2.0272,
+      "step": 390
+    },
+    {
+      "epoch": 0.09512341962673089,
+      "grad_norm": 3.2852962017059326,
+      "learning_rate": 1.809730250481696e-05,
+      "loss": 2.077,
+      "step": 395
+    },
+    {
+      "epoch": 0.09632751354605659,
+      "grad_norm": 3.710927963256836,
+      "learning_rate": 1.8073217726396917e-05,
+      "loss": 2.1271,
+      "step": 400
+    },
+    {
+      "epoch": 0.09632751354605659,
+      "eval_loss": 2.0655810832977295,
+      "eval_runtime": 16.3755,
+      "eval_samples_per_second": 6.107,
+      "eval_steps_per_second": 0.794,
+      "step": 400
+    },
+    {
+      "epoch": 0.0975316074653823,
+      "grad_norm": 3.5019216537475586,
+      "learning_rate": 1.804913294797688e-05,
+      "loss": 2.1081,
+      "step": 405
+    },
+    {
+      "epoch": 0.09873570138470801,
+      "grad_norm": 3.5533690452575684,
+      "learning_rate": 1.8025048169556842e-05,
+      "loss": 2.0751,
+      "step": 410
+    },
+    {
+      "epoch": 0.09993979530403371,
+      "grad_norm": 3.4970240592956543,
+      "learning_rate": 1.8000963391136804e-05,
+      "loss": 2.066,
+      "step": 415
+    },
+    {
+      "epoch": 0.10114388922335943,
+      "grad_norm": 3.0926427841186523,
+      "learning_rate": 1.7976878612716763e-05,
+      "loss": 2.0516,
+      "step": 420
+    },
+    {
+      "epoch": 0.10234798314268513,
+      "grad_norm": 3.747452974319458,
+      "learning_rate": 1.7952793834296726e-05,
+      "loss": 2.0721,
+      "step": 425
+    },
+    {
+      "epoch": 0.10355207706201083,
+      "grad_norm": 3.3113677501678467,
+      "learning_rate": 1.7928709055876688e-05,
+      "loss": 2.1527,
+      "step": 430
+    },
+    {
+      "epoch": 0.10475617098133655,
+      "grad_norm": 3.357912063598633,
+      "learning_rate": 1.7904624277456647e-05,
+      "loss": 2.0113,
+      "step": 435
+    },
+    {
+      "epoch": 0.10596026490066225,
+      "grad_norm": 3.023893356323242,
+      "learning_rate": 1.7880539499036613e-05,
+      "loss": 2.1332,
+      "step": 440
+    },
+    {
+      "epoch": 0.10716435881998795,
+      "grad_norm": 3.3027355670928955,
+      "learning_rate": 1.785645472061657e-05,
+      "loss": 1.9699,
+      "step": 445
+    },
+    {
+      "epoch": 0.10836845273931367,
+      "grad_norm": 5.3524932861328125,
+      "learning_rate": 1.7832369942196534e-05,
+      "loss": 2.0182,
+      "step": 450
+    },
+    {
+      "epoch": 0.10957254665863937,
+      "grad_norm": 3.200258731842041,
+      "learning_rate": 1.7808285163776496e-05,
+      "loss": 2.007,
+      "step": 455
+    },
+    {
+      "epoch": 0.11077664057796509,
+      "grad_norm": 3.286268949508667,
+      "learning_rate": 1.7784200385356455e-05,
+      "loss": 2.0907,
+      "step": 460
+    },
+    {
+      "epoch": 0.11198073449729079,
+      "grad_norm": 3.15291428565979,
+      "learning_rate": 1.7760115606936417e-05,
+      "loss": 2.0468,
+      "step": 465
+    },
+    {
+      "epoch": 0.11318482841661649,
+      "grad_norm": 3.3798069953918457,
+      "learning_rate": 1.773603082851638e-05,
+      "loss": 1.9927,
+      "step": 470
+    },
+    {
+      "epoch": 0.11438892233594221,
+      "grad_norm": 3.4220967292785645,
+      "learning_rate": 1.771194605009634e-05,
+      "loss": 2.1326,
+      "step": 475
+    },
+    {
+      "epoch": 0.11559301625526791,
+      "grad_norm": 3.379628896713257,
+      "learning_rate": 1.76878612716763e-05,
+      "loss": 1.9202,
+      "step": 480
+    },
+    {
+      "epoch": 0.11679711017459361,
+      "grad_norm": 3.3020846843719482,
+      "learning_rate": 1.7663776493256263e-05,
+      "loss": 2.1176,
+      "step": 485
+    },
+    {
+      "epoch": 0.11800120409391933,
+      "grad_norm": 3.2711665630340576,
+      "learning_rate": 1.7639691714836226e-05,
+      "loss": 2.0865,
+      "step": 490
+    },
+    {
+      "epoch": 0.11920529801324503,
+      "grad_norm": 3.239253520965576,
+      "learning_rate": 1.7615606936416188e-05,
+      "loss": 1.9284,
+      "step": 495
+    },
+    {
+      "epoch": 0.12040939193257075,
+      "grad_norm": 3.4960460662841797,
+      "learning_rate": 1.7591522157996147e-05,
+      "loss": 2.0088,
+      "step": 500
+    },
+    {
+      "epoch": 0.12161348585189645,
+      "grad_norm": 3.337407350540161,
+      "learning_rate": 1.756743737957611e-05,
+      "loss": 1.9687,
+      "step": 505
+    },
+    {
+      "epoch": 0.12281757977122215,
+      "grad_norm": 3.534827709197998,
+      "learning_rate": 1.754335260115607e-05,
+      "loss": 2.0273,
+      "step": 510
+    },
+    {
+      "epoch": 0.12402167369054787,
+      "grad_norm": 3.6207938194274902,
+      "learning_rate": 1.7519267822736034e-05,
+      "loss": 2.0458,
+      "step": 515
+    },
+    {
+      "epoch": 0.12522576760987358,
+      "grad_norm": 3.396012544631958,
+      "learning_rate": 1.7495183044315993e-05,
+      "loss": 1.9185,
+      "step": 520
+    },
+    {
+      "epoch": 0.12642986152919927,
+      "grad_norm": 3.001236915588379,
+      "learning_rate": 1.7471098265895955e-05,
+      "loss": 1.9407,
+      "step": 525
+    },
+    {
+      "epoch": 0.127633955448525,
+      "grad_norm": 3.1318376064300537,
+      "learning_rate": 1.7447013487475918e-05,
+      "loss": 1.8984,
+      "step": 530
+    },
+    {
+      "epoch": 0.1288380493678507,
+      "grad_norm": 3.4541585445404053,
+      "learning_rate": 1.7422928709055877e-05,
+      "loss": 1.8846,
+      "step": 535
+    },
+    {
+      "epoch": 0.1300421432871764,
+      "grad_norm": 3.311082363128662,
+      "learning_rate": 1.739884393063584e-05,
+      "loss": 2.0015,
+      "step": 540
+    },
+    {
+      "epoch": 0.1312462372065021,
+      "grad_norm": 3.2366561889648438,
+      "learning_rate": 1.73747591522158e-05,
+      "loss": 2.0176,
+      "step": 545
+    },
+    {
+      "epoch": 0.13245033112582782,
+      "grad_norm": 3.123307943344116,
+      "learning_rate": 1.735067437379576e-05,
+      "loss": 1.9731,
+      "step": 550
+    },
+    {
+      "epoch": 0.1336544250451535,
+      "grad_norm": 3.776921033859253,
+      "learning_rate": 1.7326589595375726e-05,
+      "loss": 2.0484,
+      "step": 555
+    },
+    {
+      "epoch": 0.13485851896447923,
+      "grad_norm": 2.959716796875,
+      "learning_rate": 1.7302504816955685e-05,
+      "loss": 1.9689,
+      "step": 560
+    },
+    {
+      "epoch": 0.13606261288380495,
+      "grad_norm": 3.527384042739868,
+      "learning_rate": 1.7278420038535647e-05,
+      "loss": 1.9488,
+      "step": 565
+    },
+    {
+      "epoch": 0.13726670680313063,
+      "grad_norm": 3.0703189373016357,
+      "learning_rate": 1.725433526011561e-05,
+      "loss": 2.1226,
+      "step": 570
+    },
+    {
+      "epoch": 0.13847080072245635,
+      "grad_norm": 3.1028363704681396,
+      "learning_rate": 1.723025048169557e-05,
+      "loss": 1.8966,
+      "step": 575
+    },
+    {
+      "epoch": 0.13967489464178207,
+      "grad_norm": 3.340517044067383,
+      "learning_rate": 1.720616570327553e-05,
+      "loss": 2.2156,
+      "step": 580
+    },
+    {
+      "epoch": 0.14087898856110775,
+      "grad_norm": 3.2740213871002197,
+      "learning_rate": 1.7182080924855493e-05,
+      "loss": 1.9445,
+      "step": 585
+    },
+    {
+      "epoch": 0.14208308248043347,
+      "grad_norm": 3.240690231323242,
+      "learning_rate": 1.7157996146435455e-05,
+      "loss": 2.0295,
+      "step": 590
+    },
+    {
+      "epoch": 0.1432871763997592,
+      "grad_norm": 3.821340799331665,
+      "learning_rate": 1.7133911368015418e-05,
+      "loss": 2.1401,
+      "step": 595
+    },
+    {
+      "epoch": 0.1444912703190849,
+      "grad_norm": 3.103550910949707,
+      "learning_rate": 1.7109826589595377e-05,
+      "loss": 2.1164,
+      "step": 600
+    },
+    {
+      "epoch": 0.1444912703190849,
+      "eval_loss": 2.017059564590454,
+      "eval_runtime": 16.4324,
+      "eval_samples_per_second": 6.086,
+      "eval_steps_per_second": 0.791,
+      "step": 600
+    },
+    {
+      "epoch": 0.1456953642384106,
+      "grad_norm": 3.4971117973327637,
+      "learning_rate": 1.708574181117534e-05,
+      "loss": 1.9864,
+      "step": 605
+    },
+    {
+      "epoch": 0.1468994581577363,
+      "grad_norm": 3.324803590774536,
+      "learning_rate": 1.70616570327553e-05,
+      "loss": 2.0402,
+      "step": 610
+    },
+    {
+      "epoch": 0.14810355207706202,
+      "grad_norm": 3.302614450454712,
+      "learning_rate": 1.703757225433526e-05,
+      "loss": 1.9494,
+      "step": 615
+    },
+    {
+      "epoch": 0.1493076459963877,
+      "grad_norm": 3.3090734481811523,
+      "learning_rate": 1.7013487475915223e-05,
+      "loss": 2.0748,
+      "step": 620
+    },
+    {
+      "epoch": 0.15051173991571343,
+      "grad_norm": 3.559049129486084,
+      "learning_rate": 1.6989402697495185e-05,
+      "loss": 1.9038,
+      "step": 625
+    },
+    {
+      "epoch": 0.15171583383503914,
+      "grad_norm": 3.5149178504943848,
+      "learning_rate": 1.6965317919075147e-05,
+      "loss": 2.0261,
+      "step": 630
+    },
+    {
+      "epoch": 0.15291992775436483,
+      "grad_norm": 3.835693120956421,
+      "learning_rate": 1.6941233140655106e-05,
+      "loss": 1.9453,
+      "step": 635
+    },
+    {
+      "epoch": 0.15412402167369055,
+      "grad_norm": 3.521132469177246,
+      "learning_rate": 1.691714836223507e-05,
+      "loss": 2.0138,
+      "step": 640
+    },
+    {
+      "epoch": 0.15532811559301626,
+      "grad_norm": 3.2369840145111084,
+      "learning_rate": 1.689306358381503e-05,
+      "loss": 2.0285,
+      "step": 645
+    },
+    {
+      "epoch": 0.15653220951234195,
+      "grad_norm": 3.1592392921447754,
+      "learning_rate": 1.686897880539499e-05,
+      "loss": 1.9912,
+      "step": 650
+    },
+    {
+      "epoch": 0.15773630343166767,
+      "grad_norm": 3.2069106101989746,
+      "learning_rate": 1.6844894026974952e-05,
+      "loss": 2.0159,
+      "step": 655
+    },
+    {
+      "epoch": 0.15894039735099338,
+      "grad_norm": 3.318230390548706,
+      "learning_rate": 1.6820809248554915e-05,
+      "loss": 2.0412,
+      "step": 660
+    },
+    {
+      "epoch": 0.16014449127031907,
+      "grad_norm": 3.549443244934082,
+      "learning_rate": 1.6796724470134877e-05,
+      "loss": 2.0014,
+      "step": 665
+    },
+    {
+      "epoch": 0.1613485851896448,
+      "grad_norm": 3.32999324798584,
+      "learning_rate": 1.677263969171484e-05,
+      "loss": 2.0303,
+      "step": 670
+    },
+    {
+      "epoch": 0.1625526791089705,
+      "grad_norm": 3.262946367263794,
+      "learning_rate": 1.6748554913294798e-05,
+      "loss": 1.9883,
+      "step": 675
+    },
+    {
+      "epoch": 0.16375677302829622,
+      "grad_norm": 3.484685182571411,
+      "learning_rate": 1.672447013487476e-05,
+      "loss": 1.9695,
+      "step": 680
+    },
+    {
+      "epoch": 0.1649608669476219,
+      "grad_norm": 3.4177358150482178,
+      "learning_rate": 1.6700385356454723e-05,
+      "loss": 2.0088,
+      "step": 685
+    },
+    {
+      "epoch": 0.16616496086694763,
+      "grad_norm": 3.447498321533203,
+      "learning_rate": 1.6676300578034682e-05,
+      "loss": 2.0813,
+      "step": 690
+    },
+    {
+      "epoch": 0.16736905478627334,
+      "grad_norm": 3.152740240097046,
+      "learning_rate": 1.6652215799614644e-05,
+      "loss": 1.9988,
+      "step": 695
+    },
+    {
+      "epoch": 0.16857314870559903,
+      "grad_norm": 3.8948824405670166,
+      "learning_rate": 1.6628131021194607e-05,
+      "loss": 2.0801,
+      "step": 700
+    },
+    {
+      "epoch": 0.16977724262492475,
+      "grad_norm": 3.81358003616333,
+      "learning_rate": 1.660404624277457e-05,
+      "loss": 1.944,
+      "step": 705
+    },
+    {
+      "epoch": 0.17098133654425046,
+      "grad_norm": 2.980236053466797,
+      "learning_rate": 1.657996146435453e-05,
+      "loss": 1.9151,
+      "step": 710
+    },
+    {
+      "epoch": 0.17218543046357615,
+      "grad_norm": 3.041680335998535,
+      "learning_rate": 1.655587668593449e-05,
+      "loss": 1.9486,
+      "step": 715
+    },
+    {
+      "epoch": 0.17338952438290187,
+      "grad_norm": 2.898974657058716,
+      "learning_rate": 1.6531791907514452e-05,
+      "loss": 2.1119,
+      "step": 720
+    },
+    {
+      "epoch": 0.17459361830222758,
+      "grad_norm": 3.161224603652954,
+      "learning_rate": 1.6507707129094415e-05,
+      "loss": 2.036,
+      "step": 725
+    },
+    {
+      "epoch": 0.17579771222155327,
+      "grad_norm": 3.2449426651000977,
+      "learning_rate": 1.6483622350674374e-05,
+      "loss": 2.0635,
+      "step": 730
+    },
+    {
+      "epoch": 0.177001806140879,
+      "grad_norm": 3.2805328369140625,
+      "learning_rate": 1.6459537572254336e-05,
+      "loss": 1.8022,
+      "step": 735
+    },
+    {
+      "epoch": 0.1782059000602047,
+      "grad_norm": 3.491149663925171,
+      "learning_rate": 1.64354527938343e-05,
+      "loss": 1.9832,
+      "step": 740
+    },
+    {
+      "epoch": 0.1794099939795304,
+      "grad_norm": 3.423267126083374,
+      "learning_rate": 1.641136801541426e-05,
+      "loss": 1.9574,
+      "step": 745
+    },
+    {
+      "epoch": 0.1806140878988561,
+      "grad_norm": 3.1914217472076416,
+      "learning_rate": 1.638728323699422e-05,
+      "loss": 1.9283,
+      "step": 750
+    },
+    {
+      "epoch": 0.18181818181818182,
+      "grad_norm": 3.2903149127960205,
+      "learning_rate": 1.6363198458574182e-05,
+      "loss": 2.1174,
+      "step": 755
+    },
+    {
+      "epoch": 0.18302227573750754,
+      "grad_norm": 3.113159656524658,
+      "learning_rate": 1.6339113680154144e-05,
+      "loss": 1.8794,
+      "step": 760
+    },
+    {
+      "epoch": 0.18422636965683323,
+      "grad_norm": 3.261596918106079,
+      "learning_rate": 1.6315028901734103e-05,
+      "loss": 2.0853,
+      "step": 765
+    },
+    {
+      "epoch": 0.18543046357615894,
+      "grad_norm": 2.9525296688079834,
+      "learning_rate": 1.629094412331407e-05,
+      "loss": 1.8622,
+      "step": 770
+    },
+    {
+      "epoch": 0.18663455749548466,
+      "grad_norm": 3.2103638648986816,
+      "learning_rate": 1.6266859344894028e-05,
+      "loss": 2.0536,
+      "step": 775
+    },
+    {
+      "epoch": 0.18783865141481035,
+      "grad_norm": 3.5312676429748535,
+      "learning_rate": 1.624277456647399e-05,
+      "loss": 1.9387,
+      "step": 780
+    },
+    {
+      "epoch": 0.18904274533413606,
+      "grad_norm": 3.277223825454712,
+      "learning_rate": 1.6218689788053953e-05,
+      "loss": 1.9804,
+      "step": 785
+    },
+    {
+      "epoch": 0.19024683925346178,
+      "grad_norm": 3.207287549972534,
+      "learning_rate": 1.619460500963391e-05,
+      "loss": 1.8782,
+      "step": 790
+    },
+    {
+      "epoch": 0.19145093317278747,
+      "grad_norm": 3.401834487915039,
+      "learning_rate": 1.6170520231213874e-05,
+      "loss": 1.9383,
+      "step": 795
+    },
+    {
+      "epoch": 0.19265502709211318,
+      "grad_norm": 3.5186078548431396,
+      "learning_rate": 1.6146435452793836e-05,
+      "loss": 2.0963,
+      "step": 800
+    },
+    {
+      "epoch": 0.19265502709211318,
+      "eval_loss": 1.979533076286316,
+      "eval_runtime": 16.4348,
+      "eval_samples_per_second": 6.085,
+      "eval_steps_per_second": 0.791,
+      "step": 800
+    },
+    {
+      "epoch": 0.1938591210114389,
+      "grad_norm": 3.0080161094665527,
+      "learning_rate": 1.6122350674373795e-05,
+      "loss": 1.951,
+      "step": 805
+    },
+    {
+      "epoch": 0.1950632149307646,
+      "grad_norm": 3.124155044555664,
+      "learning_rate": 1.6098265895953758e-05,
+      "loss": 1.8663,
+      "step": 810
+    },
+    {
+      "epoch": 0.1962673088500903,
+      "grad_norm": 3.6262383460998535,
+      "learning_rate": 1.607418111753372e-05,
+      "loss": 1.9478,
+      "step": 815
+    },
+    {
+      "epoch": 0.19747140276941602,
+      "grad_norm": 3.3047947883605957,
+      "learning_rate": 1.6050096339113682e-05,
+      "loss": 1.9203,
+      "step": 820
+    },
+    {
+      "epoch": 0.1986754966887417,
+      "grad_norm": 3.0261447429656982,
+      "learning_rate": 1.6026011560693645e-05,
+      "loss": 1.8988,
+      "step": 825
+    },
+    {
+      "epoch": 0.19987959060806743,
+      "grad_norm": 4.233884334564209,
+      "learning_rate": 1.6001926782273604e-05,
+      "loss": 2.0327,
+      "step": 830
+    },
+    {
+      "epoch": 0.20108368452739314,
+      "grad_norm": 2.9169118404388428,
+      "learning_rate": 1.5977842003853566e-05,
+      "loss": 1.9264,
+      "step": 835
+    },
+    {
+      "epoch": 0.20228777844671886,
+      "grad_norm": 3.0078282356262207,
+      "learning_rate": 1.5953757225433528e-05,
+      "loss": 1.8821,
+      "step": 840
+    },
+    {
+      "epoch": 0.20349187236604455,
+      "grad_norm": 3.4188835620880127,
+      "learning_rate": 1.592967244701349e-05,
+      "loss": 1.933,
+      "step": 845
+    },
+    {
+      "epoch": 0.20469596628537026,
+      "grad_norm": 4.739987850189209,
+      "learning_rate": 1.590558766859345e-05,
+      "loss": 1.9182,
+      "step": 850
+    },
+    {
+      "epoch": 0.20590006020469598,
+      "grad_norm": 3.1810977458953857,
+      "learning_rate": 1.5881502890173412e-05,
+      "loss": 1.984,
+      "step": 855
+    },
+    {
+      "epoch": 0.20710415412402167,
+      "grad_norm": 3.174739360809326,
+      "learning_rate": 1.5857418111753374e-05,
+      "loss": 1.7719,
+      "step": 860
+    },
+    {
+      "epoch": 0.20830824804334738,
+      "grad_norm": 3.379767656326294,
+      "learning_rate": 1.5833333333333333e-05,
+      "loss": 1.9481,
+      "step": 865
+    },
+    {
+      "epoch": 0.2095123419626731,
+      "grad_norm": 3.3487260341644287,
+      "learning_rate": 1.5809248554913295e-05,
+      "loss": 1.9416,
+      "step": 870
+    },
+    {
+      "epoch": 0.2107164358819988,
+      "grad_norm": 3.4879958629608154,
+      "learning_rate": 1.5785163776493258e-05,
+      "loss": 2.0463,
+      "step": 875
+    },
+    {
+      "epoch": 0.2119205298013245,
+      "grad_norm": 3.2338194847106934,
+      "learning_rate": 1.5761078998073217e-05,
+      "loss": 1.9441,
+      "step": 880
+    },
+    {
+      "epoch": 0.21312462372065022,
+      "grad_norm": 3.122405529022217,
+      "learning_rate": 1.5736994219653182e-05,
+      "loss": 2.0644,
+      "step": 885
+    },
+    {
+      "epoch": 0.2143287176399759,
+      "grad_norm": 3.0773510932922363,
+      "learning_rate": 1.571290944123314e-05,
+      "loss": 1.9415,
+      "step": 890
+    },
+    {
+      "epoch": 0.21553281155930162,
+      "grad_norm": 3.004040241241455,
+      "learning_rate": 1.5688824662813104e-05,
+      "loss": 1.9305,
+      "step": 895
+    },
+    {
+      "epoch": 0.21673690547862734,
+      "grad_norm": 3.547109603881836,
+      "learning_rate": 1.5664739884393066e-05,
+      "loss": 2.1088,
+      "step": 900
+    },
+    {
+      "epoch": 0.21794099939795303,
+      "grad_norm": 3.1982204914093018,
+      "learning_rate": 1.5640655105973025e-05,
+      "loss": 1.8667,
+      "step": 905
+    },
+    {
+      "epoch": 0.21914509331727874,
+      "grad_norm": 3.381781578063965,
+      "learning_rate": 1.5616570327552987e-05,
+      "loss": 1.9783,
+      "step": 910
+    },
+    {
+      "epoch": 0.22034918723660446,
+      "grad_norm": 2.9775896072387695,
+      "learning_rate": 1.559248554913295e-05,
+      "loss": 2.0211,
+      "step": 915
+    },
+    {
+      "epoch": 0.22155328115593018,
+      "grad_norm": 2.864551067352295,
+      "learning_rate": 1.5568400770712912e-05,
+      "loss": 1.8579,
+      "step": 920
+    },
+    {
+      "epoch": 0.22275737507525586,
+      "grad_norm": 3.0532050132751465,
+      "learning_rate": 1.5544315992292874e-05,
+      "loss": 1.9398,
+      "step": 925
+    },
+    {
+      "epoch": 0.22396146899458158,
+      "grad_norm": 2.859631061553955,
+      "learning_rate": 1.5520231213872833e-05,
+      "loss": 1.8625,
+      "step": 930
+    },
+    {
+      "epoch": 0.2251655629139073,
+      "grad_norm": 3.1373536586761475,
+      "learning_rate": 1.5496146435452796e-05,
+      "loss": 1.9003,
+      "step": 935
+    },
+    {
+      "epoch": 0.22636965683323299,
+      "grad_norm": 3.3248465061187744,
+      "learning_rate": 1.5472061657032758e-05,
+      "loss": 2.0517,
+      "step": 940
+    },
+    {
+      "epoch": 0.2275737507525587,
+      "grad_norm": 3.5990936756134033,
+      "learning_rate": 1.5447976878612717e-05,
+      "loss": 1.8975,
+      "step": 945
+    },
+    {
+      "epoch": 0.22877784467188442,
+      "grad_norm": 3.4047725200653076,
+      "learning_rate": 1.542389210019268e-05,
+      "loss": 1.9786,
+      "step": 950
+    },
+    {
+      "epoch": 0.2299819385912101,
+      "grad_norm": 3.3326022624969482,
+      "learning_rate": 1.539980732177264e-05,
+      "loss": 1.7848,
+      "step": 955
+    },
+    {
+      "epoch": 0.23118603251053582,
+      "grad_norm": 3.2942848205566406,
+      "learning_rate": 1.5375722543352604e-05,
+      "loss": 1.8549,
+      "step": 960
+    },
+    {
+      "epoch": 0.23239012642986154,
+      "grad_norm": 3.4602601528167725,
+      "learning_rate": 1.5351637764932563e-05,
+      "loss": 1.917,
+      "step": 965
+    },
+    {
+      "epoch": 0.23359422034918723,
+      "grad_norm": 3.591327428817749,
+      "learning_rate": 1.5327552986512525e-05,
+      "loss": 1.9457,
+      "step": 970
+    },
+    {
+      "epoch": 0.23479831426851294,
+      "grad_norm": 3.215808868408203,
+      "learning_rate": 1.5303468208092487e-05,
+      "loss": 1.9261,
+      "step": 975
+    },
+    {
+      "epoch": 0.23600240818783866,
+      "grad_norm": 3.5032927989959717,
+      "learning_rate": 1.5279383429672446e-05,
+      "loss": 1.967,
+      "step": 980
+    },
+    {
+      "epoch": 0.23720650210716435,
+      "grad_norm": 3.1476144790649414,
+      "learning_rate": 1.525529865125241e-05,
+      "loss": 2.0448,
+      "step": 985
+    },
+    {
+      "epoch": 0.23841059602649006,
+      "grad_norm": 3.046126365661621,
+      "learning_rate": 1.5231213872832371e-05,
+      "loss": 1.9086,
+      "step": 990
+    },
+    {
+      "epoch": 0.23961468994581578,
+      "grad_norm": 3.0403099060058594,
+      "learning_rate": 1.5207129094412332e-05,
+      "loss": 1.9671,
+      "step": 995
+    },
+    {
+      "epoch": 0.2408187838651415,
+      "grad_norm": 3.524573802947998,
+      "learning_rate": 1.5183044315992294e-05,
+      "loss": 1.9532,
+      "step": 1000
+    },
+    {
+      "epoch": 0.2408187838651415,
+      "eval_loss": 1.9571956396102905,
+      "eval_runtime": 16.4441,
+      "eval_samples_per_second": 6.081,
+      "eval_steps_per_second": 0.791,
+      "step": 1000
+    },
+    {
+      "epoch": 0.24202287778446718,
+      "grad_norm": 3.518422842025757,
+      "learning_rate": 1.5158959537572255e-05,
+      "loss": 1.9851,
+      "step": 1005
+    },
+    {
+      "epoch": 0.2432269717037929,
+      "grad_norm": 3.0476677417755127,
+      "learning_rate": 1.5134874759152215e-05,
+      "loss": 2.0102,
+      "step": 1010
+    },
+    {
+      "epoch": 0.24443106562311862,
+      "grad_norm": 2.7093968391418457,
+      "learning_rate": 1.511078998073218e-05,
+      "loss": 1.8704,
+      "step": 1015
+    },
+    {
+      "epoch": 0.2456351595424443,
+      "grad_norm": 2.8607966899871826,
+      "learning_rate": 1.508670520231214e-05,
+      "loss": 1.9948,
+      "step": 1020
+    },
+    {
+      "epoch": 0.24683925346177002,
+      "grad_norm": 3.224783182144165,
+      "learning_rate": 1.50626204238921e-05,
+      "loss": 1.9097,
+      "step": 1025
+    },
+    {
+      "epoch": 0.24804334738109574,
+      "grad_norm": 3.199155807495117,
+      "learning_rate": 1.5038535645472063e-05,
+      "loss": 2.0205,
+      "step": 1030
+    },
+    {
+      "epoch": 0.24924744130042142,
+      "grad_norm": 3.12842059135437,
+      "learning_rate": 1.5014450867052024e-05,
+      "loss": 1.8666,
+      "step": 1035
+    },
+    {
+      "epoch": 0.25045153521974717,
+      "grad_norm": 3.7331671714782715,
+      "learning_rate": 1.4990366088631986e-05,
+      "loss": 1.913,
+      "step": 1040
+    },
+    {
+      "epoch": 0.25165562913907286,
+      "grad_norm": 3.0594935417175293,
+      "learning_rate": 1.4966281310211948e-05,
+      "loss": 2.0016,
+      "step": 1045
+    },
+    {
+      "epoch": 0.25285972305839854,
+      "grad_norm": 3.166200637817383,
+      "learning_rate": 1.4942196531791909e-05,
+      "loss": 1.9481,
+      "step": 1050
+    },
+    {
+      "epoch": 0.2540638169777243,
+      "grad_norm": 3.08423113822937,
+      "learning_rate": 1.4918111753371871e-05,
+      "loss": 1.9168,
+      "step": 1055
+    },
+    {
+      "epoch": 0.25526791089705,
+      "grad_norm": 3.7309155464172363,
+      "learning_rate": 1.4894026974951832e-05,
+      "loss": 1.9698,
+      "step": 1060
+    },
+    {
+      "epoch": 0.25647200481637566,
+      "grad_norm": 3.5642640590667725,
+      "learning_rate": 1.4869942196531793e-05,
+      "loss": 2.0914,
+      "step": 1065
+    },
+    {
+      "epoch": 0.2576760987357014,
+      "grad_norm": 3.201970338821411,
+      "learning_rate": 1.4845857418111755e-05,
+      "loss": 1.9358,
+      "step": 1070
+    },
+    {
+      "epoch": 0.2588801926550271,
+      "grad_norm": 3.19600510597229,
+      "learning_rate": 1.4821772639691716e-05,
+      "loss": 1.9028,
+      "step": 1075
+    },
+    {
+      "epoch": 0.2600842865743528,
+      "grad_norm": 2.8048558235168457,
+      "learning_rate": 1.4797687861271676e-05,
+      "loss": 1.9104,
+      "step": 1080
+    },
+    {
+      "epoch": 0.26128838049367853,
+      "grad_norm": 3.3447604179382324,
+      "learning_rate": 1.477360308285164e-05,
+      "loss": 1.9683,
+      "step": 1085
+    },
+    {
+      "epoch": 0.2624924744130042,
+      "grad_norm": 3.2456166744232178,
+      "learning_rate": 1.4749518304431601e-05,
+      "loss": 1.9469,
+      "step": 1090
+    },
+    {
+      "epoch": 0.2636965683323299,
+      "grad_norm": 4.318637847900391,
+      "learning_rate": 1.4725433526011562e-05,
+      "loss": 1.9199,
+      "step": 1095
+    },
+    {
+      "epoch": 0.26490066225165565,
+      "grad_norm": 3.258910655975342,
+      "learning_rate": 1.4701348747591524e-05,
+      "loss": 1.9996,
+      "step": 1100
+    },
+    {
+      "epoch": 0.26610475617098134,
+      "grad_norm": 3.2944164276123047,
+      "learning_rate": 1.4677263969171484e-05,
+      "loss": 1.9373,
+      "step": 1105
+    },
+    {
+      "epoch": 0.267308850090307,
+      "grad_norm": 2.938908815383911,
+      "learning_rate": 1.4653179190751445e-05,
+      "loss": 1.8301,
+      "step": 1110
+    },
+    {
+      "epoch": 0.26851294400963277,
+      "grad_norm": 3.38433575630188,
+      "learning_rate": 1.4629094412331407e-05,
+      "loss": 2.0615,
+      "step": 1115
+    },
+    {
+      "epoch": 0.26971703792895846,
+      "grad_norm": 2.9752516746520996,
+      "learning_rate": 1.4605009633911368e-05,
+      "loss": 2.0132,
+      "step": 1120
+    },
+    {
+      "epoch": 0.27092113184828415,
+      "grad_norm": 2.9489858150482178,
+      "learning_rate": 1.458092485549133e-05,
+      "loss": 1.9319,
+      "step": 1125
+    },
+    {
+      "epoch": 0.2721252257676099,
+      "grad_norm": 3.231663465499878,
+      "learning_rate": 1.4556840077071293e-05,
+      "loss": 1.8924,
+      "step": 1130
+    },
+    {
+      "epoch": 0.2733293196869356,
+      "grad_norm": 3.2028298377990723,
+      "learning_rate": 1.4532755298651253e-05,
+      "loss": 1.9836,
+      "step": 1135
+    },
+    {
+      "epoch": 0.27453341360626127,
+      "grad_norm": 3.348498582839966,
+      "learning_rate": 1.4508670520231216e-05,
+      "loss": 2.0908,
+      "step": 1140
+    },
+    {
+      "epoch": 0.275737507525587,
+      "grad_norm": 3.1415324211120605,
+      "learning_rate": 1.4484585741811176e-05,
+      "loss": 2.043,
+      "step": 1145
+    },
+    {
+      "epoch": 0.2769416014449127,
+      "grad_norm": 2.9878766536712646,
+      "learning_rate": 1.4460500963391137e-05,
+      "loss": 1.9536,
+      "step": 1150
+    },
+    {
+      "epoch": 0.2781456953642384,
+      "grad_norm": 3.9880380630493164,
+      "learning_rate": 1.4436416184971101e-05,
+      "loss": 1.7441,
+      "step": 1155
+    },
+    {
+      "epoch": 0.27934978928356413,
+      "grad_norm": 3.2457761764526367,
+      "learning_rate": 1.4412331406551062e-05,
+      "loss": 2.0117,
+      "step": 1160
+    },
+    {
+      "epoch": 0.2805538832028898,
+      "grad_norm": 2.9482262134552,
+      "learning_rate": 1.4388246628131022e-05,
+      "loss": 1.9502,
+      "step": 1165
+    },
+    {
+      "epoch": 0.2817579771222155,
+      "grad_norm": 3.3882107734680176,
+      "learning_rate": 1.4364161849710985e-05,
+      "loss": 1.9861,
+      "step": 1170
+    },
+    {
+      "epoch": 0.28296207104154125,
+      "grad_norm": 3.379577398300171,
+      "learning_rate": 1.4340077071290945e-05,
+      "loss": 2.0115,
+      "step": 1175
+    },
+    {
+      "epoch": 0.28416616496086694,
+      "grad_norm": 3.02996826171875,
+      "learning_rate": 1.4315992292870906e-05,
+      "loss": 1.9103,
+      "step": 1180
+    },
+    {
+      "epoch": 0.28537025888019263,
+      "grad_norm": 2.9293081760406494,
+      "learning_rate": 1.4291907514450868e-05,
+      "loss": 1.9432,
+      "step": 1185
+    },
+    {
+      "epoch": 0.2865743527995184,
+      "grad_norm": 3.3768506050109863,
+      "learning_rate": 1.4267822736030829e-05,
+      "loss": 1.8372,
+      "step": 1190
+    },
+    {
+      "epoch": 0.28777844671884406,
+      "grad_norm": 3.0163660049438477,
+      "learning_rate": 1.424373795761079e-05,
+      "loss": 1.7784,
+      "step": 1195
+    },
+    {
+      "epoch": 0.2889825406381698,
+      "grad_norm": 3.591684103012085,
+      "learning_rate": 1.4219653179190754e-05,
+      "loss": 1.9951,
+      "step": 1200
+    },
+    {
+      "epoch": 0.2889825406381698,
+      "eval_loss": 1.9316246509552002,
+      "eval_runtime": 16.3758,
+      "eval_samples_per_second": 6.107,
+      "eval_steps_per_second": 0.794,
+      "step": 1200
+    },
+    {
+      "epoch": 0.2901866345574955,
+      "grad_norm": 3.3553686141967773,
+      "learning_rate": 1.4195568400770714e-05,
+      "loss": 1.8822,
+      "step": 1205
+    },
+    {
+      "epoch": 0.2913907284768212,
+      "grad_norm": 3.1767303943634033,
+      "learning_rate": 1.4171483622350675e-05,
+      "loss": 1.9386,
+      "step": 1210
+    },
+    {
+      "epoch": 0.2925948223961469,
+      "grad_norm": 3.6845219135284424,
+      "learning_rate": 1.4147398843930637e-05,
+      "loss": 1.8512,
+      "step": 1215
+    },
+    {
+      "epoch": 0.2937989163154726,
+      "grad_norm": 3.189955949783325,
+      "learning_rate": 1.4123314065510598e-05,
+      "loss": 1.9804,
+      "step": 1220
+    },
+    {
+      "epoch": 0.2950030102347983,
+      "grad_norm": 3.7629036903381348,
+      "learning_rate": 1.4099229287090558e-05,
+      "loss": 1.9759,
+      "step": 1225
+    },
+    {
+      "epoch": 0.29620710415412405,
+      "grad_norm": 3.6975581645965576,
+      "learning_rate": 1.4075144508670523e-05,
+      "loss": 2.0106,
+      "step": 1230
+    },
+    {
+      "epoch": 0.29741119807344973,
+      "grad_norm": 3.1638119220733643,
+      "learning_rate": 1.4051059730250483e-05,
+      "loss": 1.9846,
+      "step": 1235
+    },
+    {
+      "epoch": 0.2986152919927754,
+      "grad_norm": 3.2830159664154053,
+      "learning_rate": 1.4026974951830444e-05,
+      "loss": 2.0757,
+      "step": 1240
+    },
+    {
+      "epoch": 0.29981938591210117,
+      "grad_norm": 3.1656594276428223,
+      "learning_rate": 1.4002890173410406e-05,
+      "loss": 1.9831,
+      "step": 1245
+    },
+    {
+      "epoch": 0.30102347983142685,
+      "grad_norm": 3.056269407272339,
+      "learning_rate": 1.3978805394990367e-05,
+      "loss": 1.897,
+      "step": 1250
+    },
+    {
+      "epoch": 0.30222757375075254,
+      "grad_norm": 3.1246907711029053,
+      "learning_rate": 1.3954720616570329e-05,
+      "loss": 1.9663,
+      "step": 1255
+    },
+    {
+      "epoch": 0.3034316676700783,
+      "grad_norm": 2.964416980743408,
+      "learning_rate": 1.393063583815029e-05,
+      "loss": 2.0476,
+      "step": 1260
+    },
+    {
+      "epoch": 0.304635761589404,
+      "grad_norm": 3.241818904876709,
+      "learning_rate": 1.390655105973025e-05,
+      "loss": 2.0771,
+      "step": 1265
+    },
+    {
+      "epoch": 0.30583985550872966,
+      "grad_norm": 3.29571795463562,
+      "learning_rate": 1.3882466281310214e-05,
+      "loss": 2.0502,
+      "step": 1270
+    },
+    {
+      "epoch": 0.3070439494280554,
+      "grad_norm": 3.6429920196533203,
+      "learning_rate": 1.3858381502890175e-05,
+      "loss": 2.0583,
+      "step": 1275
+    },
+    {
+      "epoch": 0.3082480433473811,
+      "grad_norm": 3.16072416305542,
+      "learning_rate": 1.3834296724470136e-05,
+      "loss": 1.8808,
+      "step": 1280
+    },
+    {
+      "epoch": 0.3094521372667068,
+      "grad_norm": 3.2263102531433105,
+      "learning_rate": 1.3810211946050098e-05,
+      "loss": 1.8824,
+      "step": 1285
+    },
+    {
+      "epoch": 0.3106562311860325,
+      "grad_norm": 3.2608978748321533,
+      "learning_rate": 1.3786127167630059e-05,
+      "loss": 1.8209,
+      "step": 1290
+    },
+    {
+      "epoch": 0.3118603251053582,
+      "grad_norm": 3.1739134788513184,
+      "learning_rate": 1.376204238921002e-05,
+      "loss": 2.0042,
+      "step": 1295
+    },
+    {
+      "epoch": 0.3130644190246839,
+      "grad_norm": 3.8513424396514893,
+      "learning_rate": 1.3737957610789983e-05,
+      "loss": 1.9969,
+      "step": 1300
+    },
+    {
+      "epoch": 0.31426851294400965,
+      "grad_norm": 2.987257957458496,
+      "learning_rate": 1.3713872832369944e-05,
+      "loss": 1.8387,
+      "step": 1305
+    },
+    {
+      "epoch": 0.31547260686333534,
+      "grad_norm": 3.1317367553710938,
+      "learning_rate": 1.3689788053949905e-05,
+      "loss": 2.0304,
+      "step": 1310
+    },
+    {
+      "epoch": 0.316676700782661,
+      "grad_norm": 3.459153413772583,
+      "learning_rate": 1.3665703275529867e-05,
+      "loss": 1.977,
+      "step": 1315
+    },
+    {
+      "epoch": 0.31788079470198677,
+      "grad_norm": 3.1578195095062256,
+      "learning_rate": 1.3641618497109828e-05,
+      "loss": 1.8042,
+      "step": 1320
+    },
+    {
+      "epoch": 0.31908488862131246,
+      "grad_norm": 3.375023126602173,
+      "learning_rate": 1.3617533718689788e-05,
+      "loss": 1.8485,
+      "step": 1325
+    },
+    {
+      "epoch": 0.32028898254063815,
+      "grad_norm": 3.1867401599884033,
+      "learning_rate": 1.359344894026975e-05,
+      "loss": 1.9303,
+      "step": 1330
+    },
+    {
+      "epoch": 0.3214930764599639,
+      "grad_norm": 3.303433418273926,
+      "learning_rate": 1.3569364161849711e-05,
+      "loss": 1.8925,
+      "step": 1335
+    },
+    {
+      "epoch": 0.3226971703792896,
+      "grad_norm": 2.990525960922241,
+      "learning_rate": 1.3545279383429672e-05,
+      "loss": 1.8612,
+      "step": 1340
+    },
+    {
+      "epoch": 0.32390126429861527,
+      "grad_norm": 3.570690155029297,
+      "learning_rate": 1.3521194605009636e-05,
+      "loss": 1.9175,
+      "step": 1345
+    },
+    {
+      "epoch": 0.325105358217941,
+      "grad_norm": 3.3581953048706055,
+      "learning_rate": 1.3497109826589597e-05,
+      "loss": 2.0293,
+      "step": 1350
+    },
+    {
+      "epoch": 0.3263094521372667,
+      "grad_norm": 3.111626148223877,
+      "learning_rate": 1.3473025048169559e-05,
+      "loss": 1.9314,
+      "step": 1355
+    },
+    {
+      "epoch": 0.32751354605659244,
+      "grad_norm": 2.8977465629577637,
+      "learning_rate": 1.344894026974952e-05,
+      "loss": 1.9008,
+      "step": 1360
+    },
+    {
+      "epoch": 0.32871763997591813,
+      "grad_norm": 3.2835822105407715,
+      "learning_rate": 1.342485549132948e-05,
+      "loss": 1.8344,
+      "step": 1365
+    },
+    {
+      "epoch": 0.3299217338952438,
+      "grad_norm": 3.041848659515381,
+      "learning_rate": 1.3400770712909442e-05,
+      "loss": 2.0046,
+      "step": 1370
+    },
+    {
+      "epoch": 0.33112582781456956,
+      "grad_norm": 3.252289295196533,
+      "learning_rate": 1.3376685934489405e-05,
+      "loss": 2.015,
+      "step": 1375
+    },
+    {
+      "epoch": 0.33232992173389525,
+      "grad_norm": 3.0157580375671387,
+      "learning_rate": 1.3352601156069365e-05,
+      "loss": 1.7938,
+      "step": 1380
+    },
+    {
+      "epoch": 0.33353401565322094,
+      "grad_norm": 3.2034637928009033,
+      "learning_rate": 1.3328516377649328e-05,
+      "loss": 1.7438,
+      "step": 1385
+    },
+    {
+      "epoch": 0.3347381095725467,
+      "grad_norm": 2.962069034576416,
+      "learning_rate": 1.3304431599229288e-05,
+      "loss": 1.8915,
+      "step": 1390
+    },
+    {
+      "epoch": 0.33594220349187237,
+      "grad_norm": 3.4406933784484863,
+      "learning_rate": 1.3280346820809249e-05,
+      "loss": 1.7684,
+      "step": 1395
+    },
+    {
+      "epoch": 0.33714629741119806,
+      "grad_norm": 3.531928062438965,
+      "learning_rate": 1.3256262042389211e-05,
+      "loss": 1.8796,
+      "step": 1400
+    },
+    {
+      "epoch": 0.33714629741119806,
+      "eval_loss": 1.9124047756195068,
+      "eval_runtime": 16.4823,
+      "eval_samples_per_second": 6.067,
+      "eval_steps_per_second": 0.789,
+      "step": 1400
+    },
+    {
+      "epoch": 0.3383503913305238,
+      "grad_norm": 3.052105665206909,
+      "learning_rate": 1.3232177263969172e-05,
+      "loss": 1.8796,
+      "step": 1405
+    },
+    {
+      "epoch": 0.3395544852498495,
+      "grad_norm": 3.317544460296631,
+      "learning_rate": 1.3208092485549133e-05,
+      "loss": 1.975,
+      "step": 1410
+    },
+    {
+      "epoch": 0.3407585791691752,
+      "grad_norm": 3.8019814491271973,
+      "learning_rate": 1.3184007707129097e-05,
+      "loss": 1.8014,
+      "step": 1415
+    },
+    {
+      "epoch": 0.3419626730885009,
+      "grad_norm": 3.2202208042144775,
+      "learning_rate": 1.3159922928709057e-05,
+      "loss": 2.0098,
+      "step": 1420
+    },
+    {
+      "epoch": 0.3431667670078266,
+      "grad_norm": 3.2262189388275146,
+      "learning_rate": 1.3135838150289018e-05,
+      "loss": 1.758,
+      "step": 1425
+    },
+    {
+      "epoch": 0.3443708609271523,
+      "grad_norm": 3.03486704826355,
+      "learning_rate": 1.311175337186898e-05,
+      "loss": 1.9254,
+      "step": 1430
+    },
+    {
+      "epoch": 0.34557495484647804,
+      "grad_norm": 3.063431978225708,
+      "learning_rate": 1.3087668593448941e-05,
+      "loss": 1.9091,
+      "step": 1435
+    },
+    {
+      "epoch": 0.34677904876580373,
+      "grad_norm": 3.082839012145996,
+      "learning_rate": 1.3063583815028902e-05,
+      "loss": 1.951,
+      "step": 1440
+    },
+    {
+      "epoch": 0.3479831426851294,
+      "grad_norm": 3.692833662033081,
+      "learning_rate": 1.3039499036608864e-05,
+      "loss": 1.9771,
+      "step": 1445
+    },
+    {
+      "epoch": 0.34918723660445516,
+      "grad_norm": 3.4902002811431885,
+      "learning_rate": 1.3015414258188825e-05,
+      "loss": 1.8954,
+      "step": 1450
+    },
+    {
+      "epoch": 0.35039133052378085,
+      "grad_norm": 2.9592397212982178,
+      "learning_rate": 1.2991329479768787e-05,
+      "loss": 1.8791,
+      "step": 1455
+    },
+    {
+      "epoch": 0.35159542444310654,
+      "grad_norm": 3.3300886154174805,
+      "learning_rate": 1.296724470134875e-05,
+      "loss": 1.9712,
+      "step": 1460
+    },
+    {
+      "epoch": 0.3527995183624323,
+      "grad_norm": 3.206303834915161,
+      "learning_rate": 1.294315992292871e-05,
+      "loss": 1.8155,
+      "step": 1465
+    },
+    {
+      "epoch": 0.354003612281758,
+      "grad_norm": 3.2186410427093506,
+      "learning_rate": 1.2919075144508672e-05,
+      "loss": 1.8583,
+      "step": 1470
+    },
+    {
+      "epoch": 0.35520770620108366,
+      "grad_norm": 3.482147216796875,
+      "learning_rate": 1.2894990366088633e-05,
+      "loss": 1.8233,
+      "step": 1475
+    },
+    {
+      "epoch": 0.3564118001204094,
+      "grad_norm": 3.317288637161255,
+      "learning_rate": 1.2870905587668594e-05,
+      "loss": 2.0094,
+      "step": 1480
+    },
+    {
+      "epoch": 0.3576158940397351,
+      "grad_norm": 2.962334632873535,
+      "learning_rate": 1.2846820809248558e-05,
+      "loss": 2.0041,
+      "step": 1485
+    },
+    {
+      "epoch": 0.3588199879590608,
+      "grad_norm": 3.1497514247894287,
+      "learning_rate": 1.2822736030828518e-05,
+      "loss": 1.8808,
+      "step": 1490
+    },
+    {
+      "epoch": 0.3600240818783865,
+      "grad_norm": 3.3814947605133057,
+      "learning_rate": 1.2798651252408479e-05,
+      "loss": 1.8085,
+      "step": 1495
+    },
+    {
+      "epoch": 0.3612281757977122,
+      "grad_norm": 3.426969051361084,
+      "learning_rate": 1.2774566473988441e-05,
+      "loss": 1.8659,
+      "step": 1500
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 4152,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1582373279990880.0,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-1500/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fd020aa9333054ccbd44f2f69522647a4bdda52101c5b015e38e78115982722b
+size 5816

checkpoint-2000/added_tokens.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "<|im_end|>": 32000,
+  "<|im_start|>": 32001
+}

checkpoint-2000/config.json ADDED Viewed

	@@ -0,0 +1,27 @@

+{
+  "_name_or_path": "Felladrin/Minueza-32M-Chat",
+  "architectures": [
+    "MistralForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "eos_token_id": 32000,
+  "head_dim": 26,
+  "hidden_act": "silu",
+  "hidden_size": 312,
+  "initializer_range": 0.02,
+  "intermediate_size": 1092,
+  "max_position_embeddings": 2048,
+  "model_type": "mistral",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 10,
+  "num_key_value_heads": 4,
+  "rms_norm_eps": 1e-06,
+  "rope_theta": 10000.0,
+  "sliding_window": 1024,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float32",
+  "transformers_version": "4.48.1",
+  "use_cache": false,
+  "vocab_size": 32002
+}

checkpoint-2000/generation_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "eos_token_id": 32000,
+  "transformers_version": "4.48.1"
+}

checkpoint-2000/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a5e5ae12002ae7acfcc7e9e689ac9f7111109f3f0d398163dc5325ba2036c51a
+size 131181272

checkpoint-2000/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9f456a2495078924ddab825fa63567c5c9efa44e8572a8ffcbdb5235f584babf
+size 262419258

checkpoint-2000/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5edb34d031c0c2b447f3eaadb401a4c1e7e7e6d8c096e28b7092e01a8bd48c92
+size 14244

checkpoint-2000/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8a6a91dbc2ee0881caca567a1c7d4bda579188bc179e9e0441bb8880b35c4bd4
+size 1064

checkpoint-2000/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<|im_end|>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoint-2000/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-2000/tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dadfd56d766715c61d2ef780a525ab43b8e6da4de6865bda3d95fdef5e134055
+size 493443

checkpoint-2000/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,61 @@

+{
+  "add_bos_token": false,
+  "add_eos_token": false,
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32000": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32001": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [],
+  "bos_token": "<s>",
+  "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "extra_special_tokens": {},
+  "legacy": true,
+  "model_max_length": 2048,
+  "pad_token": "<|im_end|>",
+  "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}

checkpoint-2000/trainer_state.json ADDED Viewed

	@@ -0,0 +1,2913 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.481637567730283,
+  "eval_steps": 200,
+  "global_step": 2000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0012040939193257074,
+      "grad_norm": 4.470886707305908,
+      "learning_rate": 1.9975915221579964e-05,
+      "loss": 2.6688,
+      "step": 5
+    },
+    {
+      "epoch": 0.002408187838651415,
+      "grad_norm": 4.067092418670654,
+      "learning_rate": 1.9951830443159926e-05,
+      "loss": 2.2509,
+      "step": 10
+    },
+    {
+      "epoch": 0.003612281757977122,
+      "grad_norm": 4.133108615875244,
+      "learning_rate": 1.9927745664739885e-05,
+      "loss": 2.3732,
+      "step": 15
+    },
+    {
+      "epoch": 0.00481637567730283,
+      "grad_norm": 3.4890763759613037,
+      "learning_rate": 1.9903660886319848e-05,
+      "loss": 2.3632,
+      "step": 20
+    },
+    {
+      "epoch": 0.006020469596628537,
+      "grad_norm": 4.1045308113098145,
+      "learning_rate": 1.987957610789981e-05,
+      "loss": 2.5203,
+      "step": 25
+    },
+    {
+      "epoch": 0.007224563515954244,
+      "grad_norm": 4.26784086227417,
+      "learning_rate": 1.985549132947977e-05,
+      "loss": 2.3349,
+      "step": 30
+    },
+    {
+      "epoch": 0.008428657435279952,
+      "grad_norm": 4.144766330718994,
+      "learning_rate": 1.983140655105973e-05,
+      "loss": 2.4106,
+      "step": 35
+    },
+    {
+      "epoch": 0.00963275135460566,
+      "grad_norm": 3.9538934230804443,
+      "learning_rate": 1.9807321772639694e-05,
+      "loss": 2.3192,
+      "step": 40
+    },
+    {
+      "epoch": 0.010836845273931367,
+      "grad_norm": 3.9219865798950195,
+      "learning_rate": 1.9783236994219656e-05,
+      "loss": 2.2012,
+      "step": 45
+    },
+    {
+      "epoch": 0.012040939193257074,
+      "grad_norm": 3.391493797302246,
+      "learning_rate": 1.9759152215799615e-05,
+      "loss": 2.4064,
+      "step": 50
+    },
+    {
+      "epoch": 0.013245033112582781,
+      "grad_norm": 4.393350124359131,
+      "learning_rate": 1.9735067437379577e-05,
+      "loss": 2.1513,
+      "step": 55
+    },
+    {
+      "epoch": 0.014449127031908489,
+      "grad_norm": 3.6243207454681396,
+      "learning_rate": 1.971098265895954e-05,
+      "loss": 2.2788,
+      "step": 60
+    },
+    {
+      "epoch": 0.015653220951234198,
+      "grad_norm": 3.642468214035034,
+      "learning_rate": 1.9686897880539502e-05,
+      "loss": 2.2244,
+      "step": 65
+    },
+    {
+      "epoch": 0.016857314870559904,
+      "grad_norm": 4.0894670486450195,
+      "learning_rate": 1.966281310211946e-05,
+      "loss": 2.3622,
+      "step": 70
+    },
+    {
+      "epoch": 0.018061408789885613,
+      "grad_norm": 4.033819198608398,
+      "learning_rate": 1.9638728323699423e-05,
+      "loss": 2.3803,
+      "step": 75
+    },
+    {
+      "epoch": 0.01926550270921132,
+      "grad_norm": 3.535987138748169,
+      "learning_rate": 1.9614643545279385e-05,
+      "loss": 2.2161,
+      "step": 80
+    },
+    {
+      "epoch": 0.020469596628537028,
+      "grad_norm": 3.541868209838867,
+      "learning_rate": 1.9590558766859348e-05,
+      "loss": 2.1721,
+      "step": 85
+    },
+    {
+      "epoch": 0.021673690547862733,
+      "grad_norm": 4.147072792053223,
+      "learning_rate": 1.9566473988439307e-05,
+      "loss": 2.3239,
+      "step": 90
+    },
+    {
+      "epoch": 0.022877784467188442,
+      "grad_norm": 3.4949986934661865,
+      "learning_rate": 1.954238921001927e-05,
+      "loss": 2.265,
+      "step": 95
+    },
+    {
+      "epoch": 0.024081878386514148,
+      "grad_norm": 3.793950319290161,
+      "learning_rate": 1.951830443159923e-05,
+      "loss": 2.152,
+      "step": 100
+    },
+    {
+      "epoch": 0.025285972305839857,
+      "grad_norm": 3.9355053901672363,
+      "learning_rate": 1.949421965317919e-05,
+      "loss": 2.2534,
+      "step": 105
+    },
+    {
+      "epoch": 0.026490066225165563,
+      "grad_norm": 3.255175828933716,
+      "learning_rate": 1.9470134874759156e-05,
+      "loss": 2.2971,
+      "step": 110
+    },
+    {
+      "epoch": 0.027694160144491272,
+      "grad_norm": 3.650298595428467,
+      "learning_rate": 1.9446050096339115e-05,
+      "loss": 2.1228,
+      "step": 115
+    },
+    {
+      "epoch": 0.028898254063816978,
+      "grad_norm": 3.1906814575195312,
+      "learning_rate": 1.9421965317919077e-05,
+      "loss": 2.0995,
+      "step": 120
+    },
+    {
+      "epoch": 0.030102347983142687,
+      "grad_norm": 3.8122494220733643,
+      "learning_rate": 1.939788053949904e-05,
+      "loss": 2.1651,
+      "step": 125
+    },
+    {
+      "epoch": 0.031306441902468396,
+      "grad_norm": 3.8269336223602295,
+      "learning_rate": 1.9373795761079e-05,
+      "loss": 2.1731,
+      "step": 130
+    },
+    {
+      "epoch": 0.0325105358217941,
+      "grad_norm": 3.75238037109375,
+      "learning_rate": 1.934971098265896e-05,
+      "loss": 2.3071,
+      "step": 135
+    },
+    {
+      "epoch": 0.03371462974111981,
+      "grad_norm": 3.538330078125,
+      "learning_rate": 1.9325626204238923e-05,
+      "loss": 2.3015,
+      "step": 140
+    },
+    {
+      "epoch": 0.034918723660445516,
+      "grad_norm": 3.497131586074829,
+      "learning_rate": 1.9301541425818882e-05,
+      "loss": 2.128,
+      "step": 145
+    },
+    {
+      "epoch": 0.036122817579771226,
+      "grad_norm": 3.6173276901245117,
+      "learning_rate": 1.9277456647398845e-05,
+      "loss": 2.1792,
+      "step": 150
+    },
+    {
+      "epoch": 0.03732691149909693,
+      "grad_norm": 3.2987892627716064,
+      "learning_rate": 1.9253371868978807e-05,
+      "loss": 2.0246,
+      "step": 155
+    },
+    {
+      "epoch": 0.03853100541842264,
+      "grad_norm": 3.1787831783294678,
+      "learning_rate": 1.922928709055877e-05,
+      "loss": 2.2108,
+      "step": 160
+    },
+    {
+      "epoch": 0.039735099337748346,
+      "grad_norm": 3.5422236919403076,
+      "learning_rate": 1.920520231213873e-05,
+      "loss": 2.1738,
+      "step": 165
+    },
+    {
+      "epoch": 0.040939193257074055,
+      "grad_norm": 3.7987539768218994,
+      "learning_rate": 1.918111753371869e-05,
+      "loss": 2.1161,
+      "step": 170
+    },
+    {
+      "epoch": 0.04214328717639976,
+      "grad_norm": 3.2058522701263428,
+      "learning_rate": 1.9157032755298653e-05,
+      "loss": 2.0808,
+      "step": 175
+    },
+    {
+      "epoch": 0.04334738109572547,
+      "grad_norm": 3.00519061088562,
+      "learning_rate": 1.9132947976878615e-05,
+      "loss": 2.1412,
+      "step": 180
+    },
+    {
+      "epoch": 0.044551475015051176,
+      "grad_norm": 3.4471330642700195,
+      "learning_rate": 1.9108863198458578e-05,
+      "loss": 2.1695,
+      "step": 185
+    },
+    {
+      "epoch": 0.045755568934376885,
+      "grad_norm": 3.394496440887451,
+      "learning_rate": 1.9084778420038536e-05,
+      "loss": 1.9532,
+      "step": 190
+    },
+    {
+      "epoch": 0.04695966285370259,
+      "grad_norm": 3.03004789352417,
+      "learning_rate": 1.90606936416185e-05,
+      "loss": 2.0659,
+      "step": 195
+    },
+    {
+      "epoch": 0.048163756773028296,
+      "grad_norm": 3.4260365962982178,
+      "learning_rate": 1.903660886319846e-05,
+      "loss": 2.0792,
+      "step": 200
+    },
+    {
+      "epoch": 0.048163756773028296,
+      "eval_loss": 2.1430513858795166,
+      "eval_runtime": 16.4051,
+      "eval_samples_per_second": 6.096,
+      "eval_steps_per_second": 0.792,
+      "step": 200
+    },
+    {
+      "epoch": 0.049367850692354005,
+      "grad_norm": 4.670680999755859,
+      "learning_rate": 1.901252408477842e-05,
+      "loss": 2.0952,
+      "step": 205
+    },
+    {
+      "epoch": 0.050571944611679714,
+      "grad_norm": 3.510042667388916,
+      "learning_rate": 1.8988439306358382e-05,
+      "loss": 2.195,
+      "step": 210
+    },
+    {
+      "epoch": 0.05177603853100542,
+      "grad_norm": 3.0459847450256348,
+      "learning_rate": 1.8964354527938345e-05,
+      "loss": 2.2117,
+      "step": 215
+    },
+    {
+      "epoch": 0.052980132450331126,
+      "grad_norm": 4.36016321182251,
+      "learning_rate": 1.8940269749518304e-05,
+      "loss": 2.1191,
+      "step": 220
+    },
+    {
+      "epoch": 0.054184226369656835,
+      "grad_norm": 3.0498242378234863,
+      "learning_rate": 1.891618497109827e-05,
+      "loss": 2.0838,
+      "step": 225
+    },
+    {
+      "epoch": 0.055388320288982544,
+      "grad_norm": 3.218038558959961,
+      "learning_rate": 1.889210019267823e-05,
+      "loss": 2.1118,
+      "step": 230
+    },
+    {
+      "epoch": 0.056592414208308246,
+      "grad_norm": 3.3144683837890625,
+      "learning_rate": 1.886801541425819e-05,
+      "loss": 2.2176,
+      "step": 235
+    },
+    {
+      "epoch": 0.057796508127633955,
+      "grad_norm": 3.2364652156829834,
+      "learning_rate": 1.8843930635838153e-05,
+      "loss": 2.112,
+      "step": 240
+    },
+    {
+      "epoch": 0.059000602046959665,
+      "grad_norm": 3.291278839111328,
+      "learning_rate": 1.8819845857418112e-05,
+      "loss": 2.144,
+      "step": 245
+    },
+    {
+      "epoch": 0.060204695966285374,
+      "grad_norm": 3.65297794342041,
+      "learning_rate": 1.8795761078998074e-05,
+      "loss": 2.2597,
+      "step": 250
+    },
+    {
+      "epoch": 0.061408789885611076,
+      "grad_norm": 3.2321982383728027,
+      "learning_rate": 1.8771676300578037e-05,
+      "loss": 2.1618,
+      "step": 255
+    },
+    {
+      "epoch": 0.06261288380493679,
+      "grad_norm": 3.352842330932617,
+      "learning_rate": 1.8747591522158e-05,
+      "loss": 2.006,
+      "step": 260
+    },
+    {
+      "epoch": 0.0638169777242625,
+      "grad_norm": 3.5657215118408203,
+      "learning_rate": 1.8723506743737958e-05,
+      "loss": 2.2253,
+      "step": 265
+    },
+    {
+      "epoch": 0.0650210716435882,
+      "grad_norm": 3.060060739517212,
+      "learning_rate": 1.869942196531792e-05,
+      "loss": 2.1187,
+      "step": 270
+    },
+    {
+      "epoch": 0.06622516556291391,
+      "grad_norm": 3.473719835281372,
+      "learning_rate": 1.8675337186897883e-05,
+      "loss": 2.0299,
+      "step": 275
+    },
+    {
+      "epoch": 0.06742925948223961,
+      "grad_norm": 3.1167919635772705,
+      "learning_rate": 1.8651252408477845e-05,
+      "loss": 2.0381,
+      "step": 280
+    },
+    {
+      "epoch": 0.06863335340156532,
+      "grad_norm": 3.815816640853882,
+      "learning_rate": 1.8627167630057804e-05,
+      "loss": 2.1624,
+      "step": 285
+    },
+    {
+      "epoch": 0.06983744732089103,
+      "grad_norm": 3.2820959091186523,
+      "learning_rate": 1.8603082851637766e-05,
+      "loss": 2.0819,
+      "step": 290
+    },
+    {
+      "epoch": 0.07104154124021674,
+      "grad_norm": 3.568885087966919,
+      "learning_rate": 1.857899807321773e-05,
+      "loss": 2.0749,
+      "step": 295
+    },
+    {
+      "epoch": 0.07224563515954245,
+      "grad_norm": 3.424076795578003,
+      "learning_rate": 1.855491329479769e-05,
+      "loss": 2.129,
+      "step": 300
+    },
+    {
+      "epoch": 0.07344972907886815,
+      "grad_norm": 3.2800493240356445,
+      "learning_rate": 1.853082851637765e-05,
+      "loss": 2.2067,
+      "step": 305
+    },
+    {
+      "epoch": 0.07465382299819386,
+      "grad_norm": 3.487868547439575,
+      "learning_rate": 1.8506743737957612e-05,
+      "loss": 2.124,
+      "step": 310
+    },
+    {
+      "epoch": 0.07585791691751957,
+      "grad_norm": 3.3999245166778564,
+      "learning_rate": 1.8482658959537575e-05,
+      "loss": 1.9888,
+      "step": 315
+    },
+    {
+      "epoch": 0.07706201083684527,
+      "grad_norm": 3.973482370376587,
+      "learning_rate": 1.8458574181117533e-05,
+      "loss": 2.0592,
+      "step": 320
+    },
+    {
+      "epoch": 0.07826610475617098,
+      "grad_norm": 2.9601657390594482,
+      "learning_rate": 1.8434489402697496e-05,
+      "loss": 2.1022,
+      "step": 325
+    },
+    {
+      "epoch": 0.07947019867549669,
+      "grad_norm": 3.260118246078491,
+      "learning_rate": 1.8410404624277458e-05,
+      "loss": 1.9763,
+      "step": 330
+    },
+    {
+      "epoch": 0.0806742925948224,
+      "grad_norm": 3.509838819503784,
+      "learning_rate": 1.838631984585742e-05,
+      "loss": 2.0284,
+      "step": 335
+    },
+    {
+      "epoch": 0.08187838651414811,
+      "grad_norm": 4.363494396209717,
+      "learning_rate": 1.8362235067437383e-05,
+      "loss": 2.0479,
+      "step": 340
+    },
+    {
+      "epoch": 0.08308248043347381,
+      "grad_norm": 3.2578630447387695,
+      "learning_rate": 1.8338150289017342e-05,
+      "loss": 2.0488,
+      "step": 345
+    },
+    {
+      "epoch": 0.08428657435279951,
+      "grad_norm": 3.2846531867980957,
+      "learning_rate": 1.8314065510597304e-05,
+      "loss": 2.0876,
+      "step": 350
+    },
+    {
+      "epoch": 0.08549066827212523,
+      "grad_norm": 3.3275203704833984,
+      "learning_rate": 1.8289980732177266e-05,
+      "loss": 2.0564,
+      "step": 355
+    },
+    {
+      "epoch": 0.08669476219145093,
+      "grad_norm": 3.1368625164031982,
+      "learning_rate": 1.8265895953757225e-05,
+      "loss": 2.1533,
+      "step": 360
+    },
+    {
+      "epoch": 0.08789885611077664,
+      "grad_norm": 3.3824191093444824,
+      "learning_rate": 1.8241811175337188e-05,
+      "loss": 2.1821,
+      "step": 365
+    },
+    {
+      "epoch": 0.08910295003010235,
+      "grad_norm": 3.5150134563446045,
+      "learning_rate": 1.821772639691715e-05,
+      "loss": 2.0292,
+      "step": 370
+    },
+    {
+      "epoch": 0.09030704394942805,
+      "grad_norm": 3.421921730041504,
+      "learning_rate": 1.8193641618497112e-05,
+      "loss": 1.9862,
+      "step": 375
+    },
+    {
+      "epoch": 0.09151113786875377,
+      "grad_norm": 3.616887092590332,
+      "learning_rate": 1.8169556840077075e-05,
+      "loss": 2.0158,
+      "step": 380
+    },
+    {
+      "epoch": 0.09271523178807947,
+      "grad_norm": 5.063056945800781,
+      "learning_rate": 1.8145472061657034e-05,
+      "loss": 2.0579,
+      "step": 385
+    },
+    {
+      "epoch": 0.09391932570740517,
+      "grad_norm": 3.5242559909820557,
+      "learning_rate": 1.8121387283236996e-05,
+      "loss": 2.0272,
+      "step": 390
+    },
+    {
+      "epoch": 0.09512341962673089,
+      "grad_norm": 3.2852962017059326,
+      "learning_rate": 1.809730250481696e-05,
+      "loss": 2.077,
+      "step": 395
+    },
+    {
+      "epoch": 0.09632751354605659,
+      "grad_norm": 3.710927963256836,
+      "learning_rate": 1.8073217726396917e-05,
+      "loss": 2.1271,
+      "step": 400
+    },
+    {
+      "epoch": 0.09632751354605659,
+      "eval_loss": 2.0655810832977295,
+      "eval_runtime": 16.3755,
+      "eval_samples_per_second": 6.107,
+      "eval_steps_per_second": 0.794,
+      "step": 400
+    },
+    {
+      "epoch": 0.0975316074653823,
+      "grad_norm": 3.5019216537475586,
+      "learning_rate": 1.804913294797688e-05,
+      "loss": 2.1081,
+      "step": 405
+    },
+    {
+      "epoch": 0.09873570138470801,
+      "grad_norm": 3.5533690452575684,
+      "learning_rate": 1.8025048169556842e-05,
+      "loss": 2.0751,
+      "step": 410
+    },
+    {
+      "epoch": 0.09993979530403371,
+      "grad_norm": 3.4970240592956543,
+      "learning_rate": 1.8000963391136804e-05,
+      "loss": 2.066,
+      "step": 415
+    },
+    {
+      "epoch": 0.10114388922335943,
+      "grad_norm": 3.0926427841186523,
+      "learning_rate": 1.7976878612716763e-05,
+      "loss": 2.0516,
+      "step": 420
+    },
+    {
+      "epoch": 0.10234798314268513,
+      "grad_norm": 3.747452974319458,
+      "learning_rate": 1.7952793834296726e-05,
+      "loss": 2.0721,
+      "step": 425
+    },
+    {
+      "epoch": 0.10355207706201083,
+      "grad_norm": 3.3113677501678467,
+      "learning_rate": 1.7928709055876688e-05,
+      "loss": 2.1527,
+      "step": 430
+    },
+    {
+      "epoch": 0.10475617098133655,
+      "grad_norm": 3.357912063598633,
+      "learning_rate": 1.7904624277456647e-05,
+      "loss": 2.0113,
+      "step": 435
+    },
+    {
+      "epoch": 0.10596026490066225,
+      "grad_norm": 3.023893356323242,
+      "learning_rate": 1.7880539499036613e-05,
+      "loss": 2.1332,
+      "step": 440
+    },
+    {
+      "epoch": 0.10716435881998795,
+      "grad_norm": 3.3027355670928955,
+      "learning_rate": 1.785645472061657e-05,
+      "loss": 1.9699,
+      "step": 445
+    },
+    {
+      "epoch": 0.10836845273931367,
+      "grad_norm": 5.3524932861328125,
+      "learning_rate": 1.7832369942196534e-05,
+      "loss": 2.0182,
+      "step": 450
+    },
+    {
+      "epoch": 0.10957254665863937,
+      "grad_norm": 3.200258731842041,
+      "learning_rate": 1.7808285163776496e-05,
+      "loss": 2.007,
+      "step": 455
+    },
+    {
+      "epoch": 0.11077664057796509,
+      "grad_norm": 3.286268949508667,
+      "learning_rate": 1.7784200385356455e-05,
+      "loss": 2.0907,
+      "step": 460
+    },
+    {
+      "epoch": 0.11198073449729079,
+      "grad_norm": 3.15291428565979,
+      "learning_rate": 1.7760115606936417e-05,
+      "loss": 2.0468,
+      "step": 465
+    },
+    {
+      "epoch": 0.11318482841661649,
+      "grad_norm": 3.3798069953918457,
+      "learning_rate": 1.773603082851638e-05,
+      "loss": 1.9927,
+      "step": 470
+    },
+    {
+      "epoch": 0.11438892233594221,
+      "grad_norm": 3.4220967292785645,
+      "learning_rate": 1.771194605009634e-05,
+      "loss": 2.1326,
+      "step": 475
+    },
+    {
+      "epoch": 0.11559301625526791,
+      "grad_norm": 3.379628896713257,
+      "learning_rate": 1.76878612716763e-05,
+      "loss": 1.9202,
+      "step": 480
+    },
+    {
+      "epoch": 0.11679711017459361,
+      "grad_norm": 3.3020846843719482,
+      "learning_rate": 1.7663776493256263e-05,
+      "loss": 2.1176,
+      "step": 485
+    },
+    {
+      "epoch": 0.11800120409391933,
+      "grad_norm": 3.2711665630340576,
+      "learning_rate": 1.7639691714836226e-05,
+      "loss": 2.0865,
+      "step": 490
+    },
+    {
+      "epoch": 0.11920529801324503,
+      "grad_norm": 3.239253520965576,
+      "learning_rate": 1.7615606936416188e-05,
+      "loss": 1.9284,
+      "step": 495
+    },
+    {
+      "epoch": 0.12040939193257075,
+      "grad_norm": 3.4960460662841797,
+      "learning_rate": 1.7591522157996147e-05,
+      "loss": 2.0088,
+      "step": 500
+    },
+    {
+      "epoch": 0.12161348585189645,
+      "grad_norm": 3.337407350540161,
+      "learning_rate": 1.756743737957611e-05,
+      "loss": 1.9687,
+      "step": 505
+    },
+    {
+      "epoch": 0.12281757977122215,
+      "grad_norm": 3.534827709197998,
+      "learning_rate": 1.754335260115607e-05,
+      "loss": 2.0273,
+      "step": 510
+    },
+    {
+      "epoch": 0.12402167369054787,
+      "grad_norm": 3.6207938194274902,
+      "learning_rate": 1.7519267822736034e-05,
+      "loss": 2.0458,
+      "step": 515
+    },
+    {
+      "epoch": 0.12522576760987358,
+      "grad_norm": 3.396012544631958,
+      "learning_rate": 1.7495183044315993e-05,
+      "loss": 1.9185,
+      "step": 520
+    },
+    {
+      "epoch": 0.12642986152919927,
+      "grad_norm": 3.001236915588379,
+      "learning_rate": 1.7471098265895955e-05,
+      "loss": 1.9407,
+      "step": 525
+    },
+    {
+      "epoch": 0.127633955448525,
+      "grad_norm": 3.1318376064300537,
+      "learning_rate": 1.7447013487475918e-05,
+      "loss": 1.8984,
+      "step": 530
+    },
+    {
+      "epoch": 0.1288380493678507,
+      "grad_norm": 3.4541585445404053,
+      "learning_rate": 1.7422928709055877e-05,
+      "loss": 1.8846,
+      "step": 535
+    },
+    {
+      "epoch": 0.1300421432871764,
+      "grad_norm": 3.311082363128662,
+      "learning_rate": 1.739884393063584e-05,
+      "loss": 2.0015,
+      "step": 540
+    },
+    {
+      "epoch": 0.1312462372065021,
+      "grad_norm": 3.2366561889648438,
+      "learning_rate": 1.73747591522158e-05,
+      "loss": 2.0176,
+      "step": 545
+    },
+    {
+      "epoch": 0.13245033112582782,
+      "grad_norm": 3.123307943344116,
+      "learning_rate": 1.735067437379576e-05,
+      "loss": 1.9731,
+      "step": 550
+    },
+    {
+      "epoch": 0.1336544250451535,
+      "grad_norm": 3.776921033859253,
+      "learning_rate": 1.7326589595375726e-05,
+      "loss": 2.0484,
+      "step": 555
+    },
+    {
+      "epoch": 0.13485851896447923,
+      "grad_norm": 2.959716796875,
+      "learning_rate": 1.7302504816955685e-05,
+      "loss": 1.9689,
+      "step": 560
+    },
+    {
+      "epoch": 0.13606261288380495,
+      "grad_norm": 3.527384042739868,
+      "learning_rate": 1.7278420038535647e-05,
+      "loss": 1.9488,
+      "step": 565
+    },
+    {
+      "epoch": 0.13726670680313063,
+      "grad_norm": 3.0703189373016357,
+      "learning_rate": 1.725433526011561e-05,
+      "loss": 2.1226,
+      "step": 570
+    },
+    {
+      "epoch": 0.13847080072245635,
+      "grad_norm": 3.1028363704681396,
+      "learning_rate": 1.723025048169557e-05,
+      "loss": 1.8966,
+      "step": 575
+    },
+    {
+      "epoch": 0.13967489464178207,
+      "grad_norm": 3.340517044067383,
+      "learning_rate": 1.720616570327553e-05,
+      "loss": 2.2156,
+      "step": 580
+    },
+    {
+      "epoch": 0.14087898856110775,
+      "grad_norm": 3.2740213871002197,
+      "learning_rate": 1.7182080924855493e-05,
+      "loss": 1.9445,
+      "step": 585
+    },
+    {
+      "epoch": 0.14208308248043347,
+      "grad_norm": 3.240690231323242,
+      "learning_rate": 1.7157996146435455e-05,
+      "loss": 2.0295,
+      "step": 590
+    },
+    {
+      "epoch": 0.1432871763997592,
+      "grad_norm": 3.821340799331665,
+      "learning_rate": 1.7133911368015418e-05,
+      "loss": 2.1401,
+      "step": 595
+    },
+    {
+      "epoch": 0.1444912703190849,
+      "grad_norm": 3.103550910949707,
+      "learning_rate": 1.7109826589595377e-05,
+      "loss": 2.1164,
+      "step": 600
+    },
+    {
+      "epoch": 0.1444912703190849,
+      "eval_loss": 2.017059564590454,
+      "eval_runtime": 16.4324,
+      "eval_samples_per_second": 6.086,
+      "eval_steps_per_second": 0.791,
+      "step": 600
+    },
+    {
+      "epoch": 0.1456953642384106,
+      "grad_norm": 3.4971117973327637,
+      "learning_rate": 1.708574181117534e-05,
+      "loss": 1.9864,
+      "step": 605
+    },
+    {
+      "epoch": 0.1468994581577363,
+      "grad_norm": 3.324803590774536,
+      "learning_rate": 1.70616570327553e-05,
+      "loss": 2.0402,
+      "step": 610
+    },
+    {
+      "epoch": 0.14810355207706202,
+      "grad_norm": 3.302614450454712,
+      "learning_rate": 1.703757225433526e-05,
+      "loss": 1.9494,
+      "step": 615
+    },
+    {
+      "epoch": 0.1493076459963877,
+      "grad_norm": 3.3090734481811523,
+      "learning_rate": 1.7013487475915223e-05,
+      "loss": 2.0748,
+      "step": 620
+    },
+    {
+      "epoch": 0.15051173991571343,
+      "grad_norm": 3.559049129486084,
+      "learning_rate": 1.6989402697495185e-05,
+      "loss": 1.9038,
+      "step": 625
+    },
+    {
+      "epoch": 0.15171583383503914,
+      "grad_norm": 3.5149178504943848,
+      "learning_rate": 1.6965317919075147e-05,
+      "loss": 2.0261,
+      "step": 630
+    },
+    {
+      "epoch": 0.15291992775436483,
+      "grad_norm": 3.835693120956421,
+      "learning_rate": 1.6941233140655106e-05,
+      "loss": 1.9453,
+      "step": 635
+    },
+    {
+      "epoch": 0.15412402167369055,
+      "grad_norm": 3.521132469177246,
+      "learning_rate": 1.691714836223507e-05,
+      "loss": 2.0138,
+      "step": 640
+    },
+    {
+      "epoch": 0.15532811559301626,
+      "grad_norm": 3.2369840145111084,
+      "learning_rate": 1.689306358381503e-05,
+      "loss": 2.0285,
+      "step": 645
+    },
+    {
+      "epoch": 0.15653220951234195,
+      "grad_norm": 3.1592392921447754,
+      "learning_rate": 1.686897880539499e-05,
+      "loss": 1.9912,
+      "step": 650
+    },
+    {
+      "epoch": 0.15773630343166767,
+      "grad_norm": 3.2069106101989746,
+      "learning_rate": 1.6844894026974952e-05,
+      "loss": 2.0159,
+      "step": 655
+    },
+    {
+      "epoch": 0.15894039735099338,
+      "grad_norm": 3.318230390548706,
+      "learning_rate": 1.6820809248554915e-05,
+      "loss": 2.0412,
+      "step": 660
+    },
+    {
+      "epoch": 0.16014449127031907,
+      "grad_norm": 3.549443244934082,
+      "learning_rate": 1.6796724470134877e-05,
+      "loss": 2.0014,
+      "step": 665
+    },
+    {
+      "epoch": 0.1613485851896448,
+      "grad_norm": 3.32999324798584,
+      "learning_rate": 1.677263969171484e-05,
+      "loss": 2.0303,
+      "step": 670
+    },
+    {
+      "epoch": 0.1625526791089705,
+      "grad_norm": 3.262946367263794,
+      "learning_rate": 1.6748554913294798e-05,
+      "loss": 1.9883,
+      "step": 675
+    },
+    {
+      "epoch": 0.16375677302829622,
+      "grad_norm": 3.484685182571411,
+      "learning_rate": 1.672447013487476e-05,
+      "loss": 1.9695,
+      "step": 680
+    },
+    {
+      "epoch": 0.1649608669476219,
+      "grad_norm": 3.4177358150482178,
+      "learning_rate": 1.6700385356454723e-05,
+      "loss": 2.0088,
+      "step": 685
+    },
+    {
+      "epoch": 0.16616496086694763,
+      "grad_norm": 3.447498321533203,
+      "learning_rate": 1.6676300578034682e-05,
+      "loss": 2.0813,
+      "step": 690
+    },
+    {
+      "epoch": 0.16736905478627334,
+      "grad_norm": 3.152740240097046,
+      "learning_rate": 1.6652215799614644e-05,
+      "loss": 1.9988,
+      "step": 695
+    },
+    {
+      "epoch": 0.16857314870559903,
+      "grad_norm": 3.8948824405670166,
+      "learning_rate": 1.6628131021194607e-05,
+      "loss": 2.0801,
+      "step": 700
+    },
+    {
+      "epoch": 0.16977724262492475,
+      "grad_norm": 3.81358003616333,
+      "learning_rate": 1.660404624277457e-05,
+      "loss": 1.944,
+      "step": 705
+    },
+    {
+      "epoch": 0.17098133654425046,
+      "grad_norm": 2.980236053466797,
+      "learning_rate": 1.657996146435453e-05,
+      "loss": 1.9151,
+      "step": 710
+    },
+    {
+      "epoch": 0.17218543046357615,
+      "grad_norm": 3.041680335998535,
+      "learning_rate": 1.655587668593449e-05,
+      "loss": 1.9486,
+      "step": 715
+    },
+    {
+      "epoch": 0.17338952438290187,
+      "grad_norm": 2.898974657058716,
+      "learning_rate": 1.6531791907514452e-05,
+      "loss": 2.1119,
+      "step": 720
+    },
+    {
+      "epoch": 0.17459361830222758,
+      "grad_norm": 3.161224603652954,
+      "learning_rate": 1.6507707129094415e-05,
+      "loss": 2.036,
+      "step": 725
+    },
+    {
+      "epoch": 0.17579771222155327,
+      "grad_norm": 3.2449426651000977,
+      "learning_rate": 1.6483622350674374e-05,
+      "loss": 2.0635,
+      "step": 730
+    },
+    {
+      "epoch": 0.177001806140879,
+      "grad_norm": 3.2805328369140625,
+      "learning_rate": 1.6459537572254336e-05,
+      "loss": 1.8022,
+      "step": 735
+    },
+    {
+      "epoch": 0.1782059000602047,
+      "grad_norm": 3.491149663925171,
+      "learning_rate": 1.64354527938343e-05,
+      "loss": 1.9832,
+      "step": 740
+    },
+    {
+      "epoch": 0.1794099939795304,
+      "grad_norm": 3.423267126083374,
+      "learning_rate": 1.641136801541426e-05,
+      "loss": 1.9574,
+      "step": 745
+    },
+    {
+      "epoch": 0.1806140878988561,
+      "grad_norm": 3.1914217472076416,
+      "learning_rate": 1.638728323699422e-05,
+      "loss": 1.9283,
+      "step": 750
+    },
+    {
+      "epoch": 0.18181818181818182,
+      "grad_norm": 3.2903149127960205,
+      "learning_rate": 1.6363198458574182e-05,
+      "loss": 2.1174,
+      "step": 755
+    },
+    {
+      "epoch": 0.18302227573750754,
+      "grad_norm": 3.113159656524658,
+      "learning_rate": 1.6339113680154144e-05,
+      "loss": 1.8794,
+      "step": 760
+    },
+    {
+      "epoch": 0.18422636965683323,
+      "grad_norm": 3.261596918106079,
+      "learning_rate": 1.6315028901734103e-05,
+      "loss": 2.0853,
+      "step": 765
+    },
+    {
+      "epoch": 0.18543046357615894,
+      "grad_norm": 2.9525296688079834,
+      "learning_rate": 1.629094412331407e-05,
+      "loss": 1.8622,
+      "step": 770
+    },
+    {
+      "epoch": 0.18663455749548466,
+      "grad_norm": 3.2103638648986816,
+      "learning_rate": 1.6266859344894028e-05,
+      "loss": 2.0536,
+      "step": 775
+    },
+    {
+      "epoch": 0.18783865141481035,
+      "grad_norm": 3.5312676429748535,
+      "learning_rate": 1.624277456647399e-05,
+      "loss": 1.9387,
+      "step": 780
+    },
+    {
+      "epoch": 0.18904274533413606,
+      "grad_norm": 3.277223825454712,
+      "learning_rate": 1.6218689788053953e-05,
+      "loss": 1.9804,
+      "step": 785
+    },
+    {
+      "epoch": 0.19024683925346178,
+      "grad_norm": 3.207287549972534,
+      "learning_rate": 1.619460500963391e-05,
+      "loss": 1.8782,
+      "step": 790
+    },
+    {
+      "epoch": 0.19145093317278747,
+      "grad_norm": 3.401834487915039,
+      "learning_rate": 1.6170520231213874e-05,
+      "loss": 1.9383,
+      "step": 795
+    },
+    {
+      "epoch": 0.19265502709211318,
+      "grad_norm": 3.5186078548431396,
+      "learning_rate": 1.6146435452793836e-05,
+      "loss": 2.0963,
+      "step": 800
+    },
+    {
+      "epoch": 0.19265502709211318,
+      "eval_loss": 1.979533076286316,
+      "eval_runtime": 16.4348,
+      "eval_samples_per_second": 6.085,
+      "eval_steps_per_second": 0.791,
+      "step": 800
+    },
+    {
+      "epoch": 0.1938591210114389,
+      "grad_norm": 3.0080161094665527,
+      "learning_rate": 1.6122350674373795e-05,
+      "loss": 1.951,
+      "step": 805
+    },
+    {
+      "epoch": 0.1950632149307646,
+      "grad_norm": 3.124155044555664,
+      "learning_rate": 1.6098265895953758e-05,
+      "loss": 1.8663,
+      "step": 810
+    },
+    {
+      "epoch": 0.1962673088500903,
+      "grad_norm": 3.6262383460998535,
+      "learning_rate": 1.607418111753372e-05,
+      "loss": 1.9478,
+      "step": 815
+    },
+    {
+      "epoch": 0.19747140276941602,
+      "grad_norm": 3.3047947883605957,
+      "learning_rate": 1.6050096339113682e-05,
+      "loss": 1.9203,
+      "step": 820
+    },
+    {
+      "epoch": 0.1986754966887417,
+      "grad_norm": 3.0261447429656982,
+      "learning_rate": 1.6026011560693645e-05,
+      "loss": 1.8988,
+      "step": 825
+    },
+    {
+      "epoch": 0.19987959060806743,
+      "grad_norm": 4.233884334564209,
+      "learning_rate": 1.6001926782273604e-05,
+      "loss": 2.0327,
+      "step": 830
+    },
+    {
+      "epoch": 0.20108368452739314,
+      "grad_norm": 2.9169118404388428,
+      "learning_rate": 1.5977842003853566e-05,
+      "loss": 1.9264,
+      "step": 835
+    },
+    {
+      "epoch": 0.20228777844671886,
+      "grad_norm": 3.0078282356262207,
+      "learning_rate": 1.5953757225433528e-05,
+      "loss": 1.8821,
+      "step": 840
+    },
+    {
+      "epoch": 0.20349187236604455,
+      "grad_norm": 3.4188835620880127,
+      "learning_rate": 1.592967244701349e-05,
+      "loss": 1.933,
+      "step": 845
+    },
+    {
+      "epoch": 0.20469596628537026,
+      "grad_norm": 4.739987850189209,
+      "learning_rate": 1.590558766859345e-05,
+      "loss": 1.9182,
+      "step": 850
+    },
+    {
+      "epoch": 0.20590006020469598,
+      "grad_norm": 3.1810977458953857,
+      "learning_rate": 1.5881502890173412e-05,
+      "loss": 1.984,
+      "step": 855
+    },
+    {
+      "epoch": 0.20710415412402167,
+      "grad_norm": 3.174739360809326,
+      "learning_rate": 1.5857418111753374e-05,
+      "loss": 1.7719,
+      "step": 860
+    },
+    {
+      "epoch": 0.20830824804334738,
+      "grad_norm": 3.379767656326294,
+      "learning_rate": 1.5833333333333333e-05,
+      "loss": 1.9481,
+      "step": 865
+    },
+    {
+      "epoch": 0.2095123419626731,
+      "grad_norm": 3.3487260341644287,
+      "learning_rate": 1.5809248554913295e-05,
+      "loss": 1.9416,
+      "step": 870
+    },
+    {
+      "epoch": 0.2107164358819988,
+      "grad_norm": 3.4879958629608154,
+      "learning_rate": 1.5785163776493258e-05,
+      "loss": 2.0463,
+      "step": 875
+    },
+    {
+      "epoch": 0.2119205298013245,
+      "grad_norm": 3.2338194847106934,
+      "learning_rate": 1.5761078998073217e-05,
+      "loss": 1.9441,
+      "step": 880
+    },
+    {
+      "epoch": 0.21312462372065022,
+      "grad_norm": 3.122405529022217,
+      "learning_rate": 1.5736994219653182e-05,
+      "loss": 2.0644,
+      "step": 885
+    },
+    {
+      "epoch": 0.2143287176399759,
+      "grad_norm": 3.0773510932922363,
+      "learning_rate": 1.571290944123314e-05,
+      "loss": 1.9415,
+      "step": 890
+    },
+    {
+      "epoch": 0.21553281155930162,
+      "grad_norm": 3.004040241241455,
+      "learning_rate": 1.5688824662813104e-05,
+      "loss": 1.9305,
+      "step": 895
+    },
+    {
+      "epoch": 0.21673690547862734,
+      "grad_norm": 3.547109603881836,
+      "learning_rate": 1.5664739884393066e-05,
+      "loss": 2.1088,
+      "step": 900
+    },
+    {
+      "epoch": 0.21794099939795303,
+      "grad_norm": 3.1982204914093018,
+      "learning_rate": 1.5640655105973025e-05,
+      "loss": 1.8667,
+      "step": 905
+    },
+    {
+      "epoch": 0.21914509331727874,
+      "grad_norm": 3.381781578063965,
+      "learning_rate": 1.5616570327552987e-05,
+      "loss": 1.9783,
+      "step": 910
+    },
+    {
+      "epoch": 0.22034918723660446,
+      "grad_norm": 2.9775896072387695,
+      "learning_rate": 1.559248554913295e-05,
+      "loss": 2.0211,
+      "step": 915
+    },
+    {
+      "epoch": 0.22155328115593018,
+      "grad_norm": 2.864551067352295,
+      "learning_rate": 1.5568400770712912e-05,
+      "loss": 1.8579,
+      "step": 920
+    },
+    {
+      "epoch": 0.22275737507525586,
+      "grad_norm": 3.0532050132751465,
+      "learning_rate": 1.5544315992292874e-05,
+      "loss": 1.9398,
+      "step": 925
+    },
+    {
+      "epoch": 0.22396146899458158,
+      "grad_norm": 2.859631061553955,
+      "learning_rate": 1.5520231213872833e-05,
+      "loss": 1.8625,
+      "step": 930
+    },
+    {
+      "epoch": 0.2251655629139073,
+      "grad_norm": 3.1373536586761475,
+      "learning_rate": 1.5496146435452796e-05,
+      "loss": 1.9003,
+      "step": 935
+    },
+    {
+      "epoch": 0.22636965683323299,
+      "grad_norm": 3.3248465061187744,
+      "learning_rate": 1.5472061657032758e-05,
+      "loss": 2.0517,
+      "step": 940
+    },
+    {
+      "epoch": 0.2275737507525587,
+      "grad_norm": 3.5990936756134033,
+      "learning_rate": 1.5447976878612717e-05,
+      "loss": 1.8975,
+      "step": 945
+    },
+    {
+      "epoch": 0.22877784467188442,
+      "grad_norm": 3.4047725200653076,
+      "learning_rate": 1.542389210019268e-05,
+      "loss": 1.9786,
+      "step": 950
+    },
+    {
+      "epoch": 0.2299819385912101,
+      "grad_norm": 3.3326022624969482,
+      "learning_rate": 1.539980732177264e-05,
+      "loss": 1.7848,
+      "step": 955
+    },
+    {
+      "epoch": 0.23118603251053582,
+      "grad_norm": 3.2942848205566406,
+      "learning_rate": 1.5375722543352604e-05,
+      "loss": 1.8549,
+      "step": 960
+    },
+    {
+      "epoch": 0.23239012642986154,
+      "grad_norm": 3.4602601528167725,
+      "learning_rate": 1.5351637764932563e-05,
+      "loss": 1.917,
+      "step": 965
+    },
+    {
+      "epoch": 0.23359422034918723,
+      "grad_norm": 3.591327428817749,
+      "learning_rate": 1.5327552986512525e-05,
+      "loss": 1.9457,
+      "step": 970
+    },
+    {
+      "epoch": 0.23479831426851294,
+      "grad_norm": 3.215808868408203,
+      "learning_rate": 1.5303468208092487e-05,
+      "loss": 1.9261,
+      "step": 975
+    },
+    {
+      "epoch": 0.23600240818783866,
+      "grad_norm": 3.5032927989959717,
+      "learning_rate": 1.5279383429672446e-05,
+      "loss": 1.967,
+      "step": 980
+    },
+    {
+      "epoch": 0.23720650210716435,
+      "grad_norm": 3.1476144790649414,
+      "learning_rate": 1.525529865125241e-05,
+      "loss": 2.0448,
+      "step": 985
+    },
+    {
+      "epoch": 0.23841059602649006,
+      "grad_norm": 3.046126365661621,
+      "learning_rate": 1.5231213872832371e-05,
+      "loss": 1.9086,
+      "step": 990
+    },
+    {
+      "epoch": 0.23961468994581578,
+      "grad_norm": 3.0403099060058594,
+      "learning_rate": 1.5207129094412332e-05,
+      "loss": 1.9671,
+      "step": 995
+    },
+    {
+      "epoch": 0.2408187838651415,
+      "grad_norm": 3.524573802947998,
+      "learning_rate": 1.5183044315992294e-05,
+      "loss": 1.9532,
+      "step": 1000
+    },
+    {
+      "epoch": 0.2408187838651415,
+      "eval_loss": 1.9571956396102905,
+      "eval_runtime": 16.4441,
+      "eval_samples_per_second": 6.081,
+      "eval_steps_per_second": 0.791,
+      "step": 1000
+    },
+    {
+      "epoch": 0.24202287778446718,
+      "grad_norm": 3.518422842025757,
+      "learning_rate": 1.5158959537572255e-05,
+      "loss": 1.9851,
+      "step": 1005
+    },
+    {
+      "epoch": 0.2432269717037929,
+      "grad_norm": 3.0476677417755127,
+      "learning_rate": 1.5134874759152215e-05,
+      "loss": 2.0102,
+      "step": 1010
+    },
+    {
+      "epoch": 0.24443106562311862,
+      "grad_norm": 2.7093968391418457,
+      "learning_rate": 1.511078998073218e-05,
+      "loss": 1.8704,
+      "step": 1015
+    },
+    {
+      "epoch": 0.2456351595424443,
+      "grad_norm": 2.8607966899871826,
+      "learning_rate": 1.508670520231214e-05,
+      "loss": 1.9948,
+      "step": 1020
+    },
+    {
+      "epoch": 0.24683925346177002,
+      "grad_norm": 3.224783182144165,
+      "learning_rate": 1.50626204238921e-05,
+      "loss": 1.9097,
+      "step": 1025
+    },
+    {
+      "epoch": 0.24804334738109574,
+      "grad_norm": 3.199155807495117,
+      "learning_rate": 1.5038535645472063e-05,
+      "loss": 2.0205,
+      "step": 1030
+    },
+    {
+      "epoch": 0.24924744130042142,
+      "grad_norm": 3.12842059135437,
+      "learning_rate": 1.5014450867052024e-05,
+      "loss": 1.8666,
+      "step": 1035
+    },
+    {
+      "epoch": 0.25045153521974717,
+      "grad_norm": 3.7331671714782715,
+      "learning_rate": 1.4990366088631986e-05,
+      "loss": 1.913,
+      "step": 1040
+    },
+    {
+      "epoch": 0.25165562913907286,
+      "grad_norm": 3.0594935417175293,
+      "learning_rate": 1.4966281310211948e-05,
+      "loss": 2.0016,
+      "step": 1045
+    },
+    {
+      "epoch": 0.25285972305839854,
+      "grad_norm": 3.166200637817383,
+      "learning_rate": 1.4942196531791909e-05,
+      "loss": 1.9481,
+      "step": 1050
+    },
+    {
+      "epoch": 0.2540638169777243,
+      "grad_norm": 3.08423113822937,
+      "learning_rate": 1.4918111753371871e-05,
+      "loss": 1.9168,
+      "step": 1055
+    },
+    {
+      "epoch": 0.25526791089705,
+      "grad_norm": 3.7309155464172363,
+      "learning_rate": 1.4894026974951832e-05,
+      "loss": 1.9698,
+      "step": 1060
+    },
+    {
+      "epoch": 0.25647200481637566,
+      "grad_norm": 3.5642640590667725,
+      "learning_rate": 1.4869942196531793e-05,
+      "loss": 2.0914,
+      "step": 1065
+    },
+    {
+      "epoch": 0.2576760987357014,
+      "grad_norm": 3.201970338821411,
+      "learning_rate": 1.4845857418111755e-05,
+      "loss": 1.9358,
+      "step": 1070
+    },
+    {
+      "epoch": 0.2588801926550271,
+      "grad_norm": 3.19600510597229,
+      "learning_rate": 1.4821772639691716e-05,
+      "loss": 1.9028,
+      "step": 1075
+    },
+    {
+      "epoch": 0.2600842865743528,
+      "grad_norm": 2.8048558235168457,
+      "learning_rate": 1.4797687861271676e-05,
+      "loss": 1.9104,
+      "step": 1080
+    },
+    {
+      "epoch": 0.26128838049367853,
+      "grad_norm": 3.3447604179382324,
+      "learning_rate": 1.477360308285164e-05,
+      "loss": 1.9683,
+      "step": 1085
+    },
+    {
+      "epoch": 0.2624924744130042,
+      "grad_norm": 3.2456166744232178,
+      "learning_rate": 1.4749518304431601e-05,
+      "loss": 1.9469,
+      "step": 1090
+    },
+    {
+      "epoch": 0.2636965683323299,
+      "grad_norm": 4.318637847900391,
+      "learning_rate": 1.4725433526011562e-05,
+      "loss": 1.9199,
+      "step": 1095
+    },
+    {
+      "epoch": 0.26490066225165565,
+      "grad_norm": 3.258910655975342,
+      "learning_rate": 1.4701348747591524e-05,
+      "loss": 1.9996,
+      "step": 1100
+    },
+    {
+      "epoch": 0.26610475617098134,
+      "grad_norm": 3.2944164276123047,
+      "learning_rate": 1.4677263969171484e-05,
+      "loss": 1.9373,
+      "step": 1105
+    },
+    {
+      "epoch": 0.267308850090307,
+      "grad_norm": 2.938908815383911,
+      "learning_rate": 1.4653179190751445e-05,
+      "loss": 1.8301,
+      "step": 1110
+    },
+    {
+      "epoch": 0.26851294400963277,
+      "grad_norm": 3.38433575630188,
+      "learning_rate": 1.4629094412331407e-05,
+      "loss": 2.0615,
+      "step": 1115
+    },
+    {
+      "epoch": 0.26971703792895846,
+      "grad_norm": 2.9752516746520996,
+      "learning_rate": 1.4605009633911368e-05,
+      "loss": 2.0132,
+      "step": 1120
+    },
+    {
+      "epoch": 0.27092113184828415,
+      "grad_norm": 2.9489858150482178,
+      "learning_rate": 1.458092485549133e-05,
+      "loss": 1.9319,
+      "step": 1125
+    },
+    {
+      "epoch": 0.2721252257676099,
+      "grad_norm": 3.231663465499878,
+      "learning_rate": 1.4556840077071293e-05,
+      "loss": 1.8924,
+      "step": 1130
+    },
+    {
+      "epoch": 0.2733293196869356,
+      "grad_norm": 3.2028298377990723,
+      "learning_rate": 1.4532755298651253e-05,
+      "loss": 1.9836,
+      "step": 1135
+    },
+    {
+      "epoch": 0.27453341360626127,
+      "grad_norm": 3.348498582839966,
+      "learning_rate": 1.4508670520231216e-05,
+      "loss": 2.0908,
+      "step": 1140
+    },
+    {
+      "epoch": 0.275737507525587,
+      "grad_norm": 3.1415324211120605,
+      "learning_rate": 1.4484585741811176e-05,
+      "loss": 2.043,
+      "step": 1145
+    },
+    {
+      "epoch": 0.2769416014449127,
+      "grad_norm": 2.9878766536712646,
+      "learning_rate": 1.4460500963391137e-05,
+      "loss": 1.9536,
+      "step": 1150
+    },
+    {
+      "epoch": 0.2781456953642384,
+      "grad_norm": 3.9880380630493164,
+      "learning_rate": 1.4436416184971101e-05,
+      "loss": 1.7441,
+      "step": 1155
+    },
+    {
+      "epoch": 0.27934978928356413,
+      "grad_norm": 3.2457761764526367,
+      "learning_rate": 1.4412331406551062e-05,
+      "loss": 2.0117,
+      "step": 1160
+    },
+    {
+      "epoch": 0.2805538832028898,
+      "grad_norm": 2.9482262134552,
+      "learning_rate": 1.4388246628131022e-05,
+      "loss": 1.9502,
+      "step": 1165
+    },
+    {
+      "epoch": 0.2817579771222155,
+      "grad_norm": 3.3882107734680176,
+      "learning_rate": 1.4364161849710985e-05,
+      "loss": 1.9861,
+      "step": 1170
+    },
+    {
+      "epoch": 0.28296207104154125,
+      "grad_norm": 3.379577398300171,
+      "learning_rate": 1.4340077071290945e-05,
+      "loss": 2.0115,
+      "step": 1175
+    },
+    {
+      "epoch": 0.28416616496086694,
+      "grad_norm": 3.02996826171875,
+      "learning_rate": 1.4315992292870906e-05,
+      "loss": 1.9103,
+      "step": 1180
+    },
+    {
+      "epoch": 0.28537025888019263,
+      "grad_norm": 2.9293081760406494,
+      "learning_rate": 1.4291907514450868e-05,
+      "loss": 1.9432,
+      "step": 1185
+    },
+    {
+      "epoch": 0.2865743527995184,
+      "grad_norm": 3.3768506050109863,
+      "learning_rate": 1.4267822736030829e-05,
+      "loss": 1.8372,
+      "step": 1190
+    },
+    {
+      "epoch": 0.28777844671884406,
+      "grad_norm": 3.0163660049438477,
+      "learning_rate": 1.424373795761079e-05,
+      "loss": 1.7784,
+      "step": 1195
+    },
+    {
+      "epoch": 0.2889825406381698,
+      "grad_norm": 3.591684103012085,
+      "learning_rate": 1.4219653179190754e-05,
+      "loss": 1.9951,
+      "step": 1200
+    },
+    {
+      "epoch": 0.2889825406381698,
+      "eval_loss": 1.9316246509552002,
+      "eval_runtime": 16.3758,
+      "eval_samples_per_second": 6.107,
+      "eval_steps_per_second": 0.794,
+      "step": 1200
+    },
+    {
+      "epoch": 0.2901866345574955,
+      "grad_norm": 3.3553686141967773,
+      "learning_rate": 1.4195568400770714e-05,
+      "loss": 1.8822,
+      "step": 1205
+    },
+    {
+      "epoch": 0.2913907284768212,
+      "grad_norm": 3.1767303943634033,
+      "learning_rate": 1.4171483622350675e-05,
+      "loss": 1.9386,
+      "step": 1210
+    },
+    {
+      "epoch": 0.2925948223961469,
+      "grad_norm": 3.6845219135284424,
+      "learning_rate": 1.4147398843930637e-05,
+      "loss": 1.8512,
+      "step": 1215
+    },
+    {
+      "epoch": 0.2937989163154726,
+      "grad_norm": 3.189955949783325,
+      "learning_rate": 1.4123314065510598e-05,
+      "loss": 1.9804,
+      "step": 1220
+    },
+    {
+      "epoch": 0.2950030102347983,
+      "grad_norm": 3.7629036903381348,
+      "learning_rate": 1.4099229287090558e-05,
+      "loss": 1.9759,
+      "step": 1225
+    },
+    {
+      "epoch": 0.29620710415412405,
+      "grad_norm": 3.6975581645965576,
+      "learning_rate": 1.4075144508670523e-05,
+      "loss": 2.0106,
+      "step": 1230
+    },
+    {
+      "epoch": 0.29741119807344973,
+      "grad_norm": 3.1638119220733643,
+      "learning_rate": 1.4051059730250483e-05,
+      "loss": 1.9846,
+      "step": 1235
+    },
+    {
+      "epoch": 0.2986152919927754,
+      "grad_norm": 3.2830159664154053,
+      "learning_rate": 1.4026974951830444e-05,
+      "loss": 2.0757,
+      "step": 1240
+    },
+    {
+      "epoch": 0.29981938591210117,
+      "grad_norm": 3.1656594276428223,
+      "learning_rate": 1.4002890173410406e-05,
+      "loss": 1.9831,
+      "step": 1245
+    },
+    {
+      "epoch": 0.30102347983142685,
+      "grad_norm": 3.056269407272339,
+      "learning_rate": 1.3978805394990367e-05,
+      "loss": 1.897,
+      "step": 1250
+    },
+    {
+      "epoch": 0.30222757375075254,
+      "grad_norm": 3.1246907711029053,
+      "learning_rate": 1.3954720616570329e-05,
+      "loss": 1.9663,
+      "step": 1255
+    },
+    {
+      "epoch": 0.3034316676700783,
+      "grad_norm": 2.964416980743408,
+      "learning_rate": 1.393063583815029e-05,
+      "loss": 2.0476,
+      "step": 1260
+    },
+    {
+      "epoch": 0.304635761589404,
+      "grad_norm": 3.241818904876709,
+      "learning_rate": 1.390655105973025e-05,
+      "loss": 2.0771,
+      "step": 1265
+    },
+    {
+      "epoch": 0.30583985550872966,
+      "grad_norm": 3.29571795463562,
+      "learning_rate": 1.3882466281310214e-05,
+      "loss": 2.0502,
+      "step": 1270
+    },
+    {
+      "epoch": 0.3070439494280554,
+      "grad_norm": 3.6429920196533203,
+      "learning_rate": 1.3858381502890175e-05,
+      "loss": 2.0583,
+      "step": 1275
+    },
+    {
+      "epoch": 0.3082480433473811,
+      "grad_norm": 3.16072416305542,
+      "learning_rate": 1.3834296724470136e-05,
+      "loss": 1.8808,
+      "step": 1280
+    },
+    {
+      "epoch": 0.3094521372667068,
+      "grad_norm": 3.2263102531433105,
+      "learning_rate": 1.3810211946050098e-05,
+      "loss": 1.8824,
+      "step": 1285
+    },
+    {
+      "epoch": 0.3106562311860325,
+      "grad_norm": 3.2608978748321533,
+      "learning_rate": 1.3786127167630059e-05,
+      "loss": 1.8209,
+      "step": 1290
+    },
+    {
+      "epoch": 0.3118603251053582,
+      "grad_norm": 3.1739134788513184,
+      "learning_rate": 1.376204238921002e-05,
+      "loss": 2.0042,
+      "step": 1295
+    },
+    {
+      "epoch": 0.3130644190246839,
+      "grad_norm": 3.8513424396514893,
+      "learning_rate": 1.3737957610789983e-05,
+      "loss": 1.9969,
+      "step": 1300
+    },
+    {
+      "epoch": 0.31426851294400965,
+      "grad_norm": 2.987257957458496,
+      "learning_rate": 1.3713872832369944e-05,
+      "loss": 1.8387,
+      "step": 1305
+    },
+    {
+      "epoch": 0.31547260686333534,
+      "grad_norm": 3.1317367553710938,
+      "learning_rate": 1.3689788053949905e-05,
+      "loss": 2.0304,
+      "step": 1310
+    },
+    {
+      "epoch": 0.316676700782661,
+      "grad_norm": 3.459153413772583,
+      "learning_rate": 1.3665703275529867e-05,
+      "loss": 1.977,
+      "step": 1315
+    },
+    {
+      "epoch": 0.31788079470198677,
+      "grad_norm": 3.1578195095062256,
+      "learning_rate": 1.3641618497109828e-05,
+      "loss": 1.8042,
+      "step": 1320
+    },
+    {
+      "epoch": 0.31908488862131246,
+      "grad_norm": 3.375023126602173,
+      "learning_rate": 1.3617533718689788e-05,
+      "loss": 1.8485,
+      "step": 1325
+    },
+    {
+      "epoch": 0.32028898254063815,
+      "grad_norm": 3.1867401599884033,
+      "learning_rate": 1.359344894026975e-05,
+      "loss": 1.9303,
+      "step": 1330
+    },
+    {
+      "epoch": 0.3214930764599639,
+      "grad_norm": 3.303433418273926,
+      "learning_rate": 1.3569364161849711e-05,
+      "loss": 1.8925,
+      "step": 1335
+    },
+    {
+      "epoch": 0.3226971703792896,
+      "grad_norm": 2.990525960922241,
+      "learning_rate": 1.3545279383429672e-05,
+      "loss": 1.8612,
+      "step": 1340
+    },
+    {
+      "epoch": 0.32390126429861527,
+      "grad_norm": 3.570690155029297,
+      "learning_rate": 1.3521194605009636e-05,
+      "loss": 1.9175,
+      "step": 1345
+    },
+    {
+      "epoch": 0.325105358217941,
+      "grad_norm": 3.3581953048706055,
+      "learning_rate": 1.3497109826589597e-05,
+      "loss": 2.0293,
+      "step": 1350
+    },
+    {
+      "epoch": 0.3263094521372667,
+      "grad_norm": 3.111626148223877,
+      "learning_rate": 1.3473025048169559e-05,
+      "loss": 1.9314,
+      "step": 1355
+    },
+    {
+      "epoch": 0.32751354605659244,
+      "grad_norm": 2.8977465629577637,
+      "learning_rate": 1.344894026974952e-05,
+      "loss": 1.9008,
+      "step": 1360
+    },
+    {
+      "epoch": 0.32871763997591813,
+      "grad_norm": 3.2835822105407715,
+      "learning_rate": 1.342485549132948e-05,
+      "loss": 1.8344,
+      "step": 1365
+    },
+    {
+      "epoch": 0.3299217338952438,
+      "grad_norm": 3.041848659515381,
+      "learning_rate": 1.3400770712909442e-05,
+      "loss": 2.0046,
+      "step": 1370
+    },
+    {
+      "epoch": 0.33112582781456956,
+      "grad_norm": 3.252289295196533,
+      "learning_rate": 1.3376685934489405e-05,
+      "loss": 2.015,
+      "step": 1375
+    },
+    {
+      "epoch": 0.33232992173389525,
+      "grad_norm": 3.0157580375671387,
+      "learning_rate": 1.3352601156069365e-05,
+      "loss": 1.7938,
+      "step": 1380
+    },
+    {
+      "epoch": 0.33353401565322094,
+      "grad_norm": 3.2034637928009033,
+      "learning_rate": 1.3328516377649328e-05,
+      "loss": 1.7438,
+      "step": 1385
+    },
+    {
+      "epoch": 0.3347381095725467,
+      "grad_norm": 2.962069034576416,
+      "learning_rate": 1.3304431599229288e-05,
+      "loss": 1.8915,
+      "step": 1390
+    },
+    {
+      "epoch": 0.33594220349187237,
+      "grad_norm": 3.4406933784484863,
+      "learning_rate": 1.3280346820809249e-05,
+      "loss": 1.7684,
+      "step": 1395
+    },
+    {
+      "epoch": 0.33714629741119806,
+      "grad_norm": 3.531928062438965,
+      "learning_rate": 1.3256262042389211e-05,
+      "loss": 1.8796,
+      "step": 1400
+    },
+    {
+      "epoch": 0.33714629741119806,
+      "eval_loss": 1.9124047756195068,
+      "eval_runtime": 16.4823,
+      "eval_samples_per_second": 6.067,
+      "eval_steps_per_second": 0.789,
+      "step": 1400
+    },
+    {
+      "epoch": 0.3383503913305238,
+      "grad_norm": 3.052105665206909,
+      "learning_rate": 1.3232177263969172e-05,
+      "loss": 1.8796,
+      "step": 1405
+    },
+    {
+      "epoch": 0.3395544852498495,
+      "grad_norm": 3.317544460296631,
+      "learning_rate": 1.3208092485549133e-05,
+      "loss": 1.975,
+      "step": 1410
+    },
+    {
+      "epoch": 0.3407585791691752,
+      "grad_norm": 3.8019814491271973,
+      "learning_rate": 1.3184007707129097e-05,
+      "loss": 1.8014,
+      "step": 1415
+    },
+    {
+      "epoch": 0.3419626730885009,
+      "grad_norm": 3.2202208042144775,
+      "learning_rate": 1.3159922928709057e-05,
+      "loss": 2.0098,
+      "step": 1420
+    },
+    {
+      "epoch": 0.3431667670078266,
+      "grad_norm": 3.2262189388275146,
+      "learning_rate": 1.3135838150289018e-05,
+      "loss": 1.758,
+      "step": 1425
+    },
+    {
+      "epoch": 0.3443708609271523,
+      "grad_norm": 3.03486704826355,
+      "learning_rate": 1.311175337186898e-05,
+      "loss": 1.9254,
+      "step": 1430
+    },
+    {
+      "epoch": 0.34557495484647804,
+      "grad_norm": 3.063431978225708,
+      "learning_rate": 1.3087668593448941e-05,
+      "loss": 1.9091,
+      "step": 1435
+    },
+    {
+      "epoch": 0.34677904876580373,
+      "grad_norm": 3.082839012145996,
+      "learning_rate": 1.3063583815028902e-05,
+      "loss": 1.951,
+      "step": 1440
+    },
+    {
+      "epoch": 0.3479831426851294,
+      "grad_norm": 3.692833662033081,
+      "learning_rate": 1.3039499036608864e-05,
+      "loss": 1.9771,
+      "step": 1445
+    },
+    {
+      "epoch": 0.34918723660445516,
+      "grad_norm": 3.4902002811431885,
+      "learning_rate": 1.3015414258188825e-05,
+      "loss": 1.8954,
+      "step": 1450
+    },
+    {
+      "epoch": 0.35039133052378085,
+      "grad_norm": 2.9592397212982178,
+      "learning_rate": 1.2991329479768787e-05,
+      "loss": 1.8791,
+      "step": 1455
+    },
+    {
+      "epoch": 0.35159542444310654,
+      "grad_norm": 3.3300886154174805,
+      "learning_rate": 1.296724470134875e-05,
+      "loss": 1.9712,
+      "step": 1460
+    },
+    {
+      "epoch": 0.3527995183624323,
+      "grad_norm": 3.206303834915161,
+      "learning_rate": 1.294315992292871e-05,
+      "loss": 1.8155,
+      "step": 1465
+    },
+    {
+      "epoch": 0.354003612281758,
+      "grad_norm": 3.2186410427093506,
+      "learning_rate": 1.2919075144508672e-05,
+      "loss": 1.8583,
+      "step": 1470
+    },
+    {
+      "epoch": 0.35520770620108366,
+      "grad_norm": 3.482147216796875,
+      "learning_rate": 1.2894990366088633e-05,
+      "loss": 1.8233,
+      "step": 1475
+    },
+    {
+      "epoch": 0.3564118001204094,
+      "grad_norm": 3.317288637161255,
+      "learning_rate": 1.2870905587668594e-05,
+      "loss": 2.0094,
+      "step": 1480
+    },
+    {
+      "epoch": 0.3576158940397351,
+      "grad_norm": 2.962334632873535,
+      "learning_rate": 1.2846820809248558e-05,
+      "loss": 2.0041,
+      "step": 1485
+    },
+    {
+      "epoch": 0.3588199879590608,
+      "grad_norm": 3.1497514247894287,
+      "learning_rate": 1.2822736030828518e-05,
+      "loss": 1.8808,
+      "step": 1490
+    },
+    {
+      "epoch": 0.3600240818783865,
+      "grad_norm": 3.3814947605133057,
+      "learning_rate": 1.2798651252408479e-05,
+      "loss": 1.8085,
+      "step": 1495
+    },
+    {
+      "epoch": 0.3612281757977122,
+      "grad_norm": 3.426969051361084,
+      "learning_rate": 1.2774566473988441e-05,
+      "loss": 1.8659,
+      "step": 1500
+    },
+    {
+      "epoch": 0.3624322697170379,
+      "grad_norm": 2.9899697303771973,
+      "learning_rate": 1.2750481695568402e-05,
+      "loss": 1.8327,
+      "step": 1505
+    },
+    {
+      "epoch": 0.36363636363636365,
+      "grad_norm": 3.3430888652801514,
+      "learning_rate": 1.2726396917148362e-05,
+      "loss": 1.8923,
+      "step": 1510
+    },
+    {
+      "epoch": 0.36484045755568933,
+      "grad_norm": 3.001516342163086,
+      "learning_rate": 1.2702312138728325e-05,
+      "loss": 1.9374,
+      "step": 1515
+    },
+    {
+      "epoch": 0.3660445514750151,
+      "grad_norm": 3.1876020431518555,
+      "learning_rate": 1.2678227360308285e-05,
+      "loss": 1.9921,
+      "step": 1520
+    },
+    {
+      "epoch": 0.36724864539434077,
+      "grad_norm": 3.2372751235961914,
+      "learning_rate": 1.2654142581888246e-05,
+      "loss": 1.7351,
+      "step": 1525
+    },
+    {
+      "epoch": 0.36845273931366646,
+      "grad_norm": 3.10359525680542,
+      "learning_rate": 1.263005780346821e-05,
+      "loss": 1.9056,
+      "step": 1530
+    },
+    {
+      "epoch": 0.3696568332329922,
+      "grad_norm": 3.0910706520080566,
+      "learning_rate": 1.260597302504817e-05,
+      "loss": 1.7916,
+      "step": 1535
+    },
+    {
+      "epoch": 0.3708609271523179,
+      "grad_norm": 3.0096778869628906,
+      "learning_rate": 1.2581888246628131e-05,
+      "loss": 1.8056,
+      "step": 1540
+    },
+    {
+      "epoch": 0.3720650210716436,
+      "grad_norm": 2.8062777519226074,
+      "learning_rate": 1.2557803468208094e-05,
+      "loss": 1.855,
+      "step": 1545
+    },
+    {
+      "epoch": 0.3732691149909693,
+      "grad_norm": 3.287958860397339,
+      "learning_rate": 1.2533718689788054e-05,
+      "loss": 1.9238,
+      "step": 1550
+    },
+    {
+      "epoch": 0.374473208910295,
+      "grad_norm": 3.459998607635498,
+      "learning_rate": 1.2509633911368015e-05,
+      "loss": 1.9554,
+      "step": 1555
+    },
+    {
+      "epoch": 0.3756773028296207,
+      "grad_norm": 3.3246283531188965,
+      "learning_rate": 1.2485549132947979e-05,
+      "loss": 1.8773,
+      "step": 1560
+    },
+    {
+      "epoch": 0.37688139674894644,
+      "grad_norm": 3.058595895767212,
+      "learning_rate": 1.246146435452794e-05,
+      "loss": 1.923,
+      "step": 1565
+    },
+    {
+      "epoch": 0.37808549066827213,
+      "grad_norm": 2.9321656227111816,
+      "learning_rate": 1.2437379576107902e-05,
+      "loss": 1.9403,
+      "step": 1570
+    },
+    {
+      "epoch": 0.3792895845875978,
+      "grad_norm": 3.1878929138183594,
+      "learning_rate": 1.2413294797687863e-05,
+      "loss": 1.9703,
+      "step": 1575
+    },
+    {
+      "epoch": 0.38049367850692356,
+      "grad_norm": 3.411778450012207,
+      "learning_rate": 1.2389210019267823e-05,
+      "loss": 1.8681,
+      "step": 1580
+    },
+    {
+      "epoch": 0.38169777242624925,
+      "grad_norm": 2.9784305095672607,
+      "learning_rate": 1.2365125240847786e-05,
+      "loss": 1.8222,
+      "step": 1585
+    },
+    {
+      "epoch": 0.38290186634557494,
+      "grad_norm": 3.037580728530884,
+      "learning_rate": 1.2341040462427746e-05,
+      "loss": 1.9808,
+      "step": 1590
+    },
+    {
+      "epoch": 0.3841059602649007,
+      "grad_norm": 2.867035388946533,
+      "learning_rate": 1.2316955684007707e-05,
+      "loss": 1.7734,
+      "step": 1595
+    },
+    {
+      "epoch": 0.38531005418422637,
+      "grad_norm": 2.8721401691436768,
+      "learning_rate": 1.2292870905587671e-05,
+      "loss": 1.8227,
+      "step": 1600
+    },
+    {
+      "epoch": 0.38531005418422637,
+      "eval_loss": 1.8952380418777466,
+      "eval_runtime": 16.4306,
+      "eval_samples_per_second": 6.086,
+      "eval_steps_per_second": 0.791,
+      "step": 1600
+    },
+    {
+      "epoch": 0.38651414810355206,
+      "grad_norm": 3.3023080825805664,
+      "learning_rate": 1.2268786127167632e-05,
+      "loss": 2.0716,
+      "step": 1605
+    },
+    {
+      "epoch": 0.3877182420228778,
+      "grad_norm": 3.0930209159851074,
+      "learning_rate": 1.2244701348747592e-05,
+      "loss": 2.1265,
+      "step": 1610
+    },
+    {
+      "epoch": 0.3889223359422035,
+      "grad_norm": 2.944687843322754,
+      "learning_rate": 1.2220616570327555e-05,
+      "loss": 1.8317,
+      "step": 1615
+    },
+    {
+      "epoch": 0.3901264298615292,
+      "grad_norm": 3.4099764823913574,
+      "learning_rate": 1.2196531791907515e-05,
+      "loss": 1.9413,
+      "step": 1620
+    },
+    {
+      "epoch": 0.3913305237808549,
+      "grad_norm": 4.778016567230225,
+      "learning_rate": 1.2172447013487476e-05,
+      "loss": 1.9718,
+      "step": 1625
+    },
+    {
+      "epoch": 0.3925346177001806,
+      "grad_norm": 3.0482661724090576,
+      "learning_rate": 1.214836223506744e-05,
+      "loss": 1.7691,
+      "step": 1630
+    },
+    {
+      "epoch": 0.3937387116195063,
+      "grad_norm": 3.1143486499786377,
+      "learning_rate": 1.21242774566474e-05,
+      "loss": 1.8427,
+      "step": 1635
+    },
+    {
+      "epoch": 0.39494280553883204,
+      "grad_norm": 3.4189348220825195,
+      "learning_rate": 1.2100192678227361e-05,
+      "loss": 1.7502,
+      "step": 1640
+    },
+    {
+      "epoch": 0.39614689945815773,
+      "grad_norm": 2.9873223304748535,
+      "learning_rate": 1.2076107899807323e-05,
+      "loss": 1.9207,
+      "step": 1645
+    },
+    {
+      "epoch": 0.3973509933774834,
+      "grad_norm": 3.051147699356079,
+      "learning_rate": 1.2052023121387284e-05,
+      "loss": 1.8523,
+      "step": 1650
+    },
+    {
+      "epoch": 0.39855508729680916,
+      "grad_norm": 3.375426769256592,
+      "learning_rate": 1.2027938342967245e-05,
+      "loss": 1.9916,
+      "step": 1655
+    },
+    {
+      "epoch": 0.39975918121613485,
+      "grad_norm": 3.345398426055908,
+      "learning_rate": 1.2003853564547207e-05,
+      "loss": 1.8303,
+      "step": 1660
+    },
+    {
+      "epoch": 0.40096327513546054,
+      "grad_norm": 3.3531017303466797,
+      "learning_rate": 1.1979768786127168e-05,
+      "loss": 1.8113,
+      "step": 1665
+    },
+    {
+      "epoch": 0.4021673690547863,
+      "grad_norm": 3.656094551086426,
+      "learning_rate": 1.1955684007707128e-05,
+      "loss": 1.8728,
+      "step": 1670
+    },
+    {
+      "epoch": 0.40337146297411197,
+      "grad_norm": 3.184237241744995,
+      "learning_rate": 1.1931599229287092e-05,
+      "loss": 1.8423,
+      "step": 1675
+    },
+    {
+      "epoch": 0.4045755568934377,
+      "grad_norm": 3.013655662536621,
+      "learning_rate": 1.1907514450867053e-05,
+      "loss": 2.019,
+      "step": 1680
+    },
+    {
+      "epoch": 0.4057796508127634,
+      "grad_norm": 3.0090904235839844,
+      "learning_rate": 1.1883429672447015e-05,
+      "loss": 1.9982,
+      "step": 1685
+    },
+    {
+      "epoch": 0.4069837447320891,
+      "grad_norm": 2.8152971267700195,
+      "learning_rate": 1.1859344894026976e-05,
+      "loss": 1.8038,
+      "step": 1690
+    },
+    {
+      "epoch": 0.40818783865141484,
+      "grad_norm": 3.1558475494384766,
+      "learning_rate": 1.1835260115606937e-05,
+      "loss": 1.904,
+      "step": 1695
+    },
+    {
+      "epoch": 0.4093919325707405,
+      "grad_norm": 3.0803842544555664,
+      "learning_rate": 1.1811175337186899e-05,
+      "loss": 1.9563,
+      "step": 1700
+    },
+    {
+      "epoch": 0.4105960264900662,
+      "grad_norm": 3.41733717918396,
+      "learning_rate": 1.1787090558766861e-05,
+      "loss": 1.8821,
+      "step": 1705
+    },
+    {
+      "epoch": 0.41180012040939196,
+      "grad_norm": 3.818134069442749,
+      "learning_rate": 1.1763005780346822e-05,
+      "loss": 2.038,
+      "step": 1710
+    },
+    {
+      "epoch": 0.41300421432871764,
+      "grad_norm": 2.931730270385742,
+      "learning_rate": 1.1738921001926784e-05,
+      "loss": 1.7836,
+      "step": 1715
+    },
+    {
+      "epoch": 0.41420830824804333,
+      "grad_norm": 3.2240729331970215,
+      "learning_rate": 1.1714836223506745e-05,
+      "loss": 1.8985,
+      "step": 1720
+    },
+    {
+      "epoch": 0.4154124021673691,
+      "grad_norm": 3.0868842601776123,
+      "learning_rate": 1.1690751445086706e-05,
+      "loss": 1.9309,
+      "step": 1725
+    },
+    {
+      "epoch": 0.41661649608669477,
+      "grad_norm": 3.2010886669158936,
+      "learning_rate": 1.1666666666666668e-05,
+      "loss": 1.9058,
+      "step": 1730
+    },
+    {
+      "epoch": 0.41782059000602045,
+      "grad_norm": 3.248377799987793,
+      "learning_rate": 1.1642581888246629e-05,
+      "loss": 1.9333,
+      "step": 1735
+    },
+    {
+      "epoch": 0.4190246839253462,
+      "grad_norm": 3.792973756790161,
+      "learning_rate": 1.161849710982659e-05,
+      "loss": 1.7065,
+      "step": 1740
+    },
+    {
+      "epoch": 0.4202287778446719,
+      "grad_norm": 3.1620492935180664,
+      "learning_rate": 1.1594412331406553e-05,
+      "loss": 1.9736,
+      "step": 1745
+    },
+    {
+      "epoch": 0.4214328717639976,
+      "grad_norm": 3.8951566219329834,
+      "learning_rate": 1.1570327552986514e-05,
+      "loss": 1.9916,
+      "step": 1750
+    },
+    {
+      "epoch": 0.4226369656833233,
+      "grad_norm": 3.2940852642059326,
+      "learning_rate": 1.1546242774566474e-05,
+      "loss": 1.8398,
+      "step": 1755
+    },
+    {
+      "epoch": 0.423841059602649,
+      "grad_norm": 3.4238831996917725,
+      "learning_rate": 1.1522157996146437e-05,
+      "loss": 1.903,
+      "step": 1760
+    },
+    {
+      "epoch": 0.4250451535219747,
+      "grad_norm": 3.1505134105682373,
+      "learning_rate": 1.1498073217726397e-05,
+      "loss": 1.8617,
+      "step": 1765
+    },
+    {
+      "epoch": 0.42624924744130044,
+      "grad_norm": 3.1795570850372314,
+      "learning_rate": 1.1473988439306358e-05,
+      "loss": 1.7612,
+      "step": 1770
+    },
+    {
+      "epoch": 0.4274533413606261,
+      "grad_norm": 3.108771324157715,
+      "learning_rate": 1.144990366088632e-05,
+      "loss": 1.7434,
+      "step": 1775
+    },
+    {
+      "epoch": 0.4286574352799518,
+      "grad_norm": 3.026624917984009,
+      "learning_rate": 1.1425818882466281e-05,
+      "loss": 2.0082,
+      "step": 1780
+    },
+    {
+      "epoch": 0.42986152919927756,
+      "grad_norm": 3.0684778690338135,
+      "learning_rate": 1.1401734104046245e-05,
+      "loss": 1.9998,
+      "step": 1785
+    },
+    {
+      "epoch": 0.43106562311860325,
+      "grad_norm": 3.261258840560913,
+      "learning_rate": 1.1377649325626206e-05,
+      "loss": 1.9306,
+      "step": 1790
+    },
+    {
+      "epoch": 0.43226971703792894,
+      "grad_norm": 3.167187452316284,
+      "learning_rate": 1.1353564547206166e-05,
+      "loss": 1.7516,
+      "step": 1795
+    },
+    {
+      "epoch": 0.4334738109572547,
+      "grad_norm": 2.8481147289276123,
+      "learning_rate": 1.1329479768786129e-05,
+      "loss": 1.9046,
+      "step": 1800
+    },
+    {
+      "epoch": 0.4334738109572547,
+      "eval_loss": 1.8845782279968262,
+      "eval_runtime": 16.4138,
+      "eval_samples_per_second": 6.092,
+      "eval_steps_per_second": 0.792,
+      "step": 1800
+    },
+    {
+      "epoch": 0.43467790487658037,
+      "grad_norm": 3.374049663543701,
+      "learning_rate": 1.130539499036609e-05,
+      "loss": 1.799,
+      "step": 1805
+    },
+    {
+      "epoch": 0.43588199879590606,
+      "grad_norm": 3.0791234970092773,
+      "learning_rate": 1.128131021194605e-05,
+      "loss": 1.8862,
+      "step": 1810
+    },
+    {
+      "epoch": 0.4370860927152318,
+      "grad_norm": 3.2381889820098877,
+      "learning_rate": 1.1257225433526014e-05,
+      "loss": 1.8981,
+      "step": 1815
+    },
+    {
+      "epoch": 0.4382901866345575,
+      "grad_norm": 3.3203744888305664,
+      "learning_rate": 1.1233140655105975e-05,
+      "loss": 1.9926,
+      "step": 1820
+    },
+    {
+      "epoch": 0.4394942805538832,
+      "grad_norm": 3.075920581817627,
+      "learning_rate": 1.1209055876685935e-05,
+      "loss": 1.8543,
+      "step": 1825
+    },
+    {
+      "epoch": 0.4406983744732089,
+      "grad_norm": 3.020115852355957,
+      "learning_rate": 1.1184971098265898e-05,
+      "loss": 1.9877,
+      "step": 1830
+    },
+    {
+      "epoch": 0.4419024683925346,
+      "grad_norm": 3.152524471282959,
+      "learning_rate": 1.1160886319845858e-05,
+      "loss": 1.9152,
+      "step": 1835
+    },
+    {
+      "epoch": 0.44310656231186035,
+      "grad_norm": 3.2993345260620117,
+      "learning_rate": 1.1136801541425819e-05,
+      "loss": 1.8089,
+      "step": 1840
+    },
+    {
+      "epoch": 0.44431065623118604,
+      "grad_norm": 3.579181671142578,
+      "learning_rate": 1.1112716763005781e-05,
+      "loss": 1.9076,
+      "step": 1845
+    },
+    {
+      "epoch": 0.44551475015051173,
+      "grad_norm": 3.3042211532592773,
+      "learning_rate": 1.1088631984585742e-05,
+      "loss": 1.8324,
+      "step": 1850
+    },
+    {
+      "epoch": 0.4467188440698375,
+      "grad_norm": 3.5857417583465576,
+      "learning_rate": 1.1064547206165703e-05,
+      "loss": 1.7684,
+      "step": 1855
+    },
+    {
+      "epoch": 0.44792293798916316,
+      "grad_norm": 3.2090542316436768,
+      "learning_rate": 1.1040462427745667e-05,
+      "loss": 1.7793,
+      "step": 1860
+    },
+    {
+      "epoch": 0.44912703190848885,
+      "grad_norm": 3.0961151123046875,
+      "learning_rate": 1.1016377649325627e-05,
+      "loss": 1.8414,
+      "step": 1865
+    },
+    {
+      "epoch": 0.4503311258278146,
+      "grad_norm": 3.3767356872558594,
+      "learning_rate": 1.0992292870905588e-05,
+      "loss": 1.8577,
+      "step": 1870
+    },
+    {
+      "epoch": 0.4515352197471403,
+      "grad_norm": 3.22123122215271,
+      "learning_rate": 1.096820809248555e-05,
+      "loss": 1.8515,
+      "step": 1875
+    },
+    {
+      "epoch": 0.45273931366646597,
+      "grad_norm": 3.1328110694885254,
+      "learning_rate": 1.094412331406551e-05,
+      "loss": 1.8725,
+      "step": 1880
+    },
+    {
+      "epoch": 0.4539434075857917,
+      "grad_norm": 3.3135077953338623,
+      "learning_rate": 1.0920038535645471e-05,
+      "loss": 1.8706,
+      "step": 1885
+    },
+    {
+      "epoch": 0.4551475015051174,
+      "grad_norm": 2.829827308654785,
+      "learning_rate": 1.0895953757225436e-05,
+      "loss": 1.8184,
+      "step": 1890
+    },
+    {
+      "epoch": 0.4563515954244431,
+      "grad_norm": 3.203533411026001,
+      "learning_rate": 1.0871868978805396e-05,
+      "loss": 1.8289,
+      "step": 1895
+    },
+    {
+      "epoch": 0.45755568934376883,
+      "grad_norm": 3.491577386856079,
+      "learning_rate": 1.0847784200385358e-05,
+      "loss": 1.888,
+      "step": 1900
+    },
+    {
+      "epoch": 0.4587597832630945,
+      "grad_norm": 3.2121121883392334,
+      "learning_rate": 1.0823699421965319e-05,
+      "loss": 1.8264,
+      "step": 1905
+    },
+    {
+      "epoch": 0.4599638771824202,
+      "grad_norm": 3.110546827316284,
+      "learning_rate": 1.079961464354528e-05,
+      "loss": 1.9191,
+      "step": 1910
+    },
+    {
+      "epoch": 0.46116797110174595,
+      "grad_norm": 4.177798748016357,
+      "learning_rate": 1.0775529865125242e-05,
+      "loss": 1.8732,
+      "step": 1915
+    },
+    {
+      "epoch": 0.46237206502107164,
+      "grad_norm": 3.107978582382202,
+      "learning_rate": 1.0751445086705203e-05,
+      "loss": 1.9634,
+      "step": 1920
+    },
+    {
+      "epoch": 0.46357615894039733,
+      "grad_norm": 3.661637783050537,
+      "learning_rate": 1.0727360308285163e-05,
+      "loss": 1.7731,
+      "step": 1925
+    },
+    {
+      "epoch": 0.4647802528597231,
+      "grad_norm": 3.32905912399292,
+      "learning_rate": 1.0703275529865127e-05,
+      "loss": 1.8689,
+      "step": 1930
+    },
+    {
+      "epoch": 0.46598434677904876,
+      "grad_norm": 3.348806381225586,
+      "learning_rate": 1.0679190751445088e-05,
+      "loss": 2.0132,
+      "step": 1935
+    },
+    {
+      "epoch": 0.46718844069837445,
+      "grad_norm": 3.625046730041504,
+      "learning_rate": 1.0655105973025049e-05,
+      "loss": 1.9329,
+      "step": 1940
+    },
+    {
+      "epoch": 0.4683925346177002,
+      "grad_norm": 2.9613242149353027,
+      "learning_rate": 1.0631021194605011e-05,
+      "loss": 1.8133,
+      "step": 1945
+    },
+    {
+      "epoch": 0.4695966285370259,
+      "grad_norm": 3.4370510578155518,
+      "learning_rate": 1.0606936416184972e-05,
+      "loss": 1.8325,
+      "step": 1950
+    },
+    {
+      "epoch": 0.4708007224563516,
+      "grad_norm": 3.0052490234375,
+      "learning_rate": 1.0582851637764932e-05,
+      "loss": 1.7397,
+      "step": 1955
+    },
+    {
+      "epoch": 0.4720048163756773,
+      "grad_norm": 3.4657671451568604,
+      "learning_rate": 1.0558766859344896e-05,
+      "loss": 1.8374,
+      "step": 1960
+    },
+    {
+      "epoch": 0.473208910295003,
+      "grad_norm": 3.1156861782073975,
+      "learning_rate": 1.0534682080924857e-05,
+      "loss": 1.9296,
+      "step": 1965
+    },
+    {
+      "epoch": 0.4744130042143287,
+      "grad_norm": 2.908092498779297,
+      "learning_rate": 1.0510597302504818e-05,
+      "loss": 1.9279,
+      "step": 1970
+    },
+    {
+      "epoch": 0.47561709813365444,
+      "grad_norm": 3.2144057750701904,
+      "learning_rate": 1.048651252408478e-05,
+      "loss": 1.6978,
+      "step": 1975
+    },
+    {
+      "epoch": 0.4768211920529801,
+      "grad_norm": 3.2203619480133057,
+      "learning_rate": 1.046242774566474e-05,
+      "loss": 1.7573,
+      "step": 1980
+    },
+    {
+      "epoch": 0.4780252859723058,
+      "grad_norm": 3.258633613586426,
+      "learning_rate": 1.0438342967244701e-05,
+      "loss": 1.747,
+      "step": 1985
+    },
+    {
+      "epoch": 0.47922937989163156,
+      "grad_norm": 3.7678489685058594,
+      "learning_rate": 1.0414258188824664e-05,
+      "loss": 1.8558,
+      "step": 1990
+    },
+    {
+      "epoch": 0.48043347381095725,
+      "grad_norm": 3.226168155670166,
+      "learning_rate": 1.0390173410404624e-05,
+      "loss": 1.8387,
+      "step": 1995
+    },
+    {
+      "epoch": 0.481637567730283,
+      "grad_norm": 2.9940948486328125,
+      "learning_rate": 1.0366088631984585e-05,
+      "loss": 1.8159,
+      "step": 2000
+    },
+    {
+      "epoch": 0.481637567730283,
+      "eval_loss": 1.871155858039856,
+      "eval_runtime": 16.4315,
+      "eval_samples_per_second": 6.086,
+      "eval_steps_per_second": 0.791,
+      "step": 2000
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 4152,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2105914563722736.0,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-2000/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fd020aa9333054ccbd44f2f69522647a4bdda52101c5b015e38e78115982722b
+size 5816

checkpoint-500/added_tokens.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "<|im_end|>": 32000,
+  "<|im_start|>": 32001
+}

checkpoint-500/config.json ADDED Viewed

	@@ -0,0 +1,27 @@

+{
+  "_name_or_path": "Felladrin/Minueza-32M-Chat",
+  "architectures": [
+    "MistralForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "eos_token_id": 32000,
+  "head_dim": 26,
+  "hidden_act": "silu",
+  "hidden_size": 312,
+  "initializer_range": 0.02,
+  "intermediate_size": 1092,
+  "max_position_embeddings": 2048,
+  "model_type": "mistral",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 10,
+  "num_key_value_heads": 4,
+  "rms_norm_eps": 1e-06,
+  "rope_theta": 10000.0,
+  "sliding_window": 1024,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float32",
+  "transformers_version": "4.48.1",
+  "use_cache": false,
+  "vocab_size": 32002
+}

checkpoint-500/generation_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "eos_token_id": 32000,
+  "transformers_version": "4.48.1"
+}

checkpoint-500/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f78022fd000efe193d3842c20c125accf0e08f56767a1083341549b1632cbad5
+size 131181272

checkpoint-500/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7c45fba25104663249b410ca2e7dcb1da25da0a206f56b5a1ed7e4b8faa977ae
+size 262419258

checkpoint-500/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9196a1e708bf24d6abba41cce3f8558820acc3e50f9394c5955e29eb41ffea3d
+size 14244

checkpoint-500/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:19108033ec901a824f3cecb198c7800f1aa950cde6adc4b84b88b2b0d1248717
+size 1064

checkpoint-500/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<|im_end|>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoint-500/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-500/tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dadfd56d766715c61d2ef780a525ab43b8e6da4de6865bda3d95fdef5e134055
+size 493443

checkpoint-500/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,61 @@

+{
+  "add_bos_token": false,
+  "add_eos_token": false,
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32000": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32001": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [],
+  "bos_token": "<s>",
+  "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "extra_special_tokens": {},
+  "legacy": true,
+  "model_max_length": 2048,
+  "pad_token": "<|im_end|>",
+  "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}