diff --git a/.gitattributes b/.gitattributes
index a6344aac8c09253b3b630fb776ae94478aa0275b..f5ffdfde7de0f88935abe61e4c65e06681b8edc4 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -33,3 +33,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+checkpoint-579/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-618/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-772/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-822/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text
diff --git a/README.md b/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..804657de33fbff0b7c340fecdc1a207d1298802c
--- /dev/null
+++ b/README.md
@@ -0,0 +1,178 @@
+---
+library_name: peft
+license: mit
+base_model: THUDM/GLM-4-32B-0414
+tags:
+- axolotl
+- generated_from_trainer
+datasets:
+- anthracite-core/magnum-v5-sft-proto-glm4-instruct-rev1
+model-index:
+- name: magnum-v5-sft-prototype-glm4-32b-lora
+  results: []
+---
+
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+
+[<img src="https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/axolotl-ai-cloud/axolotl)
+<details><summary>See axolotl config</summary>
+
+axolotl version: `0.8.0`
+```yaml
+base_model: THUDM/GLM-4-32B-0414
+#base_model_ignore_patterns: "*/*"
+# optionally might have model_type or tokenizer_type
+model_type: AutoModelForCausalLM
+tokenizer_type: AutoTokenizer
+# Automatically upload checkpoint and final model to HF
+hub_model_id: anthracite-core/magnum-v5-sft-prototype-glm4-32b-lora
+hub_strategy: "all_checkpoints"
+push_dataset_to_hub:
+hf_use_auth_token: true
+
+
+load_in_8bit: false
+load_in_4bit: false
+strict: false
+
+datasets:
+  - path: anthracite-core/magnum-v5-sft-proto-glm4-instruct-rev1
+    ds_type: parquet
+    type:
+shuffle_merged_datasets: true
+dataset_prepared_path: ./data/magnum-32b-data
+val_set_size: 0.01
+output_dir: ./data/32b-lora-out
+
+plugins:
+  - axolotl.integrations.liger.LigerPlugin
+  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
+#liger_rope: false
+liger_rms_norm: true
+liger_layer_norm: true
+liger_glu_activation: true
+liger_fused_linear_cross_entropy: true
+cut_cross_entropy: true
+
+sequence_len: 32768
+sample_packing: true
+eval_sample_packing: true
+pad_to_sequence_len: true
+
+adapter: lora
+lora_model_dir:
+lora_r: 128
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_linear: true
+lora_fan_in_fan_out:
+peft_use_rslora: true
+lora_modules_to_save:
+  - embed_tokens
+  - lm_head
+
+wandb_project: 32b-magnum-lora
+wandb_entity:
+wandb_watch:
+wandb_name: run4-Lora-0.001-clip
+wandb_log_model:
+
+gradient_accumulation_steps: 2
+micro_batch_size: 1
+num_epochs: 2
+optimizer: paged_ademamix_8bit
+lr_scheduler: cosine
+learning_rate: 2e-4
+max_grad_norm: 0.001
+
+train_on_inputs: false
+group_by_length: false
+bf16: auto
+fp16:
+tf32: false
+
+gradient_checkpointing: true
+early_stopping_patience:
+resume_from_checkpoint:
+local_rank:
+logging_steps: 1
+xformers_attention:
+flash_attention: true
+s2_attention:
+
+warmup_steps: 40
+evals_per_epoch: 4
+eval_table_size:
+eval_max_new_tokens:
+saves_per_epoch: 2
+debug:
+deepspeed: ./deepspeed_configs/zero3_bf16.json
+weight_decay: 0.01
+fsdp:
+fsdp_config:
+special_tokens:
+
+```
+
+</details><br>
+
+# magnum-v5-sft-prototype-glm4-32b-lora
+
+This model is a fine-tuned version of [THUDM/GLM-4-32B-0414](https://huggingface.co/THUDM/GLM-4-32B-0414) on the anthracite-core/magnum-v5-sft-proto-glm4-instruct-rev1 dataset.
+It achieves the following results on the evaluation set:
+- Loss: 1.1075
+
+## Model description
+
+More information needed
+
+## Intended uses & limitations
+
+More information needed
+
+## Training and evaluation data
+
+More information needed
+
+## Training procedure
+
+### Training hyperparameters
+
+The following hyperparameters were used during training:
+- learning_rate: 0.0002
+- train_batch_size: 1
+- eval_batch_size: 1
+- seed: 42
+- distributed_type: multi-GPU
+- num_devices: 8
+- gradient_accumulation_steps: 2
+- total_train_batch_size: 16
+- total_eval_batch_size: 8
+- optimizer: Use paged_ademamix_8bit and the args are:
+No additional optimizer arguments
+- lr_scheduler_type: cosine
+- lr_scheduler_warmup_steps: 40
+- num_epochs: 2.0
+
+### Training results
+
+| Training Loss | Epoch  | Step | Validation Loss |
+|:-------------:|:------:|:----:|:---------------:|
+| 1.3541        | 0.0024 | 1    | 1.3336          |
+| 1.1718        | 0.2503 | 103  | 1.1633          |
+| 1.1976        | 0.5006 | 206  | 1.1460          |
+| 1.095         | 0.7509 | 309  | 1.1339          |
+| 1.1076        | 1.0    | 412  | 1.1213          |
+| 1.1063        | 1.2503 | 515  | 1.1128          |
+| 1.1214        | 1.5006 | 618  | 1.1089          |
+| 1.0286        | 1.7509 | 721  | 1.1075          |
+
+
+### Framework versions
+
+- PEFT 0.15.1
+- Transformers 4.51.3
+- Pytorch 2.6.0+cu124
+- Datasets 3.5.0
+- Tokenizers 0.21.1
\ No newline at end of file
diff --git a/adapter_config.json b/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..d23c5bb0164ae65157b73dbb2e6dc419d09b28ad
--- /dev/null
+++ b/adapter_config.json
@@ -0,0 +1,41 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "THUDM/GLM-4-32B-0414",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": null,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": [
+    "embed_tokens",
+    "lm_head"
+  ],
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "o_proj",
+    "v_proj",
+    "down_proj",
+    "q_proj",
+    "k_proj",
+    "gate_up_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_rslora": true
+}
\ No newline at end of file
diff --git a/adapter_model.safetensors b/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..3bfc934021ae2f94535e9442dcecf9427f7b12c1
--- /dev/null
+++ b/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f9dabe0dcb2a00ba6eca0b1e4fb714d3c1d5289929ed928c9ab44c923fdb4073
+size 5579575888
diff --git a/checkpoint-579/README.md b/checkpoint-579/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..5d958b0327f8e51ca223e270eb789387f6b41fb2
--- /dev/null
+++ b/checkpoint-579/README.md
@@ -0,0 +1,202 @@
+---
+base_model: THUDM/GLM-4-32B-0414
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.15.1
\ No newline at end of file
diff --git a/checkpoint-579/adapter_config.json b/checkpoint-579/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..d9d8fa4860138947c736b05e4c3dd010601e2671
--- /dev/null
+++ b/checkpoint-579/adapter_config.json
@@ -0,0 +1,41 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "THUDM/GLM-4-32B-0414",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": null,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": [
+    "embed_tokens",
+    "lm_head"
+  ],
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "q_proj",
+    "k_proj",
+    "gate_up_proj",
+    "down_proj",
+    "o_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_rslora": true
+}
\ No newline at end of file
diff --git a/checkpoint-579/adapter_model.safetensors b/checkpoint-579/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..6eeb8d196189660cb9e111e74276cb5032cc3611
--- /dev/null
+++ b/checkpoint-579/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:80565d96a79b139f4b14a3ced21fa8604075474e378f633f8d127b0c555c29e8
+size 5579575888
diff --git a/checkpoint-579/global_step579/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-579/global_step579/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..78d06f253477859cecff8f985f19dab627cf4e72
--- /dev/null
+++ b/checkpoint-579/global_step579/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:48e8ab9ba739bd62cd3ef94481774d288af3ba11a4b5a56079f2ab51f45db23b
+size 2458601314
diff --git a/checkpoint-579/global_step579/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-579/global_step579/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c1174a5d7163f76db7590b4ad4d3e2b6ed441625
--- /dev/null
+++ b/checkpoint-579/global_step579/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:547540e83d082664d31e6df0aae97c6a282b14a2f5c740a5fb4fa414ec682262
+size 2458601314
diff --git a/checkpoint-579/global_step579/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoint-579/global_step579/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..6520e8da61049060f11bde0d4305111cda4afe6e
--- /dev/null
+++ b/checkpoint-579/global_step579/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4999177f18b552e941e0cd332fda8f7680c431b4790b6841cb460c113d6343f7
+size 2458601314
diff --git a/checkpoint-579/global_step579/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/checkpoint-579/global_step579/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..96a11f5eec69a321387bac2889a5f2c303588494
--- /dev/null
+++ b/checkpoint-579/global_step579/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:280b2a747c884ad982d19970113323be477c314aba0791f90aff38e18ee9c5a1
+size 2458601314
diff --git a/checkpoint-579/global_step579/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/checkpoint-579/global_step579/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..fe929389ab3d7f46caba34cfb9e5168aeb9bd8f2
--- /dev/null
+++ b/checkpoint-579/global_step579/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8e77d311a3f54d90a3341f461ae15d6a06d107718bcdb6f4932396687a882936
+size 2458601314
diff --git a/checkpoint-579/global_step579/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/checkpoint-579/global_step579/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c503c3559a028a2a489f974499d97220cce7a187
--- /dev/null
+++ b/checkpoint-579/global_step579/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:028d1496caa92ddefb57857eb12c276592f4649d823f2a55734edc2c40a91f73
+size 2458601314
diff --git a/checkpoint-579/global_step579/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/checkpoint-579/global_step579/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..77c5eb699cb56108c59110e041fb60f868b51736
--- /dev/null
+++ b/checkpoint-579/global_step579/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:841382c61e7f5ae2f61b61dbc14d0a699c00618877854f2b3ee8779d148e0012
+size 2458601314
diff --git a/checkpoint-579/global_step579/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/checkpoint-579/global_step579/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d0e411fb9dedd864b0b6669b8c5d93f6a5a67142
--- /dev/null
+++ b/checkpoint-579/global_step579/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1cd5dcdfa379dcab50f833b3906b633af94d611228e675cc7f086a04d8b32329
+size 2458601314
diff --git a/checkpoint-579/global_step579/zero_pp_rank_0_mp_rank_00_model_states.pt b/checkpoint-579/global_step579/zero_pp_rank_0_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2ad518f276b9deb5d3acc0b20e8892bad4130da9
--- /dev/null
+++ b/checkpoint-579/global_step579/zero_pp_rank_0_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7a57e502191a0500b9d1c0f16cbf9b0199741c167c4ce17e8f2b6d4593311aa0
+size 747668
diff --git a/checkpoint-579/global_step579/zero_pp_rank_1_mp_rank_00_model_states.pt b/checkpoint-579/global_step579/zero_pp_rank_1_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..88bffa1ee37846a276cd537819c8f28c8f17b8b1
--- /dev/null
+++ b/checkpoint-579/global_step579/zero_pp_rank_1_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a1770f917d929eaba8b68921e9bc477fdaf63abe1dfd431ff350a8abc92d1e66
+size 747668
diff --git a/checkpoint-579/global_step579/zero_pp_rank_2_mp_rank_00_model_states.pt b/checkpoint-579/global_step579/zero_pp_rank_2_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..020fa356f6db7120e3491832e4c46c9fce909417
--- /dev/null
+++ b/checkpoint-579/global_step579/zero_pp_rank_2_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:30a2eef29639c2c972919eed193d71d81f7be500743aa508a20fbe7b84849d11
+size 747668
diff --git a/checkpoint-579/global_step579/zero_pp_rank_3_mp_rank_00_model_states.pt b/checkpoint-579/global_step579/zero_pp_rank_3_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..0cfd5b7c09c4ab38527cc09df22d8de40ee0d7ef
--- /dev/null
+++ b/checkpoint-579/global_step579/zero_pp_rank_3_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9ac3c8d1e4ce2007a511263e7a05b661ae353592545b19274f21da30669e3cad
+size 747668
diff --git a/checkpoint-579/global_step579/zero_pp_rank_4_mp_rank_00_model_states.pt b/checkpoint-579/global_step579/zero_pp_rank_4_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..dd508f247f80c27b081f6cad8fdc0daedd168450
--- /dev/null
+++ b/checkpoint-579/global_step579/zero_pp_rank_4_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9c4d97f6979f8f733d796a5c58e564596ab2456e618e44a25cf9ace722c64a73
+size 747668
diff --git a/checkpoint-579/global_step579/zero_pp_rank_5_mp_rank_00_model_states.pt b/checkpoint-579/global_step579/zero_pp_rank_5_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..57995b565610be7090e12ba1a209b58f121e5c66
--- /dev/null
+++ b/checkpoint-579/global_step579/zero_pp_rank_5_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:56a5a4c0a626c1fe61e7a4db142b78cba919bf3f484a5c494933437f0d774cc1
+size 747668
diff --git a/checkpoint-579/global_step579/zero_pp_rank_6_mp_rank_00_model_states.pt b/checkpoint-579/global_step579/zero_pp_rank_6_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..3e34ff54956729aef29800db4956cc690764d01e
--- /dev/null
+++ b/checkpoint-579/global_step579/zero_pp_rank_6_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d6aeec15becc8fc42faa3f0575c51ef364181d2665459eddc8d2dc966b403020
+size 747668
diff --git a/checkpoint-579/global_step579/zero_pp_rank_7_mp_rank_00_model_states.pt b/checkpoint-579/global_step579/zero_pp_rank_7_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..eada1b68299ed58e210d0b97eb369e27e8da8881
--- /dev/null
+++ b/checkpoint-579/global_step579/zero_pp_rank_7_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:705f93be649ce8d35b21383ed30847f7ffe67a8b1c37e6b0ba22ede8acda0310
+size 747668
diff --git a/checkpoint-579/latest b/checkpoint-579/latest
new file mode 100644
index 0000000000000000000000000000000000000000..ca469d91f6dc5559091863658d35124fe4c6a737
--- /dev/null
+++ b/checkpoint-579/latest
@@ -0,0 +1 @@
+global_step579
\ No newline at end of file
diff --git a/checkpoint-579/rng_state_0.pth b/checkpoint-579/rng_state_0.pth
new file mode 100644
index 0000000000000000000000000000000000000000..78ea69bf77bc9a540866bb9542e61b9deec3a3fe
--- /dev/null
+++ b/checkpoint-579/rng_state_0.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:38c04bc7b73c7a7d50ea429c6932b3c02ee97bad0a60bc0571bce7889d378963
+size 15984
diff --git a/checkpoint-579/rng_state_1.pth b/checkpoint-579/rng_state_1.pth
new file mode 100644
index 0000000000000000000000000000000000000000..08b3f9fd9da2159be3018e4f8c90ebc74fb5a928
--- /dev/null
+++ b/checkpoint-579/rng_state_1.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:31217f02580556ec125eca275e53614326586d6949699267269156e796c66602
+size 15984
diff --git a/checkpoint-579/rng_state_2.pth b/checkpoint-579/rng_state_2.pth
new file mode 100644
index 0000000000000000000000000000000000000000..ac55da7b4f89118e261157974e4446dcd655378e
--- /dev/null
+++ b/checkpoint-579/rng_state_2.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a05785e9c45b375c31236f1e23d47f249ad4638fee36b5efedd9a8689e393677
+size 15984
diff --git a/checkpoint-579/rng_state_3.pth b/checkpoint-579/rng_state_3.pth
new file mode 100644
index 0000000000000000000000000000000000000000..29c693b5a30633dc93355e02804a1084b69ee8f6
--- /dev/null
+++ b/checkpoint-579/rng_state_3.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7a44490d89fcd7a0d4c86bd5dba58d6d5df0722673a204f059fdccba9d833240
+size 15984
diff --git a/checkpoint-579/rng_state_4.pth b/checkpoint-579/rng_state_4.pth
new file mode 100644
index 0000000000000000000000000000000000000000..16769eb8e1f615c0823adb92b1402f1dcfab4b79
--- /dev/null
+++ b/checkpoint-579/rng_state_4.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:458fcb8dbc7e1b3dd98288e9a49357b31abb705d49b87c597778a99a064a3349
+size 15984
diff --git a/checkpoint-579/rng_state_5.pth b/checkpoint-579/rng_state_5.pth
new file mode 100644
index 0000000000000000000000000000000000000000..e52e4a4f125a359879f3db83a2fd95f8a6163cd0
--- /dev/null
+++ b/checkpoint-579/rng_state_5.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eb0b12c61dd87f425696599697c6a6601f719556cb7f3bd2ca6f1cd6bbb836e5
+size 15984
diff --git a/checkpoint-579/rng_state_6.pth b/checkpoint-579/rng_state_6.pth
new file mode 100644
index 0000000000000000000000000000000000000000..6e9d62e251af62e249950da700f94e7c125e05d6
--- /dev/null
+++ b/checkpoint-579/rng_state_6.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3cb73b7f37f6a6e5ff92e836afc1a8be3a31ef4cafb9feae0870d034fa0e871e
+size 15984
diff --git a/checkpoint-579/rng_state_7.pth b/checkpoint-579/rng_state_7.pth
new file mode 100644
index 0000000000000000000000000000000000000000..6b5ec5c88c370f2afdf7d9f98c9c231de26d6f7e
--- /dev/null
+++ b/checkpoint-579/rng_state_7.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a5b10ca9cc6239fd14cf357acb50f08aecddecb3971e9cd2e8dd3be177dae75b
+size 15984
diff --git a/checkpoint-579/scheduler.pt b/checkpoint-579/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..971728ecba22ed809a021e2ed07e4fe42fc08910
--- /dev/null
+++ b/checkpoint-579/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7f6cbd64ca821dc70aa12c5bad69ea8779cfa628a03651d12c29acb5503462db
+size 1064
diff --git a/checkpoint-579/special_tokens_map.json b/checkpoint-579/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..3cf953c2c8a3c89778c92e54c685942bb1130616
--- /dev/null
+++ b/checkpoint-579/special_tokens_map.json
@@ -0,0 +1,32 @@
+{
+  "additional_special_tokens": [
+    "<|endoftext|>",
+    "[MASK]",
+    "[gMASK]",
+    "[sMASK]",
+    "<sop>",
+    "<eop>",
+    "<|system|>",
+    "<|user|>",
+    "<|assistant|>",
+    "<|observation|>",
+    "<|begin_of_image|>",
+    "<|end_of_image|>",
+    "<|begin_of_video|>",
+    "<|end_of_video|>"
+  ],
+  "eos_token": {
+    "content": "<|user|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
diff --git a/checkpoint-579/tokenizer.json b/checkpoint-579/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..4d1dde37a2715c11628fc84bf571976f9f80eb69
--- /dev/null
+++ b/checkpoint-579/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:76ebeac0d8bd7879ead7b43c16b44981f277e47225de2bd7de9ae1a6cc664a8c
+size 19966496
diff --git a/checkpoint-579/tokenizer_config.json b/checkpoint-579/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..e19c45f6310a17f6c1c6be76a429ac68438d547f
--- /dev/null
+++ b/checkpoint-579/tokenizer_config.json
@@ -0,0 +1,146 @@
+{
+  "added_tokens_decoder": {
+    "151329": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151330": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151331": {
+      "content": "[gMASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151332": {
+      "content": "[sMASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151333": {
+      "content": "<sop>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151334": {
+      "content": "<eop>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151335": {
+      "content": "<|system|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151336": {
+      "content": "<|user|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151337": {
+      "content": "<|assistant|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151338": {
+      "content": "<|observation|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151339": {
+      "content": "<|begin_of_image|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151340": {
+      "content": "<|end_of_image|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151341": {
+      "content": "<|begin_of_video|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151342": {
+      "content": "<|end_of_video|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|endoftext|>",
+    "[MASK]",
+    "[gMASK]",
+    "[sMASK]",
+    "<sop>",
+    "<eop>",
+    "<|system|>",
+    "<|user|>",
+    "<|assistant|>",
+    "<|observation|>",
+    "<|begin_of_image|>",
+    "<|end_of_image|>",
+    "<|begin_of_video|>",
+    "<|end_of_video|>"
+  ],
+  "chat_template": "[gMASK]<sop>\n{%- if tools -%}\n<|system|>\n# 可用工具\n{% for tool in tools %}\n    {%- set function = tool.function if tool.get(\"function\") else tool %}\n\n## {{ function.name }}\n\n{{ function | tojson(indent=4, ensure_ascii=False) }}\n在调用上述函数时，请使用 Json 格式表示调用的参数。\n{%- endfor %}\n{%- endif -%}\n\n{%- for msg in messages %}\n    {%- if msg.role == 'system' %}\n<|system|>\n{{ msg.content }}\n    {%- endif %}\n{%- endfor %}\n\n{%- for message in messages if message.role != 'system' %}\n    {%- set role = message['role'] %}\n    {%- set content = message['content'] %}\n    {%- set meta = message.get(\"metadata\", \"\") %}\n\n    {%- if role == 'user' %}\n<|user|>\n{{ content }}\n    {%- elif role == 'assistant' and not meta %}\n<|assistant|>\n{{ content }}\n    {%- elif role == 'assistant' and meta %}\n<|assistant|>{{ meta }}\n{{ content }}\n    {%- elif role == 'observation' %}\n<|observation|>\n{{ content }}\n    {%- endif %}\n{%- endfor %}\n{% if add_generation_prompt %}<|assistant|>{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "do_lower_case": false,
+  "eos_token": "<|user|>",
+  "extra_special_tokens": {},
+  "model_input_names": [
+    "input_ids",
+    "attention_mask"
+  ],
+  "model_max_length": 128000,
+  "pad_token": "<|endoftext|>",
+  "padding_side": "left",
+  "remove_space": false,
+  "tokenizer_class": "PreTrainedTokenizer"
+}
diff --git a/checkpoint-579/trainer_state.json b/checkpoint-579/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..6df763ccb52fd28c798f148e5b42cdeb7a0dd3d6
--- /dev/null
+++ b/checkpoint-579/trainer_state.json
@@ -0,0 +1,4135 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.5,
+  "eval_steps": 97,
+  "global_step": 579,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0025906735751295338,
+      "grad_norm": 758.2562349755826,
+      "learning_rate": 0.0,
+      "loss": 1.3719,
+      "step": 1
+    },
+    {
+      "epoch": 0.0025906735751295338,
+      "eval_loss": 1.3159157037734985,
+      "eval_runtime": 36.907,
+      "eval_samples_per_second": 20.159,
+      "eval_steps_per_second": 1.273,
+      "step": 1
+    },
+    {
+      "epoch": 0.0051813471502590676,
+      "grad_norm": 666.308184823038,
+      "learning_rate": 1.0000000000000002e-06,
+      "loss": 1.36,
+      "step": 2
+    },
+    {
+      "epoch": 0.007772020725388601,
+      "grad_norm": 211.0771195353068,
+      "learning_rate": 2.0000000000000003e-06,
+      "loss": 1.3746,
+      "step": 3
+    },
+    {
+      "epoch": 0.010362694300518135,
+      "grad_norm": 431.5114709683218,
+      "learning_rate": 3e-06,
+      "loss": 1.3412,
+      "step": 4
+    },
+    {
+      "epoch": 0.012953367875647668,
+      "grad_norm": 230.87468433791625,
+      "learning_rate": 4.000000000000001e-06,
+      "loss": 1.3837,
+      "step": 5
+    },
+    {
+      "epoch": 0.015544041450777202,
+      "grad_norm": 635.1636587738542,
+      "learning_rate": 5e-06,
+      "loss": 1.3761,
+      "step": 6
+    },
+    {
+      "epoch": 0.018134715025906734,
+      "grad_norm": 791.5536958334704,
+      "learning_rate": 6e-06,
+      "loss": 1.2855,
+      "step": 7
+    },
+    {
+      "epoch": 0.02072538860103627,
+      "grad_norm": 667.7197994216477,
+      "learning_rate": 7e-06,
+      "loss": 1.3267,
+      "step": 8
+    },
+    {
+      "epoch": 0.023316062176165803,
+      "grad_norm": 254.3855973692125,
+      "learning_rate": 8.000000000000001e-06,
+      "loss": 1.2977,
+      "step": 9
+    },
+    {
+      "epoch": 0.025906735751295335,
+      "grad_norm": 162.29347257682093,
+      "learning_rate": 9e-06,
+      "loss": 1.3522,
+      "step": 10
+    },
+    {
+      "epoch": 0.02849740932642487,
+      "grad_norm": 352.6352930651456,
+      "learning_rate": 1e-05,
+      "loss": 1.2688,
+      "step": 11
+    },
+    {
+      "epoch": 0.031088082901554404,
+      "grad_norm": 148.2629265526552,
+      "learning_rate": 1.1000000000000001e-05,
+      "loss": 1.3342,
+      "step": 12
+    },
+    {
+      "epoch": 0.03367875647668394,
+      "grad_norm": 249.88753789723657,
+      "learning_rate": 1.2e-05,
+      "loss": 1.2983,
+      "step": 13
+    },
+    {
+      "epoch": 0.03626943005181347,
+      "grad_norm": 184.03358422636597,
+      "learning_rate": 1.3000000000000001e-05,
+      "loss": 1.3291,
+      "step": 14
+    },
+    {
+      "epoch": 0.038860103626943004,
+      "grad_norm": 198.4491469860763,
+      "learning_rate": 1.4e-05,
+      "loss": 1.4014,
+      "step": 15
+    },
+    {
+      "epoch": 0.04145077720207254,
+      "grad_norm": 680.9537058769038,
+      "learning_rate": 1.5000000000000002e-05,
+      "loss": 1.3775,
+      "step": 16
+    },
+    {
+      "epoch": 0.04404145077720207,
+      "grad_norm": 563.0247638614801,
+      "learning_rate": 1.6000000000000003e-05,
+      "loss": 1.3228,
+      "step": 17
+    },
+    {
+      "epoch": 0.046632124352331605,
+      "grad_norm": 271.985463813746,
+      "learning_rate": 1.7e-05,
+      "loss": 1.3695,
+      "step": 18
+    },
+    {
+      "epoch": 0.04922279792746114,
+      "grad_norm": 399.51218452223316,
+      "learning_rate": 1.8e-05,
+      "loss": 1.2556,
+      "step": 19
+    },
+    {
+      "epoch": 0.05181347150259067,
+      "grad_norm": 160.70697055826656,
+      "learning_rate": 1.9e-05,
+      "loss": 1.2982,
+      "step": 20
+    },
+    {
+      "epoch": 0.054404145077720206,
+      "grad_norm": 227.8927504687491,
+      "learning_rate": 2e-05,
+      "loss": 1.3532,
+      "step": 21
+    },
+    {
+      "epoch": 0.05699481865284974,
+      "grad_norm": 550.1538868076032,
+      "learning_rate": 2.1000000000000002e-05,
+      "loss": 1.2603,
+      "step": 22
+    },
+    {
+      "epoch": 0.05958549222797927,
+      "grad_norm": 291.8994359919024,
+      "learning_rate": 2.2000000000000003e-05,
+      "loss": 1.3663,
+      "step": 23
+    },
+    {
+      "epoch": 0.06217616580310881,
+      "grad_norm": 120.60677833129643,
+      "learning_rate": 2.3e-05,
+      "loss": 1.3129,
+      "step": 24
+    },
+    {
+      "epoch": 0.06476683937823834,
+      "grad_norm": 414.4006662101242,
+      "learning_rate": 2.4e-05,
+      "loss": 1.3037,
+      "step": 25
+    },
+    {
+      "epoch": 0.06735751295336788,
+      "grad_norm": 141.48324465317884,
+      "learning_rate": 2.5e-05,
+      "loss": 1.3095,
+      "step": 26
+    },
+    {
+      "epoch": 0.06994818652849741,
+      "grad_norm": 147.86066819937994,
+      "learning_rate": 2.6000000000000002e-05,
+      "loss": 1.2372,
+      "step": 27
+    },
+    {
+      "epoch": 0.07253886010362694,
+      "grad_norm": 214.47337614964576,
+      "learning_rate": 2.7000000000000002e-05,
+      "loss": 1.3384,
+      "step": 28
+    },
+    {
+      "epoch": 0.07512953367875648,
+      "grad_norm": 898.4324889241673,
+      "learning_rate": 2.8e-05,
+      "loss": 1.2003,
+      "step": 29
+    },
+    {
+      "epoch": 0.07772020725388601,
+      "grad_norm": 128.83026557596128,
+      "learning_rate": 2.9e-05,
+      "loss": 1.2172,
+      "step": 30
+    },
+    {
+      "epoch": 0.08031088082901554,
+      "grad_norm": 183.0777862405529,
+      "learning_rate": 3.0000000000000004e-05,
+      "loss": 1.2674,
+      "step": 31
+    },
+    {
+      "epoch": 0.08290155440414508,
+      "grad_norm": 119.01841833358732,
+      "learning_rate": 3.1e-05,
+      "loss": 1.2554,
+      "step": 32
+    },
+    {
+      "epoch": 0.08549222797927461,
+      "grad_norm": 117.65980267542858,
+      "learning_rate": 3.2000000000000005e-05,
+      "loss": 1.2716,
+      "step": 33
+    },
+    {
+      "epoch": 0.08808290155440414,
+      "grad_norm": 82.40151099433953,
+      "learning_rate": 3.3e-05,
+      "loss": 1.2019,
+      "step": 34
+    },
+    {
+      "epoch": 0.09067357512953368,
+      "grad_norm": 82.61816783653785,
+      "learning_rate": 3.4e-05,
+      "loss": 1.2424,
+      "step": 35
+    },
+    {
+      "epoch": 0.09326424870466321,
+      "grad_norm": 136.42743433868276,
+      "learning_rate": 3.5000000000000004e-05,
+      "loss": 1.2066,
+      "step": 36
+    },
+    {
+      "epoch": 0.09585492227979274,
+      "grad_norm": 36.775911657584444,
+      "learning_rate": 3.6e-05,
+      "loss": 1.2485,
+      "step": 37
+    },
+    {
+      "epoch": 0.09844559585492228,
+      "grad_norm": 56.55022603284064,
+      "learning_rate": 3.7000000000000005e-05,
+      "loss": 1.2112,
+      "step": 38
+    },
+    {
+      "epoch": 0.10103626943005181,
+      "grad_norm": 50.09896932886107,
+      "learning_rate": 3.8e-05,
+      "loss": 1.2027,
+      "step": 39
+    },
+    {
+      "epoch": 0.10362694300518134,
+      "grad_norm": 54.2661481198025,
+      "learning_rate": 3.9e-05,
+      "loss": 1.2673,
+      "step": 40
+    },
+    {
+      "epoch": 0.10621761658031088,
+      "grad_norm": 60.04145981731815,
+      "learning_rate": 4e-05,
+      "loss": 1.1648,
+      "step": 41
+    },
+    {
+      "epoch": 0.10880829015544041,
+      "grad_norm": 169.47741055545822,
+      "learning_rate": 3.999981580539036e-05,
+      "loss": 1.2393,
+      "step": 42
+    },
+    {
+      "epoch": 0.11139896373056994,
+      "grad_norm": 43.64716987307323,
+      "learning_rate": 3.9999263224954204e-05,
+      "loss": 1.2906,
+      "step": 43
+    },
+    {
+      "epoch": 0.11398963730569948,
+      "grad_norm": 51.3206609767585,
+      "learning_rate": 3.999834226886976e-05,
+      "loss": 1.1807,
+      "step": 44
+    },
+    {
+      "epoch": 0.11658031088082901,
+      "grad_norm": 38.95055887413869,
+      "learning_rate": 3.999705295410054e-05,
+      "loss": 1.1825,
+      "step": 45
+    },
+    {
+      "epoch": 0.11917098445595854,
+      "grad_norm": 40.59968974426338,
+      "learning_rate": 3.999539530439504e-05,
+      "loss": 1.193,
+      "step": 46
+    },
+    {
+      "epoch": 0.12176165803108809,
+      "grad_norm": 34.5796571445333,
+      "learning_rate": 3.9993369350286265e-05,
+      "loss": 1.2127,
+      "step": 47
+    },
+    {
+      "epoch": 0.12435233160621761,
+      "grad_norm": 37.97693356149241,
+      "learning_rate": 3.99909751290912e-05,
+      "loss": 1.1543,
+      "step": 48
+    },
+    {
+      "epoch": 0.12694300518134716,
+      "grad_norm": 82.9217015858092,
+      "learning_rate": 3.9988212684910107e-05,
+      "loss": 1.2329,
+      "step": 49
+    },
+    {
+      "epoch": 0.12953367875647667,
+      "grad_norm": 49.256542144400214,
+      "learning_rate": 3.9985082068625724e-05,
+      "loss": 1.212,
+      "step": 50
+    },
+    {
+      "epoch": 0.13212435233160622,
+      "grad_norm": 45.025980435259484,
+      "learning_rate": 3.998158333790231e-05,
+      "loss": 1.2129,
+      "step": 51
+    },
+    {
+      "epoch": 0.13471502590673576,
+      "grad_norm": 45.98465689592428,
+      "learning_rate": 3.99777165571846e-05,
+      "loss": 1.1709,
+      "step": 52
+    },
+    {
+      "epoch": 0.13730569948186527,
+      "grad_norm": 43.481241408477906,
+      "learning_rate": 3.997348179769661e-05,
+      "loss": 1.1614,
+      "step": 53
+    },
+    {
+      "epoch": 0.13989637305699482,
+      "grad_norm": 82.17633750834132,
+      "learning_rate": 3.996887913744033e-05,
+      "loss": 1.2205,
+      "step": 54
+    },
+    {
+      "epoch": 0.14248704663212436,
+      "grad_norm": 53.0176514970764,
+      "learning_rate": 3.9963908661194285e-05,
+      "loss": 1.1204,
+      "step": 55
+    },
+    {
+      "epoch": 0.14507772020725387,
+      "grad_norm": 67.86382426995611,
+      "learning_rate": 3.995857046051196e-05,
+      "loss": 1.1839,
+      "step": 56
+    },
+    {
+      "epoch": 0.14766839378238342,
+      "grad_norm": 31.282407703790597,
+      "learning_rate": 3.995286463372013e-05,
+      "loss": 1.2126,
+      "step": 57
+    },
+    {
+      "epoch": 0.15025906735751296,
+      "grad_norm": 52.200764429265604,
+      "learning_rate": 3.994679128591706e-05,
+      "loss": 1.2036,
+      "step": 58
+    },
+    {
+      "epoch": 0.15284974093264247,
+      "grad_norm": 60.706608653531895,
+      "learning_rate": 3.9940350528970535e-05,
+      "loss": 1.1848,
+      "step": 59
+    },
+    {
+      "epoch": 0.15544041450777202,
+      "grad_norm": 47.31754062899529,
+      "learning_rate": 3.993354248151583e-05,
+      "loss": 1.0869,
+      "step": 60
+    },
+    {
+      "epoch": 0.15803108808290156,
+      "grad_norm": 49.42450836392811,
+      "learning_rate": 3.9926367268953514e-05,
+      "loss": 1.2651,
+      "step": 61
+    },
+    {
+      "epoch": 0.16062176165803108,
+      "grad_norm": 38.791167030088886,
+      "learning_rate": 3.991882502344712e-05,
+      "loss": 1.1881,
+      "step": 62
+    },
+    {
+      "epoch": 0.16321243523316062,
+      "grad_norm": 56.16339499737216,
+      "learning_rate": 3.991091588392077e-05,
+      "loss": 1.1518,
+      "step": 63
+    },
+    {
+      "epoch": 0.16580310880829016,
+      "grad_norm": 861.8559063020828,
+      "learning_rate": 3.990263999605652e-05,
+      "loss": 1.1614,
+      "step": 64
+    },
+    {
+      "epoch": 0.16839378238341968,
+      "grad_norm": 50.92822786500888,
+      "learning_rate": 3.989399751229179e-05,
+      "loss": 1.1998,
+      "step": 65
+    },
+    {
+      "epoch": 0.17098445595854922,
+      "grad_norm": 31.04121324055666,
+      "learning_rate": 3.988498859181645e-05,
+      "loss": 1.1795,
+      "step": 66
+    },
+    {
+      "epoch": 0.17357512953367876,
+      "grad_norm": 50.33061983380845,
+      "learning_rate": 3.9875613400569975e-05,
+      "loss": 1.1742,
+      "step": 67
+    },
+    {
+      "epoch": 0.17616580310880828,
+      "grad_norm": 75.20462514003519,
+      "learning_rate": 3.986587211123833e-05,
+      "loss": 1.1856,
+      "step": 68
+    },
+    {
+      "epoch": 0.17875647668393782,
+      "grad_norm": 38.82139317052205,
+      "learning_rate": 3.98557649032508e-05,
+      "loss": 1.1529,
+      "step": 69
+    },
+    {
+      "epoch": 0.18134715025906736,
+      "grad_norm": 36.55988806615175,
+      "learning_rate": 3.984529196277674e-05,
+      "loss": 1.1884,
+      "step": 70
+    },
+    {
+      "epoch": 0.18393782383419688,
+      "grad_norm": 104.8931793971097,
+      "learning_rate": 3.983445348272203e-05,
+      "loss": 1.2182,
+      "step": 71
+    },
+    {
+      "epoch": 0.18652849740932642,
+      "grad_norm": 36.50395409234617,
+      "learning_rate": 3.982324966272566e-05,
+      "loss": 1.1609,
+      "step": 72
+    },
+    {
+      "epoch": 0.18911917098445596,
+      "grad_norm": 35.019191693448626,
+      "learning_rate": 3.981168070915594e-05,
+      "loss": 1.173,
+      "step": 73
+    },
+    {
+      "epoch": 0.19170984455958548,
+      "grad_norm": 33.378390048053596,
+      "learning_rate": 3.979974683510677e-05,
+      "loss": 1.173,
+      "step": 74
+    },
+    {
+      "epoch": 0.19430051813471502,
+      "grad_norm": 43.356840136984154,
+      "learning_rate": 3.978744826039366e-05,
+      "loss": 1.2032,
+      "step": 75
+    },
+    {
+      "epoch": 0.19689119170984457,
+      "grad_norm": 31.285725922510768,
+      "learning_rate": 3.977478521154974e-05,
+      "loss": 1.1569,
+      "step": 76
+    },
+    {
+      "epoch": 0.19948186528497408,
+      "grad_norm": 35.19264482867074,
+      "learning_rate": 3.9761757921821544e-05,
+      "loss": 1.1365,
+      "step": 77
+    },
+    {
+      "epoch": 0.20207253886010362,
+      "grad_norm": 44.66037256551279,
+      "learning_rate": 3.974836663116472e-05,
+      "loss": 1.164,
+      "step": 78
+    },
+    {
+      "epoch": 0.20466321243523317,
+      "grad_norm": 68.91101457952654,
+      "learning_rate": 3.973461158623963e-05,
+      "loss": 1.2256,
+      "step": 79
+    },
+    {
+      "epoch": 0.20725388601036268,
+      "grad_norm": 45.866521854583,
+      "learning_rate": 3.9720493040406786e-05,
+      "loss": 1.1697,
+      "step": 80
+    },
+    {
+      "epoch": 0.20984455958549222,
+      "grad_norm": 59.63095169617338,
+      "learning_rate": 3.970601125372218e-05,
+      "loss": 1.2094,
+      "step": 81
+    },
+    {
+      "epoch": 0.21243523316062177,
+      "grad_norm": 39.085597271064216,
+      "learning_rate": 3.9691166492932535e-05,
+      "loss": 1.1048,
+      "step": 82
+    },
+    {
+      "epoch": 0.21502590673575128,
+      "grad_norm": 36.40256073477861,
+      "learning_rate": 3.9675959031470336e-05,
+      "loss": 1.248,
+      "step": 83
+    },
+    {
+      "epoch": 0.21761658031088082,
+      "grad_norm": 29.846921716586085,
+      "learning_rate": 3.966038914944881e-05,
+      "loss": 1.1718,
+      "step": 84
+    },
+    {
+      "epoch": 0.22020725388601037,
+      "grad_norm": 50.87052190327881,
+      "learning_rate": 3.964445713365682e-05,
+      "loss": 1.1529,
+      "step": 85
+    },
+    {
+      "epoch": 0.22279792746113988,
+      "grad_norm": 35.32915760431302,
+      "learning_rate": 3.9628163277553486e-05,
+      "loss": 1.1767,
+      "step": 86
+    },
+    {
+      "epoch": 0.22538860103626943,
+      "grad_norm": 157.5587514654703,
+      "learning_rate": 3.961150788126286e-05,
+      "loss": 1.2194,
+      "step": 87
+    },
+    {
+      "epoch": 0.22797927461139897,
+      "grad_norm": 25.03485489120971,
+      "learning_rate": 3.9594491251568376e-05,
+      "loss": 1.1392,
+      "step": 88
+    },
+    {
+      "epoch": 0.23056994818652848,
+      "grad_norm": 80.55933867045263,
+      "learning_rate": 3.957711370190716e-05,
+      "loss": 1.1819,
+      "step": 89
+    },
+    {
+      "epoch": 0.23316062176165803,
+      "grad_norm": 272.22874004071406,
+      "learning_rate": 3.9559375552364325e-05,
+      "loss": 1.0998,
+      "step": 90
+    },
+    {
+      "epoch": 0.23575129533678757,
+      "grad_norm": 91.94671663482514,
+      "learning_rate": 3.954127712966702e-05,
+      "loss": 1.2494,
+      "step": 91
+    },
+    {
+      "epoch": 0.23834196891191708,
+      "grad_norm": 54.31533598131098,
+      "learning_rate": 3.952281876717843e-05,
+      "loss": 1.1385,
+      "step": 92
+    },
+    {
+      "epoch": 0.24093264248704663,
+      "grad_norm": 103.20789745908105,
+      "learning_rate": 3.950400080489165e-05,
+      "loss": 1.1398,
+      "step": 93
+    },
+    {
+      "epoch": 0.24352331606217617,
+      "grad_norm": 45.14746362545893,
+      "learning_rate": 3.94848235894234e-05,
+      "loss": 1.2697,
+      "step": 94
+    },
+    {
+      "epoch": 0.24611398963730569,
+      "grad_norm": 21.271923336142002,
+      "learning_rate": 3.9465287474007654e-05,
+      "loss": 1.1397,
+      "step": 95
+    },
+    {
+      "epoch": 0.24870466321243523,
+      "grad_norm": 93.89786795431422,
+      "learning_rate": 3.944539281848912e-05,
+      "loss": 1.1542,
+      "step": 96
+    },
+    {
+      "epoch": 0.25129533678756477,
+      "grad_norm": 32.38768349342839,
+      "learning_rate": 3.942513998931663e-05,
+      "loss": 1.1693,
+      "step": 97
+    },
+    {
+      "epoch": 0.25129533678756477,
+      "eval_loss": 1.1344976425170898,
+      "eval_runtime": 37.8807,
+      "eval_samples_per_second": 19.641,
+      "eval_steps_per_second": 1.241,
+      "step": 97
+    },
+    {
+      "epoch": 0.2538860103626943,
+      "grad_norm": 91.41293468177638,
+      "learning_rate": 3.940452935953639e-05,
+      "loss": 1.1724,
+      "step": 98
+    },
+    {
+      "epoch": 0.25647668393782386,
+      "grad_norm": 39.20645478419229,
+      "learning_rate": 3.9383561308785075e-05,
+      "loss": 1.1583,
+      "step": 99
+    },
+    {
+      "epoch": 0.25906735751295334,
+      "grad_norm": 35.32804513153546,
+      "learning_rate": 3.9362236223282885e-05,
+      "loss": 1.158,
+      "step": 100
+    },
+    {
+      "epoch": 0.2616580310880829,
+      "grad_norm": 35.24783762804842,
+      "learning_rate": 3.934055449582641e-05,
+      "loss": 1.1552,
+      "step": 101
+    },
+    {
+      "epoch": 0.26424870466321243,
+      "grad_norm": 33.743808031979775,
+      "learning_rate": 3.931851652578137e-05,
+      "loss": 1.264,
+      "step": 102
+    },
+    {
+      "epoch": 0.266839378238342,
+      "grad_norm": 113.49798793226394,
+      "learning_rate": 3.92961227190753e-05,
+      "loss": 1.2361,
+      "step": 103
+    },
+    {
+      "epoch": 0.2694300518134715,
+      "grad_norm": 31.813807349410364,
+      "learning_rate": 3.9273373488190036e-05,
+      "loss": 1.1246,
+      "step": 104
+    },
+    {
+      "epoch": 0.27202072538860106,
+      "grad_norm": 29.391695486306187,
+      "learning_rate": 3.925026925215417e-05,
+      "loss": 1.1142,
+      "step": 105
+    },
+    {
+      "epoch": 0.27461139896373055,
+      "grad_norm": 33.79933331839905,
+      "learning_rate": 3.922681043653526e-05,
+      "loss": 1.1401,
+      "step": 106
+    },
+    {
+      "epoch": 0.2772020725388601,
+      "grad_norm": 39.09509012730907,
+      "learning_rate": 3.920299747343204e-05,
+      "loss": 1.1822,
+      "step": 107
+    },
+    {
+      "epoch": 0.27979274611398963,
+      "grad_norm": 37.81471938433609,
+      "learning_rate": 3.9178830801466465e-05,
+      "loss": 1.1592,
+      "step": 108
+    },
+    {
+      "epoch": 0.2823834196891192,
+      "grad_norm": 69.07753778460207,
+      "learning_rate": 3.915431086577561e-05,
+      "loss": 1.1683,
+      "step": 109
+    },
+    {
+      "epoch": 0.2849740932642487,
+      "grad_norm": 28.864787246081605,
+      "learning_rate": 3.912943811800347e-05,
+      "loss": 1.1179,
+      "step": 110
+    },
+    {
+      "epoch": 0.28756476683937826,
+      "grad_norm": 28.842042951717836,
+      "learning_rate": 3.910421301629264e-05,
+      "loss": 1.1317,
+      "step": 111
+    },
+    {
+      "epoch": 0.29015544041450775,
+      "grad_norm": 51.475482074695506,
+      "learning_rate": 3.9078636025275904e-05,
+      "loss": 1.1451,
+      "step": 112
+    },
+    {
+      "epoch": 0.2927461139896373,
+      "grad_norm": 33.48279556713943,
+      "learning_rate": 3.9052707616067654e-05,
+      "loss": 1.1554,
+      "step": 113
+    },
+    {
+      "epoch": 0.29533678756476683,
+      "grad_norm": 21.279603575929844,
+      "learning_rate": 3.9026428266255205e-05,
+      "loss": 1.1636,
+      "step": 114
+    },
+    {
+      "epoch": 0.2979274611398964,
+      "grad_norm": 36.226178034876675,
+      "learning_rate": 3.899979845989003e-05,
+      "loss": 1.1966,
+      "step": 115
+    },
+    {
+      "epoch": 0.3005181347150259,
+      "grad_norm": 29.90506353145981,
+      "learning_rate": 3.897281868747878e-05,
+      "loss": 1.1888,
+      "step": 116
+    },
+    {
+      "epoch": 0.30310880829015546,
+      "grad_norm": 36.04602777809767,
+      "learning_rate": 3.894548944597434e-05,
+      "loss": 1.2066,
+      "step": 117
+    },
+    {
+      "epoch": 0.30569948186528495,
+      "grad_norm": 36.42793844948301,
+      "learning_rate": 3.8917811238766606e-05,
+      "loss": 1.1712,
+      "step": 118
+    },
+    {
+      "epoch": 0.3082901554404145,
+      "grad_norm": 58.788967662325696,
+      "learning_rate": 3.888978457567323e-05,
+      "loss": 1.1225,
+      "step": 119
+    },
+    {
+      "epoch": 0.31088082901554404,
+      "grad_norm": 29.357299816022326,
+      "learning_rate": 3.886140997293024e-05,
+      "loss": 1.1315,
+      "step": 120
+    },
+    {
+      "epoch": 0.3134715025906736,
+      "grad_norm": 95.08345317107502,
+      "learning_rate": 3.883268795318252e-05,
+      "loss": 1.1852,
+      "step": 121
+    },
+    {
+      "epoch": 0.3160621761658031,
+      "grad_norm": 33.6623824593179,
+      "learning_rate": 3.88036190454742e-05,
+      "loss": 1.16,
+      "step": 122
+    },
+    {
+      "epoch": 0.31865284974093266,
+      "grad_norm": 42.587546987131105,
+      "learning_rate": 3.8774203785238886e-05,
+      "loss": 1.1374,
+      "step": 123
+    },
+    {
+      "epoch": 0.32124352331606215,
+      "grad_norm": 33.360649853064245,
+      "learning_rate": 3.8744442714289816e-05,
+      "loss": 1.1757,
+      "step": 124
+    },
+    {
+      "epoch": 0.3238341968911917,
+      "grad_norm": 49.09256643961471,
+      "learning_rate": 3.8714336380809874e-05,
+      "loss": 1.1782,
+      "step": 125
+    },
+    {
+      "epoch": 0.32642487046632124,
+      "grad_norm": 31.505007051172793,
+      "learning_rate": 3.86838853393415e-05,
+      "loss": 1.195,
+      "step": 126
+    },
+    {
+      "epoch": 0.3290155440414508,
+      "grad_norm": 34.36735417254799,
+      "learning_rate": 3.865309015077645e-05,
+      "loss": 1.1078,
+      "step": 127
+    },
+    {
+      "epoch": 0.3316062176165803,
+      "grad_norm": 36.63220606142181,
+      "learning_rate": 3.862195138234551e-05,
+      "loss": 1.1319,
+      "step": 128
+    },
+    {
+      "epoch": 0.33419689119170987,
+      "grad_norm": 53.324986862513676,
+      "learning_rate": 3.859046960760801e-05,
+      "loss": 1.2301,
+      "step": 129
+    },
+    {
+      "epoch": 0.33678756476683935,
+      "grad_norm": 47.41445409144979,
+      "learning_rate": 3.855864540644126e-05,
+      "loss": 1.2366,
+      "step": 130
+    },
+    {
+      "epoch": 0.3393782383419689,
+      "grad_norm": 32.57355122427366,
+      "learning_rate": 3.8526479365029906e-05,
+      "loss": 1.142,
+      "step": 131
+    },
+    {
+      "epoch": 0.34196891191709844,
+      "grad_norm": 28.445824333644715,
+      "learning_rate": 3.849397207585508e-05,
+      "loss": 1.0847,
+      "step": 132
+    },
+    {
+      "epoch": 0.344559585492228,
+      "grad_norm": 49.23062726715889,
+      "learning_rate": 3.846112413768353e-05,
+      "loss": 1.2241,
+      "step": 133
+    },
+    {
+      "epoch": 0.3471502590673575,
+      "grad_norm": 53.424206543788074,
+      "learning_rate": 3.842793615555657e-05,
+      "loss": 1.2392,
+      "step": 134
+    },
+    {
+      "epoch": 0.34974093264248707,
+      "grad_norm": 38.19316140175426,
+      "learning_rate": 3.8394408740778934e-05,
+      "loss": 1.1208,
+      "step": 135
+    },
+    {
+      "epoch": 0.35233160621761656,
+      "grad_norm": 32.35931252369273,
+      "learning_rate": 3.836054251090755e-05,
+      "loss": 1.1604,
+      "step": 136
+    },
+    {
+      "epoch": 0.3549222797927461,
+      "grad_norm": 37.90085344799495,
+      "learning_rate": 3.83263380897401e-05,
+      "loss": 1.1134,
+      "step": 137
+    },
+    {
+      "epoch": 0.35751295336787564,
+      "grad_norm": 44.49191588319939,
+      "learning_rate": 3.829179610730359e-05,
+      "loss": 1.1281,
+      "step": 138
+    },
+    {
+      "epoch": 0.3601036269430052,
+      "grad_norm": 141.98524430756757,
+      "learning_rate": 3.8256917199842715e-05,
+      "loss": 1.0928,
+      "step": 139
+    },
+    {
+      "epoch": 0.3626943005181347,
+      "grad_norm": 30.887093976524472,
+      "learning_rate": 3.822170200980815e-05,
+      "loss": 1.0936,
+      "step": 140
+    },
+    {
+      "epoch": 0.36528497409326427,
+      "grad_norm": 21.980521878837745,
+      "learning_rate": 3.818615118584472e-05,
+      "loss": 1.1368,
+      "step": 141
+    },
+    {
+      "epoch": 0.36787564766839376,
+      "grad_norm": 538.6650762618656,
+      "learning_rate": 3.815026538277943e-05,
+      "loss": 1.0918,
+      "step": 142
+    },
+    {
+      "epoch": 0.3704663212435233,
+      "grad_norm": 40.842881572203,
+      "learning_rate": 3.811404526160943e-05,
+      "loss": 1.1705,
+      "step": 143
+    },
+    {
+      "epoch": 0.37305699481865284,
+      "grad_norm": 26.891553492377298,
+      "learning_rate": 3.8077491489489835e-05,
+      "loss": 1.1468,
+      "step": 144
+    },
+    {
+      "epoch": 0.3756476683937824,
+      "grad_norm": 45.138483181178074,
+      "learning_rate": 3.8040604739721415e-05,
+      "loss": 1.1679,
+      "step": 145
+    },
+    {
+      "epoch": 0.37823834196891193,
+      "grad_norm": 35.133763086168244,
+      "learning_rate": 3.8003385691738227e-05,
+      "loss": 1.1029,
+      "step": 146
+    },
+    {
+      "epoch": 0.38082901554404147,
+      "grad_norm": 36.941250802707344,
+      "learning_rate": 3.7965835031095065e-05,
+      "loss": 1.1491,
+      "step": 147
+    },
+    {
+      "epoch": 0.38341968911917096,
+      "grad_norm": 90.1080256703095,
+      "learning_rate": 3.792795344945485e-05,
+      "loss": 1.1212,
+      "step": 148
+    },
+    {
+      "epoch": 0.3860103626943005,
+      "grad_norm": 39.70360899750413,
+      "learning_rate": 3.7889741644575914e-05,
+      "loss": 1.15,
+      "step": 149
+    },
+    {
+      "epoch": 0.38860103626943004,
+      "grad_norm": 28.229369877304094,
+      "learning_rate": 3.78512003202991e-05,
+      "loss": 1.1111,
+      "step": 150
+    },
+    {
+      "epoch": 0.3911917098445596,
+      "grad_norm": 31.611752191925987,
+      "learning_rate": 3.7812330186534815e-05,
+      "loss": 1.1366,
+      "step": 151
+    },
+    {
+      "epoch": 0.39378238341968913,
+      "grad_norm": 38.196015586772425,
+      "learning_rate": 3.777313195924998e-05,
+      "loss": 1.1433,
+      "step": 152
+    },
+    {
+      "epoch": 0.3963730569948187,
+      "grad_norm": 22.732638044547453,
+      "learning_rate": 3.773360636045481e-05,
+      "loss": 1.1125,
+      "step": 153
+    },
+    {
+      "epoch": 0.39896373056994816,
+      "grad_norm": 90.19158665385014,
+      "learning_rate": 3.7693754118189525e-05,
+      "loss": 1.1242,
+      "step": 154
+    },
+    {
+      "epoch": 0.4015544041450777,
+      "grad_norm": 42.43479974993017,
+      "learning_rate": 3.765357596651095e-05,
+      "loss": 1.1191,
+      "step": 155
+    },
+    {
+      "epoch": 0.40414507772020725,
+      "grad_norm": 88.0076735720364,
+      "learning_rate": 3.761307264547899e-05,
+      "loss": 1.1718,
+      "step": 156
+    },
+    {
+      "epoch": 0.4067357512953368,
+      "grad_norm": 30.782507703935767,
+      "learning_rate": 3.757224490114297e-05,
+      "loss": 1.109,
+      "step": 157
+    },
+    {
+      "epoch": 0.40932642487046633,
+      "grad_norm": 69.89871106113397,
+      "learning_rate": 3.7531093485527943e-05,
+      "loss": 1.1018,
+      "step": 158
+    },
+    {
+      "epoch": 0.4119170984455959,
+      "grad_norm": 37.339006645717305,
+      "learning_rate": 3.7489619156620796e-05,
+      "loss": 1.1358,
+      "step": 159
+    },
+    {
+      "epoch": 0.41450777202072536,
+      "grad_norm": 28.06388054378899,
+      "learning_rate": 3.744782267835632e-05,
+      "loss": 1.0847,
+      "step": 160
+    },
+    {
+      "epoch": 0.4170984455958549,
+      "grad_norm": 54.05874281297702,
+      "learning_rate": 3.740570482060311e-05,
+      "loss": 1.1682,
+      "step": 161
+    },
+    {
+      "epoch": 0.41968911917098445,
+      "grad_norm": 32.299093265328835,
+      "learning_rate": 3.73632663591494e-05,
+      "loss": 1.1413,
+      "step": 162
+    },
+    {
+      "epoch": 0.422279792746114,
+      "grad_norm": 31.213652090157694,
+      "learning_rate": 3.732050807568878e-05,
+      "loss": 1.1313,
+      "step": 163
+    },
+    {
+      "epoch": 0.42487046632124353,
+      "grad_norm": 40.01090035937505,
+      "learning_rate": 3.727743075780578e-05,
+      "loss": 1.1513,
+      "step": 164
+    },
+    {
+      "epoch": 0.4274611398963731,
+      "grad_norm": 47.11352577964853,
+      "learning_rate": 3.723403519896136e-05,
+      "loss": 1.2192,
+      "step": 165
+    },
+    {
+      "epoch": 0.43005181347150256,
+      "grad_norm": 28.645086506093037,
+      "learning_rate": 3.7190322198478355e-05,
+      "loss": 1.1097,
+      "step": 166
+    },
+    {
+      "epoch": 0.4326424870466321,
+      "grad_norm": 35.28541113925116,
+      "learning_rate": 3.7146292561526654e-05,
+      "loss": 1.1557,
+      "step": 167
+    },
+    {
+      "epoch": 0.43523316062176165,
+      "grad_norm": 58.30281063037669,
+      "learning_rate": 3.7101947099108425e-05,
+      "loss": 1.1829,
+      "step": 168
+    },
+    {
+      "epoch": 0.4378238341968912,
+      "grad_norm": 26.33563548968379,
+      "learning_rate": 3.70572866280432e-05,
+      "loss": 1.147,
+      "step": 169
+    },
+    {
+      "epoch": 0.44041450777202074,
+      "grad_norm": 57.00052875402651,
+      "learning_rate": 3.701231197095277e-05,
+      "loss": 1.1212,
+      "step": 170
+    },
+    {
+      "epoch": 0.4430051813471503,
+      "grad_norm": 23.672828037237174,
+      "learning_rate": 3.696702395624608e-05,
+      "loss": 1.1152,
+      "step": 171
+    },
+    {
+      "epoch": 0.44559585492227977,
+      "grad_norm": 41.1264174112964,
+      "learning_rate": 3.692142341810395e-05,
+      "loss": 1.1154,
+      "step": 172
+    },
+    {
+      "epoch": 0.4481865284974093,
+      "grad_norm": 26.72177706144361,
+      "learning_rate": 3.6875511196463715e-05,
+      "loss": 1.1725,
+      "step": 173
+    },
+    {
+      "epoch": 0.45077720207253885,
+      "grad_norm": 95.4088800585977,
+      "learning_rate": 3.682928813700375e-05,
+      "loss": 1.1339,
+      "step": 174
+    },
+    {
+      "epoch": 0.4533678756476684,
+      "grad_norm": 34.33666578349465,
+      "learning_rate": 3.678275509112788e-05,
+      "loss": 1.1867,
+      "step": 175
+    },
+    {
+      "epoch": 0.45595854922279794,
+      "grad_norm": 31.032304531003014,
+      "learning_rate": 3.6735912915949745e-05,
+      "loss": 1.1386,
+      "step": 176
+    },
+    {
+      "epoch": 0.4585492227979275,
+      "grad_norm": 55.22043313188224,
+      "learning_rate": 3.6688762474276945e-05,
+      "loss": 1.1102,
+      "step": 177
+    },
+    {
+      "epoch": 0.46113989637305697,
+      "grad_norm": 29.82713377876857,
+      "learning_rate": 3.6641304634595216e-05,
+      "loss": 1.1564,
+      "step": 178
+    },
+    {
+      "epoch": 0.4637305699481865,
+      "grad_norm": 35.71025459541737,
+      "learning_rate": 3.659354027105238e-05,
+      "loss": 1.0939,
+      "step": 179
+    },
+    {
+      "epoch": 0.46632124352331605,
+      "grad_norm": 52.41175655642653,
+      "learning_rate": 3.6545470263442265e-05,
+      "loss": 1.1578,
+      "step": 180
+    },
+    {
+      "epoch": 0.4689119170984456,
+      "grad_norm": 27.682485766528306,
+      "learning_rate": 3.649709549718849e-05,
+      "loss": 1.1875,
+      "step": 181
+    },
+    {
+      "epoch": 0.47150259067357514,
+      "grad_norm": 36.53293663303487,
+      "learning_rate": 3.6448416863328186e-05,
+      "loss": 1.1111,
+      "step": 182
+    },
+    {
+      "epoch": 0.4740932642487047,
+      "grad_norm": 31.45177998538027,
+      "learning_rate": 3.639943525849555e-05,
+      "loss": 1.113,
+      "step": 183
+    },
+    {
+      "epoch": 0.47668393782383417,
+      "grad_norm": 28.323097072885673,
+      "learning_rate": 3.635015158490533e-05,
+      "loss": 1.1159,
+      "step": 184
+    },
+    {
+      "epoch": 0.4792746113989637,
+      "grad_norm": 47.75573754341213,
+      "learning_rate": 3.6300566750336225e-05,
+      "loss": 1.1305,
+      "step": 185
+    },
+    {
+      "epoch": 0.48186528497409326,
+      "grad_norm": 21.384095061494357,
+      "learning_rate": 3.625068166811418e-05,
+      "loss": 1.1369,
+      "step": 186
+    },
+    {
+      "epoch": 0.4844559585492228,
+      "grad_norm": 30.714645036809546,
+      "learning_rate": 3.6200497257095504e-05,
+      "loss": 1.1858,
+      "step": 187
+    },
+    {
+      "epoch": 0.48704663212435234,
+      "grad_norm": 35.12161426399798,
+      "learning_rate": 3.615001444165001e-05,
+      "loss": 1.1293,
+      "step": 188
+    },
+    {
+      "epoch": 0.4896373056994819,
+      "grad_norm": 116.83443661381396,
+      "learning_rate": 3.6099234151643924e-05,
+      "loss": 1.1515,
+      "step": 189
+    },
+    {
+      "epoch": 0.49222797927461137,
+      "grad_norm": 55.47885243409044,
+      "learning_rate": 3.604815732242283e-05,
+      "loss": 1.112,
+      "step": 190
+    },
+    {
+      "epoch": 0.4948186528497409,
+      "grad_norm": 32.332747429034285,
+      "learning_rate": 3.5996784894794394e-05,
+      "loss": 1.1661,
+      "step": 191
+    },
+    {
+      "epoch": 0.49740932642487046,
+      "grad_norm": 33.039210183180046,
+      "learning_rate": 3.594511781501103e-05,
+      "loss": 1.1244,
+      "step": 192
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 21.325687337182504,
+      "learning_rate": 3.58931570347525e-05,
+      "loss": 1.1634,
+      "step": 193
+    },
+    {
+      "epoch": 0.5025906735751295,
+      "grad_norm": 51.37599478469561,
+      "learning_rate": 3.584090351110838e-05,
+      "loss": 1.2106,
+      "step": 194
+    },
+    {
+      "epoch": 0.5025906735751295,
+      "eval_loss": 1.1119717359542847,
+      "eval_runtime": 49.6027,
+      "eval_samples_per_second": 14.999,
+      "eval_steps_per_second": 0.948,
+      "step": 194
+    },
+    {
+      "epoch": 0.5051813471502591,
+      "grad_norm": 42.105169991612456,
+      "learning_rate": 3.57883582065604e-05,
+      "loss": 1.1303,
+      "step": 195
+    },
+    {
+      "epoch": 0.5077720207253886,
+      "grad_norm": 37.14457014578168,
+      "learning_rate": 3.573552208896474e-05,
+      "loss": 1.1483,
+      "step": 196
+    },
+    {
+      "epoch": 0.5103626943005182,
+      "grad_norm": 28.56241612018119,
+      "learning_rate": 3.568239613153421e-05,
+      "loss": 1.0843,
+      "step": 197
+    },
+    {
+      "epoch": 0.5129533678756477,
+      "grad_norm": 35.399304035761865,
+      "learning_rate": 3.5628981312820315e-05,
+      "loss": 1.1177,
+      "step": 198
+    },
+    {
+      "epoch": 0.5155440414507773,
+      "grad_norm": 25.91156850470446,
+      "learning_rate": 3.557527861669522e-05,
+      "loss": 1.1215,
+      "step": 199
+    },
+    {
+      "epoch": 0.5181347150259067,
+      "grad_norm": 43.509516777992324,
+      "learning_rate": 3.552128903233363e-05,
+      "loss": 1.1532,
+      "step": 200
+    },
+    {
+      "epoch": 0.5207253886010362,
+      "grad_norm": 38.18164449834795,
+      "learning_rate": 3.54670135541946e-05,
+      "loss": 1.1142,
+      "step": 201
+    },
+    {
+      "epoch": 0.5233160621761658,
+      "grad_norm": 48.576743289054534,
+      "learning_rate": 3.541245318200318e-05,
+      "loss": 1.1152,
+      "step": 202
+    },
+    {
+      "epoch": 0.5259067357512953,
+      "grad_norm": 38.65411737007163,
+      "learning_rate": 3.5357608920732e-05,
+      "loss": 1.1607,
+      "step": 203
+    },
+    {
+      "epoch": 0.5284974093264249,
+      "grad_norm": 35.663493907396834,
+      "learning_rate": 3.530248178058282e-05,
+      "loss": 1.1273,
+      "step": 204
+    },
+    {
+      "epoch": 0.5310880829015544,
+      "grad_norm": 26.829817821665976,
+      "learning_rate": 3.5247072776967805e-05,
+      "loss": 1.1174,
+      "step": 205
+    },
+    {
+      "epoch": 0.533678756476684,
+      "grad_norm": 39.79604912152638,
+      "learning_rate": 3.519138293049097e-05,
+      "loss": 1.1811,
+      "step": 206
+    },
+    {
+      "epoch": 0.5362694300518135,
+      "grad_norm": 32.26179097390416,
+      "learning_rate": 3.513541326692925e-05,
+      "loss": 1.1346,
+      "step": 207
+    },
+    {
+      "epoch": 0.538860103626943,
+      "grad_norm": 24.35769329902787,
+      "learning_rate": 3.5079164817213684e-05,
+      "loss": 1.1061,
+      "step": 208
+    },
+    {
+      "epoch": 0.5414507772020726,
+      "grad_norm": 26.645546258363844,
+      "learning_rate": 3.5022638617410396e-05,
+      "loss": 1.0514,
+      "step": 209
+    },
+    {
+      "epoch": 0.5440414507772021,
+      "grad_norm": 105.19676603444857,
+      "learning_rate": 3.496583570870152e-05,
+      "loss": 1.1474,
+      "step": 210
+    },
+    {
+      "epoch": 0.5466321243523317,
+      "grad_norm": 61.600623030405885,
+      "learning_rate": 3.4908757137366006e-05,
+      "loss": 1.104,
+      "step": 211
+    },
+    {
+      "epoch": 0.5492227979274611,
+      "grad_norm": 31.65460129853052,
+      "learning_rate": 3.485140395476038e-05,
+      "loss": 1.0737,
+      "step": 212
+    },
+    {
+      "epoch": 0.5518134715025906,
+      "grad_norm": 26.860379117211497,
+      "learning_rate": 3.4793777217299346e-05,
+      "loss": 1.1119,
+      "step": 213
+    },
+    {
+      "epoch": 0.5544041450777202,
+      "grad_norm": 39.89324262309783,
+      "learning_rate": 3.473587798643633e-05,
+      "loss": 1.1626,
+      "step": 214
+    },
+    {
+      "epoch": 0.5569948186528497,
+      "grad_norm": 39.77638257731599,
+      "learning_rate": 3.467770732864399e-05,
+      "loss": 1.1545,
+      "step": 215
+    },
+    {
+      "epoch": 0.5595854922279793,
+      "grad_norm": 30.994657564291458,
+      "learning_rate": 3.461926631539445e-05,
+      "loss": 1.1646,
+      "step": 216
+    },
+    {
+      "epoch": 0.5621761658031088,
+      "grad_norm": 51.99674092516571,
+      "learning_rate": 3.4560556023139695e-05,
+      "loss": 1.1638,
+      "step": 217
+    },
+    {
+      "epoch": 0.5647668393782384,
+      "grad_norm": 58.5132713002146,
+      "learning_rate": 3.450157753329166e-05,
+      "loss": 1.1461,
+      "step": 218
+    },
+    {
+      "epoch": 0.5673575129533679,
+      "grad_norm": 30.712469030418482,
+      "learning_rate": 3.4442331932202326e-05,
+      "loss": 1.1583,
+      "step": 219
+    },
+    {
+      "epoch": 0.5699481865284974,
+      "grad_norm": 47.00217426642832,
+      "learning_rate": 3.438282031114374e-05,
+      "loss": 1.1154,
+      "step": 220
+    },
+    {
+      "epoch": 0.572538860103627,
+      "grad_norm": 37.33927961163222,
+      "learning_rate": 3.432304376628787e-05,
+      "loss": 1.1372,
+      "step": 221
+    },
+    {
+      "epoch": 0.5751295336787565,
+      "grad_norm": 28.858636933974392,
+      "learning_rate": 3.4263003398686464e-05,
+      "loss": 1.0488,
+      "step": 222
+    },
+    {
+      "epoch": 0.5777202072538861,
+      "grad_norm": 37.842230890171486,
+      "learning_rate": 3.420270031425072e-05,
+      "loss": 1.1892,
+      "step": 223
+    },
+    {
+      "epoch": 0.5803108808290155,
+      "grad_norm": 32.65394945357516,
+      "learning_rate": 3.4142135623730954e-05,
+      "loss": 1.1218,
+      "step": 224
+    },
+    {
+      "epoch": 0.582901554404145,
+      "grad_norm": 115.22040829465772,
+      "learning_rate": 3.4081310442696114e-05,
+      "loss": 1.1546,
+      "step": 225
+    },
+    {
+      "epoch": 0.5854922279792746,
+      "grad_norm": 31.20514468446119,
+      "learning_rate": 3.402022589151325e-05,
+      "loss": 1.0969,
+      "step": 226
+    },
+    {
+      "epoch": 0.5880829015544041,
+      "grad_norm": 52.8397361926395,
+      "learning_rate": 3.395888309532687e-05,
+      "loss": 1.1218,
+      "step": 227
+    },
+    {
+      "epoch": 0.5906735751295337,
+      "grad_norm": 51.7991692917308,
+      "learning_rate": 3.3897283184038215e-05,
+      "loss": 1.1395,
+      "step": 228
+    },
+    {
+      "epoch": 0.5932642487046632,
+      "grad_norm": 33.56775233970504,
+      "learning_rate": 3.3835427292284445e-05,
+      "loss": 1.1107,
+      "step": 229
+    },
+    {
+      "epoch": 0.5958549222797928,
+      "grad_norm": 46.081120788214314,
+      "learning_rate": 3.3773316559417734e-05,
+      "loss": 1.1472,
+      "step": 230
+    },
+    {
+      "epoch": 0.5984455958549223,
+      "grad_norm": 41.72558170492288,
+      "learning_rate": 3.371095212948431e-05,
+      "loss": 1.1871,
+      "step": 231
+    },
+    {
+      "epoch": 0.6010362694300518,
+      "grad_norm": 34.27957927587091,
+      "learning_rate": 3.364833515120336e-05,
+      "loss": 1.1376,
+      "step": 232
+    },
+    {
+      "epoch": 0.6036269430051814,
+      "grad_norm": 36.58452602010953,
+      "learning_rate": 3.358546677794586e-05,
+      "loss": 1.1885,
+      "step": 233
+    },
+    {
+      "epoch": 0.6062176165803109,
+      "grad_norm": 28.010809914189192,
+      "learning_rate": 3.352234816771337e-05,
+      "loss": 1.102,
+      "step": 234
+    },
+    {
+      "epoch": 0.6088082901554405,
+      "grad_norm": 24.78419558611963,
+      "learning_rate": 3.3458980483116664e-05,
+      "loss": 1.0818,
+      "step": 235
+    },
+    {
+      "epoch": 0.6113989637305699,
+      "grad_norm": 28.12830040081226,
+      "learning_rate": 3.3395364891354316e-05,
+      "loss": 1.1862,
+      "step": 236
+    },
+    {
+      "epoch": 0.6139896373056994,
+      "grad_norm": 37.94181651161551,
+      "learning_rate": 3.333150256419127e-05,
+      "loss": 1.147,
+      "step": 237
+    },
+    {
+      "epoch": 0.616580310880829,
+      "grad_norm": 21.809518482701854,
+      "learning_rate": 3.3267394677937134e-05,
+      "loss": 1.0994,
+      "step": 238
+    },
+    {
+      "epoch": 0.6191709844559585,
+      "grad_norm": 32.12135773753589,
+      "learning_rate": 3.320304241342464e-05,
+      "loss": 1.1531,
+      "step": 239
+    },
+    {
+      "epoch": 0.6217616580310881,
+      "grad_norm": 51.959731073524054,
+      "learning_rate": 3.31384469559878e-05,
+      "loss": 1.1717,
+      "step": 240
+    },
+    {
+      "epoch": 0.6243523316062176,
+      "grad_norm": 28.045815836372345,
+      "learning_rate": 3.307360949544012e-05,
+      "loss": 1.1814,
+      "step": 241
+    },
+    {
+      "epoch": 0.6269430051813472,
+      "grad_norm": 39.55208384578746,
+      "learning_rate": 3.300853122605268e-05,
+      "loss": 1.1483,
+      "step": 242
+    },
+    {
+      "epoch": 0.6295336787564767,
+      "grad_norm": 29.799974205160808,
+      "learning_rate": 3.294321334653213e-05,
+      "loss": 1.1838,
+      "step": 243
+    },
+    {
+      "epoch": 0.6321243523316062,
+      "grad_norm": 124.31035254102245,
+      "learning_rate": 3.2877657059998584e-05,
+      "loss": 1.0698,
+      "step": 244
+    },
+    {
+      "epoch": 0.6347150259067358,
+      "grad_norm": 37.989925180187655,
+      "learning_rate": 3.281186357396351e-05,
+      "loss": 1.0984,
+      "step": 245
+    },
+    {
+      "epoch": 0.6373056994818653,
+      "grad_norm": 55.72599333657572,
+      "learning_rate": 3.274583410030745e-05,
+      "loss": 1.2333,
+      "step": 246
+    },
+    {
+      "epoch": 0.6398963730569949,
+      "grad_norm": 46.77079456439719,
+      "learning_rate": 3.267956985525774e-05,
+      "loss": 1.2157,
+      "step": 247
+    },
+    {
+      "epoch": 0.6424870466321243,
+      "grad_norm": 33.62329915252562,
+      "learning_rate": 3.261307205936603e-05,
+      "loss": 1.1752,
+      "step": 248
+    },
+    {
+      "epoch": 0.6450777202072538,
+      "grad_norm": 34.11794183225494,
+      "learning_rate": 3.2546341937485884e-05,
+      "loss": 1.1265,
+      "step": 249
+    },
+    {
+      "epoch": 0.6476683937823834,
+      "grad_norm": 36.027636323913896,
+      "learning_rate": 3.247938071875017e-05,
+      "loss": 1.103,
+      "step": 250
+    },
+    {
+      "epoch": 0.6502590673575129,
+      "grad_norm": 35.393219337329946,
+      "learning_rate": 3.2412189636548456e-05,
+      "loss": 1.1148,
+      "step": 251
+    },
+    {
+      "epoch": 0.6528497409326425,
+      "grad_norm": 31.578919022569924,
+      "learning_rate": 3.234476992850425e-05,
+      "loss": 1.1149,
+      "step": 252
+    },
+    {
+      "epoch": 0.655440414507772,
+      "grad_norm": 28.93717647736964,
+      "learning_rate": 3.227712283645224e-05,
+      "loss": 1.1425,
+      "step": 253
+    },
+    {
+      "epoch": 0.6580310880829016,
+      "grad_norm": 34.170026750703684,
+      "learning_rate": 3.2209249606415394e-05,
+      "loss": 1.1591,
+      "step": 254
+    },
+    {
+      "epoch": 0.6606217616580311,
+      "grad_norm": 27.52194954061608,
+      "learning_rate": 3.214115148858201e-05,
+      "loss": 1.1704,
+      "step": 255
+    },
+    {
+      "epoch": 0.6632124352331606,
+      "grad_norm": 81.65404753769732,
+      "learning_rate": 3.207282973728273e-05,
+      "loss": 1.161,
+      "step": 256
+    },
+    {
+      "epoch": 0.6658031088082902,
+      "grad_norm": 57.45351536522683,
+      "learning_rate": 3.200428561096737e-05,
+      "loss": 1.116,
+      "step": 257
+    },
+    {
+      "epoch": 0.6683937823834197,
+      "grad_norm": 30.968529074463714,
+      "learning_rate": 3.193552037218179e-05,
+      "loss": 1.1265,
+      "step": 258
+    },
+    {
+      "epoch": 0.6709844559585493,
+      "grad_norm": 37.8817748068655,
+      "learning_rate": 3.186653528754464e-05,
+      "loss": 1.1287,
+      "step": 259
+    },
+    {
+      "epoch": 0.6735751295336787,
+      "grad_norm": 29.197031189172545,
+      "learning_rate": 3.179733162772398e-05,
+      "loss": 1.1045,
+      "step": 260
+    },
+    {
+      "epoch": 0.6761658031088082,
+      "grad_norm": 36.56253841299107,
+      "learning_rate": 3.172791066741392e-05,
+      "loss": 1.1539,
+      "step": 261
+    },
+    {
+      "epoch": 0.6787564766839378,
+      "grad_norm": 25.799921116950998,
+      "learning_rate": 3.165827368531113e-05,
+      "loss": 1.0796,
+      "step": 262
+    },
+    {
+      "epoch": 0.6813471502590673,
+      "grad_norm": 82.81825216532526,
+      "learning_rate": 3.1588421964091276e-05,
+      "loss": 1.142,
+      "step": 263
+    },
+    {
+      "epoch": 0.6839378238341969,
+      "grad_norm": 31.100074747569124,
+      "learning_rate": 3.151835679038542e-05,
+      "loss": 1.0908,
+      "step": 264
+    },
+    {
+      "epoch": 0.6865284974093264,
+      "grad_norm": 25.57297200703221,
+      "learning_rate": 3.14480794547563e-05,
+      "loss": 1.1436,
+      "step": 265
+    },
+    {
+      "epoch": 0.689119170984456,
+      "grad_norm": 23.92492773149328,
+      "learning_rate": 3.137759125167455e-05,
+      "loss": 1.1202,
+      "step": 266
+    },
+    {
+      "epoch": 0.6917098445595855,
+      "grad_norm": 22.14274360766396,
+      "learning_rate": 3.130689347949486e-05,
+      "loss": 1.1113,
+      "step": 267
+    },
+    {
+      "epoch": 0.694300518134715,
+      "grad_norm": 26.68725288649902,
+      "learning_rate": 3.123598744043211e-05,
+      "loss": 1.1517,
+      "step": 268
+    },
+    {
+      "epoch": 0.6968911917098446,
+      "grad_norm": 25.559817524659362,
+      "learning_rate": 3.1164874440537295e-05,
+      "loss": 1.0976,
+      "step": 269
+    },
+    {
+      "epoch": 0.6994818652849741,
+      "grad_norm": 28.89996834100355,
+      "learning_rate": 3.109355578967356e-05,
+      "loss": 1.1932,
+      "step": 270
+    },
+    {
+      "epoch": 0.7020725388601037,
+      "grad_norm": 32.09658045195569,
+      "learning_rate": 3.1022032801492e-05,
+      "loss": 1.1161,
+      "step": 271
+    },
+    {
+      "epoch": 0.7046632124352331,
+      "grad_norm": 30.623705646213768,
+      "learning_rate": 3.095030679340751e-05,
+      "loss": 1.1993,
+      "step": 272
+    },
+    {
+      "epoch": 0.7072538860103627,
+      "grad_norm": 41.71263710932429,
+      "learning_rate": 3.0878379086574494e-05,
+      "loss": 1.1624,
+      "step": 273
+    },
+    {
+      "epoch": 0.7098445595854922,
+      "grad_norm": 34.68352639470226,
+      "learning_rate": 3.0806251005862535e-05,
+      "loss": 1.1156,
+      "step": 274
+    },
+    {
+      "epoch": 0.7124352331606217,
+      "grad_norm": 23.52580702428812,
+      "learning_rate": 3.073392387983202e-05,
+      "loss": 1.0963,
+      "step": 275
+    },
+    {
+      "epoch": 0.7150259067357513,
+      "grad_norm": 28.10687988214902,
+      "learning_rate": 3.0661399040709584e-05,
+      "loss": 1.1095,
+      "step": 276
+    },
+    {
+      "epoch": 0.7176165803108808,
+      "grad_norm": 66.72288729975841,
+      "learning_rate": 3.05886778243637e-05,
+      "loss": 1.0865,
+      "step": 277
+    },
+    {
+      "epoch": 0.7202072538860104,
+      "grad_norm": 25.775217430321934,
+      "learning_rate": 3.051576157027998e-05,
+      "loss": 1.1058,
+      "step": 278
+    },
+    {
+      "epoch": 0.7227979274611399,
+      "grad_norm": 36.82942099016794,
+      "learning_rate": 3.0442651621536502e-05,
+      "loss": 1.1211,
+      "step": 279
+    },
+    {
+      "epoch": 0.7253886010362695,
+      "grad_norm": 27.878820856521013,
+      "learning_rate": 3.0369349324779115e-05,
+      "loss": 1.1471,
+      "step": 280
+    },
+    {
+      "epoch": 0.727979274611399,
+      "grad_norm": 31.293156717285573,
+      "learning_rate": 3.0295856030196618e-05,
+      "loss": 1.0748,
+      "step": 281
+    },
+    {
+      "epoch": 0.7305699481865285,
+      "grad_norm": 39.315952115194435,
+      "learning_rate": 3.022217309149588e-05,
+      "loss": 1.0993,
+      "step": 282
+    },
+    {
+      "epoch": 0.7331606217616581,
+      "grad_norm": 36.79954071435495,
+      "learning_rate": 3.0148301865876913e-05,
+      "loss": 1.1045,
+      "step": 283
+    },
+    {
+      "epoch": 0.7357512953367875,
+      "grad_norm": 26.127389502147167,
+      "learning_rate": 3.0074243714007875e-05,
+      "loss": 1.1424,
+      "step": 284
+    },
+    {
+      "epoch": 0.7383419689119171,
+      "grad_norm": 25.608778060317068,
+      "learning_rate": 3.0000000000000004e-05,
+      "loss": 1.1055,
+      "step": 285
+    },
+    {
+      "epoch": 0.7409326424870466,
+      "grad_norm": 36.22629669671894,
+      "learning_rate": 2.992557209138249e-05,
+      "loss": 1.0845,
+      "step": 286
+    },
+    {
+      "epoch": 0.7435233160621761,
+      "grad_norm": 35.30642111132886,
+      "learning_rate": 2.9850961359077293e-05,
+      "loss": 1.204,
+      "step": 287
+    },
+    {
+      "epoch": 0.7461139896373057,
+      "grad_norm": 29.765894622087952,
+      "learning_rate": 2.977616917737388e-05,
+      "loss": 1.168,
+      "step": 288
+    },
+    {
+      "epoch": 0.7487046632124352,
+      "grad_norm": 27.194683587397567,
+      "learning_rate": 2.9701196923903927e-05,
+      "loss": 1.1236,
+      "step": 289
+    },
+    {
+      "epoch": 0.7512953367875648,
+      "grad_norm": 63.09779240191165,
+      "learning_rate": 2.9626045979615928e-05,
+      "loss": 1.1395,
+      "step": 290
+    },
+    {
+      "epoch": 0.7538860103626943,
+      "grad_norm": 25.014233377763066,
+      "learning_rate": 2.9550717728749768e-05,
+      "loss": 1.1054,
+      "step": 291
+    },
+    {
+      "epoch": 0.7538860103626943,
+      "eval_loss": 1.0996382236480713,
+      "eval_runtime": 37.9545,
+      "eval_samples_per_second": 19.602,
+      "eval_steps_per_second": 1.238,
+      "step": 291
+    },
+    {
+      "epoch": 0.7564766839378239,
+      "grad_norm": 27.481891737318097,
+      "learning_rate": 2.947521355881122e-05,
+      "loss": 1.1252,
+      "step": 292
+    },
+    {
+      "epoch": 0.7590673575129534,
+      "grad_norm": 67.57807413949878,
+      "learning_rate": 2.9399534860546404e-05,
+      "loss": 1.1761,
+      "step": 293
+    },
+    {
+      "epoch": 0.7616580310880829,
+      "grad_norm": 65.66834495909988,
+      "learning_rate": 2.932368302791614e-05,
+      "loss": 1.0551,
+      "step": 294
+    },
+    {
+      "epoch": 0.7642487046632125,
+      "grad_norm": 30.051210942517116,
+      "learning_rate": 2.92476594580703e-05,
+      "loss": 1.138,
+      "step": 295
+    },
+    {
+      "epoch": 0.7668393782383419,
+      "grad_norm": 22.693089678510507,
+      "learning_rate": 2.917146555132206e-05,
+      "loss": 1.1495,
+      "step": 296
+    },
+    {
+      "epoch": 0.7694300518134715,
+      "grad_norm": 53.84166280540606,
+      "learning_rate": 2.909510271112212e-05,
+      "loss": 1.1409,
+      "step": 297
+    },
+    {
+      "epoch": 0.772020725388601,
+      "grad_norm": 32.69106061524578,
+      "learning_rate": 2.9018572344032823e-05,
+      "loss": 1.1709,
+      "step": 298
+    },
+    {
+      "epoch": 0.7746113989637305,
+      "grad_norm": 39.44484991312582,
+      "learning_rate": 2.8941875859702283e-05,
+      "loss": 1.1138,
+      "step": 299
+    },
+    {
+      "epoch": 0.7772020725388601,
+      "grad_norm": 31.51857596969122,
+      "learning_rate": 2.88650146708384e-05,
+      "loss": 1.1931,
+      "step": 300
+    },
+    {
+      "epoch": 0.7797927461139896,
+      "grad_norm": 70.51218412614058,
+      "learning_rate": 2.878799019318283e-05,
+      "loss": 1.155,
+      "step": 301
+    },
+    {
+      "epoch": 0.7823834196891192,
+      "grad_norm": 80.27969224752457,
+      "learning_rate": 2.8710803845484955e-05,
+      "loss": 1.1425,
+      "step": 302
+    },
+    {
+      "epoch": 0.7849740932642487,
+      "grad_norm": 28.16560857981767,
+      "learning_rate": 2.8633457049475678e-05,
+      "loss": 1.1072,
+      "step": 303
+    },
+    {
+      "epoch": 0.7875647668393783,
+      "grad_norm": 41.15138307552231,
+      "learning_rate": 2.855595122984129e-05,
+      "loss": 1.1492,
+      "step": 304
+    },
+    {
+      "epoch": 0.7901554404145078,
+      "grad_norm": 23.894217282116276,
+      "learning_rate": 2.847828781419722e-05,
+      "loss": 1.1136,
+      "step": 305
+    },
+    {
+      "epoch": 0.7927461139896373,
+      "grad_norm": 25.005501120810248,
+      "learning_rate": 2.8400468233061708e-05,
+      "loss": 1.0921,
+      "step": 306
+    },
+    {
+      "epoch": 0.7953367875647669,
+      "grad_norm": 30.91791938195468,
+      "learning_rate": 2.832249391982949e-05,
+      "loss": 1.1098,
+      "step": 307
+    },
+    {
+      "epoch": 0.7979274611398963,
+      "grad_norm": 44.776563922922726,
+      "learning_rate": 2.8244366310745398e-05,
+      "loss": 1.1845,
+      "step": 308
+    },
+    {
+      "epoch": 0.8005181347150259,
+      "grad_norm": 19.059329544784376,
+      "learning_rate": 2.816608684487787e-05,
+      "loss": 1.169,
+      "step": 309
+    },
+    {
+      "epoch": 0.8031088082901554,
+      "grad_norm": 63.97334641962602,
+      "learning_rate": 2.8087656964092472e-05,
+      "loss": 1.124,
+      "step": 310
+    },
+    {
+      "epoch": 0.805699481865285,
+      "grad_norm": 30.878848859015882,
+      "learning_rate": 2.8009078113025335e-05,
+      "loss": 1.2087,
+      "step": 311
+    },
+    {
+      "epoch": 0.8082901554404145,
+      "grad_norm": 34.63835471543836,
+      "learning_rate": 2.7930351739056533e-05,
+      "loss": 1.1338,
+      "step": 312
+    },
+    {
+      "epoch": 0.810880829015544,
+      "grad_norm": 30.03178182445718,
+      "learning_rate": 2.7851479292283442e-05,
+      "loss": 1.1321,
+      "step": 313
+    },
+    {
+      "epoch": 0.8134715025906736,
+      "grad_norm": 38.42236523356876,
+      "learning_rate": 2.7772462225494013e-05,
+      "loss": 1.1557,
+      "step": 314
+    },
+    {
+      "epoch": 0.8160621761658031,
+      "grad_norm": 39.179683790956744,
+      "learning_rate": 2.7693301994140026e-05,
+      "loss": 1.1201,
+      "step": 315
+    },
+    {
+      "epoch": 0.8186528497409327,
+      "grad_norm": 38.32243159447327,
+      "learning_rate": 2.761400005631028e-05,
+      "loss": 1.1105,
+      "step": 316
+    },
+    {
+      "epoch": 0.8212435233160622,
+      "grad_norm": 39.913808227411835,
+      "learning_rate": 2.7534557872703705e-05,
+      "loss": 1.1598,
+      "step": 317
+    },
+    {
+      "epoch": 0.8238341968911918,
+      "grad_norm": 69.73521867812421,
+      "learning_rate": 2.7454976906602513e-05,
+      "loss": 1.1145,
+      "step": 318
+    },
+    {
+      "epoch": 0.8264248704663213,
+      "grad_norm": 65.55887588207746,
+      "learning_rate": 2.7375258623845207e-05,
+      "loss": 1.1255,
+      "step": 319
+    },
+    {
+      "epoch": 0.8290155440414507,
+      "grad_norm": 30.980111545641563,
+      "learning_rate": 2.7295404492799575e-05,
+      "loss": 1.122,
+      "step": 320
+    },
+    {
+      "epoch": 0.8316062176165803,
+      "grad_norm": 30.12179911444832,
+      "learning_rate": 2.721541598433567e-05,
+      "loss": 1.113,
+      "step": 321
+    },
+    {
+      "epoch": 0.8341968911917098,
+      "grad_norm": 28.329434659508582,
+      "learning_rate": 2.7135294571798706e-05,
+      "loss": 1.0498,
+      "step": 322
+    },
+    {
+      "epoch": 0.8367875647668394,
+      "grad_norm": 25.114787597049578,
+      "learning_rate": 2.70550417309819e-05,
+      "loss": 1.0633,
+      "step": 323
+    },
+    {
+      "epoch": 0.8393782383419689,
+      "grad_norm": 27.754037709590385,
+      "learning_rate": 2.6974658940099337e-05,
+      "loss": 1.1585,
+      "step": 324
+    },
+    {
+      "epoch": 0.8419689119170984,
+      "grad_norm": 29.489888159179444,
+      "learning_rate": 2.6894147679758678e-05,
+      "loss": 1.1259,
+      "step": 325
+    },
+    {
+      "epoch": 0.844559585492228,
+      "grad_norm": 24.426102194202898,
+      "learning_rate": 2.6813509432933957e-05,
+      "loss": 1.1515,
+      "step": 326
+    },
+    {
+      "epoch": 0.8471502590673575,
+      "grad_norm": 24.75197483331429,
+      "learning_rate": 2.673274568493821e-05,
+      "loss": 1.15,
+      "step": 327
+    },
+    {
+      "epoch": 0.8497409326424871,
+      "grad_norm": 40.604864626683366,
+      "learning_rate": 2.6651857923396132e-05,
+      "loss": 1.1219,
+      "step": 328
+    },
+    {
+      "epoch": 0.8523316062176166,
+      "grad_norm": 34.694568404196026,
+      "learning_rate": 2.6570847638216698e-05,
+      "loss": 1.103,
+      "step": 329
+    },
+    {
+      "epoch": 0.8549222797927462,
+      "grad_norm": 48.715136403425035,
+      "learning_rate": 2.648971632156569e-05,
+      "loss": 1.1675,
+      "step": 330
+    },
+    {
+      "epoch": 0.8575129533678757,
+      "grad_norm": 97.77526410121799,
+      "learning_rate": 2.6408465467838225e-05,
+      "loss": 1.1502,
+      "step": 331
+    },
+    {
+      "epoch": 0.8601036269430051,
+      "grad_norm": 54.697215318949276,
+      "learning_rate": 2.632709657363124e-05,
+      "loss": 1.1446,
+      "step": 332
+    },
+    {
+      "epoch": 0.8626943005181347,
+      "grad_norm": 38.09192002041798,
+      "learning_rate": 2.6245611137715897e-05,
+      "loss": 1.1333,
+      "step": 333
+    },
+    {
+      "epoch": 0.8652849740932642,
+      "grad_norm": 46.713623556984956,
+      "learning_rate": 2.6164010661010007e-05,
+      "loss": 1.1252,
+      "step": 334
+    },
+    {
+      "epoch": 0.8678756476683938,
+      "grad_norm": 46.40552686286593,
+      "learning_rate": 2.6082296646550364e-05,
+      "loss": 1.121,
+      "step": 335
+    },
+    {
+      "epoch": 0.8704663212435233,
+      "grad_norm": 37.57424454065957,
+      "learning_rate": 2.6000470599465065e-05,
+      "loss": 1.1671,
+      "step": 336
+    },
+    {
+      "epoch": 0.8730569948186528,
+      "grad_norm": 38.580777053099204,
+      "learning_rate": 2.5918534026945787e-05,
+      "loss": 1.0849,
+      "step": 337
+    },
+    {
+      "epoch": 0.8756476683937824,
+      "grad_norm": 154.3106712010981,
+      "learning_rate": 2.5836488438220044e-05,
+      "loss": 1.0663,
+      "step": 338
+    },
+    {
+      "epoch": 0.8782383419689119,
+      "grad_norm": 34.21394067951015,
+      "learning_rate": 2.575433534452334e-05,
+      "loss": 1.0895,
+      "step": 339
+    },
+    {
+      "epoch": 0.8808290155440415,
+      "grad_norm": 36.291611242733886,
+      "learning_rate": 2.5672076259071385e-05,
+      "loss": 1.1242,
+      "step": 340
+    },
+    {
+      "epoch": 0.883419689119171,
+      "grad_norm": 29.411623389655112,
+      "learning_rate": 2.558971269703219e-05,
+      "loss": 1.1005,
+      "step": 341
+    },
+    {
+      "epoch": 0.8860103626943006,
+      "grad_norm": 30.24903086761753,
+      "learning_rate": 2.5507246175498174e-05,
+      "loss": 1.1134,
+      "step": 342
+    },
+    {
+      "epoch": 0.8886010362694301,
+      "grad_norm": 22.032293114161938,
+      "learning_rate": 2.5424678213458202e-05,
+      "loss": 1.1121,
+      "step": 343
+    },
+    {
+      "epoch": 0.8911917098445595,
+      "grad_norm": 34.997361528376956,
+      "learning_rate": 2.5342010331769635e-05,
+      "loss": 1.1341,
+      "step": 344
+    },
+    {
+      "epoch": 0.8937823834196891,
+      "grad_norm": 28.212824875732352,
+      "learning_rate": 2.5259244053130295e-05,
+      "loss": 1.0748,
+      "step": 345
+    },
+    {
+      "epoch": 0.8963730569948186,
+      "grad_norm": 23.870011592985897,
+      "learning_rate": 2.5176380902050418e-05,
+      "loss": 1.0643,
+      "step": 346
+    },
+    {
+      "epoch": 0.8989637305699482,
+      "grad_norm": 26.10018699309748,
+      "learning_rate": 2.5093422404824574e-05,
+      "loss": 1.1662,
+      "step": 347
+    },
+    {
+      "epoch": 0.9015544041450777,
+      "grad_norm": 30.191468778559166,
+      "learning_rate": 2.5010370089503578e-05,
+      "loss": 1.1023,
+      "step": 348
+    },
+    {
+      "epoch": 0.9041450777202072,
+      "grad_norm": 55.799581973427415,
+      "learning_rate": 2.4927225485866297e-05,
+      "loss": 1.1538,
+      "step": 349
+    },
+    {
+      "epoch": 0.9067357512953368,
+      "grad_norm": 35.7030284720465,
+      "learning_rate": 2.4843990125391516e-05,
+      "loss": 1.1,
+      "step": 350
+    },
+    {
+      "epoch": 0.9093264248704663,
+      "grad_norm": 28.61763302791738,
+      "learning_rate": 2.4760665541229712e-05,
+      "loss": 1.0914,
+      "step": 351
+    },
+    {
+      "epoch": 0.9119170984455959,
+      "grad_norm": 33.34233685155311,
+      "learning_rate": 2.467725326817481e-05,
+      "loss": 1.0862,
+      "step": 352
+    },
+    {
+      "epoch": 0.9145077720207254,
+      "grad_norm": 25.441052078480084,
+      "learning_rate": 2.4593754842635917e-05,
+      "loss": 1.1422,
+      "step": 353
+    },
+    {
+      "epoch": 0.917098445595855,
+      "grad_norm": 24.217974454985058,
+      "learning_rate": 2.451017180260902e-05,
+      "loss": 1.132,
+      "step": 354
+    },
+    {
+      "epoch": 0.9196891191709845,
+      "grad_norm": 57.986011465793155,
+      "learning_rate": 2.4426505687648653e-05,
+      "loss": 1.2082,
+      "step": 355
+    },
+    {
+      "epoch": 0.9222797927461139,
+      "grad_norm": 34.058264716876195,
+      "learning_rate": 2.4342758038839573e-05,
+      "loss": 1.1679,
+      "step": 356
+    },
+    {
+      "epoch": 0.9248704663212435,
+      "grad_norm": 28.621514922275253,
+      "learning_rate": 2.4258930398768317e-05,
+      "loss": 1.1319,
+      "step": 357
+    },
+    {
+      "epoch": 0.927461139896373,
+      "grad_norm": 35.33355417283227,
+      "learning_rate": 2.4175024311494835e-05,
+      "loss": 1.0705,
+      "step": 358
+    },
+    {
+      "epoch": 0.9300518134715026,
+      "grad_norm": 46.579572933583265,
+      "learning_rate": 2.4091041322524023e-05,
+      "loss": 1.0842,
+      "step": 359
+    },
+    {
+      "epoch": 0.9326424870466321,
+      "grad_norm": 35.494740787672974,
+      "learning_rate": 2.4006982978777263e-05,
+      "loss": 1.1072,
+      "step": 360
+    },
+    {
+      "epoch": 0.9352331606217616,
+      "grad_norm": 44.56606839509262,
+      "learning_rate": 2.392285082856394e-05,
+      "loss": 1.1125,
+      "step": 361
+    },
+    {
+      "epoch": 0.9378238341968912,
+      "grad_norm": 46.26363869084929,
+      "learning_rate": 2.3838646421552917e-05,
+      "loss": 1.1268,
+      "step": 362
+    },
+    {
+      "epoch": 0.9404145077720207,
+      "grad_norm": 89.17676267680146,
+      "learning_rate": 2.3754371308743975e-05,
+      "loss": 1.0893,
+      "step": 363
+    },
+    {
+      "epoch": 0.9430051813471503,
+      "grad_norm": 34.87700187494181,
+      "learning_rate": 2.367002704243927e-05,
+      "loss": 1.1203,
+      "step": 364
+    },
+    {
+      "epoch": 0.9455958549222798,
+      "grad_norm": 32.92806939217504,
+      "learning_rate": 2.3585615176214716e-05,
+      "loss": 1.1488,
+      "step": 365
+    },
+    {
+      "epoch": 0.9481865284974094,
+      "grad_norm": 27.27458755248548,
+      "learning_rate": 2.3501137264891396e-05,
+      "loss": 1.0874,
+      "step": 366
+    },
+    {
+      "epoch": 0.9507772020725389,
+      "grad_norm": 24.959123789739834,
+      "learning_rate": 2.3416594864506887e-05,
+      "loss": 1.1783,
+      "step": 367
+    },
+    {
+      "epoch": 0.9533678756476683,
+      "grad_norm": 31.838670988369724,
+      "learning_rate": 2.333198953228664e-05,
+      "loss": 1.0759,
+      "step": 368
+    },
+    {
+      "epoch": 0.9559585492227979,
+      "grad_norm": 28.112870222863155,
+      "learning_rate": 2.3247322826615276e-05,
+      "loss": 1.1481,
+      "step": 369
+    },
+    {
+      "epoch": 0.9585492227979274,
+      "grad_norm": 35.08461098450067,
+      "learning_rate": 2.316259630700787e-05,
+      "loss": 1.0953,
+      "step": 370
+    },
+    {
+      "epoch": 0.961139896373057,
+      "grad_norm": 37.80899503618479,
+      "learning_rate": 2.307781153408124e-05,
+      "loss": 1.1224,
+      "step": 371
+    },
+    {
+      "epoch": 0.9637305699481865,
+      "grad_norm": 31.644978122007387,
+      "learning_rate": 2.2992970069525202e-05,
+      "loss": 1.1608,
+      "step": 372
+    },
+    {
+      "epoch": 0.966321243523316,
+      "grad_norm": 23.51029318210938,
+      "learning_rate": 2.29080734760738e-05,
+      "loss": 1.0914,
+      "step": 373
+    },
+    {
+      "epoch": 0.9689119170984456,
+      "grad_norm": 28.97240481418573,
+      "learning_rate": 2.2823123317476522e-05,
+      "loss": 1.1117,
+      "step": 374
+    },
+    {
+      "epoch": 0.9715025906735751,
+      "grad_norm": 36.613893678320395,
+      "learning_rate": 2.273812115846951e-05,
+      "loss": 1.1118,
+      "step": 375
+    },
+    {
+      "epoch": 0.9740932642487047,
+      "grad_norm": 26.402979304578093,
+      "learning_rate": 2.2653068564746692e-05,
+      "loss": 1.13,
+      "step": 376
+    },
+    {
+      "epoch": 0.9766839378238342,
+      "grad_norm": 114.3000444613392,
+      "learning_rate": 2.2567967102931025e-05,
+      "loss": 1.1539,
+      "step": 377
+    },
+    {
+      "epoch": 0.9792746113989638,
+      "grad_norm": 26.861359932396834,
+      "learning_rate": 2.2482818340545534e-05,
+      "loss": 1.0566,
+      "step": 378
+    },
+    {
+      "epoch": 0.9818652849740933,
+      "grad_norm": 32.75509374223994,
+      "learning_rate": 2.2397623845984548e-05,
+      "loss": 1.1746,
+      "step": 379
+    },
+    {
+      "epoch": 0.9844559585492227,
+      "grad_norm": 34.11964206838379,
+      "learning_rate": 2.2312385188484718e-05,
+      "loss": 1.0834,
+      "step": 380
+    },
+    {
+      "epoch": 0.9870466321243523,
+      "grad_norm": 38.019564122226434,
+      "learning_rate": 2.2227103938096176e-05,
+      "loss": 1.1074,
+      "step": 381
+    },
+    {
+      "epoch": 0.9896373056994818,
+      "grad_norm": 39.5073811375391,
+      "learning_rate": 2.2141781665653584e-05,
+      "loss": 1.1082,
+      "step": 382
+    },
+    {
+      "epoch": 0.9922279792746114,
+      "grad_norm": 298.4258332795163,
+      "learning_rate": 2.205641994274721e-05,
+      "loss": 1.125,
+      "step": 383
+    },
+    {
+      "epoch": 0.9948186528497409,
+      "grad_norm": 36.444415670935506,
+      "learning_rate": 2.1971020341693973e-05,
+      "loss": 1.0935,
+      "step": 384
+    },
+    {
+      "epoch": 0.9974093264248705,
+      "grad_norm": 28.96533429210575,
+      "learning_rate": 2.188558443550849e-05,
+      "loss": 1.0957,
+      "step": 385
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 66.41241684127401,
+      "learning_rate": 2.180011379787411e-05,
+      "loss": 1.1335,
+      "step": 386
+    },
+    {
+      "epoch": 1.0025906735751295,
+      "grad_norm": 28.75549619538953,
+      "learning_rate": 2.1714610003113887e-05,
+      "loss": 1.1316,
+      "step": 387
+    },
+    {
+      "epoch": 1.005181347150259,
+      "grad_norm": 26.911837500852275,
+      "learning_rate": 2.1629074626161647e-05,
+      "loss": 1.1026,
+      "step": 388
+    },
+    {
+      "epoch": 1.005181347150259,
+      "eval_loss": 1.0908173322677612,
+      "eval_runtime": 37.7642,
+      "eval_samples_per_second": 19.701,
+      "eval_steps_per_second": 1.245,
+      "step": 388
+    },
+    {
+      "epoch": 1.0077720207253886,
+      "grad_norm": 34.28722746775385,
+      "learning_rate": 2.1543509242532932e-05,
+      "loss": 1.1104,
+      "step": 389
+    },
+    {
+      "epoch": 1.0103626943005182,
+      "grad_norm": 37.97709310694863,
+      "learning_rate": 2.145791542829597e-05,
+      "loss": 1.0663,
+      "step": 390
+    },
+    {
+      "epoch": 1.0129533678756477,
+      "grad_norm": 39.379668162327384,
+      "learning_rate": 2.1372294760042686e-05,
+      "loss": 1.1405,
+      "step": 391
+    },
+    {
+      "epoch": 1.0155440414507773,
+      "grad_norm": 27.136201219298698,
+      "learning_rate": 2.1286648814859636e-05,
+      "loss": 1.0963,
+      "step": 392
+    },
+    {
+      "epoch": 1.0181347150259068,
+      "grad_norm": 39.34261641469313,
+      "learning_rate": 2.120097917029897e-05,
+      "loss": 1.1276,
+      "step": 393
+    },
+    {
+      "epoch": 1.0207253886010363,
+      "grad_norm": 46.77583801285328,
+      "learning_rate": 2.1115287404349357e-05,
+      "loss": 1.1171,
+      "step": 394
+    },
+    {
+      "epoch": 1.0233160621761659,
+      "grad_norm": 55.10335066695868,
+      "learning_rate": 2.1029575095406933e-05,
+      "loss": 1.0831,
+      "step": 395
+    },
+    {
+      "epoch": 1.0259067357512954,
+      "grad_norm": 76.88533851789373,
+      "learning_rate": 2.0943843822246234e-05,
+      "loss": 1.0925,
+      "step": 396
+    },
+    {
+      "epoch": 1.028497409326425,
+      "grad_norm": 29.604569209708462,
+      "learning_rate": 2.0858095163991094e-05,
+      "loss": 1.1259,
+      "step": 397
+    },
+    {
+      "epoch": 1.0310880829015545,
+      "grad_norm": 37.71348366628868,
+      "learning_rate": 2.077233070008557e-05,
+      "loss": 1.0792,
+      "step": 398
+    },
+    {
+      "epoch": 1.0336787564766838,
+      "grad_norm": 26.866133194031644,
+      "learning_rate": 2.0686552010264872e-05,
+      "loss": 1.1649,
+      "step": 399
+    },
+    {
+      "epoch": 1.0362694300518134,
+      "grad_norm": 35.739274800620635,
+      "learning_rate": 2.060076067452622e-05,
+      "loss": 1.0837,
+      "step": 400
+    },
+    {
+      "epoch": 1.038860103626943,
+      "grad_norm": 24.479129391259896,
+      "learning_rate": 2.0514958273099778e-05,
+      "loss": 1.073,
+      "step": 401
+    },
+    {
+      "epoch": 1.0414507772020725,
+      "grad_norm": 50.49963650108008,
+      "learning_rate": 2.042914638641952e-05,
+      "loss": 1.0912,
+      "step": 402
+    },
+    {
+      "epoch": 1.044041450777202,
+      "grad_norm": 35.6875451072032,
+      "learning_rate": 2.0343326595094154e-05,
+      "loss": 1.0936,
+      "step": 403
+    },
+    {
+      "epoch": 1.0466321243523315,
+      "grad_norm": 30.212298193414487,
+      "learning_rate": 2.0257500479877965e-05,
+      "loss": 1.089,
+      "step": 404
+    },
+    {
+      "epoch": 1.049222797927461,
+      "grad_norm": 28.65828720015124,
+      "learning_rate": 2.0171669621641743e-05,
+      "loss": 1.1727,
+      "step": 405
+    },
+    {
+      "epoch": 1.0518134715025906,
+      "grad_norm": 39.2199058392425,
+      "learning_rate": 2.0085835601343627e-05,
+      "loss": 1.1493,
+      "step": 406
+    },
+    {
+      "epoch": 1.0544041450777202,
+      "grad_norm": 110.01204177059546,
+      "learning_rate": 2e-05,
+      "loss": 1.1245,
+      "step": 407
+    },
+    {
+      "epoch": 1.0569948186528497,
+      "grad_norm": 43.427381349600374,
+      "learning_rate": 1.9914164398656383e-05,
+      "loss": 1.1183,
+      "step": 408
+    },
+    {
+      "epoch": 1.0595854922279793,
+      "grad_norm": 64.78768909817894,
+      "learning_rate": 1.9828330378358264e-05,
+      "loss": 1.1528,
+      "step": 409
+    },
+    {
+      "epoch": 1.0621761658031088,
+      "grad_norm": 26.50257915912425,
+      "learning_rate": 1.974249952012204e-05,
+      "loss": 1.1568,
+      "step": 410
+    },
+    {
+      "epoch": 1.0647668393782384,
+      "grad_norm": 27.63159204178893,
+      "learning_rate": 1.9656673404905852e-05,
+      "loss": 1.1071,
+      "step": 411
+    },
+    {
+      "epoch": 1.067357512953368,
+      "grad_norm": 27.0795355533723,
+      "learning_rate": 1.957085361358049e-05,
+      "loss": 1.0809,
+      "step": 412
+    },
+    {
+      "epoch": 1.0699481865284974,
+      "grad_norm": 41.84795332660821,
+      "learning_rate": 1.9485041726900232e-05,
+      "loss": 1.0744,
+      "step": 413
+    },
+    {
+      "epoch": 1.072538860103627,
+      "grad_norm": 143.2109134427192,
+      "learning_rate": 1.939923932547379e-05,
+      "loss": 1.0905,
+      "step": 414
+    },
+    {
+      "epoch": 1.0751295336787565,
+      "grad_norm": 89.55384065946154,
+      "learning_rate": 1.931344798973513e-05,
+      "loss": 1.1012,
+      "step": 415
+    },
+    {
+      "epoch": 1.077720207253886,
+      "grad_norm": 31.072074793068015,
+      "learning_rate": 1.922766929991443e-05,
+      "loss": 1.1141,
+      "step": 416
+    },
+    {
+      "epoch": 1.0803108808290156,
+      "grad_norm": 29.82683189045969,
+      "learning_rate": 1.914190483600891e-05,
+      "loss": 1.0842,
+      "step": 417
+    },
+    {
+      "epoch": 1.0829015544041452,
+      "grad_norm": 30.09708662586305,
+      "learning_rate": 1.9056156177753776e-05,
+      "loss": 1.1088,
+      "step": 418
+    },
+    {
+      "epoch": 1.0854922279792747,
+      "grad_norm": 27.637437518920503,
+      "learning_rate": 1.897042490459307e-05,
+      "loss": 1.058,
+      "step": 419
+    },
+    {
+      "epoch": 1.0880829015544042,
+      "grad_norm": 69.34285700381683,
+      "learning_rate": 1.8884712595650653e-05,
+      "loss": 1.0314,
+      "step": 420
+    },
+    {
+      "epoch": 1.0906735751295338,
+      "grad_norm": 25.644927284592956,
+      "learning_rate": 1.8799020829701036e-05,
+      "loss": 1.0916,
+      "step": 421
+    },
+    {
+      "epoch": 1.093264248704663,
+      "grad_norm": 30.3898986852319,
+      "learning_rate": 1.871335118514037e-05,
+      "loss": 1.0797,
+      "step": 422
+    },
+    {
+      "epoch": 1.0958549222797926,
+      "grad_norm": 22.271334693423444,
+      "learning_rate": 1.862770523995732e-05,
+      "loss": 1.1134,
+      "step": 423
+    },
+    {
+      "epoch": 1.0984455958549222,
+      "grad_norm": 35.85874616678876,
+      "learning_rate": 1.854208457170404e-05,
+      "loss": 1.0927,
+      "step": 424
+    },
+    {
+      "epoch": 1.1010362694300517,
+      "grad_norm": 43.06832041948097,
+      "learning_rate": 1.8456490757467075e-05,
+      "loss": 1.093,
+      "step": 425
+    },
+    {
+      "epoch": 1.1036269430051813,
+      "grad_norm": 37.83777637993467,
+      "learning_rate": 1.8370925373838356e-05,
+      "loss": 1.1268,
+      "step": 426
+    },
+    {
+      "epoch": 1.1062176165803108,
+      "grad_norm": 23.798059023605177,
+      "learning_rate": 1.8285389996886113e-05,
+      "loss": 1.0989,
+      "step": 427
+    },
+    {
+      "epoch": 1.1088082901554404,
+      "grad_norm": 25.443104465500795,
+      "learning_rate": 1.8199886202125897e-05,
+      "loss": 1.0581,
+      "step": 428
+    },
+    {
+      "epoch": 1.11139896373057,
+      "grad_norm": 23.76241444847441,
+      "learning_rate": 1.8114415564491513e-05,
+      "loss": 1.0908,
+      "step": 429
+    },
+    {
+      "epoch": 1.1139896373056994,
+      "grad_norm": 26.5600693044426,
+      "learning_rate": 1.8028979658306033e-05,
+      "loss": 1.1321,
+      "step": 430
+    },
+    {
+      "epoch": 1.116580310880829,
+      "grad_norm": 44.854375199828986,
+      "learning_rate": 1.794358005725279e-05,
+      "loss": 1.0762,
+      "step": 431
+    },
+    {
+      "epoch": 1.1191709844559585,
+      "grad_norm": 28.05797777410846,
+      "learning_rate": 1.785821833434642e-05,
+      "loss": 1.0698,
+      "step": 432
+    },
+    {
+      "epoch": 1.121761658031088,
+      "grad_norm": 26.488479630212364,
+      "learning_rate": 1.7772896061903824e-05,
+      "loss": 1.1223,
+      "step": 433
+    },
+    {
+      "epoch": 1.1243523316062176,
+      "grad_norm": 32.77084542157883,
+      "learning_rate": 1.768761481151529e-05,
+      "loss": 1.0984,
+      "step": 434
+    },
+    {
+      "epoch": 1.1269430051813472,
+      "grad_norm": 39.13198413130026,
+      "learning_rate": 1.7602376154015456e-05,
+      "loss": 1.1551,
+      "step": 435
+    },
+    {
+      "epoch": 1.1295336787564767,
+      "grad_norm": 23.878966995283953,
+      "learning_rate": 1.751718165945447e-05,
+      "loss": 1.1133,
+      "step": 436
+    },
+    {
+      "epoch": 1.1321243523316062,
+      "grad_norm": 33.90472985566232,
+      "learning_rate": 1.743203289706898e-05,
+      "loss": 1.1219,
+      "step": 437
+    },
+    {
+      "epoch": 1.1347150259067358,
+      "grad_norm": 23.340369938533712,
+      "learning_rate": 1.734693143525331e-05,
+      "loss": 1.1244,
+      "step": 438
+    },
+    {
+      "epoch": 1.1373056994818653,
+      "grad_norm": 105.6885206147852,
+      "learning_rate": 1.7261878841530494e-05,
+      "loss": 1.0788,
+      "step": 439
+    },
+    {
+      "epoch": 1.1398963730569949,
+      "grad_norm": 28.453526076458317,
+      "learning_rate": 1.717687668252348e-05,
+      "loss": 1.1576,
+      "step": 440
+    },
+    {
+      "epoch": 1.1424870466321244,
+      "grad_norm": 36.1473991485961,
+      "learning_rate": 1.7091926523926205e-05,
+      "loss": 1.0859,
+      "step": 441
+    },
+    {
+      "epoch": 1.145077720207254,
+      "grad_norm": 27.043461146902448,
+      "learning_rate": 1.7007029930474804e-05,
+      "loss": 1.1072,
+      "step": 442
+    },
+    {
+      "epoch": 1.1476683937823835,
+      "grad_norm": 28.066170619981435,
+      "learning_rate": 1.6922188465918763e-05,
+      "loss": 1.1279,
+      "step": 443
+    },
+    {
+      "epoch": 1.150259067357513,
+      "grad_norm": 38.62445822837212,
+      "learning_rate": 1.6837403692992136e-05,
+      "loss": 1.1275,
+      "step": 444
+    },
+    {
+      "epoch": 1.1528497409326426,
+      "grad_norm": 28.077258963587767,
+      "learning_rate": 1.6752677173384734e-05,
+      "loss": 1.1004,
+      "step": 445
+    },
+    {
+      "epoch": 1.1554404145077721,
+      "grad_norm": 42.1405744301338,
+      "learning_rate": 1.6668010467713363e-05,
+      "loss": 1.1141,
+      "step": 446
+    },
+    {
+      "epoch": 1.1580310880829017,
+      "grad_norm": 26.827291684301034,
+      "learning_rate": 1.658340513549312e-05,
+      "loss": 1.1216,
+      "step": 447
+    },
+    {
+      "epoch": 1.160621761658031,
+      "grad_norm": 30.863489441619983,
+      "learning_rate": 1.649886273510861e-05,
+      "loss": 1.1898,
+      "step": 448
+    },
+    {
+      "epoch": 1.1632124352331605,
+      "grad_norm": 27.73579733476068,
+      "learning_rate": 1.641438482378529e-05,
+      "loss": 1.0971,
+      "step": 449
+    },
+    {
+      "epoch": 1.16580310880829,
+      "grad_norm": 32.84347174567353,
+      "learning_rate": 1.6329972957560736e-05,
+      "loss": 1.0579,
+      "step": 450
+    },
+    {
+      "epoch": 1.1683937823834196,
+      "grad_norm": 30.06456192962641,
+      "learning_rate": 1.6245628691256032e-05,
+      "loss": 1.1057,
+      "step": 451
+    },
+    {
+      "epoch": 1.1709844559585492,
+      "grad_norm": 36.554506394377846,
+      "learning_rate": 1.616135357844709e-05,
+      "loss": 1.1008,
+      "step": 452
+    },
+    {
+      "epoch": 1.1735751295336787,
+      "grad_norm": 27.358643056184114,
+      "learning_rate": 1.6077149171436063e-05,
+      "loss": 1.101,
+      "step": 453
+    },
+    {
+      "epoch": 1.1761658031088082,
+      "grad_norm": 111.13373813893604,
+      "learning_rate": 1.599301702122274e-05,
+      "loss": 1.0688,
+      "step": 454
+    },
+    {
+      "epoch": 1.1787564766839378,
+      "grad_norm": 33.94168250727336,
+      "learning_rate": 1.590895867747599e-05,
+      "loss": 1.0721,
+      "step": 455
+    },
+    {
+      "epoch": 1.1813471502590673,
+      "grad_norm": 53.93978395349692,
+      "learning_rate": 1.582497568850517e-05,
+      "loss": 1.0584,
+      "step": 456
+    },
+    {
+      "epoch": 1.1839378238341969,
+      "grad_norm": 29.19245794937285,
+      "learning_rate": 1.574106960123169e-05,
+      "loss": 1.067,
+      "step": 457
+    },
+    {
+      "epoch": 1.1865284974093264,
+      "grad_norm": 28.06897801999048,
+      "learning_rate": 1.5657241961160434e-05,
+      "loss": 1.0899,
+      "step": 458
+    },
+    {
+      "epoch": 1.189119170984456,
+      "grad_norm": 52.31256652964293,
+      "learning_rate": 1.557349431235135e-05,
+      "loss": 1.0925,
+      "step": 459
+    },
+    {
+      "epoch": 1.1917098445595855,
+      "grad_norm": 65.39771110845307,
+      "learning_rate": 1.5489828197390988e-05,
+      "loss": 1.1448,
+      "step": 460
+    },
+    {
+      "epoch": 1.194300518134715,
+      "grad_norm": 27.062780348557254,
+      "learning_rate": 1.5406245157364093e-05,
+      "loss": 1.0871,
+      "step": 461
+    },
+    {
+      "epoch": 1.1968911917098446,
+      "grad_norm": 41.667025056250424,
+      "learning_rate": 1.5322746731825195e-05,
+      "loss": 1.048,
+      "step": 462
+    },
+    {
+      "epoch": 1.1994818652849741,
+      "grad_norm": 24.936669803360665,
+      "learning_rate": 1.5239334458770291e-05,
+      "loss": 1.1243,
+      "step": 463
+    },
+    {
+      "epoch": 1.2020725388601037,
+      "grad_norm": 26.65392149600558,
+      "learning_rate": 1.5156009874608484e-05,
+      "loss": 1.0919,
+      "step": 464
+    },
+    {
+      "epoch": 1.2046632124352332,
+      "grad_norm": 48.57730651937978,
+      "learning_rate": 1.5072774514133708e-05,
+      "loss": 1.1259,
+      "step": 465
+    },
+    {
+      "epoch": 1.2072538860103628,
+      "grad_norm": 31.34891257114439,
+      "learning_rate": 1.4989629910496424e-05,
+      "loss": 1.0733,
+      "step": 466
+    },
+    {
+      "epoch": 1.2098445595854923,
+      "grad_norm": 24.541559850584985,
+      "learning_rate": 1.4906577595175428e-05,
+      "loss": 1.1166,
+      "step": 467
+    },
+    {
+      "epoch": 1.2124352331606219,
+      "grad_norm": 20.4345832961354,
+      "learning_rate": 1.4823619097949584e-05,
+      "loss": 1.0916,
+      "step": 468
+    },
+    {
+      "epoch": 1.2150259067357512,
+      "grad_norm": 28.860712194727487,
+      "learning_rate": 1.4740755946869708e-05,
+      "loss": 1.1043,
+      "step": 469
+    },
+    {
+      "epoch": 1.2176165803108807,
+      "grad_norm": 25.71820242946282,
+      "learning_rate": 1.4657989668230363e-05,
+      "loss": 1.0949,
+      "step": 470
+    },
+    {
+      "epoch": 1.2202072538860103,
+      "grad_norm": 51.16994773097077,
+      "learning_rate": 1.4575321786541801e-05,
+      "loss": 1.141,
+      "step": 471
+    },
+    {
+      "epoch": 1.2227979274611398,
+      "grad_norm": 32.70442309640389,
+      "learning_rate": 1.4492753824501833e-05,
+      "loss": 1.1127,
+      "step": 472
+    },
+    {
+      "epoch": 1.2253886010362693,
+      "grad_norm": 21.913285172411495,
+      "learning_rate": 1.4410287302967813e-05,
+      "loss": 1.084,
+      "step": 473
+    },
+    {
+      "epoch": 1.2279792746113989,
+      "grad_norm": 34.45727214001296,
+      "learning_rate": 1.4327923740928613e-05,
+      "loss": 1.0836,
+      "step": 474
+    },
+    {
+      "epoch": 1.2305699481865284,
+      "grad_norm": 26.768013926034776,
+      "learning_rate": 1.4245664655476663e-05,
+      "loss": 1.1264,
+      "step": 475
+    },
+    {
+      "epoch": 1.233160621761658,
+      "grad_norm": 28.401965255935572,
+      "learning_rate": 1.4163511561779956e-05,
+      "loss": 1.0805,
+      "step": 476
+    },
+    {
+      "epoch": 1.2357512953367875,
+      "grad_norm": 29.19935757288793,
+      "learning_rate": 1.4081465973054216e-05,
+      "loss": 1.0825,
+      "step": 477
+    },
+    {
+      "epoch": 1.238341968911917,
+      "grad_norm": 24.55918541541201,
+      "learning_rate": 1.3999529400534941e-05,
+      "loss": 1.1164,
+      "step": 478
+    },
+    {
+      "epoch": 1.2409326424870466,
+      "grad_norm": 25.35635406268312,
+      "learning_rate": 1.3917703353449646e-05,
+      "loss": 1.1334,
+      "step": 479
+    },
+    {
+      "epoch": 1.2435233160621761,
+      "grad_norm": 45.453901005004184,
+      "learning_rate": 1.3835989338989996e-05,
+      "loss": 1.1387,
+      "step": 480
+    },
+    {
+      "epoch": 1.2461139896373057,
+      "grad_norm": 21.67852694202104,
+      "learning_rate": 1.375438886228411e-05,
+      "loss": 1.0846,
+      "step": 481
+    },
+    {
+      "epoch": 1.2487046632124352,
+      "grad_norm": 171.2474074894732,
+      "learning_rate": 1.3672903426368773e-05,
+      "loss": 1.1388,
+      "step": 482
+    },
+    {
+      "epoch": 1.2512953367875648,
+      "grad_norm": 43.18223835070906,
+      "learning_rate": 1.3591534532161781e-05,
+      "loss": 1.1483,
+      "step": 483
+    },
+    {
+      "epoch": 1.2538860103626943,
+      "grad_norm": 29.447332565856644,
+      "learning_rate": 1.3510283678434317e-05,
+      "loss": 1.07,
+      "step": 484
+    },
+    {
+      "epoch": 1.2564766839378239,
+      "grad_norm": 28.600251051615228,
+      "learning_rate": 1.3429152361783307e-05,
+      "loss": 1.0798,
+      "step": 485
+    },
+    {
+      "epoch": 1.2564766839378239,
+      "eval_loss": 1.085669755935669,
+      "eval_runtime": 38.1134,
+      "eval_samples_per_second": 19.521,
+      "eval_steps_per_second": 1.233,
+      "step": 485
+    },
+    {
+      "epoch": 1.2590673575129534,
+      "grad_norm": 47.124643074410464,
+      "learning_rate": 1.3348142076603876e-05,
+      "loss": 1.0875,
+      "step": 486
+    },
+    {
+      "epoch": 1.261658031088083,
+      "grad_norm": 42.06019726307143,
+      "learning_rate": 1.3267254315061797e-05,
+      "loss": 1.1429,
+      "step": 487
+    },
+    {
+      "epoch": 1.2642487046632125,
+      "grad_norm": 18.950734630756962,
+      "learning_rate": 1.318649056706605e-05,
+      "loss": 1.0747,
+      "step": 488
+    },
+    {
+      "epoch": 1.266839378238342,
+      "grad_norm": 31.903949502516806,
+      "learning_rate": 1.3105852320241326e-05,
+      "loss": 1.1041,
+      "step": 489
+    },
+    {
+      "epoch": 1.2694300518134716,
+      "grad_norm": 22.957473008085927,
+      "learning_rate": 1.3025341059900675e-05,
+      "loss": 1.1046,
+      "step": 490
+    },
+    {
+      "epoch": 1.2720207253886011,
+      "grad_norm": 22.325983256563678,
+      "learning_rate": 1.2944958269018103e-05,
+      "loss": 1.0643,
+      "step": 491
+    },
+    {
+      "epoch": 1.2746113989637307,
+      "grad_norm": 29.689383331974955,
+      "learning_rate": 1.2864705428201307e-05,
+      "loss": 1.0949,
+      "step": 492
+    },
+    {
+      "epoch": 1.2772020725388602,
+      "grad_norm": 25.338298442945575,
+      "learning_rate": 1.2784584015664337e-05,
+      "loss": 1.0725,
+      "step": 493
+    },
+    {
+      "epoch": 1.2797927461139897,
+      "grad_norm": 31.591732488078588,
+      "learning_rate": 1.2704595507200435e-05,
+      "loss": 1.0347,
+      "step": 494
+    },
+    {
+      "epoch": 1.2823834196891193,
+      "grad_norm": 42.96243570696118,
+      "learning_rate": 1.26247413761548e-05,
+      "loss": 1.1196,
+      "step": 495
+    },
+    {
+      "epoch": 1.2849740932642488,
+      "grad_norm": 26.559546676266024,
+      "learning_rate": 1.254502309339749e-05,
+      "loss": 1.0187,
+      "step": 496
+    },
+    {
+      "epoch": 1.2875647668393784,
+      "grad_norm": 27.58444017584016,
+      "learning_rate": 1.2465442127296297e-05,
+      "loss": 1.0985,
+      "step": 497
+    },
+    {
+      "epoch": 1.2901554404145077,
+      "grad_norm": 36.53028730423797,
+      "learning_rate": 1.2385999943689732e-05,
+      "loss": 1.068,
+      "step": 498
+    },
+    {
+      "epoch": 1.2927461139896372,
+      "grad_norm": 38.94837307599113,
+      "learning_rate": 1.2306698005859975e-05,
+      "loss": 1.0736,
+      "step": 499
+    },
+    {
+      "epoch": 1.2953367875647668,
+      "grad_norm": 36.67208266195125,
+      "learning_rate": 1.2227537774505996e-05,
+      "loss": 1.119,
+      "step": 500
+    },
+    {
+      "epoch": 1.2979274611398963,
+      "grad_norm": 31.086410648635283,
+      "learning_rate": 1.2148520707716567e-05,
+      "loss": 1.1094,
+      "step": 501
+    },
+    {
+      "epoch": 1.3005181347150259,
+      "grad_norm": 27.96977481605826,
+      "learning_rate": 1.2069648260943473e-05,
+      "loss": 1.1345,
+      "step": 502
+    },
+    {
+      "epoch": 1.3031088082901554,
+      "grad_norm": 22.89450502840197,
+      "learning_rate": 1.1990921886974669e-05,
+      "loss": 1.12,
+      "step": 503
+    },
+    {
+      "epoch": 1.305699481865285,
+      "grad_norm": 18.54206032224653,
+      "learning_rate": 1.1912343035907535e-05,
+      "loss": 1.0929,
+      "step": 504
+    },
+    {
+      "epoch": 1.3082901554404145,
+      "grad_norm": 38.9386007237313,
+      "learning_rate": 1.1833913155122132e-05,
+      "loss": 1.1381,
+      "step": 505
+    },
+    {
+      "epoch": 1.310880829015544,
+      "grad_norm": 37.05899458809635,
+      "learning_rate": 1.1755633689254609e-05,
+      "loss": 1.0535,
+      "step": 506
+    },
+    {
+      "epoch": 1.3134715025906736,
+      "grad_norm": 27.716372794195156,
+      "learning_rate": 1.1677506080170512e-05,
+      "loss": 1.1342,
+      "step": 507
+    },
+    {
+      "epoch": 1.3160621761658031,
+      "grad_norm": 40.42306246079416,
+      "learning_rate": 1.1599531766938306e-05,
+      "loss": 1.0887,
+      "step": 508
+    },
+    {
+      "epoch": 1.3186528497409327,
+      "grad_norm": 98.56681767405578,
+      "learning_rate": 1.1521712185802789e-05,
+      "loss": 1.0954,
+      "step": 509
+    },
+    {
+      "epoch": 1.3212435233160622,
+      "grad_norm": 34.42816933350743,
+      "learning_rate": 1.1444048770158718e-05,
+      "loss": 1.0512,
+      "step": 510
+    },
+    {
+      "epoch": 1.3238341968911918,
+      "grad_norm": 52.457523653614096,
+      "learning_rate": 1.136654295052433e-05,
+      "loss": 1.1599,
+      "step": 511
+    },
+    {
+      "epoch": 1.3264248704663213,
+      "grad_norm": 26.832339531661276,
+      "learning_rate": 1.1289196154515048e-05,
+      "loss": 1.0602,
+      "step": 512
+    },
+    {
+      "epoch": 1.3290155440414508,
+      "grad_norm": 32.746047673769816,
+      "learning_rate": 1.1212009806817163e-05,
+      "loss": 1.1544,
+      "step": 513
+    },
+    {
+      "epoch": 1.3316062176165804,
+      "grad_norm": 37.44483451702055,
+      "learning_rate": 1.1134985329161608e-05,
+      "loss": 1.1421,
+      "step": 514
+    },
+    {
+      "epoch": 1.33419689119171,
+      "grad_norm": 28.625976525737606,
+      "learning_rate": 1.1058124140297718e-05,
+      "loss": 1.0858,
+      "step": 515
+    },
+    {
+      "epoch": 1.3367875647668392,
+      "grad_norm": 38.64141195246213,
+      "learning_rate": 1.0981427655967183e-05,
+      "loss": 1.0983,
+      "step": 516
+    },
+    {
+      "epoch": 1.3393782383419688,
+      "grad_norm": 29.989753893533425,
+      "learning_rate": 1.0904897288877891e-05,
+      "loss": 1.1269,
+      "step": 517
+    },
+    {
+      "epoch": 1.3419689119170983,
+      "grad_norm": 48.63990665515511,
+      "learning_rate": 1.0828534448677942e-05,
+      "loss": 1.0844,
+      "step": 518
+    },
+    {
+      "epoch": 1.3445595854922279,
+      "grad_norm": 25.477227318250847,
+      "learning_rate": 1.0752340541929711e-05,
+      "loss": 1.0742,
+      "step": 519
+    },
+    {
+      "epoch": 1.3471502590673574,
+      "grad_norm": 26.363588814537763,
+      "learning_rate": 1.0676316972083867e-05,
+      "loss": 1.0533,
+      "step": 520
+    },
+    {
+      "epoch": 1.349740932642487,
+      "grad_norm": 34.59968737708606,
+      "learning_rate": 1.060046513945361e-05,
+      "loss": 1.0983,
+      "step": 521
+    },
+    {
+      "epoch": 1.3523316062176165,
+      "grad_norm": 52.51652561846762,
+      "learning_rate": 1.0524786441188786e-05,
+      "loss": 1.1319,
+      "step": 522
+    },
+    {
+      "epoch": 1.354922279792746,
+      "grad_norm": 21.360221214301127,
+      "learning_rate": 1.0449282271250239e-05,
+      "loss": 1.0627,
+      "step": 523
+    },
+    {
+      "epoch": 1.3575129533678756,
+      "grad_norm": 37.00053933682603,
+      "learning_rate": 1.0373954020384073e-05,
+      "loss": 1.096,
+      "step": 524
+    },
+    {
+      "epoch": 1.3601036269430051,
+      "grad_norm": 39.212240822687484,
+      "learning_rate": 1.029880307609608e-05,
+      "loss": 1.0512,
+      "step": 525
+    },
+    {
+      "epoch": 1.3626943005181347,
+      "grad_norm": 24.89842378385804,
+      "learning_rate": 1.0223830822626124e-05,
+      "loss": 1.0538,
+      "step": 526
+    },
+    {
+      "epoch": 1.3652849740932642,
+      "grad_norm": 29.14416894424653,
+      "learning_rate": 1.0149038640922715e-05,
+      "loss": 1.1538,
+      "step": 527
+    },
+    {
+      "epoch": 1.3678756476683938,
+      "grad_norm": 31.688722122648855,
+      "learning_rate": 1.0074427908617515e-05,
+      "loss": 1.171,
+      "step": 528
+    },
+    {
+      "epoch": 1.3704663212435233,
+      "grad_norm": 41.918909004413734,
+      "learning_rate": 1.0000000000000006e-05,
+      "loss": 1.1203,
+      "step": 529
+    },
+    {
+      "epoch": 1.3730569948186528,
+      "grad_norm": 26.70963454516576,
+      "learning_rate": 9.92575628599213e-06,
+      "loss": 1.0855,
+      "step": 530
+    },
+    {
+      "epoch": 1.3756476683937824,
+      "grad_norm": 24.819351173466824,
+      "learning_rate": 9.851698134123095e-06,
+      "loss": 1.0972,
+      "step": 531
+    },
+    {
+      "epoch": 1.378238341968912,
+      "grad_norm": 22.100465399566815,
+      "learning_rate": 9.777826908504126e-06,
+      "loss": 1.08,
+      "step": 532
+    },
+    {
+      "epoch": 1.3808290155440415,
+      "grad_norm": 29.31574709406259,
+      "learning_rate": 9.704143969803392e-06,
+      "loss": 1.0835,
+      "step": 533
+    },
+    {
+      "epoch": 1.383419689119171,
+      "grad_norm": 25.551326748473052,
+      "learning_rate": 9.630650675220892e-06,
+      "loss": 1.0396,
+      "step": 534
+    },
+    {
+      "epoch": 1.3860103626943006,
+      "grad_norm": 59.07595627892596,
+      "learning_rate": 9.557348378463503e-06,
+      "loss": 1.0814,
+      "step": 535
+    },
+    {
+      "epoch": 1.38860103626943,
+      "grad_norm": 24.96501978981908,
+      "learning_rate": 9.484238429720018e-06,
+      "loss": 1.0187,
+      "step": 536
+    },
+    {
+      "epoch": 1.3911917098445596,
+      "grad_norm": 42.530604702279234,
+      "learning_rate": 9.411322175636298e-06,
+      "loss": 1.074,
+      "step": 537
+    },
+    {
+      "epoch": 1.3937823834196892,
+      "grad_norm": 34.91129065632851,
+      "learning_rate": 9.338600959290414e-06,
+      "loss": 1.0878,
+      "step": 538
+    },
+    {
+      "epoch": 1.3963730569948187,
+      "grad_norm": 32.07525956876426,
+      "learning_rate": 9.266076120167992e-06,
+      "loss": 1.0962,
+      "step": 539
+    },
+    {
+      "epoch": 1.3989637305699483,
+      "grad_norm": 40.18387743296675,
+      "learning_rate": 9.193748994137462e-06,
+      "loss": 1.1033,
+      "step": 540
+    },
+    {
+      "epoch": 1.4015544041450778,
+      "grad_norm": 66.68031460980451,
+      "learning_rate": 9.121620913425508e-06,
+      "loss": 1.1466,
+      "step": 541
+    },
+    {
+      "epoch": 1.4041450777202074,
+      "grad_norm": 34.07506059584738,
+      "learning_rate": 9.04969320659249e-06,
+      "loss": 1.1184,
+      "step": 542
+    },
+    {
+      "epoch": 1.406735751295337,
+      "grad_norm": 17.130845779169075,
+      "learning_rate": 8.977967198508001e-06,
+      "loss": 1.0803,
+      "step": 543
+    },
+    {
+      "epoch": 1.4093264248704664,
+      "grad_norm": 22.4457025132615,
+      "learning_rate": 8.906444210326441e-06,
+      "loss": 1.0745,
+      "step": 544
+    },
+    {
+      "epoch": 1.411917098445596,
+      "grad_norm": 73.43971735356851,
+      "learning_rate": 8.83512555946271e-06,
+      "loss": 1.0717,
+      "step": 545
+    },
+    {
+      "epoch": 1.4145077720207253,
+      "grad_norm": 38.16321297719761,
+      "learning_rate": 8.764012559567899e-06,
+      "loss": 1.1371,
+      "step": 546
+    },
+    {
+      "epoch": 1.4170984455958548,
+      "grad_norm": 56.14718024907725,
+      "learning_rate": 8.693106520505147e-06,
+      "loss": 1.0185,
+      "step": 547
+    },
+    {
+      "epoch": 1.4196891191709844,
+      "grad_norm": 53.3812598790062,
+      "learning_rate": 8.622408748325461e-06,
+      "loss": 1.0859,
+      "step": 548
+    },
+    {
+      "epoch": 1.422279792746114,
+      "grad_norm": 39.69041631433326,
+      "learning_rate": 8.551920545243704e-06,
+      "loss": 1.1146,
+      "step": 549
+    },
+    {
+      "epoch": 1.4248704663212435,
+      "grad_norm": 24.099260758984773,
+      "learning_rate": 8.481643209614576e-06,
+      "loss": 1.0968,
+      "step": 550
+    },
+    {
+      "epoch": 1.427461139896373,
+      "grad_norm": 22.623850373369237,
+      "learning_rate": 8.411578035908728e-06,
+      "loss": 1.0642,
+      "step": 551
+    },
+    {
+      "epoch": 1.4300518134715026,
+      "grad_norm": 25.343746374404027,
+      "learning_rate": 8.341726314688875e-06,
+      "loss": 1.0815,
+      "step": 552
+    },
+    {
+      "epoch": 1.432642487046632,
+      "grad_norm": 35.82641011588973,
+      "learning_rate": 8.272089332586089e-06,
+      "loss": 1.1012,
+      "step": 553
+    },
+    {
+      "epoch": 1.4352331606217616,
+      "grad_norm": 24.81161215784662,
+      "learning_rate": 8.20266837227603e-06,
+      "loss": 1.1086,
+      "step": 554
+    },
+    {
+      "epoch": 1.4378238341968912,
+      "grad_norm": 54.18243481591251,
+      "learning_rate": 8.133464712455364e-06,
+      "loss": 1.0704,
+      "step": 555
+    },
+    {
+      "epoch": 1.4404145077720207,
+      "grad_norm": 23.602598217141395,
+      "learning_rate": 8.064479627818213e-06,
+      "loss": 1.1519,
+      "step": 556
+    },
+    {
+      "epoch": 1.4430051813471503,
+      "grad_norm": 31.124404868409982,
+      "learning_rate": 7.995714389032638e-06,
+      "loss": 1.0705,
+      "step": 557
+    },
+    {
+      "epoch": 1.4455958549222798,
+      "grad_norm": 24.14171016995626,
+      "learning_rate": 7.927170262717284e-06,
+      "loss": 1.1083,
+      "step": 558
+    },
+    {
+      "epoch": 1.4481865284974094,
+      "grad_norm": 47.987203109917175,
+      "learning_rate": 7.858848511417998e-06,
+      "loss": 1.0836,
+      "step": 559
+    },
+    {
+      "epoch": 1.450777202072539,
+      "grad_norm": 25.871447098066056,
+      "learning_rate": 7.790750393584616e-06,
+      "loss": 1.0787,
+      "step": 560
+    },
+    {
+      "epoch": 1.4533678756476685,
+      "grad_norm": 23.820249113937482,
+      "learning_rate": 7.72287716354776e-06,
+      "loss": 1.1165,
+      "step": 561
+    },
+    {
+      "epoch": 1.455958549222798,
+      "grad_norm": 48.04131308947624,
+      "learning_rate": 7.65523007149575e-06,
+      "loss": 1.0819,
+      "step": 562
+    },
+    {
+      "epoch": 1.4585492227979275,
+      "grad_norm": 29.273494083692352,
+      "learning_rate": 7.587810363451544e-06,
+      "loss": 1.0302,
+      "step": 563
+    },
+    {
+      "epoch": 1.4611398963730569,
+      "grad_norm": 120.01571222366722,
+      "learning_rate": 7.5206192812498345e-06,
+      "loss": 1.1291,
+      "step": 564
+    },
+    {
+      "epoch": 1.4637305699481864,
+      "grad_norm": 33.16947662083338,
+      "learning_rate": 7.4536580625141244e-06,
+      "loss": 1.0842,
+      "step": 565
+    },
+    {
+      "epoch": 1.466321243523316,
+      "grad_norm": 29.979556378166713,
+      "learning_rate": 7.386927940633981e-06,
+      "loss": 1.1116,
+      "step": 566
+    },
+    {
+      "epoch": 1.4689119170984455,
+      "grad_norm": 27.172344859281896,
+      "learning_rate": 7.32043014474227e-06,
+      "loss": 1.0676,
+      "step": 567
+    },
+    {
+      "epoch": 1.471502590673575,
+      "grad_norm": 30.208548637757318,
+      "learning_rate": 7.254165899692554e-06,
+      "loss": 1.1104,
+      "step": 568
+    },
+    {
+      "epoch": 1.4740932642487046,
+      "grad_norm": 19.385421184583773,
+      "learning_rate": 7.188136426036498e-06,
+      "loss": 1.0085,
+      "step": 569
+    },
+    {
+      "epoch": 1.4766839378238341,
+      "grad_norm": 30.350787749309685,
+      "learning_rate": 7.12234294000143e-06,
+      "loss": 1.0584,
+      "step": 570
+    },
+    {
+      "epoch": 1.4792746113989637,
+      "grad_norm": 31.520305600900198,
+      "learning_rate": 7.056786653467882e-06,
+      "loss": 1.0831,
+      "step": 571
+    },
+    {
+      "epoch": 1.4818652849740932,
+      "grad_norm": 46.13006972574487,
+      "learning_rate": 6.991468773947321e-06,
+      "loss": 1.1761,
+      "step": 572
+    },
+    {
+      "epoch": 1.4844559585492227,
+      "grad_norm": 26.72340868362835,
+      "learning_rate": 6.926390504559879e-06,
+      "loss": 1.0605,
+      "step": 573
+    },
+    {
+      "epoch": 1.4870466321243523,
+      "grad_norm": 25.992965411102556,
+      "learning_rate": 6.861553044012206e-06,
+      "loss": 1.1015,
+      "step": 574
+    },
+    {
+      "epoch": 1.4896373056994818,
+      "grad_norm": 38.60187420279626,
+      "learning_rate": 6.796957586575364e-06,
+      "loss": 1.1232,
+      "step": 575
+    },
+    {
+      "epoch": 1.4922279792746114,
+      "grad_norm": 21.7618591565717,
+      "learning_rate": 6.732605322062869e-06,
+      "loss": 1.1196,
+      "step": 576
+    },
+    {
+      "epoch": 1.494818652849741,
+      "grad_norm": 28.233093007170996,
+      "learning_rate": 6.668497435808736e-06,
+      "loss": 1.1451,
+      "step": 577
+    },
+    {
+      "epoch": 1.4974093264248705,
+      "grad_norm": 28.061514297823816,
+      "learning_rate": 6.604635108645683e-06,
+      "loss": 1.0832,
+      "step": 578
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 35.34503147975386,
+      "learning_rate": 6.5410195168833425e-06,
+      "loss": 1.118,
+      "step": 579
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 772,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 193,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.0022991232499712e+16,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoint-579/training_args.bin b/checkpoint-579/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..a4d661b15e5bbd8390fd11a502bea76680041301
--- /dev/null
+++ b/checkpoint-579/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3fe76c44cf1ade69372a2b861f80cfcfc5ba88f283683f660a4a0605f642aee3
+size 8568
diff --git a/checkpoint-579/zero_to_fp32.py b/checkpoint-579/zero_to_fp32.py
new file mode 100644
index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8
--- /dev/null
+++ b/checkpoint-579/zero_to_fp32.py
@@ -0,0 +1,604 @@
+#!/usr/bin/env python
+
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example: python zero_to_fp32.py . pytorch_model.bin
+
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+from collections import OrderedDict
+from dataclasses import dataclass
+
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+                                            FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+                                            FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+
+
+@dataclass
+class zero_model_state:
+    buffers: dict()
+    param_shapes: dict()
+    shared_params: list
+    ds_version: int
+    frozen_param_shapes: dict()
+    frozen_param_fragments: dict()
+
+
+debug = 0
+
+# load to cpu
+device = torch.device('cpu')
+
+
+def atoi(text):
+    return int(text) if text.isdigit() else text
+
+
+def natural_keys(text):
+    '''
+    alist.sort(key=natural_keys) sorts in human order
+    http://nedbatchelder.com/blog/200712/human_sorting.html
+    (See Toothy's implementation in the comments)
+    '''
+    return [atoi(c) for c in re.split(r'(\d+)', text)]
+
+
+def get_model_state_file(checkpoint_dir, zero_stage):
+    if not os.path.isdir(checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+
+    # there should be only one file
+    if zero_stage <= 2:
+        file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+    elif zero_stage == 3:
+        file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+
+    if not os.path.exists(file):
+        raise FileNotFoundError(f"can't find model states file at '{file}'")
+
+    return file
+
+
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+    # XXX: need to test that this simple glob rule works for multi-node setup too
+    ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+
+    if len(ckpt_files) == 0:
+        raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+
+    return ckpt_files
+
+
+def get_optim_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+
+
+def get_model_state_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+
+
+def parse_model_states(files):
+    zero_model_states = []
+    for file in files:
+        state_dict = torch.load(file, map_location=device)
+
+        if BUFFER_NAMES not in state_dict:
+            raise ValueError(f"{file} is not a model state checkpoint")
+        buffer_names = state_dict[BUFFER_NAMES]
+        if debug:
+            print("Found buffers:", buffer_names)
+
+        # recover just the buffers while restoring them to fp32 if they were saved in fp16
+        buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+        param_shapes = state_dict[PARAM_SHAPES]
+
+        # collect parameters that are included in param_shapes
+        param_names = []
+        for s in param_shapes:
+            for name in s.keys():
+                param_names.append(name)
+
+        # update with frozen parameters
+        frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+        if frozen_param_shapes is not None:
+            if debug:
+                print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+            param_names += list(frozen_param_shapes.keys())
+
+        # handle shared params
+        shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+
+        ds_version = state_dict.get(DS_VERSION, None)
+
+        frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+
+        z_model_state = zero_model_state(buffers=buffers,
+                                         param_shapes=param_shapes,
+                                         shared_params=shared_params,
+                                         ds_version=ds_version,
+                                         frozen_param_shapes=frozen_param_shapes,
+                                         frozen_param_fragments=frozen_param_fragments)
+        zero_model_states.append(z_model_state)
+
+    return zero_model_states
+
+
+def parse_optim_states(files, ds_checkpoint_dir):
+
+    total_files = len(files)
+    state_dicts = []
+    for f in files:
+        state_dict = torch.load(f, map_location=device)
+        # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+        # and also handle the case where it was already removed by another helper script
+        state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+        state_dicts.append(state_dict)
+
+    if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
+        raise ValueError(f"{files[0]} is not a zero checkpoint")
+    zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+    world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+
+    # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+    # parameters can be different from data parallelism for non-expert parameters. So we can just
+    # use the max of the partition_count to get the dp world_size.
+
+    if type(world_size) is list:
+        world_size = max(world_size)
+
+    if world_size != total_files:
+        raise ValueError(
+            f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+            "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+        )
+
+    # the groups are named differently in each stage
+    if zero_stage <= 2:
+        fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+    elif zero_stage == 3:
+        fp32_groups_key = FP32_FLAT_GROUPS
+    else:
+        raise ValueError(f"unknown zero stage {zero_stage}")
+
+    if zero_stage <= 2:
+        fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+    elif zero_stage == 3:
+        # if there is more than one param group, there will be multiple flattened tensors - one
+        # flattened tensor per group - for simplicity merge them into a single tensor
+        #
+        # XXX: could make the script more memory efficient for when there are multiple groups - it
+        # will require matching the sub-lists of param_shapes for each param group flattened tensor
+
+        fp32_flat_groups = [
+            torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts))
+        ]
+
+    return zero_stage, world_size, fp32_flat_groups
+
+
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
+    """
+    Returns fp32 state_dict reconstructed from ds checkpoint
+
+    Args:
+        - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+
+    """
+    print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+
+    optim_files = get_optim_files(ds_checkpoint_dir)
+    zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+    print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+
+    model_files = get_model_state_files(ds_checkpoint_dir)
+
+    zero_model_states = parse_model_states(model_files)
+    print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+
+    if zero_stage <= 2:
+        return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+    elif zero_stage == 3:
+        return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+
+
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+
+    frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+    frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+
+    if debug:
+        num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+        print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+    total_params = 0
+    total_numel = 0
+    for name, shape in frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+
+        state_dict[name] = frozen_param_fragments[name]
+
+        if debug:
+            print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _has_callable(obj, fn):
+    attr = getattr(obj, fn, None)
+    return callable(attr)
+
+
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+
+    # Reconstruction protocol:
+    #
+    # XXX: document this
+
+    if debug:
+        for i in range(world_size):
+            for j in range(len(fp32_flat_groups[0])):
+                print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+
+    # XXX: memory usage doubles here (zero2)
+    num_param_groups = len(fp32_flat_groups[0])
+    merged_single_partition_of_fp32_groups = []
+    for i in range(num_param_groups):
+        merged_partitions = [sd[i] for sd in fp32_flat_groups]
+        full_single_fp32_vector = torch.cat(merged_partitions, 0)
+        merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+    avail_numel = sum(
+        [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+
+    if debug:
+        wanted_params = sum([len(shapes) for shapes in param_shapes])
+        wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+        # not asserting if there is a mismatch due to possible padding
+        print(f"Have {avail_numel} numels to process.")
+        print(f"Need {wanted_numel} numels in {wanted_params} params.")
+
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    total_numel = 0
+    total_params = 0
+    for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+        offset = 0
+        avail_numel = full_single_fp32_vector.numel()
+        for name, shape in shapes.items():
+
+            unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+            total_numel += unpartitioned_numel
+            total_params += 1
+
+            if debug:
+                print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+            state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+            offset += unpartitioned_numel
+
+        # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+        # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+        # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+        # live optimizer object, so we are checking that the numbers are within the right range
+        align_to = 2 * world_size
+
+        def zero2_align(x):
+            return align_to * math.ceil(x / align_to)
+
+        if debug:
+            print(f"original offset={offset}, avail_numel={avail_numel}")
+
+        offset = zero2_align(offset)
+        avail_numel = zero2_align(avail_numel)
+
+        if debug:
+            print(f"aligned  offset={offset}, avail_numel={avail_numel}")
+
+        # Sanity check
+        if offset != avail_numel:
+            raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+    print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+
+    if not exclude_frozen_parameters:
+        _zero2_merge_frozen_params(state_dict, zero_model_states)
+
+    _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+
+    return state_dict
+
+
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+    remainder = unpartitioned_numel % world_size
+    padding_numel = (world_size - remainder) if remainder else 0
+    partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+    return partitioned_numel, padding_numel
+
+
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+
+    if debug:
+        for i in range(world_size):
+            num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+            print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+        frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+    total_params = 0
+    total_numel = 0
+    for name, shape in zero_model_states[0].frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+
+        param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+        state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+        if debug:
+            print(
+                f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    avail_numel = fp32_flat_groups[0].numel() * world_size
+    # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+    # param, re-consolidating each param, while dealing with padding if any
+
+    # merge list of dicts, preserving order
+    param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+
+    if debug:
+        for i in range(world_size):
+            print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+
+        wanted_params = len(param_shapes)
+        wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+        # not asserting if there is a mismatch due to possible padding
+        avail_numel = fp32_flat_groups[0].numel() * world_size
+        print(f"Trainable params: Have {avail_numel} numels to process.")
+        print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    offset = 0
+    total_numel = 0
+    total_params = 0
+    for name, shape in param_shapes.items():
+
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        total_params += 1
+
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+        if debug:
+            print(
+                f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+
+        # XXX: memory usage doubles here
+        state_dict[name] = torch.cat(
+            tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)),
+            0).narrow(0, 0, unpartitioned_numel).view(shape)
+        offset += partitioned_numel
+
+    offset *= world_size
+
+    # Sanity check
+    if offset != avail_numel:
+        raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+    print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+
+    if not exclude_frozen_parameters:
+        _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+
+    _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+
+    return state_dict
+
+
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+    ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+    via a model hub.
+
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+
+    Returns:
+        - pytorch ``state_dict``
+
+    Note: this approach may not work if your application doesn't have sufficient free CPU memory and
+    you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+    the checkpoint.
+
+    A typical usage might be ::
+
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        # do the training and checkpoint saving
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+        model = model.cpu() # move to cpu
+        model.load_state_dict(state_dict)
+        # submit to model hub or save the model to share with others
+
+    In this example the ``model`` will no longer be usable in the deepspeed context of the same
+    application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+    If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+
+    """
+    if tag is None:
+        latest_path = os.path.join(checkpoint_dir, 'latest')
+        if os.path.isfile(latest_path):
+            with open(latest_path, 'r') as fd:
+                tag = fd.read().strip()
+        else:
+            raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+
+    ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+
+    if not os.path.isdir(ds_checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+
+    return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
+
+
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+    loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+    """
+
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters)
+    print(f"Saving fp32 state dict to {output_file}")
+    torch.save(state_dict, output_file)
+
+
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+    """
+    1. Put the provided model to cpu
+    2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+    3. Load it into the provided model
+
+    Args:
+        - ``model``: the model object to update
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+
+    Returns:
+        - ``model`: modified model
+
+    Make sure you have plenty of CPU memory available before you call this function. If you don't
+    have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+    conveniently placed for you in the checkpoint folder.
+
+    A typical usage might be ::
+
+        from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+        model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+        # submit to model hub or save the model to share with others
+
+    Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+    of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+    """
+    logger.info(f"Extracting fp32 weights")
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+
+    logger.info(f"Overwriting model with fp32 weights")
+    model = model.cpu()
+    model.load_state_dict(state_dict, strict=False)
+
+    return model
+
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("checkpoint_dir",
+                        type=str,
+                        help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+    parser.add_argument(
+        "output_file",
+        type=str,
+        help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)")
+    parser.add_argument("-t",
+                        "--tag",
+                        type=str,
+                        default=None,
+                        help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+    parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
+    parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+    args = parser.parse_args()
+
+    debug = args.debug
+
+    convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
+                                               args.output_file,
+                                               tag=args.tag,
+                                               exclude_frozen_parameters=args.exclude_frozen_parameters)
diff --git a/checkpoint-618/README.md b/checkpoint-618/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..5d958b0327f8e51ca223e270eb789387f6b41fb2
--- /dev/null
+++ b/checkpoint-618/README.md
@@ -0,0 +1,202 @@
+---
+base_model: THUDM/GLM-4-32B-0414
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.15.1
\ No newline at end of file
diff --git a/checkpoint-618/adapter_config.json b/checkpoint-618/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..d23c5bb0164ae65157b73dbb2e6dc419d09b28ad
--- /dev/null
+++ b/checkpoint-618/adapter_config.json
@@ -0,0 +1,41 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "THUDM/GLM-4-32B-0414",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": null,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": [
+    "embed_tokens",
+    "lm_head"
+  ],
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "o_proj",
+    "v_proj",
+    "down_proj",
+    "q_proj",
+    "k_proj",
+    "gate_up_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_rslora": true
+}
\ No newline at end of file
diff --git a/checkpoint-618/adapter_model.safetensors b/checkpoint-618/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..33fd135a729971587398eb81df85b84a291ab4dc
--- /dev/null
+++ b/checkpoint-618/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6ac6338f4da6e13c15823f24e19a271d8fc65df47f503bcfc14c8766e14ef0bc
+size 5579575888
diff --git a/checkpoint-618/global_step617/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-618/global_step617/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a191af173b723477e42b0a79af6ddab2af6fb19c
--- /dev/null
+++ b/checkpoint-618/global_step617/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2313d9207c50abb87475adf598a5d6fd3d3d02d22a7b8852855f6dd1abd67977
+size 2458601314
diff --git a/checkpoint-618/global_step617/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-618/global_step617/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e26be5b394465290ffffb20cfd48342f4a3b9c19
--- /dev/null
+++ b/checkpoint-618/global_step617/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fbeb45abbdd046dc63cd170c5bde2c73d64227159812131ab33ec194a12b170a
+size 2458601314
diff --git a/checkpoint-618/global_step617/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoint-618/global_step617/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..843d35b523ca76fd1d85413c4ce50465be4372b1
--- /dev/null
+++ b/checkpoint-618/global_step617/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b84e269a78f25aad5152fd4bdef3ec4e6635ed14d6d08702d3f5e125a14ac28c
+size 2458601314
diff --git a/checkpoint-618/global_step617/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/checkpoint-618/global_step617/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..87b8ad101a0fcd3fa5b32fd26360252d3edc77f4
--- /dev/null
+++ b/checkpoint-618/global_step617/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:215071f0888a851073f0f40280a8aca3d352c7cc71c16f9c407988c78fcfa8f7
+size 2458601314
diff --git a/checkpoint-618/global_step617/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/checkpoint-618/global_step617/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b5d4a4d1e0e0887a6a7d7f801c73150fec2793ec
--- /dev/null
+++ b/checkpoint-618/global_step617/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a73d035178aead156f96b9f208248db39158203b83ef9aed34d8ec3c6b174236
+size 2458601314
diff --git a/checkpoint-618/global_step617/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/checkpoint-618/global_step617/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c614ae3828df91d62a62c1c93450bb83342b11c1
--- /dev/null
+++ b/checkpoint-618/global_step617/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4ee07b54f69db0b9e3953e949e178cc87592e3eaa1a845556be656befaaed324
+size 2458601314
diff --git a/checkpoint-618/global_step617/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/checkpoint-618/global_step617/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..37a860fddeaffb071a6f42a5bf2de6cefd1a328c
--- /dev/null
+++ b/checkpoint-618/global_step617/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de7012e2b66a6f72edf266b4e20132b482c19058bb357065fb8aa7f0314c069b
+size 2458601314
diff --git a/checkpoint-618/global_step617/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/checkpoint-618/global_step617/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2af90a6ee085d81fe0119d7767a2e116eca3205f
--- /dev/null
+++ b/checkpoint-618/global_step617/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d9a9d463c8c32fe26cb6ceb06f636a283e30cc9d24c48080d6ef467ea2ba506d
+size 2458601314
diff --git a/checkpoint-618/global_step617/zero_pp_rank_0_mp_rank_00_model_states.pt b/checkpoint-618/global_step617/zero_pp_rank_0_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a94c0321b519c597a0c94bc9cb4e9370ca2b765e
--- /dev/null
+++ b/checkpoint-618/global_step617/zero_pp_rank_0_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:258423171c32e9d6cc74b23612ceb2e89e38df0d91d795457185e713a7644523
+size 752148
diff --git a/checkpoint-618/global_step617/zero_pp_rank_1_mp_rank_00_model_states.pt b/checkpoint-618/global_step617/zero_pp_rank_1_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ae518370a60f71674df5904752ba108a45154fe4
--- /dev/null
+++ b/checkpoint-618/global_step617/zero_pp_rank_1_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b9856c3fab0d45b7a84d519c6a5781e65fe4d56e33855490b163619887fb91be
+size 752148
diff --git a/checkpoint-618/global_step617/zero_pp_rank_2_mp_rank_00_model_states.pt b/checkpoint-618/global_step617/zero_pp_rank_2_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..0ad81e3065870e7d7ac7488f5289b5165736ba9c
--- /dev/null
+++ b/checkpoint-618/global_step617/zero_pp_rank_2_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2580444416d0f9c1ba57df79e9e685673596d1ad95a07002eb4da35198eb4c0d
+size 752148
diff --git a/checkpoint-618/global_step617/zero_pp_rank_3_mp_rank_00_model_states.pt b/checkpoint-618/global_step617/zero_pp_rank_3_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..57d3de63b3c7899b8abb561a3992fe65ecb19ca0
--- /dev/null
+++ b/checkpoint-618/global_step617/zero_pp_rank_3_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e69b821888fc1affcbcc9b822d45a27588b805eb45eceefe64b5afea53beac0b
+size 752148
diff --git a/checkpoint-618/global_step617/zero_pp_rank_4_mp_rank_00_model_states.pt b/checkpoint-618/global_step617/zero_pp_rank_4_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e44cf635aa0308f463f0e4f646e20bb29030acdf
--- /dev/null
+++ b/checkpoint-618/global_step617/zero_pp_rank_4_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d4ba8e1d5086335ef4cd1bb701d69c884685bc094916d51534e55a2a9aa06693
+size 752148
diff --git a/checkpoint-618/global_step617/zero_pp_rank_5_mp_rank_00_model_states.pt b/checkpoint-618/global_step617/zero_pp_rank_5_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..850a94913244cb47e8ac73cb6a237a2578c77831
--- /dev/null
+++ b/checkpoint-618/global_step617/zero_pp_rank_5_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fe34e372dedacdb33571ab2b4347da88102b5a7930be9126607a44d798627a52
+size 752148
diff --git a/checkpoint-618/global_step617/zero_pp_rank_6_mp_rank_00_model_states.pt b/checkpoint-618/global_step617/zero_pp_rank_6_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d01209e2f8c1a36487754beef6fb0ba7ea473a84
--- /dev/null
+++ b/checkpoint-618/global_step617/zero_pp_rank_6_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b4c432819caff1085cbf70acadac49f8169a1ab6bbb00b8b4c71c97577c34f02
+size 752148
diff --git a/checkpoint-618/global_step617/zero_pp_rank_7_mp_rank_00_model_states.pt b/checkpoint-618/global_step617/zero_pp_rank_7_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..6c6cb3c270e8abba3cc7a3cd5b42413f4aaa99b5
--- /dev/null
+++ b/checkpoint-618/global_step617/zero_pp_rank_7_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e9ca8f6fbaf4df3d5f83c349c9b21bc45b5976a02529ffea751083000bde298e
+size 752148
diff --git a/checkpoint-618/latest b/checkpoint-618/latest
new file mode 100644
index 0000000000000000000000000000000000000000..e3591976dc02bcf876afbbae81c6b15992f0b7cc
--- /dev/null
+++ b/checkpoint-618/latest
@@ -0,0 +1 @@
+global_step617
\ No newline at end of file
diff --git a/checkpoint-618/rng_state_0.pth b/checkpoint-618/rng_state_0.pth
new file mode 100644
index 0000000000000000000000000000000000000000..21af2edb36d5dc2f0f272356f08666b8ba46404d
--- /dev/null
+++ b/checkpoint-618/rng_state_0.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:139ab52a8b7888bb2b6bc57022d15e95bde7158a58f3b96c075a46432bd804f7
+size 15984
diff --git a/checkpoint-618/rng_state_1.pth b/checkpoint-618/rng_state_1.pth
new file mode 100644
index 0000000000000000000000000000000000000000..fa67ef3f64f0b3e19384d6312b6150bf6c01d9c7
--- /dev/null
+++ b/checkpoint-618/rng_state_1.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e9ac300c23c0f0222dfbcfb4e7bf191c1b9c07f7e759e1445653318a00154087
+size 15984
diff --git a/checkpoint-618/rng_state_2.pth b/checkpoint-618/rng_state_2.pth
new file mode 100644
index 0000000000000000000000000000000000000000..9e61d76ca61b2704bb56bc0c9216ddac760d3cfd
--- /dev/null
+++ b/checkpoint-618/rng_state_2.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:df0b8aef61e966c433db5c300020f4cc5f72210c72167d63a56502cf32efde2e
+size 15984
diff --git a/checkpoint-618/rng_state_3.pth b/checkpoint-618/rng_state_3.pth
new file mode 100644
index 0000000000000000000000000000000000000000..8822c9d82969c59d7e0e39285330ac97a4afbd62
--- /dev/null
+++ b/checkpoint-618/rng_state_3.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:aa9b368f180513edf8a6b276c52fad1cee0a5669655c79fd700779055266cbdc
+size 15984
diff --git a/checkpoint-618/rng_state_4.pth b/checkpoint-618/rng_state_4.pth
new file mode 100644
index 0000000000000000000000000000000000000000..9999feb8ddfc2a1800d467e6ba26eee2216bbf50
--- /dev/null
+++ b/checkpoint-618/rng_state_4.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7a7949cc8c56130fc5744bf28cb582eadff20beb2a6046521867b30d17e2db36
+size 15984
diff --git a/checkpoint-618/rng_state_5.pth b/checkpoint-618/rng_state_5.pth
new file mode 100644
index 0000000000000000000000000000000000000000..31e754380a3988fb4e0a4cc3f3dc27ebea4e2534
--- /dev/null
+++ b/checkpoint-618/rng_state_5.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4e91cc570010bc761d4607a41efb25dcbbe25b17376ef2b2f9f56979e35df8a6
+size 15984
diff --git a/checkpoint-618/rng_state_6.pth b/checkpoint-618/rng_state_6.pth
new file mode 100644
index 0000000000000000000000000000000000000000..8a2c4060c498d2f9af4a1b7515a9626929ff400f
--- /dev/null
+++ b/checkpoint-618/rng_state_6.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d62293b3776c3666306e46e8e4089019cc3a093559b478e0523e85fcf1f00c09
+size 15984
diff --git a/checkpoint-618/rng_state_7.pth b/checkpoint-618/rng_state_7.pth
new file mode 100644
index 0000000000000000000000000000000000000000..c7572f4efdf870301e9beaedf8f8837610435241
--- /dev/null
+++ b/checkpoint-618/rng_state_7.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:83d5c6cfa7ad06775825ebbc35e15d8b73c7ef8a7546a855de02ef444f52ab9d
+size 15984
diff --git a/checkpoint-618/scheduler.pt b/checkpoint-618/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..14d3f0cae23c97a8fa62dba8839bd6d30fab58e6
--- /dev/null
+++ b/checkpoint-618/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2a9c98ae9f0c0fe9c37ce7e2650666ea461d75f496b952bbbb48a98af405c315
+size 1064
diff --git a/checkpoint-618/special_tokens_map.json b/checkpoint-618/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..3cf953c2c8a3c89778c92e54c685942bb1130616
--- /dev/null
+++ b/checkpoint-618/special_tokens_map.json
@@ -0,0 +1,32 @@
+{
+  "additional_special_tokens": [
+    "<|endoftext|>",
+    "[MASK]",
+    "[gMASK]",
+    "[sMASK]",
+    "<sop>",
+    "<eop>",
+    "<|system|>",
+    "<|user|>",
+    "<|assistant|>",
+    "<|observation|>",
+    "<|begin_of_image|>",
+    "<|end_of_image|>",
+    "<|begin_of_video|>",
+    "<|end_of_video|>"
+  ],
+  "eos_token": {
+    "content": "<|user|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
diff --git a/checkpoint-618/tokenizer.json b/checkpoint-618/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..4d1dde37a2715c11628fc84bf571976f9f80eb69
--- /dev/null
+++ b/checkpoint-618/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:76ebeac0d8bd7879ead7b43c16b44981f277e47225de2bd7de9ae1a6cc664a8c
+size 19966496
diff --git a/checkpoint-618/tokenizer_config.json b/checkpoint-618/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..e19c45f6310a17f6c1c6be76a429ac68438d547f
--- /dev/null
+++ b/checkpoint-618/tokenizer_config.json
@@ -0,0 +1,146 @@
+{
+  "added_tokens_decoder": {
+    "151329": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151330": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151331": {
+      "content": "[gMASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151332": {
+      "content": "[sMASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151333": {
+      "content": "<sop>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151334": {
+      "content": "<eop>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151335": {
+      "content": "<|system|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151336": {
+      "content": "<|user|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151337": {
+      "content": "<|assistant|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151338": {
+      "content": "<|observation|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151339": {
+      "content": "<|begin_of_image|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151340": {
+      "content": "<|end_of_image|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151341": {
+      "content": "<|begin_of_video|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151342": {
+      "content": "<|end_of_video|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|endoftext|>",
+    "[MASK]",
+    "[gMASK]",
+    "[sMASK]",
+    "<sop>",
+    "<eop>",
+    "<|system|>",
+    "<|user|>",
+    "<|assistant|>",
+    "<|observation|>",
+    "<|begin_of_image|>",
+    "<|end_of_image|>",
+    "<|begin_of_video|>",
+    "<|end_of_video|>"
+  ],
+  "chat_template": "[gMASK]<sop>\n{%- if tools -%}\n<|system|>\n# 可用工具\n{% for tool in tools %}\n    {%- set function = tool.function if tool.get(\"function\") else tool %}\n\n## {{ function.name }}\n\n{{ function | tojson(indent=4, ensure_ascii=False) }}\n在调用上述函数时，请使用 Json 格式表示调用的参数。\n{%- endfor %}\n{%- endif -%}\n\n{%- for msg in messages %}\n    {%- if msg.role == 'system' %}\n<|system|>\n{{ msg.content }}\n    {%- endif %}\n{%- endfor %}\n\n{%- for message in messages if message.role != 'system' %}\n    {%- set role = message['role'] %}\n    {%- set content = message['content'] %}\n    {%- set meta = message.get(\"metadata\", \"\") %}\n\n    {%- if role == 'user' %}\n<|user|>\n{{ content }}\n    {%- elif role == 'assistant' and not meta %}\n<|assistant|>\n{{ content }}\n    {%- elif role == 'assistant' and meta %}\n<|assistant|>{{ meta }}\n{{ content }}\n    {%- elif role == 'observation' %}\n<|observation|>\n{{ content }}\n    {%- endif %}\n{%- endfor %}\n{% if add_generation_prompt %}<|assistant|>{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "do_lower_case": false,
+  "eos_token": "<|user|>",
+  "extra_special_tokens": {},
+  "model_input_names": [
+    "input_ids",
+    "attention_mask"
+  ],
+  "model_max_length": 128000,
+  "pad_token": "<|endoftext|>",
+  "padding_side": "left",
+  "remove_space": false,
+  "tokenizer_class": "PreTrainedTokenizer"
+}
diff --git a/checkpoint-618/trainer_state.json b/checkpoint-618/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..bb3e3e7b728d3492f804c2962a06bcb51c0c8c38
--- /dev/null
+++ b/checkpoint-618/trainer_state.json
@@ -0,0 +1,4416 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.5006075334143378,
+  "eval_steps": 103,
+  "global_step": 618,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.002430133657351154,
+      "grad_norm": 715.4923219036787,
+      "learning_rate": 0.0,
+      "loss": 1.3541,
+      "step": 1
+    },
+    {
+      "epoch": 0.002430133657351154,
+      "eval_loss": 1.3335719108581543,
+      "eval_runtime": 53.4883,
+      "eval_samples_per_second": 13.91,
+      "eval_steps_per_second": 1.739,
+      "step": 1
+    },
+    {
+      "epoch": 0.004860267314702308,
+      "grad_norm": 614.6970578314867,
+      "learning_rate": 5e-06,
+      "loss": 1.3775,
+      "step": 2
+    },
+    {
+      "epoch": 0.007290400972053463,
+      "grad_norm": 471.59017991123795,
+      "learning_rate": 1e-05,
+      "loss": 1.339,
+      "step": 3
+    },
+    {
+      "epoch": 0.009720534629404616,
+      "grad_norm": 238.72216262259653,
+      "learning_rate": 1.5e-05,
+      "loss": 1.3829,
+      "step": 4
+    },
+    {
+      "epoch": 0.012150668286755772,
+      "grad_norm": 355.68955726709873,
+      "learning_rate": 2e-05,
+      "loss": 1.3597,
+      "step": 5
+    },
+    {
+      "epoch": 0.014580801944106925,
+      "grad_norm": 414.5627284272111,
+      "learning_rate": 2.5e-05,
+      "loss": 1.3862,
+      "step": 6
+    },
+    {
+      "epoch": 0.01701093560145808,
+      "grad_norm": 534.9877222052693,
+      "learning_rate": 3e-05,
+      "loss": 1.2784,
+      "step": 7
+    },
+    {
+      "epoch": 0.019441069258809233,
+      "grad_norm": 153.38895635666677,
+      "learning_rate": 3.5e-05,
+      "loss": 1.3521,
+      "step": 8
+    },
+    {
+      "epoch": 0.02187120291616039,
+      "grad_norm": 858.293734138087,
+      "learning_rate": 4e-05,
+      "loss": 1.2461,
+      "step": 9
+    },
+    {
+      "epoch": 0.024301336573511544,
+      "grad_norm": 255.81989388533376,
+      "learning_rate": 4.5e-05,
+      "loss": 1.2778,
+      "step": 10
+    },
+    {
+      "epoch": 0.026731470230862697,
+      "grad_norm": 368.91949003479226,
+      "learning_rate": 5e-05,
+      "loss": 1.3412,
+      "step": 11
+    },
+    {
+      "epoch": 0.02916160388821385,
+      "grad_norm": 176.49481799555898,
+      "learning_rate": 5.500000000000001e-05,
+      "loss": 1.3437,
+      "step": 12
+    },
+    {
+      "epoch": 0.031591737545565005,
+      "grad_norm": 208.57742104974147,
+      "learning_rate": 6e-05,
+      "loss": 1.2859,
+      "step": 13
+    },
+    {
+      "epoch": 0.03402187120291616,
+      "grad_norm": 93.26742036471734,
+      "learning_rate": 6.500000000000001e-05,
+      "loss": 1.1843,
+      "step": 14
+    },
+    {
+      "epoch": 0.03645200486026731,
+      "grad_norm": 145.53380444622215,
+      "learning_rate": 7e-05,
+      "loss": 1.4281,
+      "step": 15
+    },
+    {
+      "epoch": 0.038882138517618466,
+      "grad_norm": 126.56724937430516,
+      "learning_rate": 7.500000000000001e-05,
+      "loss": 1.3908,
+      "step": 16
+    },
+    {
+      "epoch": 0.041312272174969626,
+      "grad_norm": 106.19246390662754,
+      "learning_rate": 8e-05,
+      "loss": 1.344,
+      "step": 17
+    },
+    {
+      "epoch": 0.04374240583232078,
+      "grad_norm": 289.348178084847,
+      "learning_rate": 8.5e-05,
+      "loss": 1.2708,
+      "step": 18
+    },
+    {
+      "epoch": 0.046172539489671933,
+      "grad_norm": 286.63676887065634,
+      "learning_rate": 9e-05,
+      "loss": 1.3564,
+      "step": 19
+    },
+    {
+      "epoch": 0.04860267314702309,
+      "grad_norm": 269.6096299101413,
+      "learning_rate": 9.5e-05,
+      "loss": 1.2184,
+      "step": 20
+    },
+    {
+      "epoch": 0.05103280680437424,
+      "grad_norm": 151.28678796160915,
+      "learning_rate": 0.0001,
+      "loss": 1.2974,
+      "step": 21
+    },
+    {
+      "epoch": 0.053462940461725394,
+      "grad_norm": 265.5625538646362,
+      "learning_rate": 0.000105,
+      "loss": 1.2703,
+      "step": 22
+    },
+    {
+      "epoch": 0.05589307411907655,
+      "grad_norm": 724.7157187586193,
+      "learning_rate": 0.00011000000000000002,
+      "loss": 1.2691,
+      "step": 23
+    },
+    {
+      "epoch": 0.0583232077764277,
+      "grad_norm": 425.3768239347252,
+      "learning_rate": 0.00011499999999999999,
+      "loss": 1.375,
+      "step": 24
+    },
+    {
+      "epoch": 0.060753341433778855,
+      "grad_norm": 314.5119318308783,
+      "learning_rate": 0.00012,
+      "loss": 1.2952,
+      "step": 25
+    },
+    {
+      "epoch": 0.06318347509113001,
+      "grad_norm": 557.519173033834,
+      "learning_rate": 0.000125,
+      "loss": 1.2923,
+      "step": 26
+    },
+    {
+      "epoch": 0.06561360874848117,
+      "grad_norm": 211.4069356529637,
+      "learning_rate": 0.00013000000000000002,
+      "loss": 1.2629,
+      "step": 27
+    },
+    {
+      "epoch": 0.06804374240583232,
+      "grad_norm": 299.7742653722713,
+      "learning_rate": 0.00013500000000000003,
+      "loss": 1.3099,
+      "step": 28
+    },
+    {
+      "epoch": 0.07047387606318348,
+      "grad_norm": 182.18551965886013,
+      "learning_rate": 0.00014,
+      "loss": 1.2215,
+      "step": 29
+    },
+    {
+      "epoch": 0.07290400972053462,
+      "grad_norm": 153.38300520125887,
+      "learning_rate": 0.000145,
+      "loss": 1.2799,
+      "step": 30
+    },
+    {
+      "epoch": 0.07533414337788578,
+      "grad_norm": 849.4472853252786,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 1.2012,
+      "step": 31
+    },
+    {
+      "epoch": 0.07776427703523693,
+      "grad_norm": 179.94814586965418,
+      "learning_rate": 0.000155,
+      "loss": 1.2103,
+      "step": 32
+    },
+    {
+      "epoch": 0.08019441069258809,
+      "grad_norm": 180.36681057956048,
+      "learning_rate": 0.00016,
+      "loss": 1.2414,
+      "step": 33
+    },
+    {
+      "epoch": 0.08262454434993925,
+      "grad_norm": 113.72852454032189,
+      "learning_rate": 0.000165,
+      "loss": 1.2508,
+      "step": 34
+    },
+    {
+      "epoch": 0.0850546780072904,
+      "grad_norm": 150.53415363213057,
+      "learning_rate": 0.00017,
+      "loss": 1.2528,
+      "step": 35
+    },
+    {
+      "epoch": 0.08748481166464156,
+      "grad_norm": 156.19567878683574,
+      "learning_rate": 0.000175,
+      "loss": 1.2016,
+      "step": 36
+    },
+    {
+      "epoch": 0.0899149453219927,
+      "grad_norm": 416.34884765145057,
+      "learning_rate": 0.00018,
+      "loss": 1.254,
+      "step": 37
+    },
+    {
+      "epoch": 0.09234507897934387,
+      "grad_norm": 269.7105025581372,
+      "learning_rate": 0.00018500000000000002,
+      "loss": 1.2215,
+      "step": 38
+    },
+    {
+      "epoch": 0.09477521263669501,
+      "grad_norm": 249.35069047655023,
+      "learning_rate": 0.00019,
+      "loss": 1.2078,
+      "step": 39
+    },
+    {
+      "epoch": 0.09720534629404617,
+      "grad_norm": 167.16896045613478,
+      "learning_rate": 0.000195,
+      "loss": 1.1866,
+      "step": 40
+    },
+    {
+      "epoch": 0.09963547995139732,
+      "grad_norm": 248.22240554128427,
+      "learning_rate": 0.0002,
+      "loss": 1.252,
+      "step": 41
+    },
+    {
+      "epoch": 0.10206561360874848,
+      "grad_norm": 180.89520841022969,
+      "learning_rate": 0.0001999991930332148,
+      "loss": 1.2251,
+      "step": 42
+    },
+    {
+      "epoch": 0.10449574726609964,
+      "grad_norm": 614.4291375430485,
+      "learning_rate": 0.00019999677214588312,
+      "loss": 1.2563,
+      "step": 43
+    },
+    {
+      "epoch": 0.10692588092345079,
+      "grad_norm": 211.7523427355369,
+      "learning_rate": 0.00019999273737707646,
+      "loss": 1.193,
+      "step": 44
+    },
+    {
+      "epoch": 0.10935601458080195,
+      "grad_norm": 181.56788458769344,
+      "learning_rate": 0.00019998708879191335,
+      "loss": 1.2598,
+      "step": 45
+    },
+    {
+      "epoch": 0.1117861482381531,
+      "grad_norm": 157.5783414916277,
+      "learning_rate": 0.00019997982648155814,
+      "loss": 1.2663,
+      "step": 46
+    },
+    {
+      "epoch": 0.11421628189550426,
+      "grad_norm": 155.78006251192625,
+      "learning_rate": 0.00019997095056321971,
+      "loss": 1.1637,
+      "step": 47
+    },
+    {
+      "epoch": 0.1166464155528554,
+      "grad_norm": 202.0253360488958,
+      "learning_rate": 0.00019996046118014955,
+      "loss": 1.2508,
+      "step": 48
+    },
+    {
+      "epoch": 0.11907654921020656,
+      "grad_norm": 192.7576297264874,
+      "learning_rate": 0.00019994835850163924,
+      "loss": 1.2014,
+      "step": 49
+    },
+    {
+      "epoch": 0.12150668286755771,
+      "grad_norm": 132.5484871621418,
+      "learning_rate": 0.00019993464272301804,
+      "loss": 1.2279,
+      "step": 50
+    },
+    {
+      "epoch": 0.12393681652490887,
+      "grad_norm": 128.32285438248965,
+      "learning_rate": 0.00019991931406564944,
+      "loss": 1.2179,
+      "step": 51
+    },
+    {
+      "epoch": 0.12636695018226002,
+      "grad_norm": 552.3669463716512,
+      "learning_rate": 0.00019990237277692788,
+      "loss": 1.1498,
+      "step": 52
+    },
+    {
+      "epoch": 0.12879708383961117,
+      "grad_norm": 86.17911790260192,
+      "learning_rate": 0.00019988381913027442,
+      "loss": 1.2784,
+      "step": 53
+    },
+    {
+      "epoch": 0.13122721749696234,
+      "grad_norm": 70.83294605515782,
+      "learning_rate": 0.00019986365342513265,
+      "loss": 1.2224,
+      "step": 54
+    },
+    {
+      "epoch": 0.1336573511543135,
+      "grad_norm": 45.23624563299466,
+      "learning_rate": 0.00019984187598696363,
+      "loss": 1.1746,
+      "step": 55
+    },
+    {
+      "epoch": 0.13608748481166463,
+      "grad_norm": 57.67645735585192,
+      "learning_rate": 0.00019981848716724073,
+      "loss": 1.2154,
+      "step": 56
+    },
+    {
+      "epoch": 0.1385176184690158,
+      "grad_norm": 45.661268047129674,
+      "learning_rate": 0.00019979348734344398,
+      "loss": 1.1411,
+      "step": 57
+    },
+    {
+      "epoch": 0.14094775212636695,
+      "grad_norm": 53.10628399970359,
+      "learning_rate": 0.00019976687691905393,
+      "loss": 1.2029,
+      "step": 58
+    },
+    {
+      "epoch": 0.1433778857837181,
+      "grad_norm": 38.71353325803162,
+      "learning_rate": 0.00019973865632354516,
+      "loss": 1.1976,
+      "step": 59
+    },
+    {
+      "epoch": 0.14580801944106925,
+      "grad_norm": 42.789208063581114,
+      "learning_rate": 0.0001997088260123793,
+      "loss": 1.1477,
+      "step": 60
+    },
+    {
+      "epoch": 0.14823815309842042,
+      "grad_norm": 37.613194740192164,
+      "learning_rate": 0.0001996773864669978,
+      "loss": 1.2529,
+      "step": 61
+    },
+    {
+      "epoch": 0.15066828675577157,
+      "grad_norm": 47.96813084127655,
+      "learning_rate": 0.00019964433819481405,
+      "loss": 1.2328,
+      "step": 62
+    },
+    {
+      "epoch": 0.15309842041312272,
+      "grad_norm": 55.30483872428545,
+      "learning_rate": 0.00019960968172920516,
+      "loss": 1.1996,
+      "step": 63
+    },
+    {
+      "epoch": 0.15552855407047386,
+      "grad_norm": 35.58995799070749,
+      "learning_rate": 0.00019957341762950344,
+      "loss": 1.1248,
+      "step": 64
+    },
+    {
+      "epoch": 0.15795868772782504,
+      "grad_norm": 58.86131222300149,
+      "learning_rate": 0.00019953554648098748,
+      "loss": 1.3017,
+      "step": 65
+    },
+    {
+      "epoch": 0.16038882138517618,
+      "grad_norm": 32.12091331878439,
+      "learning_rate": 0.00019949606889487233,
+      "loss": 1.1961,
+      "step": 66
+    },
+    {
+      "epoch": 0.16281895504252733,
+      "grad_norm": 167.27433996357928,
+      "learning_rate": 0.0001994549855083001,
+      "loss": 1.1768,
+      "step": 67
+    },
+    {
+      "epoch": 0.1652490886998785,
+      "grad_norm": 32.3328494297432,
+      "learning_rate": 0.0001994122969843293,
+      "loss": 1.1802,
+      "step": 68
+    },
+    {
+      "epoch": 0.16767922235722965,
+      "grad_norm": 39.92530074438497,
+      "learning_rate": 0.0001993680040119244,
+      "loss": 1.2098,
+      "step": 69
+    },
+    {
+      "epoch": 0.1701093560145808,
+      "grad_norm": 45.60830517129956,
+      "learning_rate": 0.0001993221073059445,
+      "loss": 1.2159,
+      "step": 70
+    },
+    {
+      "epoch": 0.17253948967193194,
+      "grad_norm": 35.462695032736335,
+      "learning_rate": 0.00019927460760713197,
+      "loss": 1.1818,
+      "step": 71
+    },
+    {
+      "epoch": 0.17496962332928312,
+      "grad_norm": 43.05751624597826,
+      "learning_rate": 0.0001992255056821004,
+      "loss": 1.2011,
+      "step": 72
+    },
+    {
+      "epoch": 0.17739975698663427,
+      "grad_norm": 47.13143404969894,
+      "learning_rate": 0.00019917480232332224,
+      "loss": 1.1669,
+      "step": 73
+    },
+    {
+      "epoch": 0.1798298906439854,
+      "grad_norm": 72.07146401418987,
+      "learning_rate": 0.000199122498349116,
+      "loss": 1.181,
+      "step": 74
+    },
+    {
+      "epoch": 0.1822600243013366,
+      "grad_norm": 36.289202348834955,
+      "learning_rate": 0.00019906859460363307,
+      "loss": 1.1787,
+      "step": 75
+    },
+    {
+      "epoch": 0.18469015795868773,
+      "grad_norm": 46.92636167228936,
+      "learning_rate": 0.00019901309195684416,
+      "loss": 1.2316,
+      "step": 76
+    },
+    {
+      "epoch": 0.18712029161603888,
+      "grad_norm": 31.71425340357504,
+      "learning_rate": 0.00019895599130452505,
+      "loss": 1.1607,
+      "step": 77
+    },
+    {
+      "epoch": 0.18955042527339003,
+      "grad_norm": 43.94199928621344,
+      "learning_rate": 0.00019889729356824235,
+      "loss": 1.1919,
+      "step": 78
+    },
+    {
+      "epoch": 0.1919805589307412,
+      "grad_norm": 45.33073791860179,
+      "learning_rate": 0.0001988369996953386,
+      "loss": 1.2237,
+      "step": 79
+    },
+    {
+      "epoch": 0.19441069258809235,
+      "grad_norm": 135.89980489661897,
+      "learning_rate": 0.00019877511065891673,
+      "loss": 1.1822,
+      "step": 80
+    },
+    {
+      "epoch": 0.1968408262454435,
+      "grad_norm": 439.6770852212966,
+      "learning_rate": 0.00019871162745782478,
+      "loss": 1.1441,
+      "step": 81
+    },
+    {
+      "epoch": 0.19927095990279464,
+      "grad_norm": 80.73319798776026,
+      "learning_rate": 0.0001986465511166394,
+      "loss": 1.1709,
+      "step": 82
+    },
+    {
+      "epoch": 0.20170109356014582,
+      "grad_norm": 87.76515297497458,
+      "learning_rate": 0.00019857988268564953,
+      "loss": 1.1549,
+      "step": 83
+    },
+    {
+      "epoch": 0.20413122721749696,
+      "grad_norm": 70.08754986406095,
+      "learning_rate": 0.00019851162324083932,
+      "loss": 1.1771,
+      "step": 84
+    },
+    {
+      "epoch": 0.2065613608748481,
+      "grad_norm": 187.8198997057664,
+      "learning_rate": 0.0001984417738838709,
+      "loss": 1.2068,
+      "step": 85
+    },
+    {
+      "epoch": 0.20899149453219928,
+      "grad_norm": 127.78818684755072,
+      "learning_rate": 0.00019837033574206646,
+      "loss": 1.1974,
+      "step": 86
+    },
+    {
+      "epoch": 0.21142162818955043,
+      "grad_norm": 127.82979216871074,
+      "learning_rate": 0.0001982973099683902,
+      "loss": 1.185,
+      "step": 87
+    },
+    {
+      "epoch": 0.21385176184690158,
+      "grad_norm": 142.35425084857746,
+      "learning_rate": 0.00019822269774142954,
+      "loss": 1.2225,
+      "step": 88
+    },
+    {
+      "epoch": 0.21628189550425272,
+      "grad_norm": 246.64019353564817,
+      "learning_rate": 0.0001981465002653763,
+      "loss": 1.2574,
+      "step": 89
+    },
+    {
+      "epoch": 0.2187120291616039,
+      "grad_norm": 189.88471076285524,
+      "learning_rate": 0.0001980687187700071,
+      "loss": 1.1635,
+      "step": 90
+    },
+    {
+      "epoch": 0.22114216281895505,
+      "grad_norm": 116.65693373141701,
+      "learning_rate": 0.00019798935451066361,
+      "loss": 1.1457,
+      "step": 91
+    },
+    {
+      "epoch": 0.2235722964763062,
+      "grad_norm": 71.76422539970217,
+      "learning_rate": 0.00019790840876823232,
+      "loss": 1.2354,
+      "step": 92
+    },
+    {
+      "epoch": 0.22600243013365734,
+      "grad_norm": 139.42330509386431,
+      "learning_rate": 0.0001978258828491236,
+      "loss": 1.18,
+      "step": 93
+    },
+    {
+      "epoch": 0.2284325637910085,
+      "grad_norm": 131.88308820601443,
+      "learning_rate": 0.00019774177808525113,
+      "loss": 1.1868,
+      "step": 94
+    },
+    {
+      "epoch": 0.23086269744835966,
+      "grad_norm": 85.81071125615291,
+      "learning_rate": 0.00019765609583400977,
+      "loss": 1.1814,
+      "step": 95
+    },
+    {
+      "epoch": 0.2332928311057108,
+      "grad_norm": 84.43756298541064,
+      "learning_rate": 0.00019756883747825424,
+      "loss": 1.1658,
+      "step": 96
+    },
+    {
+      "epoch": 0.23572296476306198,
+      "grad_norm": 114.24245545143974,
+      "learning_rate": 0.0001974800044262764,
+      "loss": 1.2497,
+      "step": 97
+    },
+    {
+      "epoch": 0.23815309842041313,
+      "grad_norm": 76.577511222722,
+      "learning_rate": 0.00019738959811178272,
+      "loss": 1.1414,
+      "step": 98
+    },
+    {
+      "epoch": 0.24058323207776428,
+      "grad_norm": 171.8084830895381,
+      "learning_rate": 0.00019729761999387103,
+      "loss": 1.1619,
+      "step": 99
+    },
+    {
+      "epoch": 0.24301336573511542,
+      "grad_norm": 221.87752250936416,
+      "learning_rate": 0.00019720407155700707,
+      "loss": 1.2718,
+      "step": 100
+    },
+    {
+      "epoch": 0.2454434993924666,
+      "grad_norm": 205.64943975370608,
+      "learning_rate": 0.00019710895431100046,
+      "loss": 1.1786,
+      "step": 101
+    },
+    {
+      "epoch": 0.24787363304981774,
+      "grad_norm": 160.16582903260615,
+      "learning_rate": 0.00019701226979098037,
+      "loss": 1.1426,
+      "step": 102
+    },
+    {
+      "epoch": 0.2503037667071689,
+      "grad_norm": 82.85031394537334,
+      "learning_rate": 0.00019691401955737072,
+      "loss": 1.1718,
+      "step": 103
+    },
+    {
+      "epoch": 0.2503037667071689,
+      "eval_loss": 1.1633374691009521,
+      "eval_runtime": 52.6182,
+      "eval_samples_per_second": 14.14,
+      "eval_steps_per_second": 1.767,
+      "step": 103
+    },
+    {
+      "epoch": 0.25273390036452004,
+      "grad_norm": 94.74469296109082,
+      "learning_rate": 0.000196814205195865,
+      "loss": 1.2255,
+      "step": 104
+    },
+    {
+      "epoch": 0.2551640340218712,
+      "grad_norm": 126.15797466756656,
+      "learning_rate": 0.00019671282831740076,
+      "loss": 1.1623,
+      "step": 105
+    },
+    {
+      "epoch": 0.25759416767922233,
+      "grad_norm": 79.41156434272008,
+      "learning_rate": 0.0001966098905581334,
+      "loss": 1.1606,
+      "step": 106
+    },
+    {
+      "epoch": 0.2600243013365735,
+      "grad_norm": 70.33104031058372,
+      "learning_rate": 0.00019650539357941003,
+      "loss": 1.196,
+      "step": 107
+    },
+    {
+      "epoch": 0.2624544349939247,
+      "grad_norm": 69.57260733822498,
+      "learning_rate": 0.0001963993390677424,
+      "loss": 1.1939,
+      "step": 108
+    },
+    {
+      "epoch": 0.2648845686512758,
+      "grad_norm": 81.78820691772725,
+      "learning_rate": 0.00019629172873477995,
+      "loss": 1.2553,
+      "step": 109
+    },
+    {
+      "epoch": 0.267314702308627,
+      "grad_norm": 117.06324110268656,
+      "learning_rate": 0.00019618256431728194,
+      "loss": 1.2535,
+      "step": 110
+    },
+    {
+      "epoch": 0.26974483596597815,
+      "grad_norm": 83.26993317104247,
+      "learning_rate": 0.00019607184757708951,
+      "loss": 1.157,
+      "step": 111
+    },
+    {
+      "epoch": 0.27217496962332927,
+      "grad_norm": 51.990829456422375,
+      "learning_rate": 0.00019595958030109735,
+      "loss": 1.1274,
+      "step": 112
+    },
+    {
+      "epoch": 0.27460510328068044,
+      "grad_norm": 119.7487160875729,
+      "learning_rate": 0.00019584576430122473,
+      "loss": 1.1422,
+      "step": 113
+    },
+    {
+      "epoch": 0.2770352369380316,
+      "grad_norm": 88.15636932272304,
+      "learning_rate": 0.00019573040141438624,
+      "loss": 1.1599,
+      "step": 114
+    },
+    {
+      "epoch": 0.27946537059538273,
+      "grad_norm": 62.346402225534774,
+      "learning_rate": 0.00019561349350246226,
+      "loss": 1.1909,
+      "step": 115
+    },
+    {
+      "epoch": 0.2818955042527339,
+      "grad_norm": 76.40612150653034,
+      "learning_rate": 0.0001954950424522688,
+      "loss": 1.1646,
+      "step": 116
+    },
+    {
+      "epoch": 0.284325637910085,
+      "grad_norm": 94.8711613055073,
+      "learning_rate": 0.00019537505017552716,
+      "loss": 1.1547,
+      "step": 117
+    },
+    {
+      "epoch": 0.2867557715674362,
+      "grad_norm": 63.86961661796314,
+      "learning_rate": 0.00019525351860883293,
+      "loss": 1.1841,
+      "step": 118
+    },
+    {
+      "epoch": 0.2891859052247874,
+      "grad_norm": 133.2417924150684,
+      "learning_rate": 0.00019513044971362494,
+      "loss": 1.1365,
+      "step": 119
+    },
+    {
+      "epoch": 0.2916160388821385,
+      "grad_norm": 133.44891510996445,
+      "learning_rate": 0.00019500584547615333,
+      "loss": 1.1696,
+      "step": 120
+    },
+    {
+      "epoch": 0.29404617253948967,
+      "grad_norm": 58.51701768739601,
+      "learning_rate": 0.00019487970790744774,
+      "loss": 1.1874,
+      "step": 121
+    },
+    {
+      "epoch": 0.29647630619684084,
+      "grad_norm": 49.536158238056196,
+      "learning_rate": 0.00019475203904328474,
+      "loss": 1.1798,
+      "step": 122
+    },
+    {
+      "epoch": 0.29890643985419196,
+      "grad_norm": 94.27608706983857,
+      "learning_rate": 0.000194622840944155,
+      "loss": 1.2443,
+      "step": 123
+    },
+    {
+      "epoch": 0.30133657351154314,
+      "grad_norm": 103.868243202843,
+      "learning_rate": 0.00019449211569523,
+      "loss": 1.1759,
+      "step": 124
+    },
+    {
+      "epoch": 0.3037667071688943,
+      "grad_norm": 73.31536435980003,
+      "learning_rate": 0.00019435986540632843,
+      "loss": 1.1885,
+      "step": 125
+    },
+    {
+      "epoch": 0.30619684082624543,
+      "grad_norm": 64.91149114745738,
+      "learning_rate": 0.00019422609221188207,
+      "loss": 1.1864,
+      "step": 126
+    },
+    {
+      "epoch": 0.3086269744835966,
+      "grad_norm": 95.34449184763653,
+      "learning_rate": 0.00019409079827090145,
+      "loss": 1.1339,
+      "step": 127
+    },
+    {
+      "epoch": 0.3110571081409477,
+      "grad_norm": 67.36156159754226,
+      "learning_rate": 0.00019395398576694086,
+      "loss": 1.1845,
+      "step": 128
+    },
+    {
+      "epoch": 0.3134872417982989,
+      "grad_norm": 36.94913176821407,
+      "learning_rate": 0.00019381565690806328,
+      "loss": 1.2154,
+      "step": 129
+    },
+    {
+      "epoch": 0.3159173754556501,
+      "grad_norm": 69.05265214547647,
+      "learning_rate": 0.00019367581392680457,
+      "loss": 1.1642,
+      "step": 130
+    },
+    {
+      "epoch": 0.3183475091130012,
+      "grad_norm": 38.974761165559855,
+      "learning_rate": 0.00019353445908013755,
+      "loss": 1.1508,
+      "step": 131
+    },
+    {
+      "epoch": 0.32077764277035237,
+      "grad_norm": 48.47215142199794,
+      "learning_rate": 0.00019339159464943557,
+      "loss": 1.2011,
+      "step": 132
+    },
+    {
+      "epoch": 0.32320777642770354,
+      "grad_norm": 41.88512063342574,
+      "learning_rate": 0.00019324722294043558,
+      "loss": 1.1643,
+      "step": 133
+    },
+    {
+      "epoch": 0.32563791008505466,
+      "grad_norm": 25.59403215229145,
+      "learning_rate": 0.00019310134628320114,
+      "loss": 1.1954,
+      "step": 134
+    },
+    {
+      "epoch": 0.32806804374240583,
+      "grad_norm": 58.02634988046396,
+      "learning_rate": 0.00019295396703208453,
+      "loss": 1.1544,
+      "step": 135
+    },
+    {
+      "epoch": 0.330498177399757,
+      "grad_norm": 31.26218977398251,
+      "learning_rate": 0.00019280508756568896,
+      "loss": 1.1613,
+      "step": 136
+    },
+    {
+      "epoch": 0.33292831105710813,
+      "grad_norm": 31.81234539284103,
+      "learning_rate": 0.00019265471028683014,
+      "loss": 1.1892,
+      "step": 137
+    },
+    {
+      "epoch": 0.3353584447144593,
+      "grad_norm": 54.44930114675527,
+      "learning_rate": 0.00019250283762249748,
+      "loss": 1.2801,
+      "step": 138
+    },
+    {
+      "epoch": 0.3377885783718105,
+      "grad_norm": 30.320486287732734,
+      "learning_rate": 0.00019234947202381486,
+      "loss": 1.1934,
+      "step": 139
+    },
+    {
+      "epoch": 0.3402187120291616,
+      "grad_norm": 32.76175001943503,
+      "learning_rate": 0.00019219461596600113,
+      "loss": 1.1436,
+      "step": 140
+    },
+    {
+      "epoch": 0.34264884568651277,
+      "grad_norm": 36.802264122697316,
+      "learning_rate": 0.00019203827194833026,
+      "loss": 1.1418,
+      "step": 141
+    },
+    {
+      "epoch": 0.3450789793438639,
+      "grad_norm": 35.03898729580271,
+      "learning_rate": 0.0001918804424940908,
+      "loss": 1.2479,
+      "step": 142
+    },
+    {
+      "epoch": 0.34750911300121506,
+      "grad_norm": 89.58068030461165,
+      "learning_rate": 0.00019172113015054532,
+      "loss": 1.2504,
+      "step": 143
+    },
+    {
+      "epoch": 0.34993924665856624,
+      "grad_norm": 30.05799668441019,
+      "learning_rate": 0.00019156033748888917,
+      "loss": 1.1662,
+      "step": 144
+    },
+    {
+      "epoch": 0.35236938031591736,
+      "grad_norm": 33.80121199203598,
+      "learning_rate": 0.00019139806710420914,
+      "loss": 1.1862,
+      "step": 145
+    },
+    {
+      "epoch": 0.35479951397326853,
+      "grad_norm": 31.510896023067872,
+      "learning_rate": 0.00019123432161544142,
+      "loss": 1.147,
+      "step": 146
+    },
+    {
+      "epoch": 0.3572296476306197,
+      "grad_norm": 32.92613286618093,
+      "learning_rate": 0.00019106910366532942,
+      "loss": 1.1421,
+      "step": 147
+    },
+    {
+      "epoch": 0.3596597812879708,
+      "grad_norm": 245.36013493823395,
+      "learning_rate": 0.00019090241592038113,
+      "loss": 1.1306,
+      "step": 148
+    },
+    {
+      "epoch": 0.362089914945322,
+      "grad_norm": 72.3061625644275,
+      "learning_rate": 0.000190734261070826,
+      "loss": 1.1144,
+      "step": 149
+    },
+    {
+      "epoch": 0.3645200486026732,
+      "grad_norm": 63.77748866336388,
+      "learning_rate": 0.00019056464183057157,
+      "loss": 1.1249,
+      "step": 150
+    },
+    {
+      "epoch": 0.3669501822600243,
+      "grad_norm": 633.2421324308109,
+      "learning_rate": 0.00019039356093715975,
+      "loss": 1.1359,
+      "step": 151
+    },
+    {
+      "epoch": 0.36938031591737547,
+      "grad_norm": 34.456657555313704,
+      "learning_rate": 0.00019022102115172248,
+      "loss": 1.1397,
+      "step": 152
+    },
+    {
+      "epoch": 0.3718104495747266,
+      "grad_norm": 35.21328820959324,
+      "learning_rate": 0.00019004702525893732,
+      "loss": 1.1741,
+      "step": 153
+    },
+    {
+      "epoch": 0.37424058323207776,
+      "grad_norm": 90.32405227187036,
+      "learning_rate": 0.00018987157606698235,
+      "loss": 1.1844,
+      "step": 154
+    },
+    {
+      "epoch": 0.37667071688942894,
+      "grad_norm": 39.348755664527914,
+      "learning_rate": 0.000189694676407491,
+      "loss": 1.1216,
+      "step": 155
+    },
+    {
+      "epoch": 0.37910085054678005,
+      "grad_norm": 58.85540744859834,
+      "learning_rate": 0.00018951632913550626,
+      "loss": 1.115,
+      "step": 156
+    },
+    {
+      "epoch": 0.38153098420413123,
+      "grad_norm": 39.849945227365325,
+      "learning_rate": 0.0001893365371294346,
+      "loss": 1.1705,
+      "step": 157
+    },
+    {
+      "epoch": 0.3839611178614824,
+      "grad_norm": 40.300954908722304,
+      "learning_rate": 0.0001891553032909996,
+      "loss": 1.1831,
+      "step": 158
+    },
+    {
+      "epoch": 0.3863912515188335,
+      "grad_norm": 53.72009888405355,
+      "learning_rate": 0.00018897263054519498,
+      "loss": 1.1613,
+      "step": 159
+    },
+    {
+      "epoch": 0.3888213851761847,
+      "grad_norm": 142.22686975859034,
+      "learning_rate": 0.0001887885218402375,
+      "loss": 1.1639,
+      "step": 160
+    },
+    {
+      "epoch": 0.39125151883353587,
+      "grad_norm": 50.141889086717356,
+      "learning_rate": 0.00018860298014751944,
+      "loss": 1.1659,
+      "step": 161
+    },
+    {
+      "epoch": 0.393681652490887,
+      "grad_norm": 63.25519968311113,
+      "learning_rate": 0.0001884160084615604,
+      "loss": 1.168,
+      "step": 162
+    },
+    {
+      "epoch": 0.39611178614823817,
+      "grad_norm": 50.59325246324073,
+      "learning_rate": 0.0001882276097999592,
+      "loss": 1.1202,
+      "step": 163
+    },
+    {
+      "epoch": 0.3985419198055893,
+      "grad_norm": 58.32587879810431,
+      "learning_rate": 0.0001880377872033451,
+      "loss": 1.1587,
+      "step": 164
+    },
+    {
+      "epoch": 0.40097205346294046,
+      "grad_norm": 211.50882688314653,
+      "learning_rate": 0.00018784654373532866,
+      "loss": 1.1551,
+      "step": 165
+    },
+    {
+      "epoch": 0.40340218712029163,
+      "grad_norm": 47.82888424614203,
+      "learning_rate": 0.00018765388248245246,
+      "loss": 1.2274,
+      "step": 166
+    },
+    {
+      "epoch": 0.40583232077764275,
+      "grad_norm": 97.94922685274778,
+      "learning_rate": 0.00018745980655414114,
+      "loss": 1.0872,
+      "step": 167
+    },
+    {
+      "epoch": 0.4082624544349939,
+      "grad_norm": 44.74994721544976,
+      "learning_rate": 0.0001872643190826512,
+      "loss": 1.1244,
+      "step": 168
+    },
+    {
+      "epoch": 0.4106925880923451,
+      "grad_norm": 53.84692426866845,
+      "learning_rate": 0.00018706742322302064,
+      "loss": 1.1576,
+      "step": 169
+    },
+    {
+      "epoch": 0.4131227217496962,
+      "grad_norm": 54.43599132185614,
+      "learning_rate": 0.0001868691221530178,
+      "loss": 1.0957,
+      "step": 170
+    },
+    {
+      "epoch": 0.4155528554070474,
+      "grad_norm": 39.21766518089018,
+      "learning_rate": 0.00018666941907309026,
+      "loss": 1.1625,
+      "step": 171
+    },
+    {
+      "epoch": 0.41798298906439857,
+      "grad_norm": 49.40030697752548,
+      "learning_rate": 0.000186468317206313,
+      "loss": 1.1556,
+      "step": 172
+    },
+    {
+      "epoch": 0.4204131227217497,
+      "grad_norm": 101.50309647820374,
+      "learning_rate": 0.0001862658197983366,
+      "loss": 1.1687,
+      "step": 173
+    },
+    {
+      "epoch": 0.42284325637910086,
+      "grad_norm": 105.41233861946563,
+      "learning_rate": 0.0001860619301173347,
+      "loss": 1.1687,
+      "step": 174
+    },
+    {
+      "epoch": 0.425273390036452,
+      "grad_norm": 103.99749987770305,
+      "learning_rate": 0.0001858566514539513,
+      "loss": 1.144,
+      "step": 175
+    },
+    {
+      "epoch": 0.42770352369380316,
+      "grad_norm": 78.83490301242213,
+      "learning_rate": 0.0001856499871212477,
+      "loss": 1.2318,
+      "step": 176
+    },
+    {
+      "epoch": 0.43013365735115433,
+      "grad_norm": 62.325757489859335,
+      "learning_rate": 0.00018544194045464886,
+      "loss": 1.1092,
+      "step": 177
+    },
+    {
+      "epoch": 0.43256379100850545,
+      "grad_norm": 81.32804926878099,
+      "learning_rate": 0.00018523251481188986,
+      "loss": 1.2233,
+      "step": 178
+    },
+    {
+      "epoch": 0.4349939246658566,
+      "grad_norm": 38.97928032166606,
+      "learning_rate": 0.00018502171357296144,
+      "loss": 1.2371,
+      "step": 179
+    },
+    {
+      "epoch": 0.4374240583232078,
+      "grad_norm": 82.62345361244209,
+      "learning_rate": 0.0001848095401400555,
+      "loss": 1.1562,
+      "step": 180
+    },
+    {
+      "epoch": 0.4398541919805589,
+      "grad_norm": 47.793381366401626,
+      "learning_rate": 0.0001845959979375104,
+      "loss": 1.1249,
+      "step": 181
+    },
+    {
+      "epoch": 0.4422843256379101,
+      "grad_norm": 53.6022948471739,
+      "learning_rate": 0.00018438109041175532,
+      "loss": 1.1415,
+      "step": 182
+    },
+    {
+      "epoch": 0.44471445929526127,
+      "grad_norm": 65.92717051568573,
+      "learning_rate": 0.00018416482103125506,
+      "loss": 1.1748,
+      "step": 183
+    },
+    {
+      "epoch": 0.4471445929526124,
+      "grad_norm": 59.410481167619494,
+      "learning_rate": 0.0001839471932864537,
+      "loss": 1.1399,
+      "step": 184
+    },
+    {
+      "epoch": 0.44957472660996356,
+      "grad_norm": 64.22740395872977,
+      "learning_rate": 0.0001837282106897185,
+      "loss": 1.2193,
+      "step": 185
+    },
+    {
+      "epoch": 0.4520048602673147,
+      "grad_norm": 54.63497168787729,
+      "learning_rate": 0.00018350787677528306,
+      "loss": 1.153,
+      "step": 186
+    },
+    {
+      "epoch": 0.45443499392466585,
+      "grad_norm": 49.60676029637355,
+      "learning_rate": 0.00018328619509919044,
+      "loss": 1.1509,
+      "step": 187
+    },
+    {
+      "epoch": 0.456865127582017,
+      "grad_norm": 32.29074835877607,
+      "learning_rate": 0.00018306316923923563,
+      "loss": 1.1851,
+      "step": 188
+    },
+    {
+      "epoch": 0.45929526123936815,
+      "grad_norm": 61.13632454163589,
+      "learning_rate": 0.0001828388027949078,
+      "loss": 1.1323,
+      "step": 189
+    },
+    {
+      "epoch": 0.4617253948967193,
+      "grad_norm": 67.48617660835801,
+      "learning_rate": 0.00018261309938733238,
+      "loss": 1.1956,
+      "step": 190
+    },
+    {
+      "epoch": 0.4641555285540705,
+      "grad_norm": 38.31182257784929,
+      "learning_rate": 0.00018238606265921238,
+      "loss": 1.1379,
+      "step": 191
+    },
+    {
+      "epoch": 0.4665856622114216,
+      "grad_norm": 47.30995766708629,
+      "learning_rate": 0.00018215769627476984,
+      "loss": 1.1462,
+      "step": 192
+    },
+    {
+      "epoch": 0.4690157958687728,
+      "grad_norm": 34.57093925891121,
+      "learning_rate": 0.00018192800391968642,
+      "loss": 1.1979,
+      "step": 193
+    },
+    {
+      "epoch": 0.47144592952612396,
+      "grad_norm": 34.45645740457662,
+      "learning_rate": 0.0001816969893010442,
+      "loss": 1.1763,
+      "step": 194
+    },
+    {
+      "epoch": 0.4738760631834751,
+      "grad_norm": 39.21862152859671,
+      "learning_rate": 0.00018146465614726567,
+      "loss": 1.1514,
+      "step": 195
+    },
+    {
+      "epoch": 0.47630619684082626,
+      "grad_norm": 34.765347344568106,
+      "learning_rate": 0.00018123100820805355,
+      "loss": 1.1426,
+      "step": 196
+    },
+    {
+      "epoch": 0.4787363304981774,
+      "grad_norm": 35.04245362239315,
+      "learning_rate": 0.00018099604925433043,
+      "loss": 1.143,
+      "step": 197
+    },
+    {
+      "epoch": 0.48116646415552855,
+      "grad_norm": 103.45636476066032,
+      "learning_rate": 0.00018075978307817764,
+      "loss": 1.1713,
+      "step": 198
+    },
+    {
+      "epoch": 0.4835965978128797,
+      "grad_norm": 43.0297373660821,
+      "learning_rate": 0.00018052221349277442,
+      "loss": 1.2226,
+      "step": 199
+    },
+    {
+      "epoch": 0.48602673147023084,
+      "grad_norm": 32.80474372048966,
+      "learning_rate": 0.000180283344332336,
+      "loss": 1.1556,
+      "step": 200
+    },
+    {
+      "epoch": 0.488456865127582,
+      "grad_norm": 59.42688731224296,
+      "learning_rate": 0.00018004317945205197,
+      "loss": 1.1411,
+      "step": 201
+    },
+    {
+      "epoch": 0.4908869987849332,
+      "grad_norm": 102.0917822407188,
+      "learning_rate": 0.000179801722728024,
+      "loss": 1.1309,
+      "step": 202
+    },
+    {
+      "epoch": 0.4933171324422843,
+      "grad_norm": 309.9346821950787,
+      "learning_rate": 0.0001795589780572031,
+      "loss": 1.1953,
+      "step": 203
+    },
+    {
+      "epoch": 0.4957472660996355,
+      "grad_norm": 344.5019267346993,
+      "learning_rate": 0.0001793149493573271,
+      "loss": 1.1524,
+      "step": 204
+    },
+    {
+      "epoch": 0.49817739975698666,
+      "grad_norm": 50.075205946207085,
+      "learning_rate": 0.00017906964056685706,
+      "loss": 1.1495,
+      "step": 205
+    },
+    {
+      "epoch": 0.5006075334143378,
+      "grad_norm": 132.32227258331488,
+      "learning_rate": 0.00017882305564491396,
+      "loss": 1.1976,
+      "step": 206
+    },
+    {
+      "epoch": 0.5006075334143378,
+      "eval_loss": 1.146019458770752,
+      "eval_runtime": 52.7816,
+      "eval_samples_per_second": 14.096,
+      "eval_steps_per_second": 1.762,
+      "step": 206
+    },
+    {
+      "epoch": 0.503037667071689,
+      "grad_norm": 138.57200377669218,
+      "learning_rate": 0.00017857519857121458,
+      "loss": 1.2159,
+      "step": 207
+    },
+    {
+      "epoch": 0.5054678007290401,
+      "grad_norm": 268.41109734161546,
+      "learning_rate": 0.00017832607334600746,
+      "loss": 1.1748,
+      "step": 208
+    },
+    {
+      "epoch": 0.5078979343863913,
+      "grad_norm": 72.44153953442401,
+      "learning_rate": 0.00017807568399000822,
+      "loss": 1.1758,
+      "step": 209
+    },
+    {
+      "epoch": 0.5103280680437424,
+      "grad_norm": 97.75400124096738,
+      "learning_rate": 0.00017782403454433477,
+      "loss": 1.1004,
+      "step": 210
+    },
+    {
+      "epoch": 0.5127582017010935,
+      "grad_norm": 84.19522802756285,
+      "learning_rate": 0.000177571129070442,
+      "loss": 1.1397,
+      "step": 211
+    },
+    {
+      "epoch": 0.5151883353584447,
+      "grad_norm": 132.95081835535706,
+      "learning_rate": 0.00017731697165005618,
+      "loss": 1.146,
+      "step": 212
+    },
+    {
+      "epoch": 0.5176184690157959,
+      "grad_norm": 560.3351292126325,
+      "learning_rate": 0.0001770615663851093,
+      "loss": 1.1937,
+      "step": 213
+    },
+    {
+      "epoch": 0.520048602673147,
+      "grad_norm": 252.72862614645885,
+      "learning_rate": 0.0001768049173976727,
+      "loss": 1.1213,
+      "step": 214
+    },
+    {
+      "epoch": 0.5224787363304981,
+      "grad_norm": 356.2985211032981,
+      "learning_rate": 0.0001765470288298905,
+      "loss": 1.22,
+      "step": 215
+    },
+    {
+      "epoch": 0.5249088699878494,
+      "grad_norm": 952.600672502031,
+      "learning_rate": 0.00017628790484391284,
+      "loss": 1.1321,
+      "step": 216
+    },
+    {
+      "epoch": 0.5273390036452005,
+      "grad_norm": 289.9357041930161,
+      "learning_rate": 0.0001760275496218288,
+      "loss": 1.1688,
+      "step": 217
+    },
+    {
+      "epoch": 0.5297691373025516,
+      "grad_norm": 48.69445264741508,
+      "learning_rate": 0.0001757659673655986,
+      "loss": 1.1551,
+      "step": 218
+    },
+    {
+      "epoch": 0.5321992709599028,
+      "grad_norm": 40.15160247154335,
+      "learning_rate": 0.0001755031622969862,
+      "loss": 1.1459,
+      "step": 219
+    },
+    {
+      "epoch": 0.534629404617254,
+      "grad_norm": 44.59390817019205,
+      "learning_rate": 0.00017523913865749078,
+      "loss": 1.2012,
+      "step": 220
+    },
+    {
+      "epoch": 0.5370595382746051,
+      "grad_norm": 30.189717624412484,
+      "learning_rate": 0.00017497390070827848,
+      "loss": 1.15,
+      "step": 221
+    },
+    {
+      "epoch": 0.5394896719319563,
+      "grad_norm": 27.185608574176108,
+      "learning_rate": 0.00017470745273011362,
+      "loss": 1.0763,
+      "step": 222
+    },
+    {
+      "epoch": 0.5419198055893074,
+      "grad_norm": 99.44121390806423,
+      "learning_rate": 0.00017443979902328956,
+      "loss": 1.1478,
+      "step": 223
+    },
+    {
+      "epoch": 0.5443499392466585,
+      "grad_norm": 29.684499344634585,
+      "learning_rate": 0.00017417094390755934,
+      "loss": 1.1123,
+      "step": 224
+    },
+    {
+      "epoch": 0.5467800729040098,
+      "grad_norm": 26.788847114635054,
+      "learning_rate": 0.00017390089172206592,
+      "loss": 1.1169,
+      "step": 225
+    },
+    {
+      "epoch": 0.5492102065613609,
+      "grad_norm": 31.84817878214798,
+      "learning_rate": 0.00017362964682527218,
+      "loss": 1.1524,
+      "step": 226
+    },
+    {
+      "epoch": 0.551640340218712,
+      "grad_norm": 34.834632993822424,
+      "learning_rate": 0.00017335721359489057,
+      "loss": 1.1761,
+      "step": 227
+    },
+    {
+      "epoch": 0.5540704738760632,
+      "grad_norm": 66.6084234453716,
+      "learning_rate": 0.00017308359642781242,
+      "loss": 1.1175,
+      "step": 228
+    },
+    {
+      "epoch": 0.5565006075334143,
+      "grad_norm": 35.15720180142773,
+      "learning_rate": 0.00017280879974003707,
+      "loss": 1.2012,
+      "step": 229
+    },
+    {
+      "epoch": 0.5589307411907655,
+      "grad_norm": 35.975450782756226,
+      "learning_rate": 0.00017253282796660056,
+      "loss": 1.1801,
+      "step": 230
+    },
+    {
+      "epoch": 0.5613608748481167,
+      "grad_norm": 83.49050230764925,
+      "learning_rate": 0.0001722556855615039,
+      "loss": 1.1576,
+      "step": 231
+    },
+    {
+      "epoch": 0.5637910085054678,
+      "grad_norm": 150.44630441002784,
+      "learning_rate": 0.00017197737699764146,
+      "loss": 1.1826,
+      "step": 232
+    },
+    {
+      "epoch": 0.5662211421628189,
+      "grad_norm": 31.322382197739042,
+      "learning_rate": 0.00017169790676672858,
+      "loss": 1.1784,
+      "step": 233
+    },
+    {
+      "epoch": 0.56865127582017,
+      "grad_norm": 33.15983653687515,
+      "learning_rate": 0.0001714172793792291,
+      "loss": 1.1411,
+      "step": 234
+    },
+    {
+      "epoch": 0.5710814094775213,
+      "grad_norm": 22.206850165103052,
+      "learning_rate": 0.0001711354993642827,
+      "loss": 1.1772,
+      "step": 235
+    },
+    {
+      "epoch": 0.5735115431348724,
+      "grad_norm": 43.35721272668955,
+      "learning_rate": 0.00017085257126963152,
+      "loss": 1.0915,
+      "step": 236
+    },
+    {
+      "epoch": 0.5759416767922235,
+      "grad_norm": 29.57234737116712,
+      "learning_rate": 0.0001705684996615472,
+      "loss": 1.0977,
+      "step": 237
+    },
+    {
+      "epoch": 0.5783718104495748,
+      "grad_norm": 42.929644875053214,
+      "learning_rate": 0.00017028328912475668,
+      "loss": 1.1782,
+      "step": 238
+    },
+    {
+      "epoch": 0.5808019441069259,
+      "grad_norm": 32.15711272871687,
+      "learning_rate": 0.0001699969442623686,
+      "loss": 1.1855,
+      "step": 239
+    },
+    {
+      "epoch": 0.583232077764277,
+      "grad_norm": 43.64453730184205,
+      "learning_rate": 0.00016970946969579887,
+      "loss": 1.1171,
+      "step": 240
+    },
+    {
+      "epoch": 0.5856622114216282,
+      "grad_norm": 26.145541544112593,
+      "learning_rate": 0.00016942087006469592,
+      "loss": 1.1656,
+      "step": 241
+    },
+    {
+      "epoch": 0.5880923450789793,
+      "grad_norm": 53.98173886095731,
+      "learning_rate": 0.00016913115002686616,
+      "loss": 1.1378,
+      "step": 242
+    },
+    {
+      "epoch": 0.5905224787363305,
+      "grad_norm": 50.851193586801195,
+      "learning_rate": 0.00016884031425819853,
+      "loss": 1.1338,
+      "step": 243
+    },
+    {
+      "epoch": 0.5929526123936817,
+      "grad_norm": 30.166674036386443,
+      "learning_rate": 0.0001685483674525891,
+      "loss": 1.1732,
+      "step": 244
+    },
+    {
+      "epoch": 0.5953827460510328,
+      "grad_norm": 32.580505176392656,
+      "learning_rate": 0.00016825531432186543,
+      "loss": 1.143,
+      "step": 245
+    },
+    {
+      "epoch": 0.5978128797083839,
+      "grad_norm": 35.087231952662634,
+      "learning_rate": 0.0001679611595957103,
+      "loss": 1.212,
+      "step": 246
+    },
+    {
+      "epoch": 0.6002430133657352,
+      "grad_norm": 44.69578306542608,
+      "learning_rate": 0.00016766590802158566,
+      "loss": 1.1527,
+      "step": 247
+    },
+    {
+      "epoch": 0.6026731470230863,
+      "grad_norm": 39.8378839133733,
+      "learning_rate": 0.00016736956436465573,
+      "loss": 1.2174,
+      "step": 248
+    },
+    {
+      "epoch": 0.6051032806804374,
+      "grad_norm": 25.571860004032857,
+      "learning_rate": 0.0001670721334077103,
+      "loss": 1.1031,
+      "step": 249
+    },
+    {
+      "epoch": 0.6075334143377886,
+      "grad_norm": 27.626061413643438,
+      "learning_rate": 0.00016677361995108743,
+      "loss": 1.107,
+      "step": 250
+    },
+    {
+      "epoch": 0.6099635479951397,
+      "grad_norm": 47.405627339857176,
+      "learning_rate": 0.00016647402881259598,
+      "loss": 1.1521,
+      "step": 251
+    },
+    {
+      "epoch": 0.6123936816524909,
+      "grad_norm": 31.951762409660272,
+      "learning_rate": 0.00016617336482743794,
+      "loss": 1.174,
+      "step": 252
+    },
+    {
+      "epoch": 0.6148238153098421,
+      "grad_norm": 44.304437144236104,
+      "learning_rate": 0.00016587163284813032,
+      "loss": 1.1286,
+      "step": 253
+    },
+    {
+      "epoch": 0.6172539489671932,
+      "grad_norm": 21.990501251879344,
+      "learning_rate": 0.00016556883774442675,
+      "loss": 1.1927,
+      "step": 254
+    },
+    {
+      "epoch": 0.6196840826245443,
+      "grad_norm": 43.91119350789936,
+      "learning_rate": 0.00016526498440323914,
+      "loss": 1.1399,
+      "step": 255
+    },
+    {
+      "epoch": 0.6221142162818954,
+      "grad_norm": 28.064569132249982,
+      "learning_rate": 0.00016496007772855853,
+      "loss": 1.1913,
+      "step": 256
+    },
+    {
+      "epoch": 0.6245443499392467,
+      "grad_norm": 99.97142272243896,
+      "learning_rate": 0.0001646541226413761,
+      "loss": 1.1694,
+      "step": 257
+    },
+    {
+      "epoch": 0.6269744835965978,
+      "grad_norm": 27.12524206817854,
+      "learning_rate": 0.00016434712407960373,
+      "loss": 1.2398,
+      "step": 258
+    },
+    {
+      "epoch": 0.6294046172539489,
+      "grad_norm": 42.99171796479219,
+      "learning_rate": 0.00016403908699799425,
+      "loss": 1.145,
+      "step": 259
+    },
+    {
+      "epoch": 0.6318347509113001,
+      "grad_norm": 24.064938768293658,
+      "learning_rate": 0.00016373001636806153,
+      "loss": 1.098,
+      "step": 260
+    },
+    {
+      "epoch": 0.6342648845686513,
+      "grad_norm": 31.72232981247621,
+      "learning_rate": 0.00016341991717800023,
+      "loss": 1.1779,
+      "step": 261
+    },
+    {
+      "epoch": 0.6366950182260024,
+      "grad_norm": 39.97326887390835,
+      "learning_rate": 0.00016310879443260528,
+      "loss": 1.3142,
+      "step": 262
+    },
+    {
+      "epoch": 0.6391251518833536,
+      "grad_norm": 27.519208072826963,
+      "learning_rate": 0.00016279665315319114,
+      "loss": 1.2039,
+      "step": 263
+    },
+    {
+      "epoch": 0.6415552855407047,
+      "grad_norm": 52.94895557810481,
+      "learning_rate": 0.00016248349837751062,
+      "loss": 1.1718,
+      "step": 264
+    },
+    {
+      "epoch": 0.6439854191980559,
+      "grad_norm": 23.603047222747566,
+      "learning_rate": 0.0001621693351596739,
+      "loss": 1.1155,
+      "step": 265
+    },
+    {
+      "epoch": 0.6464155528554071,
+      "grad_norm": 21.400341520569807,
+      "learning_rate": 0.00016185416857006647,
+      "loss": 1.1242,
+      "step": 266
+    },
+    {
+      "epoch": 0.6488456865127582,
+      "grad_norm": 51.167335508822276,
+      "learning_rate": 0.00016153800369526788,
+      "loss": 1.1746,
+      "step": 267
+    },
+    {
+      "epoch": 0.6512758201701093,
+      "grad_norm": 26.219581065473573,
+      "learning_rate": 0.00016122084563796905,
+      "loss": 1.0836,
+      "step": 268
+    },
+    {
+      "epoch": 0.6537059538274606,
+      "grad_norm": 56.820249886600706,
+      "learning_rate": 0.0001609026995168904,
+      "loss": 1.1625,
+      "step": 269
+    },
+    {
+      "epoch": 0.6561360874848117,
+      "grad_norm": 37.43384869992443,
+      "learning_rate": 0.00016058357046669898,
+      "loss": 1.2143,
+      "step": 270
+    },
+    {
+      "epoch": 0.6585662211421628,
+      "grad_norm": 31.885237168871473,
+      "learning_rate": 0.00016026346363792567,
+      "loss": 1.1536,
+      "step": 271
+    },
+    {
+      "epoch": 0.660996354799514,
+      "grad_norm": 34.66147983279251,
+      "learning_rate": 0.00015994238419688199,
+      "loss": 1.2095,
+      "step": 272
+    },
+    {
+      "epoch": 0.6634264884568651,
+      "grad_norm": 86.90365354594917,
+      "learning_rate": 0.00015962033732557686,
+      "loss": 1.1149,
+      "step": 273
+    },
+    {
+      "epoch": 0.6658566221142163,
+      "grad_norm": 52.21177462889067,
+      "learning_rate": 0.00015929732822163287,
+      "loss": 1.1861,
+      "step": 274
+    },
+    {
+      "epoch": 0.6682867557715675,
+      "grad_norm": 92.11184701145604,
+      "learning_rate": 0.00015897336209820239,
+      "loss": 1.1853,
+      "step": 275
+    },
+    {
+      "epoch": 0.6707168894289186,
+      "grad_norm": 30.662475573811115,
+      "learning_rate": 0.00015864844418388342,
+      "loss": 1.0912,
+      "step": 276
+    },
+    {
+      "epoch": 0.6731470230862697,
+      "grad_norm": 26.15855468837027,
+      "learning_rate": 0.00015832257972263523,
+      "loss": 1.1618,
+      "step": 277
+    },
+    {
+      "epoch": 0.675577156743621,
+      "grad_norm": 41.14250673970726,
+      "learning_rate": 0.00015799577397369375,
+      "loss": 1.1499,
+      "step": 278
+    },
+    {
+      "epoch": 0.6780072904009721,
+      "grad_norm": 31.93253644773631,
+      "learning_rate": 0.00015766803221148673,
+      "loss": 1.1229,
+      "step": 279
+    },
+    {
+      "epoch": 0.6804374240583232,
+      "grad_norm": 39.87120131585165,
+      "learning_rate": 0.00015733935972554844,
+      "loss": 1.1647,
+      "step": 280
+    },
+    {
+      "epoch": 0.6828675577156743,
+      "grad_norm": 52.741654062271124,
+      "learning_rate": 0.0001570097618204345,
+      "loss": 1.1362,
+      "step": 281
+    },
+    {
+      "epoch": 0.6852976913730255,
+      "grad_norm": 33.13137686002526,
+      "learning_rate": 0.0001566792438156362,
+      "loss": 1.1825,
+      "step": 282
+    },
+    {
+      "epoch": 0.6877278250303767,
+      "grad_norm": 20.284041564566042,
+      "learning_rate": 0.00015634781104549442,
+      "loss": 1.1439,
+      "step": 283
+    },
+    {
+      "epoch": 0.6901579586877278,
+      "grad_norm": 164.9222932471453,
+      "learning_rate": 0.00015601546885911404,
+      "loss": 1.122,
+      "step": 284
+    },
+    {
+      "epoch": 0.692588092345079,
+      "grad_norm": 27.092346730158148,
+      "learning_rate": 0.00015568222262027717,
+      "loss": 1.157,
+      "step": 285
+    },
+    {
+      "epoch": 0.6950182260024301,
+      "grad_norm": 39.46898996008012,
+      "learning_rate": 0.00015534807770735664,
+      "loss": 1.1092,
+      "step": 286
+    },
+    {
+      "epoch": 0.6974483596597812,
+      "grad_norm": 30.00942949300714,
+      "learning_rate": 0.00015501303951322943,
+      "loss": 1.243,
+      "step": 287
+    },
+    {
+      "epoch": 0.6998784933171325,
+      "grad_norm": 31.435817418038887,
+      "learning_rate": 0.00015467711344518942,
+      "loss": 1.1034,
+      "step": 288
+    },
+    {
+      "epoch": 0.7023086269744836,
+      "grad_norm": 54.53572773177548,
+      "learning_rate": 0.00015434030492486023,
+      "loss": 1.2216,
+      "step": 289
+    },
+    {
+      "epoch": 0.7047387606318347,
+      "grad_norm": 24.51082708234768,
+      "learning_rate": 0.00015400261938810757,
+      "loss": 1.1532,
+      "step": 290
+    },
+    {
+      "epoch": 0.707168894289186,
+      "grad_norm": 104.85480514443172,
+      "learning_rate": 0.00015366406228495172,
+      "loss": 1.1156,
+      "step": 291
+    },
+    {
+      "epoch": 0.7095990279465371,
+      "grad_norm": 26.398830117870997,
+      "learning_rate": 0.0001533246390794794,
+      "loss": 1.0934,
+      "step": 292
+    },
+    {
+      "epoch": 0.7120291616038882,
+      "grad_norm": 25.062392373037707,
+      "learning_rate": 0.00015298435524975572,
+      "loss": 1.1453,
+      "step": 293
+    },
+    {
+      "epoch": 0.7144592952612394,
+      "grad_norm": 25.385505352027444,
+      "learning_rate": 0.0001526432162877356,
+      "loss": 1.1359,
+      "step": 294
+    },
+    {
+      "epoch": 0.7168894289185905,
+      "grad_norm": 18.00146943000571,
+      "learning_rate": 0.00015230122769917527,
+      "loss": 1.1129,
+      "step": 295
+    },
+    {
+      "epoch": 0.7193195625759417,
+      "grad_norm": 22.55383473288135,
+      "learning_rate": 0.00015195839500354335,
+      "loss": 1.142,
+      "step": 296
+    },
+    {
+      "epoch": 0.7217496962332929,
+      "grad_norm": 30.013723395820165,
+      "learning_rate": 0.00015161472373393186,
+      "loss": 1.1379,
+      "step": 297
+    },
+    {
+      "epoch": 0.724179829890644,
+      "grad_norm": 40.566201545240425,
+      "learning_rate": 0.0001512702194369668,
+      "loss": 1.1326,
+      "step": 298
+    },
+    {
+      "epoch": 0.7266099635479951,
+      "grad_norm": 27.34716639907029,
+      "learning_rate": 0.00015092488767271857,
+      "loss": 1.0782,
+      "step": 299
+    },
+    {
+      "epoch": 0.7290400972053463,
+      "grad_norm": 45.0837594669075,
+      "learning_rate": 0.00015057873401461253,
+      "loss": 1.2054,
+      "step": 300
+    },
+    {
+      "epoch": 0.7314702308626975,
+      "grad_norm": 22.39794101270309,
+      "learning_rate": 0.00015023176404933874,
+      "loss": 1.1052,
+      "step": 301
+    },
+    {
+      "epoch": 0.7339003645200486,
+      "grad_norm": 21.818512025585306,
+      "learning_rate": 0.00014988398337676198,
+      "loss": 1.1664,
+      "step": 302
+    },
+    {
+      "epoch": 0.7363304981773997,
+      "grad_norm": 33.09386163968815,
+      "learning_rate": 0.00014953539760983122,
+      "loss": 1.1364,
+      "step": 303
+    },
+    {
+      "epoch": 0.7387606318347509,
+      "grad_norm": 26.3253592215911,
+      "learning_rate": 0.00014918601237448923,
+      "loss": 1.1093,
+      "step": 304
+    },
+    {
+      "epoch": 0.741190765492102,
+      "grad_norm": 32.54878723405212,
+      "learning_rate": 0.0001488358333095816,
+      "loss": 1.182,
+      "step": 305
+    },
+    {
+      "epoch": 0.7436208991494532,
+      "grad_norm": 28.645473311846015,
+      "learning_rate": 0.0001484848660667658,
+      "loss": 1.2064,
+      "step": 306
+    },
+    {
+      "epoch": 0.7460510328068044,
+      "grad_norm": 29.02693042820854,
+      "learning_rate": 0.00014813311631041995,
+      "loss": 1.1545,
+      "step": 307
+    },
+    {
+      "epoch": 0.7484811664641555,
+      "grad_norm": 20.28193033099828,
+      "learning_rate": 0.00014778058971755154,
+      "loss": 1.1885,
+      "step": 308
+    },
+    {
+      "epoch": 0.7509113001215066,
+      "grad_norm": 121.86121371804961,
+      "learning_rate": 0.00014742729197770552,
+      "loss": 1.095,
+      "step": 309
+    },
+    {
+      "epoch": 0.7509113001215066,
+      "eval_loss": 1.133868932723999,
+      "eval_runtime": 52.6711,
+      "eval_samples_per_second": 14.125,
+      "eval_steps_per_second": 1.766,
+      "step": 309
+    },
+    {
+      "epoch": 0.7533414337788579,
+      "grad_norm": 50.1793074315811,
+      "learning_rate": 0.00014707322879287276,
+      "loss": 1.1679,
+      "step": 310
+    },
+    {
+      "epoch": 0.755771567436209,
+      "grad_norm": 31.791309498678103,
+      "learning_rate": 0.00014671840587739783,
+      "loss": 1.1277,
+      "step": 311
+    },
+    {
+      "epoch": 0.7582017010935601,
+      "grad_norm": 56.88911226488106,
+      "learning_rate": 0.00014636282895788688,
+      "loss": 1.1492,
+      "step": 312
+    },
+    {
+      "epoch": 0.7606318347509113,
+      "grad_norm": 117.29437608667352,
+      "learning_rate": 0.00014600650377311522,
+      "loss": 1.1123,
+      "step": 313
+    },
+    {
+      "epoch": 0.7630619684082625,
+      "grad_norm": 107.56728772749254,
+      "learning_rate": 0.00014564943607393459,
+      "loss": 1.171,
+      "step": 314
+    },
+    {
+      "epoch": 0.7654921020656136,
+      "grad_norm": 34.085830256919685,
+      "learning_rate": 0.0001452916316231805,
+      "loss": 1.1854,
+      "step": 315
+    },
+    {
+      "epoch": 0.7679222357229648,
+      "grad_norm": 23.625747202851176,
+      "learning_rate": 0.000144933096195579,
+      "loss": 1.1622,
+      "step": 316
+    },
+    {
+      "epoch": 0.7703523693803159,
+      "grad_norm": 56.9917185309248,
+      "learning_rate": 0.00014457383557765386,
+      "loss": 1.2037,
+      "step": 317
+    },
+    {
+      "epoch": 0.772782503037667,
+      "grad_norm": 34.55554043725056,
+      "learning_rate": 0.00014421385556763266,
+      "loss": 1.1273,
+      "step": 318
+    },
+    {
+      "epoch": 0.7752126366950183,
+      "grad_norm": 34.205286759913115,
+      "learning_rate": 0.00014385316197535372,
+      "loss": 1.2039,
+      "step": 319
+    },
+    {
+      "epoch": 0.7776427703523694,
+      "grad_norm": 27.30015395778206,
+      "learning_rate": 0.00014349176062217195,
+      "loss": 1.1903,
+      "step": 320
+    },
+    {
+      "epoch": 0.7800729040097205,
+      "grad_norm": 23.077745147127867,
+      "learning_rate": 0.00014312965734086518,
+      "loss": 1.1539,
+      "step": 321
+    },
+    {
+      "epoch": 0.7825030376670717,
+      "grad_norm": 26.22112568156326,
+      "learning_rate": 0.00014276685797553977,
+      "loss": 1.1807,
+      "step": 322
+    },
+    {
+      "epoch": 0.7849331713244229,
+      "grad_norm": 34.813719314948514,
+      "learning_rate": 0.0001424033683815365,
+      "loss": 1.1247,
+      "step": 323
+    },
+    {
+      "epoch": 0.787363304981774,
+      "grad_norm": 27.109609629038324,
+      "learning_rate": 0.00014203919442533597,
+      "loss": 1.1735,
+      "step": 324
+    },
+    {
+      "epoch": 0.7897934386391251,
+      "grad_norm": 144.91672798575476,
+      "learning_rate": 0.00014167434198446383,
+      "loss": 1.1007,
+      "step": 325
+    },
+    {
+      "epoch": 0.7922235722964763,
+      "grad_norm": 42.19042828736382,
+      "learning_rate": 0.00014130881694739616,
+      "loss": 1.1398,
+      "step": 326
+    },
+    {
+      "epoch": 0.7946537059538274,
+      "grad_norm": 43.00144921766715,
+      "learning_rate": 0.00014094262521346427,
+      "loss": 1.1712,
+      "step": 327
+    },
+    {
+      "epoch": 0.7970838396111786,
+      "grad_norm": 26.343159670729925,
+      "learning_rate": 0.0001405757726927595,
+      "loss": 1.2103,
+      "step": 328
+    },
+    {
+      "epoch": 0.7995139732685298,
+      "grad_norm": 31.68271222195729,
+      "learning_rate": 0.00014020826530603776,
+      "loss": 1.1578,
+      "step": 329
+    },
+    {
+      "epoch": 0.8019441069258809,
+      "grad_norm": 39.08920292536896,
+      "learning_rate": 0.00013984010898462416,
+      "loss": 1.1377,
+      "step": 330
+    },
+    {
+      "epoch": 0.804374240583232,
+      "grad_norm": 34.56898084569197,
+      "learning_rate": 0.00013947130967031717,
+      "loss": 1.1886,
+      "step": 331
+    },
+    {
+      "epoch": 0.8068043742405833,
+      "grad_norm": 42.016356369933895,
+      "learning_rate": 0.00013910187331529276,
+      "loss": 1.1577,
+      "step": 332
+    },
+    {
+      "epoch": 0.8092345078979344,
+      "grad_norm": 21.25953597879822,
+      "learning_rate": 0.00013873180588200827,
+      "loss": 1.1259,
+      "step": 333
+    },
+    {
+      "epoch": 0.8116646415552855,
+      "grad_norm": 39.49634140985428,
+      "learning_rate": 0.0001383611133431062,
+      "loss": 1.173,
+      "step": 334
+    },
+    {
+      "epoch": 0.8140947752126367,
+      "grad_norm": 29.837690582268863,
+      "learning_rate": 0.00013798980168131794,
+      "loss": 1.1322,
+      "step": 335
+    },
+    {
+      "epoch": 0.8165249088699879,
+      "grad_norm": 23.510451396240928,
+      "learning_rate": 0.000137617876889367,
+      "loss": 1.1392,
+      "step": 336
+    },
+    {
+      "epoch": 0.818955042527339,
+      "grad_norm": 19.183017199526635,
+      "learning_rate": 0.00013724534496987247,
+      "loss": 1.157,
+      "step": 337
+    },
+    {
+      "epoch": 0.8213851761846902,
+      "grad_norm": 51.85037647612581,
+      "learning_rate": 0.0001368722119352521,
+      "loss": 1.1255,
+      "step": 338
+    },
+    {
+      "epoch": 0.8238153098420413,
+      "grad_norm": 31.635699477838273,
+      "learning_rate": 0.00013649848380762513,
+      "loss": 1.1429,
+      "step": 339
+    },
+    {
+      "epoch": 0.8262454434993924,
+      "grad_norm": 39.6479124739029,
+      "learning_rate": 0.00013612416661871533,
+      "loss": 1.1609,
+      "step": 340
+    },
+    {
+      "epoch": 0.8286755771567437,
+      "grad_norm": 21.453228401011238,
+      "learning_rate": 0.0001357492664097534,
+      "loss": 1.1247,
+      "step": 341
+    },
+    {
+      "epoch": 0.8311057108140948,
+      "grad_norm": 28.514958428145494,
+      "learning_rate": 0.00013537378923137973,
+      "loss": 1.0845,
+      "step": 342
+    },
+    {
+      "epoch": 0.8335358444714459,
+      "grad_norm": 26.98663985253516,
+      "learning_rate": 0.00013499774114354655,
+      "loss": 1.1092,
+      "step": 343
+    },
+    {
+      "epoch": 0.8359659781287971,
+      "grad_norm": 30.76143424141064,
+      "learning_rate": 0.00013462112821542016,
+      "loss": 1.1759,
+      "step": 344
+    },
+    {
+      "epoch": 0.8383961117861483,
+      "grad_norm": 39.023771167108656,
+      "learning_rate": 0.0001342439565252831,
+      "loss": 1.1024,
+      "step": 345
+    },
+    {
+      "epoch": 0.8408262454434994,
+      "grad_norm": 29.787639099820225,
+      "learning_rate": 0.0001338662321604358,
+      "loss": 1.2141,
+      "step": 346
+    },
+    {
+      "epoch": 0.8432563791008505,
+      "grad_norm": 25.60634301240642,
+      "learning_rate": 0.00013348796121709862,
+      "loss": 1.1244,
+      "step": 347
+    },
+    {
+      "epoch": 0.8456865127582017,
+      "grad_norm": 76.98542857181108,
+      "learning_rate": 0.00013310914980031334,
+      "loss": 1.19,
+      "step": 348
+    },
+    {
+      "epoch": 0.8481166464155528,
+      "grad_norm": 110.28982985071892,
+      "learning_rate": 0.0001327298040238446,
+      "loss": 1.1295,
+      "step": 349
+    },
+    {
+      "epoch": 0.850546780072904,
+      "grad_norm": 22.610631125609732,
+      "learning_rate": 0.0001323499300100811,
+      "loss": 1.1445,
+      "step": 350
+    },
+    {
+      "epoch": 0.8529769137302552,
+      "grad_norm": 29.958515973723888,
+      "learning_rate": 0.00013196953388993726,
+      "loss": 1.2048,
+      "step": 351
+    },
+    {
+      "epoch": 0.8554070473876063,
+      "grad_norm": 30.691798031468103,
+      "learning_rate": 0.00013158862180275363,
+      "loss": 1.1628,
+      "step": 352
+    },
+    {
+      "epoch": 0.8578371810449574,
+      "grad_norm": 28.568576369680258,
+      "learning_rate": 0.00013120719989619833,
+      "loss": 1.0899,
+      "step": 353
+    },
+    {
+      "epoch": 0.8602673147023087,
+      "grad_norm": 42.12623456189728,
+      "learning_rate": 0.0001308252743261675,
+      "loss": 1.1451,
+      "step": 354
+    },
+    {
+      "epoch": 0.8626974483596598,
+      "grad_norm": 112.39248005736448,
+      "learning_rate": 0.00013044285125668614,
+      "loss": 1.154,
+      "step": 355
+    },
+    {
+      "epoch": 0.8651275820170109,
+      "grad_norm": 28.013602355549782,
+      "learning_rate": 0.0001300599368598086,
+      "loss": 1.1937,
+      "step": 356
+    },
+    {
+      "epoch": 0.8675577156743621,
+      "grad_norm": 27.763517972300694,
+      "learning_rate": 0.0001296765373155188,
+      "loss": 1.1243,
+      "step": 357
+    },
+    {
+      "epoch": 0.8699878493317132,
+      "grad_norm": 112.85815824767063,
+      "learning_rate": 0.0001292926588116308,
+      "loss": 1.1595,
+      "step": 358
+    },
+    {
+      "epoch": 0.8724179829890644,
+      "grad_norm": 27.085127886556087,
+      "learning_rate": 0.00012890830754368855,
+      "loss": 1.1196,
+      "step": 359
+    },
+    {
+      "epoch": 0.8748481166464156,
+      "grad_norm": 31.56336829128541,
+      "learning_rate": 0.00012852348971486617,
+      "loss": 1.1231,
+      "step": 360
+    },
+    {
+      "epoch": 0.8772782503037667,
+      "grad_norm": 31.904393738907178,
+      "learning_rate": 0.0001281382115358679,
+      "loss": 1.097,
+      "step": 361
+    },
+    {
+      "epoch": 0.8797083839611178,
+      "grad_norm": 25.034453894065827,
+      "learning_rate": 0.00012775247922482748,
+      "loss": 1.1246,
+      "step": 362
+    },
+    {
+      "epoch": 0.8821385176184691,
+      "grad_norm": 33.221958266501474,
+      "learning_rate": 0.0001273662990072083,
+      "loss": 1.1189,
+      "step": 363
+    },
+    {
+      "epoch": 0.8845686512758202,
+      "grad_norm": 26.638980136773224,
+      "learning_rate": 0.00012697967711570242,
+      "loss": 1.1315,
+      "step": 364
+    },
+    {
+      "epoch": 0.8869987849331713,
+      "grad_norm": 27.231479341362885,
+      "learning_rate": 0.00012659261979013043,
+      "loss": 1.1464,
+      "step": 365
+    },
+    {
+      "epoch": 0.8894289185905225,
+      "grad_norm": 19.654091006710207,
+      "learning_rate": 0.0001262051332773404,
+      "loss": 1.1271,
+      "step": 366
+    },
+    {
+      "epoch": 0.8918590522478737,
+      "grad_norm": 50.3934263865559,
+      "learning_rate": 0.00012581722383110718,
+      "loss": 1.1002,
+      "step": 367
+    },
+    {
+      "epoch": 0.8942891859052248,
+      "grad_norm": 20.25952031318632,
+      "learning_rate": 0.00012542889771203166,
+      "loss": 1.0629,
+      "step": 368
+    },
+    {
+      "epoch": 0.8967193195625759,
+      "grad_norm": 19.16914945262315,
+      "learning_rate": 0.00012504016118743935,
+      "loss": 1.1597,
+      "step": 369
+    },
+    {
+      "epoch": 0.8991494532199271,
+      "grad_norm": 35.65941460173898,
+      "learning_rate": 0.00012465102053127957,
+      "loss": 1.1501,
+      "step": 370
+    },
+    {
+      "epoch": 0.9015795868772782,
+      "grad_norm": 26.093269180565315,
+      "learning_rate": 0.00012426148202402404,
+      "loss": 1.1455,
+      "step": 371
+    },
+    {
+      "epoch": 0.9040097205346294,
+      "grad_norm": 30.928987547424892,
+      "learning_rate": 0.00012387155195256537,
+      "loss": 1.1392,
+      "step": 372
+    },
+    {
+      "epoch": 0.9064398541919806,
+      "grad_norm": 20.17512596846915,
+      "learning_rate": 0.00012348123661011601,
+      "loss": 1.1196,
+      "step": 373
+    },
+    {
+      "epoch": 0.9088699878493317,
+      "grad_norm": 24.380789157356805,
+      "learning_rate": 0.00012309054229610623,
+      "loss": 1.1,
+      "step": 374
+    },
+    {
+      "epoch": 0.9113001215066828,
+      "grad_norm": 95.49408387682203,
+      "learning_rate": 0.00012269947531608276,
+      "loss": 1.1825,
+      "step": 375
+    },
+    {
+      "epoch": 0.913730255164034,
+      "grad_norm": 23.635286340368726,
+      "learning_rate": 0.0001223080419816069,
+      "loss": 1.1717,
+      "step": 376
+    },
+    {
+      "epoch": 0.9161603888213852,
+      "grad_norm": 21.942478063568313,
+      "learning_rate": 0.00012191624861015254,
+      "loss": 1.1661,
+      "step": 377
+    },
+    {
+      "epoch": 0.9185905224787363,
+      "grad_norm": 74.12601397150299,
+      "learning_rate": 0.00012152410152500453,
+      "loss": 1.1967,
+      "step": 378
+    },
+    {
+      "epoch": 0.9210206561360875,
+      "grad_norm": 37.26720386499629,
+      "learning_rate": 0.00012113160705515625,
+      "loss": 1.1566,
+      "step": 379
+    },
+    {
+      "epoch": 0.9234507897934386,
+      "grad_norm": 34.080854733427635,
+      "learning_rate": 0.00012073877153520776,
+      "loss": 1.0847,
+      "step": 380
+    },
+    {
+      "epoch": 0.9258809234507898,
+      "grad_norm": 26.50842916877183,
+      "learning_rate": 0.0001203456013052634,
+      "loss": 1.0824,
+      "step": 381
+    },
+    {
+      "epoch": 0.928311057108141,
+      "grad_norm": 37.92039651416441,
+      "learning_rate": 0.00011995210271082944,
+      "loss": 1.1485,
+      "step": 382
+    },
+    {
+      "epoch": 0.9307411907654921,
+      "grad_norm": 38.56931832374284,
+      "learning_rate": 0.00011955828210271187,
+      "loss": 1.0737,
+      "step": 383
+    },
+    {
+      "epoch": 0.9331713244228432,
+      "grad_norm": 24.419015296791592,
+      "learning_rate": 0.0001191641458369136,
+      "loss": 1.1208,
+      "step": 384
+    },
+    {
+      "epoch": 0.9356014580801945,
+      "grad_norm": 28.75379656643836,
+      "learning_rate": 0.00011876970027453222,
+      "loss": 1.1071,
+      "step": 385
+    },
+    {
+      "epoch": 0.9380315917375456,
+      "grad_norm": 138.39305133994282,
+      "learning_rate": 0.00011837495178165706,
+      "loss": 1.1405,
+      "step": 386
+    },
+    {
+      "epoch": 0.9404617253948967,
+      "grad_norm": 22.200435229928654,
+      "learning_rate": 0.00011797990672926652,
+      "loss": 1.124,
+      "step": 387
+    },
+    {
+      "epoch": 0.9428918590522479,
+      "grad_norm": 40.21978055156661,
+      "learning_rate": 0.00011758457149312538,
+      "loss": 1.1875,
+      "step": 388
+    },
+    {
+      "epoch": 0.945321992709599,
+      "grad_norm": 23.592672098002485,
+      "learning_rate": 0.00011718895245368167,
+      "loss": 1.1748,
+      "step": 389
+    },
+    {
+      "epoch": 0.9477521263669502,
+      "grad_norm": 17.463183827323444,
+      "learning_rate": 0.00011679305599596393,
+      "loss": 1.1794,
+      "step": 390
+    },
+    {
+      "epoch": 0.9501822600243013,
+      "grad_norm": 36.219441964332646,
+      "learning_rate": 0.00011639688850947799,
+      "loss": 1.1459,
+      "step": 391
+    },
+    {
+      "epoch": 0.9526123936816525,
+      "grad_norm": 23.727472560980413,
+      "learning_rate": 0.00011600045638810386,
+      "loss": 1.076,
+      "step": 392
+    },
+    {
+      "epoch": 0.9550425273390036,
+      "grad_norm": 57.63284414960702,
+      "learning_rate": 0.00011560376602999272,
+      "loss": 1.1919,
+      "step": 393
+    },
+    {
+      "epoch": 0.9574726609963548,
+      "grad_norm": 40.23829998466358,
+      "learning_rate": 0.00011520682383746333,
+      "loss": 1.0701,
+      "step": 394
+    },
+    {
+      "epoch": 0.959902794653706,
+      "grad_norm": 58.2018640218209,
+      "learning_rate": 0.00011480963621689905,
+      "loss": 1.1745,
+      "step": 395
+    },
+    {
+      "epoch": 0.9623329283110571,
+      "grad_norm": 27.693448904288406,
+      "learning_rate": 0.00011441220957864421,
+      "loss": 1.1323,
+      "step": 396
+    },
+    {
+      "epoch": 0.9647630619684082,
+      "grad_norm": 34.94430005820724,
+      "learning_rate": 0.00011401455033690076,
+      "loss": 1.1497,
+      "step": 397
+    },
+    {
+      "epoch": 0.9671931956257594,
+      "grad_norm": 17.521922247865188,
+      "learning_rate": 0.00011361666490962468,
+      "loss": 1.1319,
+      "step": 398
+    },
+    {
+      "epoch": 0.9696233292831106,
+      "grad_norm": 25.886687159935246,
+      "learning_rate": 0.00011321855971842243,
+      "loss": 1.1418,
+      "step": 399
+    },
+    {
+      "epoch": 0.9720534629404617,
+      "grad_norm": 31.388154506614836,
+      "learning_rate": 0.00011282024118844738,
+      "loss": 1.1282,
+      "step": 400
+    },
+    {
+      "epoch": 0.9744835965978129,
+      "grad_norm": 27.458601253675347,
+      "learning_rate": 0.00011242171574829599,
+      "loss": 1.1647,
+      "step": 401
+    },
+    {
+      "epoch": 0.976913730255164,
+      "grad_norm": 25.922873022924257,
+      "learning_rate": 0.00011202298982990411,
+      "loss": 1.091,
+      "step": 402
+    },
+    {
+      "epoch": 0.9793438639125152,
+      "grad_norm": 20.129467589894766,
+      "learning_rate": 0.00011162406986844323,
+      "loss": 1.2,
+      "step": 403
+    },
+    {
+      "epoch": 0.9817739975698664,
+      "grad_norm": 25.11892123906363,
+      "learning_rate": 0.00011122496230221645,
+      "loss": 1.0731,
+      "step": 404
+    },
+    {
+      "epoch": 0.9842041312272175,
+      "grad_norm": 26.416884392453543,
+      "learning_rate": 0.00011082567357255484,
+      "loss": 1.1836,
+      "step": 405
+    },
+    {
+      "epoch": 0.9866342648845686,
+      "grad_norm": 18.768078773975784,
+      "learning_rate": 0.00011042621012371322,
+      "loss": 1.1275,
+      "step": 406
+    },
+    {
+      "epoch": 0.9890643985419199,
+      "grad_norm": 22.275756523796257,
+      "learning_rate": 0.00011002657840276627,
+      "loss": 1.1228,
+      "step": 407
+    },
+    {
+      "epoch": 0.991494532199271,
+      "grad_norm": 29.605335344828575,
+      "learning_rate": 0.00010962678485950455,
+      "loss": 1.0255,
+      "step": 408
+    },
+    {
+      "epoch": 0.9939246658566221,
+      "grad_norm": 41.1718200727633,
+      "learning_rate": 0.00010922683594633021,
+      "loss": 1.1876,
+      "step": 409
+    },
+    {
+      "epoch": 0.9963547995139733,
+      "grad_norm": 20.46397475257922,
+      "learning_rate": 0.00010882673811815304,
+      "loss": 1.1168,
+      "step": 410
+    },
+    {
+      "epoch": 0.9987849331713244,
+      "grad_norm": 21.084924025016928,
+      "learning_rate": 0.00010842649783228624,
+      "loss": 1.1948,
+      "step": 411
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 21.084924025016928,
+      "learning_rate": 0.00010802612154834211,
+      "loss": 1.1076,
+      "step": 412
+    },
+    {
+      "epoch": 1.0,
+      "eval_loss": 1.121336579322815,
+      "eval_runtime": 52.7043,
+      "eval_samples_per_second": 14.116,
+      "eval_steps_per_second": 1.765,
+      "step": 412
+    },
+    {
+      "epoch": 1.0024301336573511,
+      "grad_norm": 35.25758968935371,
+      "learning_rate": 0.00010762561572812788,
+      "loss": 1.1335,
+      "step": 413
+    },
+    {
+      "epoch": 1.0048602673147022,
+      "grad_norm": 20.78715726366623,
+      "learning_rate": 0.0001072249868355415,
+      "loss": 1.1003,
+      "step": 414
+    },
+    {
+      "epoch": 1.0072904009720534,
+      "grad_norm": 31.01116633763719,
+      "learning_rate": 0.0001068242413364671,
+      "loss": 1.1225,
+      "step": 415
+    },
+    {
+      "epoch": 1.0097205346294047,
+      "grad_norm": 19.050638172672897,
+      "learning_rate": 0.00010642338569867086,
+      "loss": 1.0595,
+      "step": 416
+    },
+    {
+      "epoch": 1.0121506682867558,
+      "grad_norm": 41.54235389574412,
+      "learning_rate": 0.00010602242639169648,
+      "loss": 1.1719,
+      "step": 417
+    },
+    {
+      "epoch": 1.014580801944107,
+      "grad_norm": 41.34218206464363,
+      "learning_rate": 0.00010562136988676078,
+      "loss": 1.1292,
+      "step": 418
+    },
+    {
+      "epoch": 1.017010935601458,
+      "grad_norm": 32.436985934581934,
+      "learning_rate": 0.0001052202226566494,
+      "loss": 1.1244,
+      "step": 419
+    },
+    {
+      "epoch": 1.0194410692588092,
+      "grad_norm": 19.631825450596665,
+      "learning_rate": 0.0001048189911756121,
+      "loss": 1.1323,
+      "step": 420
+    },
+    {
+      "epoch": 1.0218712029161603,
+      "grad_norm": 23.275029440216805,
+      "learning_rate": 0.00010441768191925847,
+      "loss": 1.1605,
+      "step": 421
+    },
+    {
+      "epoch": 1.0243013365735116,
+      "grad_norm": 21.44161988455765,
+      "learning_rate": 0.0001040163013644533,
+      "loss": 1.0886,
+      "step": 422
+    },
+    {
+      "epoch": 1.0267314702308628,
+      "grad_norm": 31.9765167465431,
+      "learning_rate": 0.00010361485598921212,
+      "loss": 1.1378,
+      "step": 423
+    },
+    {
+      "epoch": 1.0291616038882139,
+      "grad_norm": 22.340741556027833,
+      "learning_rate": 0.00010321335227259661,
+      "loss": 1.1278,
+      "step": 424
+    },
+    {
+      "epoch": 1.031591737545565,
+      "grad_norm": 29.27286563037163,
+      "learning_rate": 0.00010281179669461005,
+      "loss": 1.1186,
+      "step": 425
+    },
+    {
+      "epoch": 1.034021871202916,
+      "grad_norm": 65.85877610734141,
+      "learning_rate": 0.00010241019573609269,
+      "loss": 1.1673,
+      "step": 426
+    },
+    {
+      "epoch": 1.0364520048602672,
+      "grad_norm": 35.173784527846884,
+      "learning_rate": 0.00010200855587861724,
+      "loss": 1.0903,
+      "step": 427
+    },
+    {
+      "epoch": 1.0388821385176186,
+      "grad_norm": 29.91546238299385,
+      "learning_rate": 0.00010160688360438419,
+      "loss": 1.0884,
+      "step": 428
+    },
+    {
+      "epoch": 1.0413122721749697,
+      "grad_norm": 26.873308685100223,
+      "learning_rate": 0.0001012051853961172,
+      "loss": 1.1296,
+      "step": 429
+    },
+    {
+      "epoch": 1.0437424058323208,
+      "grad_norm": 25.90622275527891,
+      "learning_rate": 0.00010080346773695853,
+      "loss": 1.1349,
+      "step": 430
+    },
+    {
+      "epoch": 1.046172539489672,
+      "grad_norm": 21.388851321680434,
+      "learning_rate": 0.00010040173711036431,
+      "loss": 1.0947,
+      "step": 431
+    },
+    {
+      "epoch": 1.048602673147023,
+      "grad_norm": 31.206506843880053,
+      "learning_rate": 0.0001,
+      "loss": 1.1541,
+      "step": 432
+    },
+    {
+      "epoch": 1.0510328068043742,
+      "grad_norm": 19.486767323523555,
+      "learning_rate": 9.959826288963571e-05,
+      "loss": 1.1574,
+      "step": 433
+    },
+    {
+      "epoch": 1.0534629404617255,
+      "grad_norm": 102.81325604770561,
+      "learning_rate": 9.919653226304148e-05,
+      "loss": 1.1762,
+      "step": 434
+    },
+    {
+      "epoch": 1.0558930741190766,
+      "grad_norm": 17.18170280255333,
+      "learning_rate": 9.879481460388282e-05,
+      "loss": 1.1208,
+      "step": 435
+    },
+    {
+      "epoch": 1.0583232077764277,
+      "grad_norm": 29.88292309614927,
+      "learning_rate": 9.839311639561583e-05,
+      "loss": 1.1114,
+      "step": 436
+    },
+    {
+      "epoch": 1.0607533414337789,
+      "grad_norm": 23.50392429976475,
+      "learning_rate": 9.799144412138275e-05,
+      "loss": 1.2026,
+      "step": 437
+    },
+    {
+      "epoch": 1.06318347509113,
+      "grad_norm": 24.794408487434744,
+      "learning_rate": 9.758980426390732e-05,
+      "loss": 1.1587,
+      "step": 438
+    },
+    {
+      "epoch": 1.065613608748481,
+      "grad_norm": 38.726295800289655,
+      "learning_rate": 9.718820330538998e-05,
+      "loss": 1.14,
+      "step": 439
+    },
+    {
+      "epoch": 1.0680437424058322,
+      "grad_norm": 31.152256057732977,
+      "learning_rate": 9.678664772740343e-05,
+      "loss": 1.0882,
+      "step": 440
+    },
+    {
+      "epoch": 1.0704738760631836,
+      "grad_norm": 65.73380095432839,
+      "learning_rate": 9.638514401078788e-05,
+      "loss": 1.1213,
+      "step": 441
+    },
+    {
+      "epoch": 1.0729040097205347,
+      "grad_norm": 69.07317297910537,
+      "learning_rate": 9.598369863554673e-05,
+      "loss": 1.1285,
+      "step": 442
+    },
+    {
+      "epoch": 1.0753341433778858,
+      "grad_norm": 62.55969576940585,
+      "learning_rate": 9.558231808074156e-05,
+      "loss": 1.1252,
+      "step": 443
+    },
+    {
+      "epoch": 1.077764277035237,
+      "grad_norm": 26.35106444530265,
+      "learning_rate": 9.51810088243879e-05,
+      "loss": 1.108,
+      "step": 444
+    },
+    {
+      "epoch": 1.080194410692588,
+      "grad_norm": 76.70006955440516,
+      "learning_rate": 9.477977734335061e-05,
+      "loss": 1.1144,
+      "step": 445
+    },
+    {
+      "epoch": 1.0826245443499392,
+      "grad_norm": 22.376983523395264,
+      "learning_rate": 9.437863011323922e-05,
+      "loss": 1.173,
+      "step": 446
+    },
+    {
+      "epoch": 1.0850546780072905,
+      "grad_norm": 33.51322062360491,
+      "learning_rate": 9.397757360830353e-05,
+      "loss": 1.089,
+      "step": 447
+    },
+    {
+      "epoch": 1.0874848116646416,
+      "grad_norm": 24.87252097324779,
+      "learning_rate": 9.357661430132915e-05,
+      "loss": 1.098,
+      "step": 448
+    },
+    {
+      "epoch": 1.0899149453219927,
+      "grad_norm": 48.95371674408058,
+      "learning_rate": 9.317575866353292e-05,
+      "loss": 1.0491,
+      "step": 449
+    },
+    {
+      "epoch": 1.0923450789793439,
+      "grad_norm": 25.50740340531524,
+      "learning_rate": 9.277501316445854e-05,
+      "loss": 1.0939,
+      "step": 450
+    },
+    {
+      "epoch": 1.094775212636695,
+      "grad_norm": 27.60998778610316,
+      "learning_rate": 9.23743842718721e-05,
+      "loss": 1.1564,
+      "step": 451
+    },
+    {
+      "epoch": 1.097205346294046,
+      "grad_norm": 63.99226186124907,
+      "learning_rate": 9.197387845165793e-05,
+      "loss": 1.1088,
+      "step": 452
+    },
+    {
+      "epoch": 1.0996354799513974,
+      "grad_norm": 36.441157466567596,
+      "learning_rate": 9.157350216771378e-05,
+      "loss": 1.0897,
+      "step": 453
+    },
+    {
+      "epoch": 1.1020656136087486,
+      "grad_norm": 32.32587774153429,
+      "learning_rate": 9.117326188184695e-05,
+      "loss": 1.1285,
+      "step": 454
+    },
+    {
+      "epoch": 1.1044957472660997,
+      "grad_norm": 33.39257750037465,
+      "learning_rate": 9.077316405366981e-05,
+      "loss": 1.1568,
+      "step": 455
+    },
+    {
+      "epoch": 1.1069258809234508,
+      "grad_norm": 45.03485873480868,
+      "learning_rate": 9.037321514049548e-05,
+      "loss": 1.0791,
+      "step": 456
+    },
+    {
+      "epoch": 1.109356014580802,
+      "grad_norm": 35.1451377482015,
+      "learning_rate": 8.997342159723371e-05,
+      "loss": 1.1243,
+      "step": 457
+    },
+    {
+      "epoch": 1.111786148238153,
+      "grad_norm": 67.01465976966,
+      "learning_rate": 8.957378987628682e-05,
+      "loss": 1.0978,
+      "step": 458
+    },
+    {
+      "epoch": 1.1142162818955041,
+      "grad_norm": 33.057859846207634,
+      "learning_rate": 8.917432642744518e-05,
+      "loss": 1.1431,
+      "step": 459
+    },
+    {
+      "epoch": 1.1166464155528555,
+      "grad_norm": 30.602840863536635,
+      "learning_rate": 8.877503769778356e-05,
+      "loss": 1.1157,
+      "step": 460
+    },
+    {
+      "epoch": 1.1190765492102066,
+      "grad_norm": 38.088467248288964,
+      "learning_rate": 8.83759301315568e-05,
+      "loss": 1.0776,
+      "step": 461
+    },
+    {
+      "epoch": 1.1215066828675577,
+      "grad_norm": 66.03671829863266,
+      "learning_rate": 8.797701017009591e-05,
+      "loss": 1.1468,
+      "step": 462
+    },
+    {
+      "epoch": 1.1239368165249088,
+      "grad_norm": 32.293691874682686,
+      "learning_rate": 8.757828425170404e-05,
+      "loss": 1.1115,
+      "step": 463
+    },
+    {
+      "epoch": 1.12636695018226,
+      "grad_norm": 32.70707175332633,
+      "learning_rate": 8.717975881155261e-05,
+      "loss": 1.1677,
+      "step": 464
+    },
+    {
+      "epoch": 1.128797083839611,
+      "grad_norm": 48.79069594971439,
+      "learning_rate": 8.678144028157759e-05,
+      "loss": 1.1341,
+      "step": 465
+    },
+    {
+      "epoch": 1.1312272174969624,
+      "grad_norm": 37.52808559072613,
+      "learning_rate": 8.638333509037536e-05,
+      "loss": 1.1414,
+      "step": 466
+    },
+    {
+      "epoch": 1.1336573511543135,
+      "grad_norm": 27.096068124970536,
+      "learning_rate": 8.598544966309925e-05,
+      "loss": 1.1719,
+      "step": 467
+    },
+    {
+      "epoch": 1.1360874848116647,
+      "grad_norm": 16.019227077248434,
+      "learning_rate": 8.55877904213558e-05,
+      "loss": 1.1148,
+      "step": 468
+    },
+    {
+      "epoch": 1.1385176184690158,
+      "grad_norm": 29.861941956913498,
+      "learning_rate": 8.519036378310096e-05,
+      "loss": 1.1486,
+      "step": 469
+    },
+    {
+      "epoch": 1.140947752126367,
+      "grad_norm": 23.058998452019107,
+      "learning_rate": 8.47931761625367e-05,
+      "loss": 1.0745,
+      "step": 470
+    },
+    {
+      "epoch": 1.143377885783718,
+      "grad_norm": 24.486692418227875,
+      "learning_rate": 8.43962339700073e-05,
+      "loss": 1.1333,
+      "step": 471
+    },
+    {
+      "epoch": 1.1458080194410694,
+      "grad_norm": 31.632544516924323,
+      "learning_rate": 8.399954361189615e-05,
+      "loss": 1.1565,
+      "step": 472
+    },
+    {
+      "epoch": 1.1482381530984205,
+      "grad_norm": 21.67735267443374,
+      "learning_rate": 8.360311149052205e-05,
+      "loss": 1.109,
+      "step": 473
+    },
+    {
+      "epoch": 1.1506682867557716,
+      "grad_norm": 29.096918560226527,
+      "learning_rate": 8.320694400403606e-05,
+      "loss": 1.1517,
+      "step": 474
+    },
+    {
+      "epoch": 1.1530984204131227,
+      "grad_norm": 46.067313216206955,
+      "learning_rate": 8.281104754631835e-05,
+      "loss": 1.1043,
+      "step": 475
+    },
+    {
+      "epoch": 1.1555285540704738,
+      "grad_norm": 30.84953769166141,
+      "learning_rate": 8.241542850687465e-05,
+      "loss": 1.1081,
+      "step": 476
+    },
+    {
+      "epoch": 1.157958687727825,
+      "grad_norm": 39.34158523904847,
+      "learning_rate": 8.20200932707335e-05,
+      "loss": 1.1787,
+      "step": 477
+    },
+    {
+      "epoch": 1.160388821385176,
+      "grad_norm": 39.14663302484904,
+      "learning_rate": 8.162504821834295e-05,
+      "loss": 1.202,
+      "step": 478
+    },
+    {
+      "epoch": 1.1628189550425274,
+      "grad_norm": 49.7279004249915,
+      "learning_rate": 8.123029972546781e-05,
+      "loss": 1.1439,
+      "step": 479
+    },
+    {
+      "epoch": 1.1652490886998785,
+      "grad_norm": 35.49897960878779,
+      "learning_rate": 8.083585416308642e-05,
+      "loss": 1.0741,
+      "step": 480
+    },
+    {
+      "epoch": 1.1676792223572297,
+      "grad_norm": 31.306252618855535,
+      "learning_rate": 8.044171789728816e-05,
+      "loss": 1.0697,
+      "step": 481
+    },
+    {
+      "epoch": 1.1701093560145808,
+      "grad_norm": 22.40745672651249,
+      "learning_rate": 8.004789728917059e-05,
+      "loss": 1.1498,
+      "step": 482
+    },
+    {
+      "epoch": 1.172539489671932,
+      "grad_norm": 32.19326746671122,
+      "learning_rate": 7.965439869473664e-05,
+      "loss": 1.1392,
+      "step": 483
+    },
+    {
+      "epoch": 1.1749696233292832,
+      "grad_norm": 33.66876390791385,
+      "learning_rate": 7.926122846479224e-05,
+      "loss": 1.1049,
+      "step": 484
+    },
+    {
+      "epoch": 1.1773997569866343,
+      "grad_norm": 35.43357233261174,
+      "learning_rate": 7.886839294484377e-05,
+      "loss": 1.0467,
+      "step": 485
+    },
+    {
+      "epoch": 1.1798298906439855,
+      "grad_norm": 50.660998166256256,
+      "learning_rate": 7.84758984749955e-05,
+      "loss": 1.1244,
+      "step": 486
+    },
+    {
+      "epoch": 1.1822600243013366,
+      "grad_norm": 41.356845334605936,
+      "learning_rate": 7.808375138984745e-05,
+      "loss": 1.1279,
+      "step": 487
+    },
+    {
+      "epoch": 1.1846901579586877,
+      "grad_norm": 22.947663723281487,
+      "learning_rate": 7.769195801839313e-05,
+      "loss": 1.0787,
+      "step": 488
+    },
+    {
+      "epoch": 1.1871202916160388,
+      "grad_norm": 36.434647074399905,
+      "learning_rate": 7.730052468391725e-05,
+      "loss": 1.1148,
+      "step": 489
+    },
+    {
+      "epoch": 1.18955042527339,
+      "grad_norm": 75.94549877059467,
+      "learning_rate": 7.690945770389377e-05,
+      "loss": 1.1127,
+      "step": 490
+    },
+    {
+      "epoch": 1.1919805589307413,
+      "grad_norm": 68.03126664734435,
+      "learning_rate": 7.6518763389884e-05,
+      "loss": 1.1672,
+      "step": 491
+    },
+    {
+      "epoch": 1.1944106925880924,
+      "grad_norm": 40.15361719091623,
+      "learning_rate": 7.612844804743466e-05,
+      "loss": 1.0962,
+      "step": 492
+    },
+    {
+      "epoch": 1.1968408262454435,
+      "grad_norm": 105.80023571763755,
+      "learning_rate": 7.573851797597602e-05,
+      "loss": 1.1091,
+      "step": 493
+    },
+    {
+      "epoch": 1.1992709599027946,
+      "grad_norm": 41.84401502420881,
+      "learning_rate": 7.534897946872042e-05,
+      "loss": 1.1359,
+      "step": 494
+    },
+    {
+      "epoch": 1.2017010935601458,
+      "grad_norm": 21.985533615468846,
+      "learning_rate": 7.495983881256067e-05,
+      "loss": 1.1024,
+      "step": 495
+    },
+    {
+      "epoch": 1.2041312272174969,
+      "grad_norm": 23.02649898605792,
+      "learning_rate": 7.457110228796838e-05,
+      "loss": 1.1089,
+      "step": 496
+    },
+    {
+      "epoch": 1.206561360874848,
+      "grad_norm": 74.4950498938832,
+      "learning_rate": 7.418277616889282e-05,
+      "loss": 1.0439,
+      "step": 497
+    },
+    {
+      "epoch": 1.2089914945321993,
+      "grad_norm": 27.637660484960865,
+      "learning_rate": 7.379486672265964e-05,
+      "loss": 1.1453,
+      "step": 498
+    },
+    {
+      "epoch": 1.2114216281895505,
+      "grad_norm": 34.98561655821008,
+      "learning_rate": 7.340738020986961e-05,
+      "loss": 1.139,
+      "step": 499
+    },
+    {
+      "epoch": 1.2138517618469016,
+      "grad_norm": 28.47627677351389,
+      "learning_rate": 7.302032288429756e-05,
+      "loss": 1.0623,
+      "step": 500
+    },
+    {
+      "epoch": 1.2162818955042527,
+      "grad_norm": 39.551486186427596,
+      "learning_rate": 7.263370099279172e-05,
+      "loss": 1.1277,
+      "step": 501
+    },
+    {
+      "epoch": 1.2187120291616038,
+      "grad_norm": 44.12973085459368,
+      "learning_rate": 7.224752077517253e-05,
+      "loss": 1.1768,
+      "step": 502
+    },
+    {
+      "epoch": 1.2211421628189552,
+      "grad_norm": 84.84836585196132,
+      "learning_rate": 7.186178846413214e-05,
+      "loss": 1.1892,
+      "step": 503
+    },
+    {
+      "epoch": 1.2235722964763063,
+      "grad_norm": 34.94807915131505,
+      "learning_rate": 7.147651028513383e-05,
+      "loss": 1.1108,
+      "step": 504
+    },
+    {
+      "epoch": 1.2260024301336574,
+      "grad_norm": 46.19847384406232,
+      "learning_rate": 7.109169245631149e-05,
+      "loss": 1.0956,
+      "step": 505
+    },
+    {
+      "epoch": 1.2284325637910085,
+      "grad_norm": 38.58484473058957,
+      "learning_rate": 7.070734118836925e-05,
+      "loss": 1.1175,
+      "step": 506
+    },
+    {
+      "epoch": 1.2308626974483596,
+      "grad_norm": 37.84739298111386,
+      "learning_rate": 7.032346268448118e-05,
+      "loss": 1.1411,
+      "step": 507
+    },
+    {
+      "epoch": 1.2332928311057108,
+      "grad_norm": 53.5471335398439,
+      "learning_rate": 6.994006314019141e-05,
+      "loss": 1.1332,
+      "step": 508
+    },
+    {
+      "epoch": 1.2357229647630619,
+      "grad_norm": 91.55067777365485,
+      "learning_rate": 6.955714874331387e-05,
+      "loss": 1.1205,
+      "step": 509
+    },
+    {
+      "epoch": 1.2381530984204132,
+      "grad_norm": 27.05333642785952,
+      "learning_rate": 6.917472567383252e-05,
+      "loss": 1.099,
+      "step": 510
+    },
+    {
+      "epoch": 1.2405832320777643,
+      "grad_norm": 24.519879042487336,
+      "learning_rate": 6.87928001038017e-05,
+      "loss": 1.1401,
+      "step": 511
+    },
+    {
+      "epoch": 1.2430133657351154,
+      "grad_norm": 33.763495598365786,
+      "learning_rate": 6.84113781972464e-05,
+      "loss": 1.2058,
+      "step": 512
+    },
+    {
+      "epoch": 1.2454434993924666,
+      "grad_norm": 34.49114206138826,
+      "learning_rate": 6.803046611006278e-05,
+      "loss": 1.1044,
+      "step": 513
+    },
+    {
+      "epoch": 1.2478736330498177,
+      "grad_norm": 74.20211157975073,
+      "learning_rate": 6.765006998991888e-05,
+      "loss": 1.111,
+      "step": 514
+    },
+    {
+      "epoch": 1.250303766707169,
+      "grad_norm": 32.30436806042553,
+      "learning_rate": 6.727019597615545e-05,
+      "loss": 1.1063,
+      "step": 515
+    },
+    {
+      "epoch": 1.250303766707169,
+      "eval_loss": 1.1128273010253906,
+      "eval_runtime": 53.4998,
+      "eval_samples_per_second": 13.907,
+      "eval_steps_per_second": 1.738,
+      "step": 515
+    },
+    {
+      "epoch": 1.25273390036452,
+      "grad_norm": 42.104054612880084,
+      "learning_rate": 6.689085019968669e-05,
+      "loss": 1.1315,
+      "step": 516
+    },
+    {
+      "epoch": 1.2551640340218713,
+      "grad_norm": 25.66097714624212,
+      "learning_rate": 6.651203878290139e-05,
+      "loss": 1.0916,
+      "step": 517
+    },
+    {
+      "epoch": 1.2575941676792224,
+      "grad_norm": 35.12310576456352,
+      "learning_rate": 6.613376783956423e-05,
+      "loss": 1.0699,
+      "step": 518
+    },
+    {
+      "epoch": 1.2600243013365735,
+      "grad_norm": 34.172951559594566,
+      "learning_rate": 6.575604347471695e-05,
+      "loss": 1.1412,
+      "step": 519
+    },
+    {
+      "epoch": 1.2624544349939246,
+      "grad_norm": 54.373563773275116,
+      "learning_rate": 6.537887178457984e-05,
+      "loss": 1.1255,
+      "step": 520
+    },
+    {
+      "epoch": 1.2648845686512757,
+      "grad_norm": 33.806385046788755,
+      "learning_rate": 6.500225885645346e-05,
+      "loss": 1.101,
+      "step": 521
+    },
+    {
+      "epoch": 1.267314702308627,
+      "grad_norm": 34.17813695957543,
+      "learning_rate": 6.46262107686203e-05,
+      "loss": 1.1226,
+      "step": 522
+    },
+    {
+      "epoch": 1.2697448359659782,
+      "grad_norm": 24.68048087106548,
+      "learning_rate": 6.425073359024663e-05,
+      "loss": 1.1787,
+      "step": 523
+    },
+    {
+      "epoch": 1.2721749696233293,
+      "grad_norm": 32.78749757697808,
+      "learning_rate": 6.387583338128471e-05,
+      "loss": 1.0541,
+      "step": 524
+    },
+    {
+      "epoch": 1.2746051032806804,
+      "grad_norm": 30.906673844090044,
+      "learning_rate": 6.350151619237488e-05,
+      "loss": 1.0964,
+      "step": 525
+    },
+    {
+      "epoch": 1.2770352369380316,
+      "grad_norm": 32.571858392892736,
+      "learning_rate": 6.312778806474795e-05,
+      "loss": 1.1251,
+      "step": 526
+    },
+    {
+      "epoch": 1.2794653705953827,
+      "grad_norm": 43.02428916532565,
+      "learning_rate": 6.275465503012751e-05,
+      "loss": 1.0473,
+      "step": 527
+    },
+    {
+      "epoch": 1.2818955042527338,
+      "grad_norm": 60.93587506764561,
+      "learning_rate": 6.2382123110633e-05,
+      "loss": 1.078,
+      "step": 528
+    },
+    {
+      "epoch": 1.2843256379100851,
+      "grad_norm": 64.6934775930251,
+      "learning_rate": 6.201019831868208e-05,
+      "loss": 1.0904,
+      "step": 529
+    },
+    {
+      "epoch": 1.2867557715674363,
+      "grad_norm": 32.977077613035426,
+      "learning_rate": 6.16388866568938e-05,
+      "loss": 1.0705,
+      "step": 530
+    },
+    {
+      "epoch": 1.2891859052247874,
+      "grad_norm": 28.27407310492513,
+      "learning_rate": 6.126819411799175e-05,
+      "loss": 1.1252,
+      "step": 531
+    },
+    {
+      "epoch": 1.2916160388821385,
+      "grad_norm": 33.73515826089828,
+      "learning_rate": 6.0898126684707265e-05,
+      "loss": 1.1262,
+      "step": 532
+    },
+    {
+      "epoch": 1.2940461725394896,
+      "grad_norm": 25.370361818959903,
+      "learning_rate": 6.052869032968285e-05,
+      "loss": 1.0845,
+      "step": 533
+    },
+    {
+      "epoch": 1.296476306196841,
+      "grad_norm": 37.389287060597105,
+      "learning_rate": 6.015989101537586e-05,
+      "loss": 1.1352,
+      "step": 534
+    },
+    {
+      "epoch": 1.2989064398541919,
+      "grad_norm": 39.04755104008223,
+      "learning_rate": 5.979173469396227e-05,
+      "loss": 1.1538,
+      "step": 535
+    },
+    {
+      "epoch": 1.3013365735115432,
+      "grad_norm": 34.33676719612293,
+      "learning_rate": 5.9424227307240554e-05,
+      "loss": 1.1725,
+      "step": 536
+    },
+    {
+      "epoch": 1.3037667071688943,
+      "grad_norm": 64.66076997769457,
+      "learning_rate": 5.905737478653572e-05,
+      "loss": 1.1146,
+      "step": 537
+    },
+    {
+      "epoch": 1.3061968408262454,
+      "grad_norm": 48.043289790386325,
+      "learning_rate": 5.8691183052603834e-05,
+      "loss": 1.1035,
+      "step": 538
+    },
+    {
+      "epoch": 1.3086269744835966,
+      "grad_norm": 49.08397341659928,
+      "learning_rate": 5.83256580155362e-05,
+      "loss": 1.1653,
+      "step": 539
+    },
+    {
+      "epoch": 1.3110571081409477,
+      "grad_norm": 46.688886812303515,
+      "learning_rate": 5.796080557466406e-05,
+      "loss": 1.1328,
+      "step": 540
+    },
+    {
+      "epoch": 1.313487241798299,
+      "grad_norm": 27.503882325413493,
+      "learning_rate": 5.7596631618463514e-05,
+      "loss": 1.1019,
+      "step": 541
+    },
+    {
+      "epoch": 1.3159173754556501,
+      "grad_norm": 48.88974129574653,
+      "learning_rate": 5.723314202446026e-05,
+      "loss": 1.121,
+      "step": 542
+    },
+    {
+      "epoch": 1.3183475091130012,
+      "grad_norm": 28.105881157995345,
+      "learning_rate": 5.687034265913485e-05,
+      "loss": 1.0898,
+      "step": 543
+    },
+    {
+      "epoch": 1.3207776427703524,
+      "grad_norm": 30.410731278414804,
+      "learning_rate": 5.6508239377828034e-05,
+      "loss": 1.07,
+      "step": 544
+    },
+    {
+      "epoch": 1.3232077764277035,
+      "grad_norm": 38.08324176765882,
+      "learning_rate": 5.614683802464631e-05,
+      "loss": 1.1503,
+      "step": 545
+    },
+    {
+      "epoch": 1.3256379100850546,
+      "grad_norm": 46.28952293745534,
+      "learning_rate": 5.578614443236738e-05,
+      "loss": 1.1282,
+      "step": 546
+    },
+    {
+      "epoch": 1.3280680437424057,
+      "grad_norm": 68.2597453597135,
+      "learning_rate": 5.542616442234618e-05,
+      "loss": 1.1373,
+      "step": 547
+    },
+    {
+      "epoch": 1.330498177399757,
+      "grad_norm": 30.351663825014143,
+      "learning_rate": 5.5066903804421025e-05,
+      "loss": 1.1633,
+      "step": 548
+    },
+    {
+      "epoch": 1.3329283110571082,
+      "grad_norm": 38.2711285636887,
+      "learning_rate": 5.470836837681954e-05,
+      "loss": 1.1604,
+      "step": 549
+    },
+    {
+      "epoch": 1.3353584447144593,
+      "grad_norm": 35.64230091531108,
+      "learning_rate": 5.4350563926065404e-05,
+      "loss": 1.0564,
+      "step": 550
+    },
+    {
+      "epoch": 1.3377885783718104,
+      "grad_norm": 44.869816046925564,
+      "learning_rate": 5.399349622688479e-05,
+      "loss": 1.1376,
+      "step": 551
+    },
+    {
+      "epoch": 1.3402187120291615,
+      "grad_norm": 26.681037126315633,
+      "learning_rate": 5.3637171042113146e-05,
+      "loss": 1.0867,
+      "step": 552
+    },
+    {
+      "epoch": 1.3426488456865129,
+      "grad_norm": 34.6124686262535,
+      "learning_rate": 5.32815941226022e-05,
+      "loss": 1.0474,
+      "step": 553
+    },
+    {
+      "epoch": 1.3450789793438638,
+      "grad_norm": 35.92639009060983,
+      "learning_rate": 5.2926771207127254e-05,
+      "loss": 1.0958,
+      "step": 554
+    },
+    {
+      "epoch": 1.3475091130012151,
+      "grad_norm": 39.08938922562224,
+      "learning_rate": 5.2572708022294504e-05,
+      "loss": 1.074,
+      "step": 555
+    },
+    {
+      "epoch": 1.3499392466585662,
+      "grad_norm": 76.06708166273745,
+      "learning_rate": 5.2219410282448514e-05,
+      "loss": 1.0865,
+      "step": 556
+    },
+    {
+      "epoch": 1.3523693803159174,
+      "grad_norm": 74.14222265654887,
+      "learning_rate": 5.1866883689580056e-05,
+      "loss": 1.1567,
+      "step": 557
+    },
+    {
+      "epoch": 1.3547995139732685,
+      "grad_norm": 34.82441678662901,
+      "learning_rate": 5.151513393323426e-05,
+      "loss": 1.0802,
+      "step": 558
+    },
+    {
+      "epoch": 1.3572296476306196,
+      "grad_norm": 75.53504846566143,
+      "learning_rate": 5.116416669041843e-05,
+      "loss": 1.0623,
+      "step": 559
+    },
+    {
+      "epoch": 1.359659781287971,
+      "grad_norm": 29.423475817434785,
+      "learning_rate": 5.0813987625510775e-05,
+      "loss": 1.077,
+      "step": 560
+    },
+    {
+      "epoch": 1.362089914945322,
+      "grad_norm": 44.607486168434534,
+      "learning_rate": 5.046460239016879e-05,
+      "loss": 1.096,
+      "step": 561
+    },
+    {
+      "epoch": 1.3645200486026732,
+      "grad_norm": 40.684125033315404,
+      "learning_rate": 5.011601662323807e-05,
+      "loss": 1.148,
+      "step": 562
+    },
+    {
+      "epoch": 1.3669501822600243,
+      "grad_norm": 47.33103026318705,
+      "learning_rate": 4.976823595066128e-05,
+      "loss": 1.1712,
+      "step": 563
+    },
+    {
+      "epoch": 1.3693803159173754,
+      "grad_norm": 51.17017845058186,
+      "learning_rate": 4.9421265985387476e-05,
+      "loss": 1.1287,
+      "step": 564
+    },
+    {
+      "epoch": 1.3718104495747265,
+      "grad_norm": 50.76665552103517,
+      "learning_rate": 4.907511232728145e-05,
+      "loss": 1.1156,
+      "step": 565
+    },
+    {
+      "epoch": 1.3742405832320777,
+      "grad_norm": 32.6007633025874,
+      "learning_rate": 4.872978056303327e-05,
+      "loss": 1.1477,
+      "step": 566
+    },
+    {
+      "epoch": 1.376670716889429,
+      "grad_norm": 29.696241441710107,
+      "learning_rate": 4.8385276266068146e-05,
+      "loss": 1.0874,
+      "step": 567
+    },
+    {
+      "epoch": 1.37910085054678,
+      "grad_norm": 58.96613500379004,
+      "learning_rate": 4.804160499645667e-05,
+      "loss": 1.0616,
+      "step": 568
+    },
+    {
+      "epoch": 1.3815309842041312,
+      "grad_norm": 37.104100020310334,
+      "learning_rate": 4.7698772300824756e-05,
+      "loss": 1.0878,
+      "step": 569
+    },
+    {
+      "epoch": 1.3839611178614823,
+      "grad_norm": 51.735902941979305,
+      "learning_rate": 4.735678371226441e-05,
+      "loss": 1.0836,
+      "step": 570
+    },
+    {
+      "epoch": 1.3863912515188335,
+      "grad_norm": 55.49190976804079,
+      "learning_rate": 4.7015644750244306e-05,
+      "loss": 1.0473,
+      "step": 571
+    },
+    {
+      "epoch": 1.3888213851761848,
+      "grad_norm": 34.27972449829039,
+      "learning_rate": 4.6675360920520625e-05,
+      "loss": 1.0723,
+      "step": 572
+    },
+    {
+      "epoch": 1.391251518833536,
+      "grad_norm": 28.508157856527724,
+      "learning_rate": 4.6335937715048306e-05,
+      "loss": 1.0723,
+      "step": 573
+    },
+    {
+      "epoch": 1.393681652490887,
+      "grad_norm": 106.84009565003795,
+      "learning_rate": 4.599738061189244e-05,
+      "loss": 1.149,
+      "step": 574
+    },
+    {
+      "epoch": 1.3961117861482382,
+      "grad_norm": 50.543394606036294,
+      "learning_rate": 4.565969507513981e-05,
+      "loss": 1.0991,
+      "step": 575
+    },
+    {
+      "epoch": 1.3985419198055893,
+      "grad_norm": 30.409124335052745,
+      "learning_rate": 4.532288655481062e-05,
+      "loss": 1.1157,
+      "step": 576
+    },
+    {
+      "epoch": 1.4009720534629404,
+      "grad_norm": 89.92061876679301,
+      "learning_rate": 4.498696048677059e-05,
+      "loss": 1.1526,
+      "step": 577
+    },
+    {
+      "epoch": 1.4034021871202915,
+      "grad_norm": 84.27775422110602,
+      "learning_rate": 4.465192229264337e-05,
+      "loss": 1.1418,
+      "step": 578
+    },
+    {
+      "epoch": 1.4058323207776429,
+      "grad_norm": 40.7815489623743,
+      "learning_rate": 4.4317777379722866e-05,
+      "loss": 1.0831,
+      "step": 579
+    },
+    {
+      "epoch": 1.408262454434994,
+      "grad_norm": 66.6911504313278,
+      "learning_rate": 4.3984531140885943e-05,
+      "loss": 1.1088,
+      "step": 580
+    },
+    {
+      "epoch": 1.410692588092345,
+      "grad_norm": 137.00882181835217,
+      "learning_rate": 4.365218895450558e-05,
+      "loss": 1.1089,
+      "step": 581
+    },
+    {
+      "epoch": 1.4131227217496962,
+      "grad_norm": 41.139168895296855,
+      "learning_rate": 4.332075618436386e-05,
+      "loss": 1.1603,
+      "step": 582
+    },
+    {
+      "epoch": 1.4155528554070473,
+      "grad_norm": 35.443969765428506,
+      "learning_rate": 4.29902381795655e-05,
+      "loss": 1.0301,
+      "step": 583
+    },
+    {
+      "epoch": 1.4179829890643987,
+      "grad_norm": 32.931514576694674,
+      "learning_rate": 4.266064027445155e-05,
+      "loss": 1.1016,
+      "step": 584
+    },
+    {
+      "epoch": 1.4204131227217496,
+      "grad_norm": 64.21015694858382,
+      "learning_rate": 4.2331967788513295e-05,
+      "loss": 1.0789,
+      "step": 585
+    },
+    {
+      "epoch": 1.422843256379101,
+      "grad_norm": 84.13251752827094,
+      "learning_rate": 4.200422602630629e-05,
+      "loss": 1.1573,
+      "step": 586
+    },
+    {
+      "epoch": 1.425273390036452,
+      "grad_norm": 53.61636603108024,
+      "learning_rate": 4.167742027736482e-05,
+      "loss": 1.0942,
+      "step": 587
+    },
+    {
+      "epoch": 1.4277035236938032,
+      "grad_norm": 133.20877569415256,
+      "learning_rate": 4.135155581611661e-05,
+      "loss": 1.0877,
+      "step": 588
+    },
+    {
+      "epoch": 1.4301336573511543,
+      "grad_norm": 49.85736467319357,
+      "learning_rate": 4.102663790179764e-05,
+      "loss": 1.0619,
+      "step": 589
+    },
+    {
+      "epoch": 1.4325637910085054,
+      "grad_norm": 91.13217639524017,
+      "learning_rate": 4.070267177836712e-05,
+      "loss": 1.1093,
+      "step": 590
+    },
+    {
+      "epoch": 1.4349939246658567,
+      "grad_norm": 49.25558128250457,
+      "learning_rate": 4.037966267442315e-05,
+      "loss": 1.1344,
+      "step": 591
+    },
+    {
+      "epoch": 1.4374240583232079,
+      "grad_norm": 95.87244356130316,
+      "learning_rate": 4.005761580311805e-05,
+      "loss": 1.0929,
+      "step": 592
+    },
+    {
+      "epoch": 1.439854191980559,
+      "grad_norm": 74.28903671045653,
+      "learning_rate": 3.973653636207437e-05,
+      "loss": 1.1263,
+      "step": 593
+    },
+    {
+      "epoch": 1.44228432563791,
+      "grad_norm": 53.99454529785116,
+      "learning_rate": 3.941642953330103e-05,
+      "loss": 1.0916,
+      "step": 594
+    },
+    {
+      "epoch": 1.4447144592952612,
+      "grad_norm": 113.26015597338959,
+      "learning_rate": 3.909730048310962e-05,
+      "loss": 1.1009,
+      "step": 595
+    },
+    {
+      "epoch": 1.4471445929526123,
+      "grad_norm": 134.4015550981493,
+      "learning_rate": 3.8779154362030986e-05,
+      "loss": 1.1351,
+      "step": 596
+    },
+    {
+      "epoch": 1.4495747266099634,
+      "grad_norm": 90.61611981238187,
+      "learning_rate": 3.846199630473216e-05,
+      "loss": 1.0827,
+      "step": 597
+    },
+    {
+      "epoch": 1.4520048602673148,
+      "grad_norm": 56.55050791518521,
+      "learning_rate": 3.814583142993352e-05,
+      "loss": 1.1145,
+      "step": 598
+    },
+    {
+      "epoch": 1.454434993924666,
+      "grad_norm": 265.6916535243014,
+      "learning_rate": 3.7830664840326145e-05,
+      "loss": 1.1459,
+      "step": 599
+    },
+    {
+      "epoch": 1.456865127582017,
+      "grad_norm": 72.81191101030372,
+      "learning_rate": 3.7516501622489367e-05,
+      "loss": 1.0903,
+      "step": 600
+    },
+    {
+      "epoch": 1.4592952612393681,
+      "grad_norm": 58.309143549086556,
+      "learning_rate": 3.720334684680889e-05,
+      "loss": 1.1041,
+      "step": 601
+    },
+    {
+      "epoch": 1.4617253948967193,
+      "grad_norm": 35.19205741792398,
+      "learning_rate": 3.689120556739475e-05,
+      "loss": 1.1523,
+      "step": 602
+    },
+    {
+      "epoch": 1.4641555285540706,
+      "grad_norm": 88.97226951757321,
+      "learning_rate": 3.6580082821999786e-05,
+      "loss": 1.1117,
+      "step": 603
+    },
+    {
+      "epoch": 1.4665856622114215,
+      "grad_norm": 64.50873879301322,
+      "learning_rate": 3.6269983631938475e-05,
+      "loss": 1.1256,
+      "step": 604
+    },
+    {
+      "epoch": 1.4690157958687728,
+      "grad_norm": 78.10556611104111,
+      "learning_rate": 3.596091300200578e-05,
+      "loss": 1.0834,
+      "step": 605
+    },
+    {
+      "epoch": 1.471445929526124,
+      "grad_norm": 69.38449946362529,
+      "learning_rate": 3.565287592039628e-05,
+      "loss": 1.1026,
+      "step": 606
+    },
+    {
+      "epoch": 1.473876063183475,
+      "grad_norm": 79.60241521456905,
+      "learning_rate": 3.534587735862391e-05,
+      "loss": 1.0456,
+      "step": 607
+    },
+    {
+      "epoch": 1.4763061968408262,
+      "grad_norm": 89.68581306071424,
+      "learning_rate": 3.503992227144147e-05,
+      "loss": 1.0809,
+      "step": 608
+    },
+    {
+      "epoch": 1.4787363304981773,
+      "grad_norm": 68.570527237558,
+      "learning_rate": 3.473501559676088e-05,
+      "loss": 1.0754,
+      "step": 609
+    },
+    {
+      "epoch": 1.4811664641555287,
+      "grad_norm": 54.94762317625427,
+      "learning_rate": 3.4431162255573245e-05,
+      "loss": 1.1751,
+      "step": 610
+    },
+    {
+      "epoch": 1.4835965978128798,
+      "grad_norm": 109.12821602719706,
+      "learning_rate": 3.4128367151869714e-05,
+      "loss": 1.1055,
+      "step": 611
+    },
+    {
+      "epoch": 1.486026731470231,
+      "grad_norm": 198.79030469542352,
+      "learning_rate": 3.3826635172562094e-05,
+      "loss": 1.1369,
+      "step": 612
+    },
+    {
+      "epoch": 1.488456865127582,
+      "grad_norm": 62.002866716809,
+      "learning_rate": 3.352597118740404e-05,
+      "loss": 1.1611,
+      "step": 613
+    },
+    {
+      "epoch": 1.4908869987849331,
+      "grad_norm": 79.21193137029579,
+      "learning_rate": 3.3226380048912585e-05,
+      "loss": 1.1688,
+      "step": 614
+    },
+    {
+      "epoch": 1.4933171324422843,
+      "grad_norm": 68.6722934326242,
+      "learning_rate": 3.292786659228973e-05,
+      "loss": 1.1248,
+      "step": 615
+    },
+    {
+      "epoch": 1.4957472660996354,
+      "grad_norm": 104.34122241838278,
+      "learning_rate": 3.263043563534428e-05,
+      "loss": 1.1425,
+      "step": 616
+    },
+    {
+      "epoch": 1.4981773997569867,
+      "grad_norm": 86.43862038340298,
+      "learning_rate": 3.233409197841437e-05,
+      "loss": 1.0562,
+      "step": 617
+    },
+    {
+      "epoch": 1.5006075334143378,
+      "grad_norm": 79.74137751394451,
+      "learning_rate": 3.2038840404289705e-05,
+      "loss": 1.1214,
+      "step": 618
+    },
+    {
+      "epoch": 1.5006075334143378,
+      "eval_loss": 1.1088899374008179,
+      "eval_runtime": 53.0545,
+      "eval_samples_per_second": 14.023,
+      "eval_steps_per_second": 1.753,
+      "step": 618
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 822,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 206,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 9.157723878347244e+17,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoint-618/training_args.bin b/checkpoint-618/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..529c57f6a4b7b9fa2912b10c5ebbd4c9ae92b0f2
--- /dev/null
+++ b/checkpoint-618/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b6cfbae5f5972dd850bae3d0987f916904b4b5b8d723c11ef16db54c57724a76
+size 8568
diff --git a/checkpoint-618/zero_to_fp32.py b/checkpoint-618/zero_to_fp32.py
new file mode 100644
index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8
--- /dev/null
+++ b/checkpoint-618/zero_to_fp32.py
@@ -0,0 +1,604 @@
+#!/usr/bin/env python
+
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example: python zero_to_fp32.py . pytorch_model.bin
+
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+from collections import OrderedDict
+from dataclasses import dataclass
+
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+                                            FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+                                            FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+
+
+@dataclass
+class zero_model_state:
+    buffers: dict()
+    param_shapes: dict()
+    shared_params: list
+    ds_version: int
+    frozen_param_shapes: dict()
+    frozen_param_fragments: dict()
+
+
+debug = 0
+
+# load to cpu
+device = torch.device('cpu')
+
+
+def atoi(text):
+    return int(text) if text.isdigit() else text
+
+
+def natural_keys(text):
+    '''
+    alist.sort(key=natural_keys) sorts in human order
+    http://nedbatchelder.com/blog/200712/human_sorting.html
+    (See Toothy's implementation in the comments)
+    '''
+    return [atoi(c) for c in re.split(r'(\d+)', text)]
+
+
+def get_model_state_file(checkpoint_dir, zero_stage):
+    if not os.path.isdir(checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+
+    # there should be only one file
+    if zero_stage <= 2:
+        file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+    elif zero_stage == 3:
+        file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+
+    if not os.path.exists(file):
+        raise FileNotFoundError(f"can't find model states file at '{file}'")
+
+    return file
+
+
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+    # XXX: need to test that this simple glob rule works for multi-node setup too
+    ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+
+    if len(ckpt_files) == 0:
+        raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+
+    return ckpt_files
+
+
+def get_optim_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+
+
+def get_model_state_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+
+
+def parse_model_states(files):
+    zero_model_states = []
+    for file in files:
+        state_dict = torch.load(file, map_location=device)
+
+        if BUFFER_NAMES not in state_dict:
+            raise ValueError(f"{file} is not a model state checkpoint")
+        buffer_names = state_dict[BUFFER_NAMES]
+        if debug:
+            print("Found buffers:", buffer_names)
+
+        # recover just the buffers while restoring them to fp32 if they were saved in fp16
+        buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+        param_shapes = state_dict[PARAM_SHAPES]
+
+        # collect parameters that are included in param_shapes
+        param_names = []
+        for s in param_shapes:
+            for name in s.keys():
+                param_names.append(name)
+
+        # update with frozen parameters
+        frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+        if frozen_param_shapes is not None:
+            if debug:
+                print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+            param_names += list(frozen_param_shapes.keys())
+
+        # handle shared params
+        shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+
+        ds_version = state_dict.get(DS_VERSION, None)
+
+        frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+
+        z_model_state = zero_model_state(buffers=buffers,
+                                         param_shapes=param_shapes,
+                                         shared_params=shared_params,
+                                         ds_version=ds_version,
+                                         frozen_param_shapes=frozen_param_shapes,
+                                         frozen_param_fragments=frozen_param_fragments)
+        zero_model_states.append(z_model_state)
+
+    return zero_model_states
+
+
+def parse_optim_states(files, ds_checkpoint_dir):
+
+    total_files = len(files)
+    state_dicts = []
+    for f in files:
+        state_dict = torch.load(f, map_location=device)
+        # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+        # and also handle the case where it was already removed by another helper script
+        state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+        state_dicts.append(state_dict)
+
+    if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
+        raise ValueError(f"{files[0]} is not a zero checkpoint")
+    zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+    world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+
+    # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+    # parameters can be different from data parallelism for non-expert parameters. So we can just
+    # use the max of the partition_count to get the dp world_size.
+
+    if type(world_size) is list:
+        world_size = max(world_size)
+
+    if world_size != total_files:
+        raise ValueError(
+            f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+            "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+        )
+
+    # the groups are named differently in each stage
+    if zero_stage <= 2:
+        fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+    elif zero_stage == 3:
+        fp32_groups_key = FP32_FLAT_GROUPS
+    else:
+        raise ValueError(f"unknown zero stage {zero_stage}")
+
+    if zero_stage <= 2:
+        fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+    elif zero_stage == 3:
+        # if there is more than one param group, there will be multiple flattened tensors - one
+        # flattened tensor per group - for simplicity merge them into a single tensor
+        #
+        # XXX: could make the script more memory efficient for when there are multiple groups - it
+        # will require matching the sub-lists of param_shapes for each param group flattened tensor
+
+        fp32_flat_groups = [
+            torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts))
+        ]
+
+    return zero_stage, world_size, fp32_flat_groups
+
+
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
+    """
+    Returns fp32 state_dict reconstructed from ds checkpoint
+
+    Args:
+        - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+
+    """
+    print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+
+    optim_files = get_optim_files(ds_checkpoint_dir)
+    zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+    print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+
+    model_files = get_model_state_files(ds_checkpoint_dir)
+
+    zero_model_states = parse_model_states(model_files)
+    print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+
+    if zero_stage <= 2:
+        return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+    elif zero_stage == 3:
+        return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+
+
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+
+    frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+    frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+
+    if debug:
+        num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+        print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+    total_params = 0
+    total_numel = 0
+    for name, shape in frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+
+        state_dict[name] = frozen_param_fragments[name]
+
+        if debug:
+            print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _has_callable(obj, fn):
+    attr = getattr(obj, fn, None)
+    return callable(attr)
+
+
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+
+    # Reconstruction protocol:
+    #
+    # XXX: document this
+
+    if debug:
+        for i in range(world_size):
+            for j in range(len(fp32_flat_groups[0])):
+                print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+
+    # XXX: memory usage doubles here (zero2)
+    num_param_groups = len(fp32_flat_groups[0])
+    merged_single_partition_of_fp32_groups = []
+    for i in range(num_param_groups):
+        merged_partitions = [sd[i] for sd in fp32_flat_groups]
+        full_single_fp32_vector = torch.cat(merged_partitions, 0)
+        merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+    avail_numel = sum(
+        [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+
+    if debug:
+        wanted_params = sum([len(shapes) for shapes in param_shapes])
+        wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+        # not asserting if there is a mismatch due to possible padding
+        print(f"Have {avail_numel} numels to process.")
+        print(f"Need {wanted_numel} numels in {wanted_params} params.")
+
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    total_numel = 0
+    total_params = 0
+    for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+        offset = 0
+        avail_numel = full_single_fp32_vector.numel()
+        for name, shape in shapes.items():
+
+            unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+            total_numel += unpartitioned_numel
+            total_params += 1
+
+            if debug:
+                print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+            state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+            offset += unpartitioned_numel
+
+        # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+        # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+        # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+        # live optimizer object, so we are checking that the numbers are within the right range
+        align_to = 2 * world_size
+
+        def zero2_align(x):
+            return align_to * math.ceil(x / align_to)
+
+        if debug:
+            print(f"original offset={offset}, avail_numel={avail_numel}")
+
+        offset = zero2_align(offset)
+        avail_numel = zero2_align(avail_numel)
+
+        if debug:
+            print(f"aligned  offset={offset}, avail_numel={avail_numel}")
+
+        # Sanity check
+        if offset != avail_numel:
+            raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+    print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+
+    if not exclude_frozen_parameters:
+        _zero2_merge_frozen_params(state_dict, zero_model_states)
+
+    _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+
+    return state_dict
+
+
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+    remainder = unpartitioned_numel % world_size
+    padding_numel = (world_size - remainder) if remainder else 0
+    partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+    return partitioned_numel, padding_numel
+
+
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+
+    if debug:
+        for i in range(world_size):
+            num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+            print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+        frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+    total_params = 0
+    total_numel = 0
+    for name, shape in zero_model_states[0].frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+
+        param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+        state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+        if debug:
+            print(
+                f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    avail_numel = fp32_flat_groups[0].numel() * world_size
+    # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+    # param, re-consolidating each param, while dealing with padding if any
+
+    # merge list of dicts, preserving order
+    param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+
+    if debug:
+        for i in range(world_size):
+            print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+
+        wanted_params = len(param_shapes)
+        wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+        # not asserting if there is a mismatch due to possible padding
+        avail_numel = fp32_flat_groups[0].numel() * world_size
+        print(f"Trainable params: Have {avail_numel} numels to process.")
+        print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    offset = 0
+    total_numel = 0
+    total_params = 0
+    for name, shape in param_shapes.items():
+
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        total_params += 1
+
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+        if debug:
+            print(
+                f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+
+        # XXX: memory usage doubles here
+        state_dict[name] = torch.cat(
+            tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)),
+            0).narrow(0, 0, unpartitioned_numel).view(shape)
+        offset += partitioned_numel
+
+    offset *= world_size
+
+    # Sanity check
+    if offset != avail_numel:
+        raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+    print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+
+    if not exclude_frozen_parameters:
+        _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+
+    _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+
+    return state_dict
+
+
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+    ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+    via a model hub.
+
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+
+    Returns:
+        - pytorch ``state_dict``
+
+    Note: this approach may not work if your application doesn't have sufficient free CPU memory and
+    you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+    the checkpoint.
+
+    A typical usage might be ::
+
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        # do the training and checkpoint saving
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+        model = model.cpu() # move to cpu
+        model.load_state_dict(state_dict)
+        # submit to model hub or save the model to share with others
+
+    In this example the ``model`` will no longer be usable in the deepspeed context of the same
+    application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+    If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+
+    """
+    if tag is None:
+        latest_path = os.path.join(checkpoint_dir, 'latest')
+        if os.path.isfile(latest_path):
+            with open(latest_path, 'r') as fd:
+                tag = fd.read().strip()
+        else:
+            raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+
+    ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+
+    if not os.path.isdir(ds_checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+
+    return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
+
+
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+    loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+    """
+
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters)
+    print(f"Saving fp32 state dict to {output_file}")
+    torch.save(state_dict, output_file)
+
+
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+    """
+    1. Put the provided model to cpu
+    2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+    3. Load it into the provided model
+
+    Args:
+        - ``model``: the model object to update
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+
+    Returns:
+        - ``model`: modified model
+
+    Make sure you have plenty of CPU memory available before you call this function. If you don't
+    have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+    conveniently placed for you in the checkpoint folder.
+
+    A typical usage might be ::
+
+        from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+        model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+        # submit to model hub or save the model to share with others
+
+    Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+    of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+    """
+    logger.info(f"Extracting fp32 weights")
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+
+    logger.info(f"Overwriting model with fp32 weights")
+    model = model.cpu()
+    model.load_state_dict(state_dict, strict=False)
+
+    return model
+
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("checkpoint_dir",
+                        type=str,
+                        help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+    parser.add_argument(
+        "output_file",
+        type=str,
+        help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)")
+    parser.add_argument("-t",
+                        "--tag",
+                        type=str,
+                        default=None,
+                        help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+    parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
+    parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+    args = parser.parse_args()
+
+    debug = args.debug
+
+    convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
+                                               args.output_file,
+                                               tag=args.tag,
+                                               exclude_frozen_parameters=args.exclude_frozen_parameters)
diff --git a/checkpoint-772/README.md b/checkpoint-772/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..5d958b0327f8e51ca223e270eb789387f6b41fb2
--- /dev/null
+++ b/checkpoint-772/README.md
@@ -0,0 +1,202 @@
+---
+base_model: THUDM/GLM-4-32B-0414
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.15.1
\ No newline at end of file
diff --git a/checkpoint-772/adapter_config.json b/checkpoint-772/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..d9d8fa4860138947c736b05e4c3dd010601e2671
--- /dev/null
+++ b/checkpoint-772/adapter_config.json
@@ -0,0 +1,41 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "THUDM/GLM-4-32B-0414",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": null,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": [
+    "embed_tokens",
+    "lm_head"
+  ],
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "q_proj",
+    "k_proj",
+    "gate_up_proj",
+    "down_proj",
+    "o_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_rslora": true
+}
\ No newline at end of file
diff --git a/checkpoint-772/adapter_model.safetensors b/checkpoint-772/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..efd4f6d0842ac4eee5367d6307eb60c53f719da1
--- /dev/null
+++ b/checkpoint-772/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0a429bd0213ee3f039d3495189dd78fc3c718e5e7b9dc021d50db49606805cb5
+size 5579575888
diff --git a/checkpoint-772/global_step772/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-772/global_step772/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..1e4efed2d9158f5e347589f668179255d748ce3f
--- /dev/null
+++ b/checkpoint-772/global_step772/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:52bf210bfbacef345afbf22fef52f863a0c4dd794b34d8ba6fb4625b51747425
+size 2458601314
diff --git a/checkpoint-772/global_step772/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-772/global_step772/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..fdab4a2c4fd26bcc09ebcce62851498b709396ac
--- /dev/null
+++ b/checkpoint-772/global_step772/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2d0ec0fca7fa53b6ebdc629bdb92ce80df032e2245cb222a1d27f4c658c3cce1
+size 2458601314
diff --git a/checkpoint-772/global_step772/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoint-772/global_step772/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a759ce7a9292456cc182bf48012e5a40d472bb7a
--- /dev/null
+++ b/checkpoint-772/global_step772/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:779b6930d95dca330aa3cf0c8072e4a0dc3fcc8a1879f06578b790ec8e9785f3
+size 2458601314
diff --git a/checkpoint-772/global_step772/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/checkpoint-772/global_step772/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d2cf1ddf420cab05d96e179879577e1380c9c507
--- /dev/null
+++ b/checkpoint-772/global_step772/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c9235fbd49d0080521cf226ea88045fd4131d451b2f9f65206e2a491bef10633
+size 2458601314
diff --git a/checkpoint-772/global_step772/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/checkpoint-772/global_step772/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..883bfa0675976c5bfbc2a48a7adc7565d4917e6d
--- /dev/null
+++ b/checkpoint-772/global_step772/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8608560fe69cbbee152b21c18551b87860579716512b78189d5b6f54fe38381b
+size 2458601314
diff --git a/checkpoint-772/global_step772/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/checkpoint-772/global_step772/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..dd0f103692fe42c845fa4984297d83e079fde655
--- /dev/null
+++ b/checkpoint-772/global_step772/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4688795cec4d22423afd1a59ecde6329428f4a7767c23d7b69cac6470de36e84
+size 2458601314
diff --git a/checkpoint-772/global_step772/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/checkpoint-772/global_step772/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b9e563b0900546dc3cb5fedb972e14dac98e9f26
--- /dev/null
+++ b/checkpoint-772/global_step772/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ad73a534b1a7b01cb75d306d2485eba0701a079986666dc31b518895a8e31494
+size 2458601314
diff --git a/checkpoint-772/global_step772/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/checkpoint-772/global_step772/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..00e956b249dc8f7c960fa4edc9b6e5e2ddc2aa68
--- /dev/null
+++ b/checkpoint-772/global_step772/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1136fb48987f32a04186db2b48ff9a8e56d87c14dc9fb5910b490500b78a1808
+size 2458601314
diff --git a/checkpoint-772/global_step772/zero_pp_rank_0_mp_rank_00_model_states.pt b/checkpoint-772/global_step772/zero_pp_rank_0_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7bccd5d3f2e4a5344a8ab671ba9e574655d48069
--- /dev/null
+++ b/checkpoint-772/global_step772/zero_pp_rank_0_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9b978b747317a3b603149b9fd0f9a2890bbde79d056e71d33616c49732c2a400
+size 747668
diff --git a/checkpoint-772/global_step772/zero_pp_rank_1_mp_rank_00_model_states.pt b/checkpoint-772/global_step772/zero_pp_rank_1_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..38fdddfb5369b380374aea97240d35c142317877
--- /dev/null
+++ b/checkpoint-772/global_step772/zero_pp_rank_1_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4a5de312ccaa1b40bf0b2e53f82e4af34f398cd675ae82c42ca1e5328ce1475a
+size 747668
diff --git a/checkpoint-772/global_step772/zero_pp_rank_2_mp_rank_00_model_states.pt b/checkpoint-772/global_step772/zero_pp_rank_2_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f56495a0e7653e0375a21dbfb44840faf342d137
--- /dev/null
+++ b/checkpoint-772/global_step772/zero_pp_rank_2_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0a2088a9c158a8e29039f2d25c01ca5831bbd50ae7f05818eee42ea02f3556dc
+size 747668
diff --git a/checkpoint-772/global_step772/zero_pp_rank_3_mp_rank_00_model_states.pt b/checkpoint-772/global_step772/zero_pp_rank_3_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..411dcbfd58343a708c500a7b991957fefd2f5baf
--- /dev/null
+++ b/checkpoint-772/global_step772/zero_pp_rank_3_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0ec4aa947bc922c74815c7db480fe0dbd4f7991f2b141f1252b3593f6ad7ee71
+size 747668
diff --git a/checkpoint-772/global_step772/zero_pp_rank_4_mp_rank_00_model_states.pt b/checkpoint-772/global_step772/zero_pp_rank_4_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..401331e9ba26f657834656a20c84d359573d7a06
--- /dev/null
+++ b/checkpoint-772/global_step772/zero_pp_rank_4_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9264b08d23fc9636ea17bcc8238a8cfd1c7064a22711c1557e7ec0708174988c
+size 747668
diff --git a/checkpoint-772/global_step772/zero_pp_rank_5_mp_rank_00_model_states.pt b/checkpoint-772/global_step772/zero_pp_rank_5_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4dec1589e5d65e770275fb05d448b576f87a19b2
--- /dev/null
+++ b/checkpoint-772/global_step772/zero_pp_rank_5_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4ae179cada4a016a1aa2818065da449ae275da7e5d76cd91a8ab3c9d294de2ae
+size 747668
diff --git a/checkpoint-772/global_step772/zero_pp_rank_6_mp_rank_00_model_states.pt b/checkpoint-772/global_step772/zero_pp_rank_6_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d2b96f0f56fbd4b86e69f48a7b27687b75e9105e
--- /dev/null
+++ b/checkpoint-772/global_step772/zero_pp_rank_6_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e67ae6578c5308d2ba390277dc3e8885b9cd57f62550755b8f5648dcd3a6b3be
+size 747668
diff --git a/checkpoint-772/global_step772/zero_pp_rank_7_mp_rank_00_model_states.pt b/checkpoint-772/global_step772/zero_pp_rank_7_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..cdbec0ec4a1b8ce9f9b78b0442fc58c6e80601e4
--- /dev/null
+++ b/checkpoint-772/global_step772/zero_pp_rank_7_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7eba29263e25cb4d1647a7500cf93bc0158d98d6403828e7794d0bc6c78af264
+size 747668
diff --git a/checkpoint-772/latest b/checkpoint-772/latest
new file mode 100644
index 0000000000000000000000000000000000000000..5bbd92f860917d9be16238187bc2f86bba0c5670
--- /dev/null
+++ b/checkpoint-772/latest
@@ -0,0 +1 @@
+global_step772
\ No newline at end of file
diff --git a/checkpoint-772/rng_state_0.pth b/checkpoint-772/rng_state_0.pth
new file mode 100644
index 0000000000000000000000000000000000000000..9ee67a9fe52438bf9b329a6ee618dfda99e3f467
--- /dev/null
+++ b/checkpoint-772/rng_state_0.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:18566e96d351f85c9faf808d2b8e8b090ac0eebabafe863d5320bf7cc2562e69
+size 15984
diff --git a/checkpoint-772/rng_state_1.pth b/checkpoint-772/rng_state_1.pth
new file mode 100644
index 0000000000000000000000000000000000000000..9d4331d73f7582d82d99fa612f1d416646c40ce7
--- /dev/null
+++ b/checkpoint-772/rng_state_1.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5ef6e1e354a2761c9dfe8da34c560d5a5ee9fefedac31317c8ff85710de1261b
+size 15984
diff --git a/checkpoint-772/rng_state_2.pth b/checkpoint-772/rng_state_2.pth
new file mode 100644
index 0000000000000000000000000000000000000000..16d822119b77a126e5baa615a68181a03d099a7a
--- /dev/null
+++ b/checkpoint-772/rng_state_2.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9e5df4b4b7b92851c781ba46584013741a933dde8af0b0cb5c1fa07712e79cc8
+size 15984
diff --git a/checkpoint-772/rng_state_3.pth b/checkpoint-772/rng_state_3.pth
new file mode 100644
index 0000000000000000000000000000000000000000..c98bd3f46a8b7286cc1d121246b38da950881056
--- /dev/null
+++ b/checkpoint-772/rng_state_3.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b19e8ce609ad14ca28d6ad7eb241877b2d8d1550e78093a062a56bd58615f2cf
+size 15984
diff --git a/checkpoint-772/rng_state_4.pth b/checkpoint-772/rng_state_4.pth
new file mode 100644
index 0000000000000000000000000000000000000000..4bfbfc6202d81260748e53fb67a9f4a49020dd28
--- /dev/null
+++ b/checkpoint-772/rng_state_4.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b57c77e995abe0fba5f3846694b27200af5934217086635b6cb04a2c25be8e3e
+size 15984
diff --git a/checkpoint-772/rng_state_5.pth b/checkpoint-772/rng_state_5.pth
new file mode 100644
index 0000000000000000000000000000000000000000..343a53021b34d918a5b8bb0dba622462755bc641
--- /dev/null
+++ b/checkpoint-772/rng_state_5.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:70e287ad85b1923f8fa5b635d0b38e32a77e1bc312a43abc82def3622ed2a6e5
+size 15984
diff --git a/checkpoint-772/rng_state_6.pth b/checkpoint-772/rng_state_6.pth
new file mode 100644
index 0000000000000000000000000000000000000000..f2f477a9676518042d0e60f50bbeef3f682b93e4
--- /dev/null
+++ b/checkpoint-772/rng_state_6.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:684f4313476ed839fa25cdc36ba6b47f3152341389952bc02b263da4c5ae8f8e
+size 15984
diff --git a/checkpoint-772/rng_state_7.pth b/checkpoint-772/rng_state_7.pth
new file mode 100644
index 0000000000000000000000000000000000000000..ccfbc33a0b3571f51e17967b949631ffcefeb919
--- /dev/null
+++ b/checkpoint-772/rng_state_7.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3d969ebb64ba903d248a0b8df9875e21ddc9fbb3219bf580c656a4fc3043e6c9
+size 15984
diff --git a/checkpoint-772/scheduler.pt b/checkpoint-772/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e2c6c76fbd456c67cb8872f27364f51ef0e22313
--- /dev/null
+++ b/checkpoint-772/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:30d419ec6d40fe5adabb5591a31055f0370131c25b4bfb62a6dbe9046d1d19b5
+size 1064
diff --git a/checkpoint-772/special_tokens_map.json b/checkpoint-772/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..3cf953c2c8a3c89778c92e54c685942bb1130616
--- /dev/null
+++ b/checkpoint-772/special_tokens_map.json
@@ -0,0 +1,32 @@
+{
+  "additional_special_tokens": [
+    "<|endoftext|>",
+    "[MASK]",
+    "[gMASK]",
+    "[sMASK]",
+    "<sop>",
+    "<eop>",
+    "<|system|>",
+    "<|user|>",
+    "<|assistant|>",
+    "<|observation|>",
+    "<|begin_of_image|>",
+    "<|end_of_image|>",
+    "<|begin_of_video|>",
+    "<|end_of_video|>"
+  ],
+  "eos_token": {
+    "content": "<|user|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
diff --git a/checkpoint-772/tokenizer.json b/checkpoint-772/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..4d1dde37a2715c11628fc84bf571976f9f80eb69
--- /dev/null
+++ b/checkpoint-772/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:76ebeac0d8bd7879ead7b43c16b44981f277e47225de2bd7de9ae1a6cc664a8c
+size 19966496
diff --git a/checkpoint-772/tokenizer_config.json b/checkpoint-772/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..e19c45f6310a17f6c1c6be76a429ac68438d547f
--- /dev/null
+++ b/checkpoint-772/tokenizer_config.json
@@ -0,0 +1,146 @@
+{
+  "added_tokens_decoder": {
+    "151329": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151330": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151331": {
+      "content": "[gMASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151332": {
+      "content": "[sMASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151333": {
+      "content": "<sop>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151334": {
+      "content": "<eop>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151335": {
+      "content": "<|system|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151336": {
+      "content": "<|user|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151337": {
+      "content": "<|assistant|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151338": {
+      "content": "<|observation|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151339": {
+      "content": "<|begin_of_image|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151340": {
+      "content": "<|end_of_image|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151341": {
+      "content": "<|begin_of_video|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151342": {
+      "content": "<|end_of_video|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|endoftext|>",
+    "[MASK]",
+    "[gMASK]",
+    "[sMASK]",
+    "<sop>",
+    "<eop>",
+    "<|system|>",
+    "<|user|>",
+    "<|assistant|>",
+    "<|observation|>",
+    "<|begin_of_image|>",
+    "<|end_of_image|>",
+    "<|begin_of_video|>",
+    "<|end_of_video|>"
+  ],
+  "chat_template": "[gMASK]<sop>\n{%- if tools -%}\n<|system|>\n# 可用工具\n{% for tool in tools %}\n    {%- set function = tool.function if tool.get(\"function\") else tool %}\n\n## {{ function.name }}\n\n{{ function | tojson(indent=4, ensure_ascii=False) }}\n在调用上述函数时，请使用 Json 格式表示调用的参数。\n{%- endfor %}\n{%- endif -%}\n\n{%- for msg in messages %}\n    {%- if msg.role == 'system' %}\n<|system|>\n{{ msg.content }}\n    {%- endif %}\n{%- endfor %}\n\n{%- for message in messages if message.role != 'system' %}\n    {%- set role = message['role'] %}\n    {%- set content = message['content'] %}\n    {%- set meta = message.get(\"metadata\", \"\") %}\n\n    {%- if role == 'user' %}\n<|user|>\n{{ content }}\n    {%- elif role == 'assistant' and not meta %}\n<|assistant|>\n{{ content }}\n    {%- elif role == 'assistant' and meta %}\n<|assistant|>{{ meta }}\n{{ content }}\n    {%- elif role == 'observation' %}\n<|observation|>\n{{ content }}\n    {%- endif %}\n{%- endfor %}\n{% if add_generation_prompt %}<|assistant|>{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "do_lower_case": false,
+  "eos_token": "<|user|>",
+  "extra_special_tokens": {},
+  "model_input_names": [
+    "input_ids",
+    "attention_mask"
+  ],
+  "model_max_length": 128000,
+  "pad_token": "<|endoftext|>",
+  "padding_side": "left",
+  "remove_space": false,
+  "tokenizer_class": "PreTrainedTokenizer"
+}
diff --git a/checkpoint-772/trainer_state.json b/checkpoint-772/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..821e291800d05f5a6fc85764080147c4a6f5aab4
--- /dev/null
+++ b/checkpoint-772/trainer_state.json
@@ -0,0 +1,5502 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.0,
+  "eval_steps": 97,
+  "global_step": 772,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0025906735751295338,
+      "grad_norm": 758.2562349755826,
+      "learning_rate": 0.0,
+      "loss": 1.3719,
+      "step": 1
+    },
+    {
+      "epoch": 0.0025906735751295338,
+      "eval_loss": 1.3159157037734985,
+      "eval_runtime": 36.907,
+      "eval_samples_per_second": 20.159,
+      "eval_steps_per_second": 1.273,
+      "step": 1
+    },
+    {
+      "epoch": 0.0051813471502590676,
+      "grad_norm": 666.308184823038,
+      "learning_rate": 1.0000000000000002e-06,
+      "loss": 1.36,
+      "step": 2
+    },
+    {
+      "epoch": 0.007772020725388601,
+      "grad_norm": 211.0771195353068,
+      "learning_rate": 2.0000000000000003e-06,
+      "loss": 1.3746,
+      "step": 3
+    },
+    {
+      "epoch": 0.010362694300518135,
+      "grad_norm": 431.5114709683218,
+      "learning_rate": 3e-06,
+      "loss": 1.3412,
+      "step": 4
+    },
+    {
+      "epoch": 0.012953367875647668,
+      "grad_norm": 230.87468433791625,
+      "learning_rate": 4.000000000000001e-06,
+      "loss": 1.3837,
+      "step": 5
+    },
+    {
+      "epoch": 0.015544041450777202,
+      "grad_norm": 635.1636587738542,
+      "learning_rate": 5e-06,
+      "loss": 1.3761,
+      "step": 6
+    },
+    {
+      "epoch": 0.018134715025906734,
+      "grad_norm": 791.5536958334704,
+      "learning_rate": 6e-06,
+      "loss": 1.2855,
+      "step": 7
+    },
+    {
+      "epoch": 0.02072538860103627,
+      "grad_norm": 667.7197994216477,
+      "learning_rate": 7e-06,
+      "loss": 1.3267,
+      "step": 8
+    },
+    {
+      "epoch": 0.023316062176165803,
+      "grad_norm": 254.3855973692125,
+      "learning_rate": 8.000000000000001e-06,
+      "loss": 1.2977,
+      "step": 9
+    },
+    {
+      "epoch": 0.025906735751295335,
+      "grad_norm": 162.29347257682093,
+      "learning_rate": 9e-06,
+      "loss": 1.3522,
+      "step": 10
+    },
+    {
+      "epoch": 0.02849740932642487,
+      "grad_norm": 352.6352930651456,
+      "learning_rate": 1e-05,
+      "loss": 1.2688,
+      "step": 11
+    },
+    {
+      "epoch": 0.031088082901554404,
+      "grad_norm": 148.2629265526552,
+      "learning_rate": 1.1000000000000001e-05,
+      "loss": 1.3342,
+      "step": 12
+    },
+    {
+      "epoch": 0.03367875647668394,
+      "grad_norm": 249.88753789723657,
+      "learning_rate": 1.2e-05,
+      "loss": 1.2983,
+      "step": 13
+    },
+    {
+      "epoch": 0.03626943005181347,
+      "grad_norm": 184.03358422636597,
+      "learning_rate": 1.3000000000000001e-05,
+      "loss": 1.3291,
+      "step": 14
+    },
+    {
+      "epoch": 0.038860103626943004,
+      "grad_norm": 198.4491469860763,
+      "learning_rate": 1.4e-05,
+      "loss": 1.4014,
+      "step": 15
+    },
+    {
+      "epoch": 0.04145077720207254,
+      "grad_norm": 680.9537058769038,
+      "learning_rate": 1.5000000000000002e-05,
+      "loss": 1.3775,
+      "step": 16
+    },
+    {
+      "epoch": 0.04404145077720207,
+      "grad_norm": 563.0247638614801,
+      "learning_rate": 1.6000000000000003e-05,
+      "loss": 1.3228,
+      "step": 17
+    },
+    {
+      "epoch": 0.046632124352331605,
+      "grad_norm": 271.985463813746,
+      "learning_rate": 1.7e-05,
+      "loss": 1.3695,
+      "step": 18
+    },
+    {
+      "epoch": 0.04922279792746114,
+      "grad_norm": 399.51218452223316,
+      "learning_rate": 1.8e-05,
+      "loss": 1.2556,
+      "step": 19
+    },
+    {
+      "epoch": 0.05181347150259067,
+      "grad_norm": 160.70697055826656,
+      "learning_rate": 1.9e-05,
+      "loss": 1.2982,
+      "step": 20
+    },
+    {
+      "epoch": 0.054404145077720206,
+      "grad_norm": 227.8927504687491,
+      "learning_rate": 2e-05,
+      "loss": 1.3532,
+      "step": 21
+    },
+    {
+      "epoch": 0.05699481865284974,
+      "grad_norm": 550.1538868076032,
+      "learning_rate": 2.1000000000000002e-05,
+      "loss": 1.2603,
+      "step": 22
+    },
+    {
+      "epoch": 0.05958549222797927,
+      "grad_norm": 291.8994359919024,
+      "learning_rate": 2.2000000000000003e-05,
+      "loss": 1.3663,
+      "step": 23
+    },
+    {
+      "epoch": 0.06217616580310881,
+      "grad_norm": 120.60677833129643,
+      "learning_rate": 2.3e-05,
+      "loss": 1.3129,
+      "step": 24
+    },
+    {
+      "epoch": 0.06476683937823834,
+      "grad_norm": 414.4006662101242,
+      "learning_rate": 2.4e-05,
+      "loss": 1.3037,
+      "step": 25
+    },
+    {
+      "epoch": 0.06735751295336788,
+      "grad_norm": 141.48324465317884,
+      "learning_rate": 2.5e-05,
+      "loss": 1.3095,
+      "step": 26
+    },
+    {
+      "epoch": 0.06994818652849741,
+      "grad_norm": 147.86066819937994,
+      "learning_rate": 2.6000000000000002e-05,
+      "loss": 1.2372,
+      "step": 27
+    },
+    {
+      "epoch": 0.07253886010362694,
+      "grad_norm": 214.47337614964576,
+      "learning_rate": 2.7000000000000002e-05,
+      "loss": 1.3384,
+      "step": 28
+    },
+    {
+      "epoch": 0.07512953367875648,
+      "grad_norm": 898.4324889241673,
+      "learning_rate": 2.8e-05,
+      "loss": 1.2003,
+      "step": 29
+    },
+    {
+      "epoch": 0.07772020725388601,
+      "grad_norm": 128.83026557596128,
+      "learning_rate": 2.9e-05,
+      "loss": 1.2172,
+      "step": 30
+    },
+    {
+      "epoch": 0.08031088082901554,
+      "grad_norm": 183.0777862405529,
+      "learning_rate": 3.0000000000000004e-05,
+      "loss": 1.2674,
+      "step": 31
+    },
+    {
+      "epoch": 0.08290155440414508,
+      "grad_norm": 119.01841833358732,
+      "learning_rate": 3.1e-05,
+      "loss": 1.2554,
+      "step": 32
+    },
+    {
+      "epoch": 0.08549222797927461,
+      "grad_norm": 117.65980267542858,
+      "learning_rate": 3.2000000000000005e-05,
+      "loss": 1.2716,
+      "step": 33
+    },
+    {
+      "epoch": 0.08808290155440414,
+      "grad_norm": 82.40151099433953,
+      "learning_rate": 3.3e-05,
+      "loss": 1.2019,
+      "step": 34
+    },
+    {
+      "epoch": 0.09067357512953368,
+      "grad_norm": 82.61816783653785,
+      "learning_rate": 3.4e-05,
+      "loss": 1.2424,
+      "step": 35
+    },
+    {
+      "epoch": 0.09326424870466321,
+      "grad_norm": 136.42743433868276,
+      "learning_rate": 3.5000000000000004e-05,
+      "loss": 1.2066,
+      "step": 36
+    },
+    {
+      "epoch": 0.09585492227979274,
+      "grad_norm": 36.775911657584444,
+      "learning_rate": 3.6e-05,
+      "loss": 1.2485,
+      "step": 37
+    },
+    {
+      "epoch": 0.09844559585492228,
+      "grad_norm": 56.55022603284064,
+      "learning_rate": 3.7000000000000005e-05,
+      "loss": 1.2112,
+      "step": 38
+    },
+    {
+      "epoch": 0.10103626943005181,
+      "grad_norm": 50.09896932886107,
+      "learning_rate": 3.8e-05,
+      "loss": 1.2027,
+      "step": 39
+    },
+    {
+      "epoch": 0.10362694300518134,
+      "grad_norm": 54.2661481198025,
+      "learning_rate": 3.9e-05,
+      "loss": 1.2673,
+      "step": 40
+    },
+    {
+      "epoch": 0.10621761658031088,
+      "grad_norm": 60.04145981731815,
+      "learning_rate": 4e-05,
+      "loss": 1.1648,
+      "step": 41
+    },
+    {
+      "epoch": 0.10880829015544041,
+      "grad_norm": 169.47741055545822,
+      "learning_rate": 3.999981580539036e-05,
+      "loss": 1.2393,
+      "step": 42
+    },
+    {
+      "epoch": 0.11139896373056994,
+      "grad_norm": 43.64716987307323,
+      "learning_rate": 3.9999263224954204e-05,
+      "loss": 1.2906,
+      "step": 43
+    },
+    {
+      "epoch": 0.11398963730569948,
+      "grad_norm": 51.3206609767585,
+      "learning_rate": 3.999834226886976e-05,
+      "loss": 1.1807,
+      "step": 44
+    },
+    {
+      "epoch": 0.11658031088082901,
+      "grad_norm": 38.95055887413869,
+      "learning_rate": 3.999705295410054e-05,
+      "loss": 1.1825,
+      "step": 45
+    },
+    {
+      "epoch": 0.11917098445595854,
+      "grad_norm": 40.59968974426338,
+      "learning_rate": 3.999539530439504e-05,
+      "loss": 1.193,
+      "step": 46
+    },
+    {
+      "epoch": 0.12176165803108809,
+      "grad_norm": 34.5796571445333,
+      "learning_rate": 3.9993369350286265e-05,
+      "loss": 1.2127,
+      "step": 47
+    },
+    {
+      "epoch": 0.12435233160621761,
+      "grad_norm": 37.97693356149241,
+      "learning_rate": 3.99909751290912e-05,
+      "loss": 1.1543,
+      "step": 48
+    },
+    {
+      "epoch": 0.12694300518134716,
+      "grad_norm": 82.9217015858092,
+      "learning_rate": 3.9988212684910107e-05,
+      "loss": 1.2329,
+      "step": 49
+    },
+    {
+      "epoch": 0.12953367875647667,
+      "grad_norm": 49.256542144400214,
+      "learning_rate": 3.9985082068625724e-05,
+      "loss": 1.212,
+      "step": 50
+    },
+    {
+      "epoch": 0.13212435233160622,
+      "grad_norm": 45.025980435259484,
+      "learning_rate": 3.998158333790231e-05,
+      "loss": 1.2129,
+      "step": 51
+    },
+    {
+      "epoch": 0.13471502590673576,
+      "grad_norm": 45.98465689592428,
+      "learning_rate": 3.99777165571846e-05,
+      "loss": 1.1709,
+      "step": 52
+    },
+    {
+      "epoch": 0.13730569948186527,
+      "grad_norm": 43.481241408477906,
+      "learning_rate": 3.997348179769661e-05,
+      "loss": 1.1614,
+      "step": 53
+    },
+    {
+      "epoch": 0.13989637305699482,
+      "grad_norm": 82.17633750834132,
+      "learning_rate": 3.996887913744033e-05,
+      "loss": 1.2205,
+      "step": 54
+    },
+    {
+      "epoch": 0.14248704663212436,
+      "grad_norm": 53.0176514970764,
+      "learning_rate": 3.9963908661194285e-05,
+      "loss": 1.1204,
+      "step": 55
+    },
+    {
+      "epoch": 0.14507772020725387,
+      "grad_norm": 67.86382426995611,
+      "learning_rate": 3.995857046051196e-05,
+      "loss": 1.1839,
+      "step": 56
+    },
+    {
+      "epoch": 0.14766839378238342,
+      "grad_norm": 31.282407703790597,
+      "learning_rate": 3.995286463372013e-05,
+      "loss": 1.2126,
+      "step": 57
+    },
+    {
+      "epoch": 0.15025906735751296,
+      "grad_norm": 52.200764429265604,
+      "learning_rate": 3.994679128591706e-05,
+      "loss": 1.2036,
+      "step": 58
+    },
+    {
+      "epoch": 0.15284974093264247,
+      "grad_norm": 60.706608653531895,
+      "learning_rate": 3.9940350528970535e-05,
+      "loss": 1.1848,
+      "step": 59
+    },
+    {
+      "epoch": 0.15544041450777202,
+      "grad_norm": 47.31754062899529,
+      "learning_rate": 3.993354248151583e-05,
+      "loss": 1.0869,
+      "step": 60
+    },
+    {
+      "epoch": 0.15803108808290156,
+      "grad_norm": 49.42450836392811,
+      "learning_rate": 3.9926367268953514e-05,
+      "loss": 1.2651,
+      "step": 61
+    },
+    {
+      "epoch": 0.16062176165803108,
+      "grad_norm": 38.791167030088886,
+      "learning_rate": 3.991882502344712e-05,
+      "loss": 1.1881,
+      "step": 62
+    },
+    {
+      "epoch": 0.16321243523316062,
+      "grad_norm": 56.16339499737216,
+      "learning_rate": 3.991091588392077e-05,
+      "loss": 1.1518,
+      "step": 63
+    },
+    {
+      "epoch": 0.16580310880829016,
+      "grad_norm": 861.8559063020828,
+      "learning_rate": 3.990263999605652e-05,
+      "loss": 1.1614,
+      "step": 64
+    },
+    {
+      "epoch": 0.16839378238341968,
+      "grad_norm": 50.92822786500888,
+      "learning_rate": 3.989399751229179e-05,
+      "loss": 1.1998,
+      "step": 65
+    },
+    {
+      "epoch": 0.17098445595854922,
+      "grad_norm": 31.04121324055666,
+      "learning_rate": 3.988498859181645e-05,
+      "loss": 1.1795,
+      "step": 66
+    },
+    {
+      "epoch": 0.17357512953367876,
+      "grad_norm": 50.33061983380845,
+      "learning_rate": 3.9875613400569975e-05,
+      "loss": 1.1742,
+      "step": 67
+    },
+    {
+      "epoch": 0.17616580310880828,
+      "grad_norm": 75.20462514003519,
+      "learning_rate": 3.986587211123833e-05,
+      "loss": 1.1856,
+      "step": 68
+    },
+    {
+      "epoch": 0.17875647668393782,
+      "grad_norm": 38.82139317052205,
+      "learning_rate": 3.98557649032508e-05,
+      "loss": 1.1529,
+      "step": 69
+    },
+    {
+      "epoch": 0.18134715025906736,
+      "grad_norm": 36.55988806615175,
+      "learning_rate": 3.984529196277674e-05,
+      "loss": 1.1884,
+      "step": 70
+    },
+    {
+      "epoch": 0.18393782383419688,
+      "grad_norm": 104.8931793971097,
+      "learning_rate": 3.983445348272203e-05,
+      "loss": 1.2182,
+      "step": 71
+    },
+    {
+      "epoch": 0.18652849740932642,
+      "grad_norm": 36.50395409234617,
+      "learning_rate": 3.982324966272566e-05,
+      "loss": 1.1609,
+      "step": 72
+    },
+    {
+      "epoch": 0.18911917098445596,
+      "grad_norm": 35.019191693448626,
+      "learning_rate": 3.981168070915594e-05,
+      "loss": 1.173,
+      "step": 73
+    },
+    {
+      "epoch": 0.19170984455958548,
+      "grad_norm": 33.378390048053596,
+      "learning_rate": 3.979974683510677e-05,
+      "loss": 1.173,
+      "step": 74
+    },
+    {
+      "epoch": 0.19430051813471502,
+      "grad_norm": 43.356840136984154,
+      "learning_rate": 3.978744826039366e-05,
+      "loss": 1.2032,
+      "step": 75
+    },
+    {
+      "epoch": 0.19689119170984457,
+      "grad_norm": 31.285725922510768,
+      "learning_rate": 3.977478521154974e-05,
+      "loss": 1.1569,
+      "step": 76
+    },
+    {
+      "epoch": 0.19948186528497408,
+      "grad_norm": 35.19264482867074,
+      "learning_rate": 3.9761757921821544e-05,
+      "loss": 1.1365,
+      "step": 77
+    },
+    {
+      "epoch": 0.20207253886010362,
+      "grad_norm": 44.66037256551279,
+      "learning_rate": 3.974836663116472e-05,
+      "loss": 1.164,
+      "step": 78
+    },
+    {
+      "epoch": 0.20466321243523317,
+      "grad_norm": 68.91101457952654,
+      "learning_rate": 3.973461158623963e-05,
+      "loss": 1.2256,
+      "step": 79
+    },
+    {
+      "epoch": 0.20725388601036268,
+      "grad_norm": 45.866521854583,
+      "learning_rate": 3.9720493040406786e-05,
+      "loss": 1.1697,
+      "step": 80
+    },
+    {
+      "epoch": 0.20984455958549222,
+      "grad_norm": 59.63095169617338,
+      "learning_rate": 3.970601125372218e-05,
+      "loss": 1.2094,
+      "step": 81
+    },
+    {
+      "epoch": 0.21243523316062177,
+      "grad_norm": 39.085597271064216,
+      "learning_rate": 3.9691166492932535e-05,
+      "loss": 1.1048,
+      "step": 82
+    },
+    {
+      "epoch": 0.21502590673575128,
+      "grad_norm": 36.40256073477861,
+      "learning_rate": 3.9675959031470336e-05,
+      "loss": 1.248,
+      "step": 83
+    },
+    {
+      "epoch": 0.21761658031088082,
+      "grad_norm": 29.846921716586085,
+      "learning_rate": 3.966038914944881e-05,
+      "loss": 1.1718,
+      "step": 84
+    },
+    {
+      "epoch": 0.22020725388601037,
+      "grad_norm": 50.87052190327881,
+      "learning_rate": 3.964445713365682e-05,
+      "loss": 1.1529,
+      "step": 85
+    },
+    {
+      "epoch": 0.22279792746113988,
+      "grad_norm": 35.32915760431302,
+      "learning_rate": 3.9628163277553486e-05,
+      "loss": 1.1767,
+      "step": 86
+    },
+    {
+      "epoch": 0.22538860103626943,
+      "grad_norm": 157.5587514654703,
+      "learning_rate": 3.961150788126286e-05,
+      "loss": 1.2194,
+      "step": 87
+    },
+    {
+      "epoch": 0.22797927461139897,
+      "grad_norm": 25.03485489120971,
+      "learning_rate": 3.9594491251568376e-05,
+      "loss": 1.1392,
+      "step": 88
+    },
+    {
+      "epoch": 0.23056994818652848,
+      "grad_norm": 80.55933867045263,
+      "learning_rate": 3.957711370190716e-05,
+      "loss": 1.1819,
+      "step": 89
+    },
+    {
+      "epoch": 0.23316062176165803,
+      "grad_norm": 272.22874004071406,
+      "learning_rate": 3.9559375552364325e-05,
+      "loss": 1.0998,
+      "step": 90
+    },
+    {
+      "epoch": 0.23575129533678757,
+      "grad_norm": 91.94671663482514,
+      "learning_rate": 3.954127712966702e-05,
+      "loss": 1.2494,
+      "step": 91
+    },
+    {
+      "epoch": 0.23834196891191708,
+      "grad_norm": 54.31533598131098,
+      "learning_rate": 3.952281876717843e-05,
+      "loss": 1.1385,
+      "step": 92
+    },
+    {
+      "epoch": 0.24093264248704663,
+      "grad_norm": 103.20789745908105,
+      "learning_rate": 3.950400080489165e-05,
+      "loss": 1.1398,
+      "step": 93
+    },
+    {
+      "epoch": 0.24352331606217617,
+      "grad_norm": 45.14746362545893,
+      "learning_rate": 3.94848235894234e-05,
+      "loss": 1.2697,
+      "step": 94
+    },
+    {
+      "epoch": 0.24611398963730569,
+      "grad_norm": 21.271923336142002,
+      "learning_rate": 3.9465287474007654e-05,
+      "loss": 1.1397,
+      "step": 95
+    },
+    {
+      "epoch": 0.24870466321243523,
+      "grad_norm": 93.89786795431422,
+      "learning_rate": 3.944539281848912e-05,
+      "loss": 1.1542,
+      "step": 96
+    },
+    {
+      "epoch": 0.25129533678756477,
+      "grad_norm": 32.38768349342839,
+      "learning_rate": 3.942513998931663e-05,
+      "loss": 1.1693,
+      "step": 97
+    },
+    {
+      "epoch": 0.25129533678756477,
+      "eval_loss": 1.1344976425170898,
+      "eval_runtime": 37.8807,
+      "eval_samples_per_second": 19.641,
+      "eval_steps_per_second": 1.241,
+      "step": 97
+    },
+    {
+      "epoch": 0.2538860103626943,
+      "grad_norm": 91.41293468177638,
+      "learning_rate": 3.940452935953639e-05,
+      "loss": 1.1724,
+      "step": 98
+    },
+    {
+      "epoch": 0.25647668393782386,
+      "grad_norm": 39.20645478419229,
+      "learning_rate": 3.9383561308785075e-05,
+      "loss": 1.1583,
+      "step": 99
+    },
+    {
+      "epoch": 0.25906735751295334,
+      "grad_norm": 35.32804513153546,
+      "learning_rate": 3.9362236223282885e-05,
+      "loss": 1.158,
+      "step": 100
+    },
+    {
+      "epoch": 0.2616580310880829,
+      "grad_norm": 35.24783762804842,
+      "learning_rate": 3.934055449582641e-05,
+      "loss": 1.1552,
+      "step": 101
+    },
+    {
+      "epoch": 0.26424870466321243,
+      "grad_norm": 33.743808031979775,
+      "learning_rate": 3.931851652578137e-05,
+      "loss": 1.264,
+      "step": 102
+    },
+    {
+      "epoch": 0.266839378238342,
+      "grad_norm": 113.49798793226394,
+      "learning_rate": 3.92961227190753e-05,
+      "loss": 1.2361,
+      "step": 103
+    },
+    {
+      "epoch": 0.2694300518134715,
+      "grad_norm": 31.813807349410364,
+      "learning_rate": 3.9273373488190036e-05,
+      "loss": 1.1246,
+      "step": 104
+    },
+    {
+      "epoch": 0.27202072538860106,
+      "grad_norm": 29.391695486306187,
+      "learning_rate": 3.925026925215417e-05,
+      "loss": 1.1142,
+      "step": 105
+    },
+    {
+      "epoch": 0.27461139896373055,
+      "grad_norm": 33.79933331839905,
+      "learning_rate": 3.922681043653526e-05,
+      "loss": 1.1401,
+      "step": 106
+    },
+    {
+      "epoch": 0.2772020725388601,
+      "grad_norm": 39.09509012730907,
+      "learning_rate": 3.920299747343204e-05,
+      "loss": 1.1822,
+      "step": 107
+    },
+    {
+      "epoch": 0.27979274611398963,
+      "grad_norm": 37.81471938433609,
+      "learning_rate": 3.9178830801466465e-05,
+      "loss": 1.1592,
+      "step": 108
+    },
+    {
+      "epoch": 0.2823834196891192,
+      "grad_norm": 69.07753778460207,
+      "learning_rate": 3.915431086577561e-05,
+      "loss": 1.1683,
+      "step": 109
+    },
+    {
+      "epoch": 0.2849740932642487,
+      "grad_norm": 28.864787246081605,
+      "learning_rate": 3.912943811800347e-05,
+      "loss": 1.1179,
+      "step": 110
+    },
+    {
+      "epoch": 0.28756476683937826,
+      "grad_norm": 28.842042951717836,
+      "learning_rate": 3.910421301629264e-05,
+      "loss": 1.1317,
+      "step": 111
+    },
+    {
+      "epoch": 0.29015544041450775,
+      "grad_norm": 51.475482074695506,
+      "learning_rate": 3.9078636025275904e-05,
+      "loss": 1.1451,
+      "step": 112
+    },
+    {
+      "epoch": 0.2927461139896373,
+      "grad_norm": 33.48279556713943,
+      "learning_rate": 3.9052707616067654e-05,
+      "loss": 1.1554,
+      "step": 113
+    },
+    {
+      "epoch": 0.29533678756476683,
+      "grad_norm": 21.279603575929844,
+      "learning_rate": 3.9026428266255205e-05,
+      "loss": 1.1636,
+      "step": 114
+    },
+    {
+      "epoch": 0.2979274611398964,
+      "grad_norm": 36.226178034876675,
+      "learning_rate": 3.899979845989003e-05,
+      "loss": 1.1966,
+      "step": 115
+    },
+    {
+      "epoch": 0.3005181347150259,
+      "grad_norm": 29.90506353145981,
+      "learning_rate": 3.897281868747878e-05,
+      "loss": 1.1888,
+      "step": 116
+    },
+    {
+      "epoch": 0.30310880829015546,
+      "grad_norm": 36.04602777809767,
+      "learning_rate": 3.894548944597434e-05,
+      "loss": 1.2066,
+      "step": 117
+    },
+    {
+      "epoch": 0.30569948186528495,
+      "grad_norm": 36.42793844948301,
+      "learning_rate": 3.8917811238766606e-05,
+      "loss": 1.1712,
+      "step": 118
+    },
+    {
+      "epoch": 0.3082901554404145,
+      "grad_norm": 58.788967662325696,
+      "learning_rate": 3.888978457567323e-05,
+      "loss": 1.1225,
+      "step": 119
+    },
+    {
+      "epoch": 0.31088082901554404,
+      "grad_norm": 29.357299816022326,
+      "learning_rate": 3.886140997293024e-05,
+      "loss": 1.1315,
+      "step": 120
+    },
+    {
+      "epoch": 0.3134715025906736,
+      "grad_norm": 95.08345317107502,
+      "learning_rate": 3.883268795318252e-05,
+      "loss": 1.1852,
+      "step": 121
+    },
+    {
+      "epoch": 0.3160621761658031,
+      "grad_norm": 33.6623824593179,
+      "learning_rate": 3.88036190454742e-05,
+      "loss": 1.16,
+      "step": 122
+    },
+    {
+      "epoch": 0.31865284974093266,
+      "grad_norm": 42.587546987131105,
+      "learning_rate": 3.8774203785238886e-05,
+      "loss": 1.1374,
+      "step": 123
+    },
+    {
+      "epoch": 0.32124352331606215,
+      "grad_norm": 33.360649853064245,
+      "learning_rate": 3.8744442714289816e-05,
+      "loss": 1.1757,
+      "step": 124
+    },
+    {
+      "epoch": 0.3238341968911917,
+      "grad_norm": 49.09256643961471,
+      "learning_rate": 3.8714336380809874e-05,
+      "loss": 1.1782,
+      "step": 125
+    },
+    {
+      "epoch": 0.32642487046632124,
+      "grad_norm": 31.505007051172793,
+      "learning_rate": 3.86838853393415e-05,
+      "loss": 1.195,
+      "step": 126
+    },
+    {
+      "epoch": 0.3290155440414508,
+      "grad_norm": 34.36735417254799,
+      "learning_rate": 3.865309015077645e-05,
+      "loss": 1.1078,
+      "step": 127
+    },
+    {
+      "epoch": 0.3316062176165803,
+      "grad_norm": 36.63220606142181,
+      "learning_rate": 3.862195138234551e-05,
+      "loss": 1.1319,
+      "step": 128
+    },
+    {
+      "epoch": 0.33419689119170987,
+      "grad_norm": 53.324986862513676,
+      "learning_rate": 3.859046960760801e-05,
+      "loss": 1.2301,
+      "step": 129
+    },
+    {
+      "epoch": 0.33678756476683935,
+      "grad_norm": 47.41445409144979,
+      "learning_rate": 3.855864540644126e-05,
+      "loss": 1.2366,
+      "step": 130
+    },
+    {
+      "epoch": 0.3393782383419689,
+      "grad_norm": 32.57355122427366,
+      "learning_rate": 3.8526479365029906e-05,
+      "loss": 1.142,
+      "step": 131
+    },
+    {
+      "epoch": 0.34196891191709844,
+      "grad_norm": 28.445824333644715,
+      "learning_rate": 3.849397207585508e-05,
+      "loss": 1.0847,
+      "step": 132
+    },
+    {
+      "epoch": 0.344559585492228,
+      "grad_norm": 49.23062726715889,
+      "learning_rate": 3.846112413768353e-05,
+      "loss": 1.2241,
+      "step": 133
+    },
+    {
+      "epoch": 0.3471502590673575,
+      "grad_norm": 53.424206543788074,
+      "learning_rate": 3.842793615555657e-05,
+      "loss": 1.2392,
+      "step": 134
+    },
+    {
+      "epoch": 0.34974093264248707,
+      "grad_norm": 38.19316140175426,
+      "learning_rate": 3.8394408740778934e-05,
+      "loss": 1.1208,
+      "step": 135
+    },
+    {
+      "epoch": 0.35233160621761656,
+      "grad_norm": 32.35931252369273,
+      "learning_rate": 3.836054251090755e-05,
+      "loss": 1.1604,
+      "step": 136
+    },
+    {
+      "epoch": 0.3549222797927461,
+      "grad_norm": 37.90085344799495,
+      "learning_rate": 3.83263380897401e-05,
+      "loss": 1.1134,
+      "step": 137
+    },
+    {
+      "epoch": 0.35751295336787564,
+      "grad_norm": 44.49191588319939,
+      "learning_rate": 3.829179610730359e-05,
+      "loss": 1.1281,
+      "step": 138
+    },
+    {
+      "epoch": 0.3601036269430052,
+      "grad_norm": 141.98524430756757,
+      "learning_rate": 3.8256917199842715e-05,
+      "loss": 1.0928,
+      "step": 139
+    },
+    {
+      "epoch": 0.3626943005181347,
+      "grad_norm": 30.887093976524472,
+      "learning_rate": 3.822170200980815e-05,
+      "loss": 1.0936,
+      "step": 140
+    },
+    {
+      "epoch": 0.36528497409326427,
+      "grad_norm": 21.980521878837745,
+      "learning_rate": 3.818615118584472e-05,
+      "loss": 1.1368,
+      "step": 141
+    },
+    {
+      "epoch": 0.36787564766839376,
+      "grad_norm": 538.6650762618656,
+      "learning_rate": 3.815026538277943e-05,
+      "loss": 1.0918,
+      "step": 142
+    },
+    {
+      "epoch": 0.3704663212435233,
+      "grad_norm": 40.842881572203,
+      "learning_rate": 3.811404526160943e-05,
+      "loss": 1.1705,
+      "step": 143
+    },
+    {
+      "epoch": 0.37305699481865284,
+      "grad_norm": 26.891553492377298,
+      "learning_rate": 3.8077491489489835e-05,
+      "loss": 1.1468,
+      "step": 144
+    },
+    {
+      "epoch": 0.3756476683937824,
+      "grad_norm": 45.138483181178074,
+      "learning_rate": 3.8040604739721415e-05,
+      "loss": 1.1679,
+      "step": 145
+    },
+    {
+      "epoch": 0.37823834196891193,
+      "grad_norm": 35.133763086168244,
+      "learning_rate": 3.8003385691738227e-05,
+      "loss": 1.1029,
+      "step": 146
+    },
+    {
+      "epoch": 0.38082901554404147,
+      "grad_norm": 36.941250802707344,
+      "learning_rate": 3.7965835031095065e-05,
+      "loss": 1.1491,
+      "step": 147
+    },
+    {
+      "epoch": 0.38341968911917096,
+      "grad_norm": 90.1080256703095,
+      "learning_rate": 3.792795344945485e-05,
+      "loss": 1.1212,
+      "step": 148
+    },
+    {
+      "epoch": 0.3860103626943005,
+      "grad_norm": 39.70360899750413,
+      "learning_rate": 3.7889741644575914e-05,
+      "loss": 1.15,
+      "step": 149
+    },
+    {
+      "epoch": 0.38860103626943004,
+      "grad_norm": 28.229369877304094,
+      "learning_rate": 3.78512003202991e-05,
+      "loss": 1.1111,
+      "step": 150
+    },
+    {
+      "epoch": 0.3911917098445596,
+      "grad_norm": 31.611752191925987,
+      "learning_rate": 3.7812330186534815e-05,
+      "loss": 1.1366,
+      "step": 151
+    },
+    {
+      "epoch": 0.39378238341968913,
+      "grad_norm": 38.196015586772425,
+      "learning_rate": 3.777313195924998e-05,
+      "loss": 1.1433,
+      "step": 152
+    },
+    {
+      "epoch": 0.3963730569948187,
+      "grad_norm": 22.732638044547453,
+      "learning_rate": 3.773360636045481e-05,
+      "loss": 1.1125,
+      "step": 153
+    },
+    {
+      "epoch": 0.39896373056994816,
+      "grad_norm": 90.19158665385014,
+      "learning_rate": 3.7693754118189525e-05,
+      "loss": 1.1242,
+      "step": 154
+    },
+    {
+      "epoch": 0.4015544041450777,
+      "grad_norm": 42.43479974993017,
+      "learning_rate": 3.765357596651095e-05,
+      "loss": 1.1191,
+      "step": 155
+    },
+    {
+      "epoch": 0.40414507772020725,
+      "grad_norm": 88.0076735720364,
+      "learning_rate": 3.761307264547899e-05,
+      "loss": 1.1718,
+      "step": 156
+    },
+    {
+      "epoch": 0.4067357512953368,
+      "grad_norm": 30.782507703935767,
+      "learning_rate": 3.757224490114297e-05,
+      "loss": 1.109,
+      "step": 157
+    },
+    {
+      "epoch": 0.40932642487046633,
+      "grad_norm": 69.89871106113397,
+      "learning_rate": 3.7531093485527943e-05,
+      "loss": 1.1018,
+      "step": 158
+    },
+    {
+      "epoch": 0.4119170984455959,
+      "grad_norm": 37.339006645717305,
+      "learning_rate": 3.7489619156620796e-05,
+      "loss": 1.1358,
+      "step": 159
+    },
+    {
+      "epoch": 0.41450777202072536,
+      "grad_norm": 28.06388054378899,
+      "learning_rate": 3.744782267835632e-05,
+      "loss": 1.0847,
+      "step": 160
+    },
+    {
+      "epoch": 0.4170984455958549,
+      "grad_norm": 54.05874281297702,
+      "learning_rate": 3.740570482060311e-05,
+      "loss": 1.1682,
+      "step": 161
+    },
+    {
+      "epoch": 0.41968911917098445,
+      "grad_norm": 32.299093265328835,
+      "learning_rate": 3.73632663591494e-05,
+      "loss": 1.1413,
+      "step": 162
+    },
+    {
+      "epoch": 0.422279792746114,
+      "grad_norm": 31.213652090157694,
+      "learning_rate": 3.732050807568878e-05,
+      "loss": 1.1313,
+      "step": 163
+    },
+    {
+      "epoch": 0.42487046632124353,
+      "grad_norm": 40.01090035937505,
+      "learning_rate": 3.727743075780578e-05,
+      "loss": 1.1513,
+      "step": 164
+    },
+    {
+      "epoch": 0.4274611398963731,
+      "grad_norm": 47.11352577964853,
+      "learning_rate": 3.723403519896136e-05,
+      "loss": 1.2192,
+      "step": 165
+    },
+    {
+      "epoch": 0.43005181347150256,
+      "grad_norm": 28.645086506093037,
+      "learning_rate": 3.7190322198478355e-05,
+      "loss": 1.1097,
+      "step": 166
+    },
+    {
+      "epoch": 0.4326424870466321,
+      "grad_norm": 35.28541113925116,
+      "learning_rate": 3.7146292561526654e-05,
+      "loss": 1.1557,
+      "step": 167
+    },
+    {
+      "epoch": 0.43523316062176165,
+      "grad_norm": 58.30281063037669,
+      "learning_rate": 3.7101947099108425e-05,
+      "loss": 1.1829,
+      "step": 168
+    },
+    {
+      "epoch": 0.4378238341968912,
+      "grad_norm": 26.33563548968379,
+      "learning_rate": 3.70572866280432e-05,
+      "loss": 1.147,
+      "step": 169
+    },
+    {
+      "epoch": 0.44041450777202074,
+      "grad_norm": 57.00052875402651,
+      "learning_rate": 3.701231197095277e-05,
+      "loss": 1.1212,
+      "step": 170
+    },
+    {
+      "epoch": 0.4430051813471503,
+      "grad_norm": 23.672828037237174,
+      "learning_rate": 3.696702395624608e-05,
+      "loss": 1.1152,
+      "step": 171
+    },
+    {
+      "epoch": 0.44559585492227977,
+      "grad_norm": 41.1264174112964,
+      "learning_rate": 3.692142341810395e-05,
+      "loss": 1.1154,
+      "step": 172
+    },
+    {
+      "epoch": 0.4481865284974093,
+      "grad_norm": 26.72177706144361,
+      "learning_rate": 3.6875511196463715e-05,
+      "loss": 1.1725,
+      "step": 173
+    },
+    {
+      "epoch": 0.45077720207253885,
+      "grad_norm": 95.4088800585977,
+      "learning_rate": 3.682928813700375e-05,
+      "loss": 1.1339,
+      "step": 174
+    },
+    {
+      "epoch": 0.4533678756476684,
+      "grad_norm": 34.33666578349465,
+      "learning_rate": 3.678275509112788e-05,
+      "loss": 1.1867,
+      "step": 175
+    },
+    {
+      "epoch": 0.45595854922279794,
+      "grad_norm": 31.032304531003014,
+      "learning_rate": 3.6735912915949745e-05,
+      "loss": 1.1386,
+      "step": 176
+    },
+    {
+      "epoch": 0.4585492227979275,
+      "grad_norm": 55.22043313188224,
+      "learning_rate": 3.6688762474276945e-05,
+      "loss": 1.1102,
+      "step": 177
+    },
+    {
+      "epoch": 0.46113989637305697,
+      "grad_norm": 29.82713377876857,
+      "learning_rate": 3.6641304634595216e-05,
+      "loss": 1.1564,
+      "step": 178
+    },
+    {
+      "epoch": 0.4637305699481865,
+      "grad_norm": 35.71025459541737,
+      "learning_rate": 3.659354027105238e-05,
+      "loss": 1.0939,
+      "step": 179
+    },
+    {
+      "epoch": 0.46632124352331605,
+      "grad_norm": 52.41175655642653,
+      "learning_rate": 3.6545470263442265e-05,
+      "loss": 1.1578,
+      "step": 180
+    },
+    {
+      "epoch": 0.4689119170984456,
+      "grad_norm": 27.682485766528306,
+      "learning_rate": 3.649709549718849e-05,
+      "loss": 1.1875,
+      "step": 181
+    },
+    {
+      "epoch": 0.47150259067357514,
+      "grad_norm": 36.53293663303487,
+      "learning_rate": 3.6448416863328186e-05,
+      "loss": 1.1111,
+      "step": 182
+    },
+    {
+      "epoch": 0.4740932642487047,
+      "grad_norm": 31.45177998538027,
+      "learning_rate": 3.639943525849555e-05,
+      "loss": 1.113,
+      "step": 183
+    },
+    {
+      "epoch": 0.47668393782383417,
+      "grad_norm": 28.323097072885673,
+      "learning_rate": 3.635015158490533e-05,
+      "loss": 1.1159,
+      "step": 184
+    },
+    {
+      "epoch": 0.4792746113989637,
+      "grad_norm": 47.75573754341213,
+      "learning_rate": 3.6300566750336225e-05,
+      "loss": 1.1305,
+      "step": 185
+    },
+    {
+      "epoch": 0.48186528497409326,
+      "grad_norm": 21.384095061494357,
+      "learning_rate": 3.625068166811418e-05,
+      "loss": 1.1369,
+      "step": 186
+    },
+    {
+      "epoch": 0.4844559585492228,
+      "grad_norm": 30.714645036809546,
+      "learning_rate": 3.6200497257095504e-05,
+      "loss": 1.1858,
+      "step": 187
+    },
+    {
+      "epoch": 0.48704663212435234,
+      "grad_norm": 35.12161426399798,
+      "learning_rate": 3.615001444165001e-05,
+      "loss": 1.1293,
+      "step": 188
+    },
+    {
+      "epoch": 0.4896373056994819,
+      "grad_norm": 116.83443661381396,
+      "learning_rate": 3.6099234151643924e-05,
+      "loss": 1.1515,
+      "step": 189
+    },
+    {
+      "epoch": 0.49222797927461137,
+      "grad_norm": 55.47885243409044,
+      "learning_rate": 3.604815732242283e-05,
+      "loss": 1.112,
+      "step": 190
+    },
+    {
+      "epoch": 0.4948186528497409,
+      "grad_norm": 32.332747429034285,
+      "learning_rate": 3.5996784894794394e-05,
+      "loss": 1.1661,
+      "step": 191
+    },
+    {
+      "epoch": 0.49740932642487046,
+      "grad_norm": 33.039210183180046,
+      "learning_rate": 3.594511781501103e-05,
+      "loss": 1.1244,
+      "step": 192
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 21.325687337182504,
+      "learning_rate": 3.58931570347525e-05,
+      "loss": 1.1634,
+      "step": 193
+    },
+    {
+      "epoch": 0.5025906735751295,
+      "grad_norm": 51.37599478469561,
+      "learning_rate": 3.584090351110838e-05,
+      "loss": 1.2106,
+      "step": 194
+    },
+    {
+      "epoch": 0.5025906735751295,
+      "eval_loss": 1.1119717359542847,
+      "eval_runtime": 49.6027,
+      "eval_samples_per_second": 14.999,
+      "eval_steps_per_second": 0.948,
+      "step": 194
+    },
+    {
+      "epoch": 0.5051813471502591,
+      "grad_norm": 42.105169991612456,
+      "learning_rate": 3.57883582065604e-05,
+      "loss": 1.1303,
+      "step": 195
+    },
+    {
+      "epoch": 0.5077720207253886,
+      "grad_norm": 37.14457014578168,
+      "learning_rate": 3.573552208896474e-05,
+      "loss": 1.1483,
+      "step": 196
+    },
+    {
+      "epoch": 0.5103626943005182,
+      "grad_norm": 28.56241612018119,
+      "learning_rate": 3.568239613153421e-05,
+      "loss": 1.0843,
+      "step": 197
+    },
+    {
+      "epoch": 0.5129533678756477,
+      "grad_norm": 35.399304035761865,
+      "learning_rate": 3.5628981312820315e-05,
+      "loss": 1.1177,
+      "step": 198
+    },
+    {
+      "epoch": 0.5155440414507773,
+      "grad_norm": 25.91156850470446,
+      "learning_rate": 3.557527861669522e-05,
+      "loss": 1.1215,
+      "step": 199
+    },
+    {
+      "epoch": 0.5181347150259067,
+      "grad_norm": 43.509516777992324,
+      "learning_rate": 3.552128903233363e-05,
+      "loss": 1.1532,
+      "step": 200
+    },
+    {
+      "epoch": 0.5207253886010362,
+      "grad_norm": 38.18164449834795,
+      "learning_rate": 3.54670135541946e-05,
+      "loss": 1.1142,
+      "step": 201
+    },
+    {
+      "epoch": 0.5233160621761658,
+      "grad_norm": 48.576743289054534,
+      "learning_rate": 3.541245318200318e-05,
+      "loss": 1.1152,
+      "step": 202
+    },
+    {
+      "epoch": 0.5259067357512953,
+      "grad_norm": 38.65411737007163,
+      "learning_rate": 3.5357608920732e-05,
+      "loss": 1.1607,
+      "step": 203
+    },
+    {
+      "epoch": 0.5284974093264249,
+      "grad_norm": 35.663493907396834,
+      "learning_rate": 3.530248178058282e-05,
+      "loss": 1.1273,
+      "step": 204
+    },
+    {
+      "epoch": 0.5310880829015544,
+      "grad_norm": 26.829817821665976,
+      "learning_rate": 3.5247072776967805e-05,
+      "loss": 1.1174,
+      "step": 205
+    },
+    {
+      "epoch": 0.533678756476684,
+      "grad_norm": 39.79604912152638,
+      "learning_rate": 3.519138293049097e-05,
+      "loss": 1.1811,
+      "step": 206
+    },
+    {
+      "epoch": 0.5362694300518135,
+      "grad_norm": 32.26179097390416,
+      "learning_rate": 3.513541326692925e-05,
+      "loss": 1.1346,
+      "step": 207
+    },
+    {
+      "epoch": 0.538860103626943,
+      "grad_norm": 24.35769329902787,
+      "learning_rate": 3.5079164817213684e-05,
+      "loss": 1.1061,
+      "step": 208
+    },
+    {
+      "epoch": 0.5414507772020726,
+      "grad_norm": 26.645546258363844,
+      "learning_rate": 3.5022638617410396e-05,
+      "loss": 1.0514,
+      "step": 209
+    },
+    {
+      "epoch": 0.5440414507772021,
+      "grad_norm": 105.19676603444857,
+      "learning_rate": 3.496583570870152e-05,
+      "loss": 1.1474,
+      "step": 210
+    },
+    {
+      "epoch": 0.5466321243523317,
+      "grad_norm": 61.600623030405885,
+      "learning_rate": 3.4908757137366006e-05,
+      "loss": 1.104,
+      "step": 211
+    },
+    {
+      "epoch": 0.5492227979274611,
+      "grad_norm": 31.65460129853052,
+      "learning_rate": 3.485140395476038e-05,
+      "loss": 1.0737,
+      "step": 212
+    },
+    {
+      "epoch": 0.5518134715025906,
+      "grad_norm": 26.860379117211497,
+      "learning_rate": 3.4793777217299346e-05,
+      "loss": 1.1119,
+      "step": 213
+    },
+    {
+      "epoch": 0.5544041450777202,
+      "grad_norm": 39.89324262309783,
+      "learning_rate": 3.473587798643633e-05,
+      "loss": 1.1626,
+      "step": 214
+    },
+    {
+      "epoch": 0.5569948186528497,
+      "grad_norm": 39.77638257731599,
+      "learning_rate": 3.467770732864399e-05,
+      "loss": 1.1545,
+      "step": 215
+    },
+    {
+      "epoch": 0.5595854922279793,
+      "grad_norm": 30.994657564291458,
+      "learning_rate": 3.461926631539445e-05,
+      "loss": 1.1646,
+      "step": 216
+    },
+    {
+      "epoch": 0.5621761658031088,
+      "grad_norm": 51.99674092516571,
+      "learning_rate": 3.4560556023139695e-05,
+      "loss": 1.1638,
+      "step": 217
+    },
+    {
+      "epoch": 0.5647668393782384,
+      "grad_norm": 58.5132713002146,
+      "learning_rate": 3.450157753329166e-05,
+      "loss": 1.1461,
+      "step": 218
+    },
+    {
+      "epoch": 0.5673575129533679,
+      "grad_norm": 30.712469030418482,
+      "learning_rate": 3.4442331932202326e-05,
+      "loss": 1.1583,
+      "step": 219
+    },
+    {
+      "epoch": 0.5699481865284974,
+      "grad_norm": 47.00217426642832,
+      "learning_rate": 3.438282031114374e-05,
+      "loss": 1.1154,
+      "step": 220
+    },
+    {
+      "epoch": 0.572538860103627,
+      "grad_norm": 37.33927961163222,
+      "learning_rate": 3.432304376628787e-05,
+      "loss": 1.1372,
+      "step": 221
+    },
+    {
+      "epoch": 0.5751295336787565,
+      "grad_norm": 28.858636933974392,
+      "learning_rate": 3.4263003398686464e-05,
+      "loss": 1.0488,
+      "step": 222
+    },
+    {
+      "epoch": 0.5777202072538861,
+      "grad_norm": 37.842230890171486,
+      "learning_rate": 3.420270031425072e-05,
+      "loss": 1.1892,
+      "step": 223
+    },
+    {
+      "epoch": 0.5803108808290155,
+      "grad_norm": 32.65394945357516,
+      "learning_rate": 3.4142135623730954e-05,
+      "loss": 1.1218,
+      "step": 224
+    },
+    {
+      "epoch": 0.582901554404145,
+      "grad_norm": 115.22040829465772,
+      "learning_rate": 3.4081310442696114e-05,
+      "loss": 1.1546,
+      "step": 225
+    },
+    {
+      "epoch": 0.5854922279792746,
+      "grad_norm": 31.20514468446119,
+      "learning_rate": 3.402022589151325e-05,
+      "loss": 1.0969,
+      "step": 226
+    },
+    {
+      "epoch": 0.5880829015544041,
+      "grad_norm": 52.8397361926395,
+      "learning_rate": 3.395888309532687e-05,
+      "loss": 1.1218,
+      "step": 227
+    },
+    {
+      "epoch": 0.5906735751295337,
+      "grad_norm": 51.7991692917308,
+      "learning_rate": 3.3897283184038215e-05,
+      "loss": 1.1395,
+      "step": 228
+    },
+    {
+      "epoch": 0.5932642487046632,
+      "grad_norm": 33.56775233970504,
+      "learning_rate": 3.3835427292284445e-05,
+      "loss": 1.1107,
+      "step": 229
+    },
+    {
+      "epoch": 0.5958549222797928,
+      "grad_norm": 46.081120788214314,
+      "learning_rate": 3.3773316559417734e-05,
+      "loss": 1.1472,
+      "step": 230
+    },
+    {
+      "epoch": 0.5984455958549223,
+      "grad_norm": 41.72558170492288,
+      "learning_rate": 3.371095212948431e-05,
+      "loss": 1.1871,
+      "step": 231
+    },
+    {
+      "epoch": 0.6010362694300518,
+      "grad_norm": 34.27957927587091,
+      "learning_rate": 3.364833515120336e-05,
+      "loss": 1.1376,
+      "step": 232
+    },
+    {
+      "epoch": 0.6036269430051814,
+      "grad_norm": 36.58452602010953,
+      "learning_rate": 3.358546677794586e-05,
+      "loss": 1.1885,
+      "step": 233
+    },
+    {
+      "epoch": 0.6062176165803109,
+      "grad_norm": 28.010809914189192,
+      "learning_rate": 3.352234816771337e-05,
+      "loss": 1.102,
+      "step": 234
+    },
+    {
+      "epoch": 0.6088082901554405,
+      "grad_norm": 24.78419558611963,
+      "learning_rate": 3.3458980483116664e-05,
+      "loss": 1.0818,
+      "step": 235
+    },
+    {
+      "epoch": 0.6113989637305699,
+      "grad_norm": 28.12830040081226,
+      "learning_rate": 3.3395364891354316e-05,
+      "loss": 1.1862,
+      "step": 236
+    },
+    {
+      "epoch": 0.6139896373056994,
+      "grad_norm": 37.94181651161551,
+      "learning_rate": 3.333150256419127e-05,
+      "loss": 1.147,
+      "step": 237
+    },
+    {
+      "epoch": 0.616580310880829,
+      "grad_norm": 21.809518482701854,
+      "learning_rate": 3.3267394677937134e-05,
+      "loss": 1.0994,
+      "step": 238
+    },
+    {
+      "epoch": 0.6191709844559585,
+      "grad_norm": 32.12135773753589,
+      "learning_rate": 3.320304241342464e-05,
+      "loss": 1.1531,
+      "step": 239
+    },
+    {
+      "epoch": 0.6217616580310881,
+      "grad_norm": 51.959731073524054,
+      "learning_rate": 3.31384469559878e-05,
+      "loss": 1.1717,
+      "step": 240
+    },
+    {
+      "epoch": 0.6243523316062176,
+      "grad_norm": 28.045815836372345,
+      "learning_rate": 3.307360949544012e-05,
+      "loss": 1.1814,
+      "step": 241
+    },
+    {
+      "epoch": 0.6269430051813472,
+      "grad_norm": 39.55208384578746,
+      "learning_rate": 3.300853122605268e-05,
+      "loss": 1.1483,
+      "step": 242
+    },
+    {
+      "epoch": 0.6295336787564767,
+      "grad_norm": 29.799974205160808,
+      "learning_rate": 3.294321334653213e-05,
+      "loss": 1.1838,
+      "step": 243
+    },
+    {
+      "epoch": 0.6321243523316062,
+      "grad_norm": 124.31035254102245,
+      "learning_rate": 3.2877657059998584e-05,
+      "loss": 1.0698,
+      "step": 244
+    },
+    {
+      "epoch": 0.6347150259067358,
+      "grad_norm": 37.989925180187655,
+      "learning_rate": 3.281186357396351e-05,
+      "loss": 1.0984,
+      "step": 245
+    },
+    {
+      "epoch": 0.6373056994818653,
+      "grad_norm": 55.72599333657572,
+      "learning_rate": 3.274583410030745e-05,
+      "loss": 1.2333,
+      "step": 246
+    },
+    {
+      "epoch": 0.6398963730569949,
+      "grad_norm": 46.77079456439719,
+      "learning_rate": 3.267956985525774e-05,
+      "loss": 1.2157,
+      "step": 247
+    },
+    {
+      "epoch": 0.6424870466321243,
+      "grad_norm": 33.62329915252562,
+      "learning_rate": 3.261307205936603e-05,
+      "loss": 1.1752,
+      "step": 248
+    },
+    {
+      "epoch": 0.6450777202072538,
+      "grad_norm": 34.11794183225494,
+      "learning_rate": 3.2546341937485884e-05,
+      "loss": 1.1265,
+      "step": 249
+    },
+    {
+      "epoch": 0.6476683937823834,
+      "grad_norm": 36.027636323913896,
+      "learning_rate": 3.247938071875017e-05,
+      "loss": 1.103,
+      "step": 250
+    },
+    {
+      "epoch": 0.6502590673575129,
+      "grad_norm": 35.393219337329946,
+      "learning_rate": 3.2412189636548456e-05,
+      "loss": 1.1148,
+      "step": 251
+    },
+    {
+      "epoch": 0.6528497409326425,
+      "grad_norm": 31.578919022569924,
+      "learning_rate": 3.234476992850425e-05,
+      "loss": 1.1149,
+      "step": 252
+    },
+    {
+      "epoch": 0.655440414507772,
+      "grad_norm": 28.93717647736964,
+      "learning_rate": 3.227712283645224e-05,
+      "loss": 1.1425,
+      "step": 253
+    },
+    {
+      "epoch": 0.6580310880829016,
+      "grad_norm": 34.170026750703684,
+      "learning_rate": 3.2209249606415394e-05,
+      "loss": 1.1591,
+      "step": 254
+    },
+    {
+      "epoch": 0.6606217616580311,
+      "grad_norm": 27.52194954061608,
+      "learning_rate": 3.214115148858201e-05,
+      "loss": 1.1704,
+      "step": 255
+    },
+    {
+      "epoch": 0.6632124352331606,
+      "grad_norm": 81.65404753769732,
+      "learning_rate": 3.207282973728273e-05,
+      "loss": 1.161,
+      "step": 256
+    },
+    {
+      "epoch": 0.6658031088082902,
+      "grad_norm": 57.45351536522683,
+      "learning_rate": 3.200428561096737e-05,
+      "loss": 1.116,
+      "step": 257
+    },
+    {
+      "epoch": 0.6683937823834197,
+      "grad_norm": 30.968529074463714,
+      "learning_rate": 3.193552037218179e-05,
+      "loss": 1.1265,
+      "step": 258
+    },
+    {
+      "epoch": 0.6709844559585493,
+      "grad_norm": 37.8817748068655,
+      "learning_rate": 3.186653528754464e-05,
+      "loss": 1.1287,
+      "step": 259
+    },
+    {
+      "epoch": 0.6735751295336787,
+      "grad_norm": 29.197031189172545,
+      "learning_rate": 3.179733162772398e-05,
+      "loss": 1.1045,
+      "step": 260
+    },
+    {
+      "epoch": 0.6761658031088082,
+      "grad_norm": 36.56253841299107,
+      "learning_rate": 3.172791066741392e-05,
+      "loss": 1.1539,
+      "step": 261
+    },
+    {
+      "epoch": 0.6787564766839378,
+      "grad_norm": 25.799921116950998,
+      "learning_rate": 3.165827368531113e-05,
+      "loss": 1.0796,
+      "step": 262
+    },
+    {
+      "epoch": 0.6813471502590673,
+      "grad_norm": 82.81825216532526,
+      "learning_rate": 3.1588421964091276e-05,
+      "loss": 1.142,
+      "step": 263
+    },
+    {
+      "epoch": 0.6839378238341969,
+      "grad_norm": 31.100074747569124,
+      "learning_rate": 3.151835679038542e-05,
+      "loss": 1.0908,
+      "step": 264
+    },
+    {
+      "epoch": 0.6865284974093264,
+      "grad_norm": 25.57297200703221,
+      "learning_rate": 3.14480794547563e-05,
+      "loss": 1.1436,
+      "step": 265
+    },
+    {
+      "epoch": 0.689119170984456,
+      "grad_norm": 23.92492773149328,
+      "learning_rate": 3.137759125167455e-05,
+      "loss": 1.1202,
+      "step": 266
+    },
+    {
+      "epoch": 0.6917098445595855,
+      "grad_norm": 22.14274360766396,
+      "learning_rate": 3.130689347949486e-05,
+      "loss": 1.1113,
+      "step": 267
+    },
+    {
+      "epoch": 0.694300518134715,
+      "grad_norm": 26.68725288649902,
+      "learning_rate": 3.123598744043211e-05,
+      "loss": 1.1517,
+      "step": 268
+    },
+    {
+      "epoch": 0.6968911917098446,
+      "grad_norm": 25.559817524659362,
+      "learning_rate": 3.1164874440537295e-05,
+      "loss": 1.0976,
+      "step": 269
+    },
+    {
+      "epoch": 0.6994818652849741,
+      "grad_norm": 28.89996834100355,
+      "learning_rate": 3.109355578967356e-05,
+      "loss": 1.1932,
+      "step": 270
+    },
+    {
+      "epoch": 0.7020725388601037,
+      "grad_norm": 32.09658045195569,
+      "learning_rate": 3.1022032801492e-05,
+      "loss": 1.1161,
+      "step": 271
+    },
+    {
+      "epoch": 0.7046632124352331,
+      "grad_norm": 30.623705646213768,
+      "learning_rate": 3.095030679340751e-05,
+      "loss": 1.1993,
+      "step": 272
+    },
+    {
+      "epoch": 0.7072538860103627,
+      "grad_norm": 41.71263710932429,
+      "learning_rate": 3.0878379086574494e-05,
+      "loss": 1.1624,
+      "step": 273
+    },
+    {
+      "epoch": 0.7098445595854922,
+      "grad_norm": 34.68352639470226,
+      "learning_rate": 3.0806251005862535e-05,
+      "loss": 1.1156,
+      "step": 274
+    },
+    {
+      "epoch": 0.7124352331606217,
+      "grad_norm": 23.52580702428812,
+      "learning_rate": 3.073392387983202e-05,
+      "loss": 1.0963,
+      "step": 275
+    },
+    {
+      "epoch": 0.7150259067357513,
+      "grad_norm": 28.10687988214902,
+      "learning_rate": 3.0661399040709584e-05,
+      "loss": 1.1095,
+      "step": 276
+    },
+    {
+      "epoch": 0.7176165803108808,
+      "grad_norm": 66.72288729975841,
+      "learning_rate": 3.05886778243637e-05,
+      "loss": 1.0865,
+      "step": 277
+    },
+    {
+      "epoch": 0.7202072538860104,
+      "grad_norm": 25.775217430321934,
+      "learning_rate": 3.051576157027998e-05,
+      "loss": 1.1058,
+      "step": 278
+    },
+    {
+      "epoch": 0.7227979274611399,
+      "grad_norm": 36.82942099016794,
+      "learning_rate": 3.0442651621536502e-05,
+      "loss": 1.1211,
+      "step": 279
+    },
+    {
+      "epoch": 0.7253886010362695,
+      "grad_norm": 27.878820856521013,
+      "learning_rate": 3.0369349324779115e-05,
+      "loss": 1.1471,
+      "step": 280
+    },
+    {
+      "epoch": 0.727979274611399,
+      "grad_norm": 31.293156717285573,
+      "learning_rate": 3.0295856030196618e-05,
+      "loss": 1.0748,
+      "step": 281
+    },
+    {
+      "epoch": 0.7305699481865285,
+      "grad_norm": 39.315952115194435,
+      "learning_rate": 3.022217309149588e-05,
+      "loss": 1.0993,
+      "step": 282
+    },
+    {
+      "epoch": 0.7331606217616581,
+      "grad_norm": 36.79954071435495,
+      "learning_rate": 3.0148301865876913e-05,
+      "loss": 1.1045,
+      "step": 283
+    },
+    {
+      "epoch": 0.7357512953367875,
+      "grad_norm": 26.127389502147167,
+      "learning_rate": 3.0074243714007875e-05,
+      "loss": 1.1424,
+      "step": 284
+    },
+    {
+      "epoch": 0.7383419689119171,
+      "grad_norm": 25.608778060317068,
+      "learning_rate": 3.0000000000000004e-05,
+      "loss": 1.1055,
+      "step": 285
+    },
+    {
+      "epoch": 0.7409326424870466,
+      "grad_norm": 36.22629669671894,
+      "learning_rate": 2.992557209138249e-05,
+      "loss": 1.0845,
+      "step": 286
+    },
+    {
+      "epoch": 0.7435233160621761,
+      "grad_norm": 35.30642111132886,
+      "learning_rate": 2.9850961359077293e-05,
+      "loss": 1.204,
+      "step": 287
+    },
+    {
+      "epoch": 0.7461139896373057,
+      "grad_norm": 29.765894622087952,
+      "learning_rate": 2.977616917737388e-05,
+      "loss": 1.168,
+      "step": 288
+    },
+    {
+      "epoch": 0.7487046632124352,
+      "grad_norm": 27.194683587397567,
+      "learning_rate": 2.9701196923903927e-05,
+      "loss": 1.1236,
+      "step": 289
+    },
+    {
+      "epoch": 0.7512953367875648,
+      "grad_norm": 63.09779240191165,
+      "learning_rate": 2.9626045979615928e-05,
+      "loss": 1.1395,
+      "step": 290
+    },
+    {
+      "epoch": 0.7538860103626943,
+      "grad_norm": 25.014233377763066,
+      "learning_rate": 2.9550717728749768e-05,
+      "loss": 1.1054,
+      "step": 291
+    },
+    {
+      "epoch": 0.7538860103626943,
+      "eval_loss": 1.0996382236480713,
+      "eval_runtime": 37.9545,
+      "eval_samples_per_second": 19.602,
+      "eval_steps_per_second": 1.238,
+      "step": 291
+    },
+    {
+      "epoch": 0.7564766839378239,
+      "grad_norm": 27.481891737318097,
+      "learning_rate": 2.947521355881122e-05,
+      "loss": 1.1252,
+      "step": 292
+    },
+    {
+      "epoch": 0.7590673575129534,
+      "grad_norm": 67.57807413949878,
+      "learning_rate": 2.9399534860546404e-05,
+      "loss": 1.1761,
+      "step": 293
+    },
+    {
+      "epoch": 0.7616580310880829,
+      "grad_norm": 65.66834495909988,
+      "learning_rate": 2.932368302791614e-05,
+      "loss": 1.0551,
+      "step": 294
+    },
+    {
+      "epoch": 0.7642487046632125,
+      "grad_norm": 30.051210942517116,
+      "learning_rate": 2.92476594580703e-05,
+      "loss": 1.138,
+      "step": 295
+    },
+    {
+      "epoch": 0.7668393782383419,
+      "grad_norm": 22.693089678510507,
+      "learning_rate": 2.917146555132206e-05,
+      "loss": 1.1495,
+      "step": 296
+    },
+    {
+      "epoch": 0.7694300518134715,
+      "grad_norm": 53.84166280540606,
+      "learning_rate": 2.909510271112212e-05,
+      "loss": 1.1409,
+      "step": 297
+    },
+    {
+      "epoch": 0.772020725388601,
+      "grad_norm": 32.69106061524578,
+      "learning_rate": 2.9018572344032823e-05,
+      "loss": 1.1709,
+      "step": 298
+    },
+    {
+      "epoch": 0.7746113989637305,
+      "grad_norm": 39.44484991312582,
+      "learning_rate": 2.8941875859702283e-05,
+      "loss": 1.1138,
+      "step": 299
+    },
+    {
+      "epoch": 0.7772020725388601,
+      "grad_norm": 31.51857596969122,
+      "learning_rate": 2.88650146708384e-05,
+      "loss": 1.1931,
+      "step": 300
+    },
+    {
+      "epoch": 0.7797927461139896,
+      "grad_norm": 70.51218412614058,
+      "learning_rate": 2.878799019318283e-05,
+      "loss": 1.155,
+      "step": 301
+    },
+    {
+      "epoch": 0.7823834196891192,
+      "grad_norm": 80.27969224752457,
+      "learning_rate": 2.8710803845484955e-05,
+      "loss": 1.1425,
+      "step": 302
+    },
+    {
+      "epoch": 0.7849740932642487,
+      "grad_norm": 28.16560857981767,
+      "learning_rate": 2.8633457049475678e-05,
+      "loss": 1.1072,
+      "step": 303
+    },
+    {
+      "epoch": 0.7875647668393783,
+      "grad_norm": 41.15138307552231,
+      "learning_rate": 2.855595122984129e-05,
+      "loss": 1.1492,
+      "step": 304
+    },
+    {
+      "epoch": 0.7901554404145078,
+      "grad_norm": 23.894217282116276,
+      "learning_rate": 2.847828781419722e-05,
+      "loss": 1.1136,
+      "step": 305
+    },
+    {
+      "epoch": 0.7927461139896373,
+      "grad_norm": 25.005501120810248,
+      "learning_rate": 2.8400468233061708e-05,
+      "loss": 1.0921,
+      "step": 306
+    },
+    {
+      "epoch": 0.7953367875647669,
+      "grad_norm": 30.91791938195468,
+      "learning_rate": 2.832249391982949e-05,
+      "loss": 1.1098,
+      "step": 307
+    },
+    {
+      "epoch": 0.7979274611398963,
+      "grad_norm": 44.776563922922726,
+      "learning_rate": 2.8244366310745398e-05,
+      "loss": 1.1845,
+      "step": 308
+    },
+    {
+      "epoch": 0.8005181347150259,
+      "grad_norm": 19.059329544784376,
+      "learning_rate": 2.816608684487787e-05,
+      "loss": 1.169,
+      "step": 309
+    },
+    {
+      "epoch": 0.8031088082901554,
+      "grad_norm": 63.97334641962602,
+      "learning_rate": 2.8087656964092472e-05,
+      "loss": 1.124,
+      "step": 310
+    },
+    {
+      "epoch": 0.805699481865285,
+      "grad_norm": 30.878848859015882,
+      "learning_rate": 2.8009078113025335e-05,
+      "loss": 1.2087,
+      "step": 311
+    },
+    {
+      "epoch": 0.8082901554404145,
+      "grad_norm": 34.63835471543836,
+      "learning_rate": 2.7930351739056533e-05,
+      "loss": 1.1338,
+      "step": 312
+    },
+    {
+      "epoch": 0.810880829015544,
+      "grad_norm": 30.03178182445718,
+      "learning_rate": 2.7851479292283442e-05,
+      "loss": 1.1321,
+      "step": 313
+    },
+    {
+      "epoch": 0.8134715025906736,
+      "grad_norm": 38.42236523356876,
+      "learning_rate": 2.7772462225494013e-05,
+      "loss": 1.1557,
+      "step": 314
+    },
+    {
+      "epoch": 0.8160621761658031,
+      "grad_norm": 39.179683790956744,
+      "learning_rate": 2.7693301994140026e-05,
+      "loss": 1.1201,
+      "step": 315
+    },
+    {
+      "epoch": 0.8186528497409327,
+      "grad_norm": 38.32243159447327,
+      "learning_rate": 2.761400005631028e-05,
+      "loss": 1.1105,
+      "step": 316
+    },
+    {
+      "epoch": 0.8212435233160622,
+      "grad_norm": 39.913808227411835,
+      "learning_rate": 2.7534557872703705e-05,
+      "loss": 1.1598,
+      "step": 317
+    },
+    {
+      "epoch": 0.8238341968911918,
+      "grad_norm": 69.73521867812421,
+      "learning_rate": 2.7454976906602513e-05,
+      "loss": 1.1145,
+      "step": 318
+    },
+    {
+      "epoch": 0.8264248704663213,
+      "grad_norm": 65.55887588207746,
+      "learning_rate": 2.7375258623845207e-05,
+      "loss": 1.1255,
+      "step": 319
+    },
+    {
+      "epoch": 0.8290155440414507,
+      "grad_norm": 30.980111545641563,
+      "learning_rate": 2.7295404492799575e-05,
+      "loss": 1.122,
+      "step": 320
+    },
+    {
+      "epoch": 0.8316062176165803,
+      "grad_norm": 30.12179911444832,
+      "learning_rate": 2.721541598433567e-05,
+      "loss": 1.113,
+      "step": 321
+    },
+    {
+      "epoch": 0.8341968911917098,
+      "grad_norm": 28.329434659508582,
+      "learning_rate": 2.7135294571798706e-05,
+      "loss": 1.0498,
+      "step": 322
+    },
+    {
+      "epoch": 0.8367875647668394,
+      "grad_norm": 25.114787597049578,
+      "learning_rate": 2.70550417309819e-05,
+      "loss": 1.0633,
+      "step": 323
+    },
+    {
+      "epoch": 0.8393782383419689,
+      "grad_norm": 27.754037709590385,
+      "learning_rate": 2.6974658940099337e-05,
+      "loss": 1.1585,
+      "step": 324
+    },
+    {
+      "epoch": 0.8419689119170984,
+      "grad_norm": 29.489888159179444,
+      "learning_rate": 2.6894147679758678e-05,
+      "loss": 1.1259,
+      "step": 325
+    },
+    {
+      "epoch": 0.844559585492228,
+      "grad_norm": 24.426102194202898,
+      "learning_rate": 2.6813509432933957e-05,
+      "loss": 1.1515,
+      "step": 326
+    },
+    {
+      "epoch": 0.8471502590673575,
+      "grad_norm": 24.75197483331429,
+      "learning_rate": 2.673274568493821e-05,
+      "loss": 1.15,
+      "step": 327
+    },
+    {
+      "epoch": 0.8497409326424871,
+      "grad_norm": 40.604864626683366,
+      "learning_rate": 2.6651857923396132e-05,
+      "loss": 1.1219,
+      "step": 328
+    },
+    {
+      "epoch": 0.8523316062176166,
+      "grad_norm": 34.694568404196026,
+      "learning_rate": 2.6570847638216698e-05,
+      "loss": 1.103,
+      "step": 329
+    },
+    {
+      "epoch": 0.8549222797927462,
+      "grad_norm": 48.715136403425035,
+      "learning_rate": 2.648971632156569e-05,
+      "loss": 1.1675,
+      "step": 330
+    },
+    {
+      "epoch": 0.8575129533678757,
+      "grad_norm": 97.77526410121799,
+      "learning_rate": 2.6408465467838225e-05,
+      "loss": 1.1502,
+      "step": 331
+    },
+    {
+      "epoch": 0.8601036269430051,
+      "grad_norm": 54.697215318949276,
+      "learning_rate": 2.632709657363124e-05,
+      "loss": 1.1446,
+      "step": 332
+    },
+    {
+      "epoch": 0.8626943005181347,
+      "grad_norm": 38.09192002041798,
+      "learning_rate": 2.6245611137715897e-05,
+      "loss": 1.1333,
+      "step": 333
+    },
+    {
+      "epoch": 0.8652849740932642,
+      "grad_norm": 46.713623556984956,
+      "learning_rate": 2.6164010661010007e-05,
+      "loss": 1.1252,
+      "step": 334
+    },
+    {
+      "epoch": 0.8678756476683938,
+      "grad_norm": 46.40552686286593,
+      "learning_rate": 2.6082296646550364e-05,
+      "loss": 1.121,
+      "step": 335
+    },
+    {
+      "epoch": 0.8704663212435233,
+      "grad_norm": 37.57424454065957,
+      "learning_rate": 2.6000470599465065e-05,
+      "loss": 1.1671,
+      "step": 336
+    },
+    {
+      "epoch": 0.8730569948186528,
+      "grad_norm": 38.580777053099204,
+      "learning_rate": 2.5918534026945787e-05,
+      "loss": 1.0849,
+      "step": 337
+    },
+    {
+      "epoch": 0.8756476683937824,
+      "grad_norm": 154.3106712010981,
+      "learning_rate": 2.5836488438220044e-05,
+      "loss": 1.0663,
+      "step": 338
+    },
+    {
+      "epoch": 0.8782383419689119,
+      "grad_norm": 34.21394067951015,
+      "learning_rate": 2.575433534452334e-05,
+      "loss": 1.0895,
+      "step": 339
+    },
+    {
+      "epoch": 0.8808290155440415,
+      "grad_norm": 36.291611242733886,
+      "learning_rate": 2.5672076259071385e-05,
+      "loss": 1.1242,
+      "step": 340
+    },
+    {
+      "epoch": 0.883419689119171,
+      "grad_norm": 29.411623389655112,
+      "learning_rate": 2.558971269703219e-05,
+      "loss": 1.1005,
+      "step": 341
+    },
+    {
+      "epoch": 0.8860103626943006,
+      "grad_norm": 30.24903086761753,
+      "learning_rate": 2.5507246175498174e-05,
+      "loss": 1.1134,
+      "step": 342
+    },
+    {
+      "epoch": 0.8886010362694301,
+      "grad_norm": 22.032293114161938,
+      "learning_rate": 2.5424678213458202e-05,
+      "loss": 1.1121,
+      "step": 343
+    },
+    {
+      "epoch": 0.8911917098445595,
+      "grad_norm": 34.997361528376956,
+      "learning_rate": 2.5342010331769635e-05,
+      "loss": 1.1341,
+      "step": 344
+    },
+    {
+      "epoch": 0.8937823834196891,
+      "grad_norm": 28.212824875732352,
+      "learning_rate": 2.5259244053130295e-05,
+      "loss": 1.0748,
+      "step": 345
+    },
+    {
+      "epoch": 0.8963730569948186,
+      "grad_norm": 23.870011592985897,
+      "learning_rate": 2.5176380902050418e-05,
+      "loss": 1.0643,
+      "step": 346
+    },
+    {
+      "epoch": 0.8989637305699482,
+      "grad_norm": 26.10018699309748,
+      "learning_rate": 2.5093422404824574e-05,
+      "loss": 1.1662,
+      "step": 347
+    },
+    {
+      "epoch": 0.9015544041450777,
+      "grad_norm": 30.191468778559166,
+      "learning_rate": 2.5010370089503578e-05,
+      "loss": 1.1023,
+      "step": 348
+    },
+    {
+      "epoch": 0.9041450777202072,
+      "grad_norm": 55.799581973427415,
+      "learning_rate": 2.4927225485866297e-05,
+      "loss": 1.1538,
+      "step": 349
+    },
+    {
+      "epoch": 0.9067357512953368,
+      "grad_norm": 35.7030284720465,
+      "learning_rate": 2.4843990125391516e-05,
+      "loss": 1.1,
+      "step": 350
+    },
+    {
+      "epoch": 0.9093264248704663,
+      "grad_norm": 28.61763302791738,
+      "learning_rate": 2.4760665541229712e-05,
+      "loss": 1.0914,
+      "step": 351
+    },
+    {
+      "epoch": 0.9119170984455959,
+      "grad_norm": 33.34233685155311,
+      "learning_rate": 2.467725326817481e-05,
+      "loss": 1.0862,
+      "step": 352
+    },
+    {
+      "epoch": 0.9145077720207254,
+      "grad_norm": 25.441052078480084,
+      "learning_rate": 2.4593754842635917e-05,
+      "loss": 1.1422,
+      "step": 353
+    },
+    {
+      "epoch": 0.917098445595855,
+      "grad_norm": 24.217974454985058,
+      "learning_rate": 2.451017180260902e-05,
+      "loss": 1.132,
+      "step": 354
+    },
+    {
+      "epoch": 0.9196891191709845,
+      "grad_norm": 57.986011465793155,
+      "learning_rate": 2.4426505687648653e-05,
+      "loss": 1.2082,
+      "step": 355
+    },
+    {
+      "epoch": 0.9222797927461139,
+      "grad_norm": 34.058264716876195,
+      "learning_rate": 2.4342758038839573e-05,
+      "loss": 1.1679,
+      "step": 356
+    },
+    {
+      "epoch": 0.9248704663212435,
+      "grad_norm": 28.621514922275253,
+      "learning_rate": 2.4258930398768317e-05,
+      "loss": 1.1319,
+      "step": 357
+    },
+    {
+      "epoch": 0.927461139896373,
+      "grad_norm": 35.33355417283227,
+      "learning_rate": 2.4175024311494835e-05,
+      "loss": 1.0705,
+      "step": 358
+    },
+    {
+      "epoch": 0.9300518134715026,
+      "grad_norm": 46.579572933583265,
+      "learning_rate": 2.4091041322524023e-05,
+      "loss": 1.0842,
+      "step": 359
+    },
+    {
+      "epoch": 0.9326424870466321,
+      "grad_norm": 35.494740787672974,
+      "learning_rate": 2.4006982978777263e-05,
+      "loss": 1.1072,
+      "step": 360
+    },
+    {
+      "epoch": 0.9352331606217616,
+      "grad_norm": 44.56606839509262,
+      "learning_rate": 2.392285082856394e-05,
+      "loss": 1.1125,
+      "step": 361
+    },
+    {
+      "epoch": 0.9378238341968912,
+      "grad_norm": 46.26363869084929,
+      "learning_rate": 2.3838646421552917e-05,
+      "loss": 1.1268,
+      "step": 362
+    },
+    {
+      "epoch": 0.9404145077720207,
+      "grad_norm": 89.17676267680146,
+      "learning_rate": 2.3754371308743975e-05,
+      "loss": 1.0893,
+      "step": 363
+    },
+    {
+      "epoch": 0.9430051813471503,
+      "grad_norm": 34.87700187494181,
+      "learning_rate": 2.367002704243927e-05,
+      "loss": 1.1203,
+      "step": 364
+    },
+    {
+      "epoch": 0.9455958549222798,
+      "grad_norm": 32.92806939217504,
+      "learning_rate": 2.3585615176214716e-05,
+      "loss": 1.1488,
+      "step": 365
+    },
+    {
+      "epoch": 0.9481865284974094,
+      "grad_norm": 27.27458755248548,
+      "learning_rate": 2.3501137264891396e-05,
+      "loss": 1.0874,
+      "step": 366
+    },
+    {
+      "epoch": 0.9507772020725389,
+      "grad_norm": 24.959123789739834,
+      "learning_rate": 2.3416594864506887e-05,
+      "loss": 1.1783,
+      "step": 367
+    },
+    {
+      "epoch": 0.9533678756476683,
+      "grad_norm": 31.838670988369724,
+      "learning_rate": 2.333198953228664e-05,
+      "loss": 1.0759,
+      "step": 368
+    },
+    {
+      "epoch": 0.9559585492227979,
+      "grad_norm": 28.112870222863155,
+      "learning_rate": 2.3247322826615276e-05,
+      "loss": 1.1481,
+      "step": 369
+    },
+    {
+      "epoch": 0.9585492227979274,
+      "grad_norm": 35.08461098450067,
+      "learning_rate": 2.316259630700787e-05,
+      "loss": 1.0953,
+      "step": 370
+    },
+    {
+      "epoch": 0.961139896373057,
+      "grad_norm": 37.80899503618479,
+      "learning_rate": 2.307781153408124e-05,
+      "loss": 1.1224,
+      "step": 371
+    },
+    {
+      "epoch": 0.9637305699481865,
+      "grad_norm": 31.644978122007387,
+      "learning_rate": 2.2992970069525202e-05,
+      "loss": 1.1608,
+      "step": 372
+    },
+    {
+      "epoch": 0.966321243523316,
+      "grad_norm": 23.51029318210938,
+      "learning_rate": 2.29080734760738e-05,
+      "loss": 1.0914,
+      "step": 373
+    },
+    {
+      "epoch": 0.9689119170984456,
+      "grad_norm": 28.97240481418573,
+      "learning_rate": 2.2823123317476522e-05,
+      "loss": 1.1117,
+      "step": 374
+    },
+    {
+      "epoch": 0.9715025906735751,
+      "grad_norm": 36.613893678320395,
+      "learning_rate": 2.273812115846951e-05,
+      "loss": 1.1118,
+      "step": 375
+    },
+    {
+      "epoch": 0.9740932642487047,
+      "grad_norm": 26.402979304578093,
+      "learning_rate": 2.2653068564746692e-05,
+      "loss": 1.13,
+      "step": 376
+    },
+    {
+      "epoch": 0.9766839378238342,
+      "grad_norm": 114.3000444613392,
+      "learning_rate": 2.2567967102931025e-05,
+      "loss": 1.1539,
+      "step": 377
+    },
+    {
+      "epoch": 0.9792746113989638,
+      "grad_norm": 26.861359932396834,
+      "learning_rate": 2.2482818340545534e-05,
+      "loss": 1.0566,
+      "step": 378
+    },
+    {
+      "epoch": 0.9818652849740933,
+      "grad_norm": 32.75509374223994,
+      "learning_rate": 2.2397623845984548e-05,
+      "loss": 1.1746,
+      "step": 379
+    },
+    {
+      "epoch": 0.9844559585492227,
+      "grad_norm": 34.11964206838379,
+      "learning_rate": 2.2312385188484718e-05,
+      "loss": 1.0834,
+      "step": 380
+    },
+    {
+      "epoch": 0.9870466321243523,
+      "grad_norm": 38.019564122226434,
+      "learning_rate": 2.2227103938096176e-05,
+      "loss": 1.1074,
+      "step": 381
+    },
+    {
+      "epoch": 0.9896373056994818,
+      "grad_norm": 39.5073811375391,
+      "learning_rate": 2.2141781665653584e-05,
+      "loss": 1.1082,
+      "step": 382
+    },
+    {
+      "epoch": 0.9922279792746114,
+      "grad_norm": 298.4258332795163,
+      "learning_rate": 2.205641994274721e-05,
+      "loss": 1.125,
+      "step": 383
+    },
+    {
+      "epoch": 0.9948186528497409,
+      "grad_norm": 36.444415670935506,
+      "learning_rate": 2.1971020341693973e-05,
+      "loss": 1.0935,
+      "step": 384
+    },
+    {
+      "epoch": 0.9974093264248705,
+      "grad_norm": 28.96533429210575,
+      "learning_rate": 2.188558443550849e-05,
+      "loss": 1.0957,
+      "step": 385
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 66.41241684127401,
+      "learning_rate": 2.180011379787411e-05,
+      "loss": 1.1335,
+      "step": 386
+    },
+    {
+      "epoch": 1.0025906735751295,
+      "grad_norm": 28.75549619538953,
+      "learning_rate": 2.1714610003113887e-05,
+      "loss": 1.1316,
+      "step": 387
+    },
+    {
+      "epoch": 1.005181347150259,
+      "grad_norm": 26.911837500852275,
+      "learning_rate": 2.1629074626161647e-05,
+      "loss": 1.1026,
+      "step": 388
+    },
+    {
+      "epoch": 1.005181347150259,
+      "eval_loss": 1.0908173322677612,
+      "eval_runtime": 37.7642,
+      "eval_samples_per_second": 19.701,
+      "eval_steps_per_second": 1.245,
+      "step": 388
+    },
+    {
+      "epoch": 1.0077720207253886,
+      "grad_norm": 34.28722746775385,
+      "learning_rate": 2.1543509242532932e-05,
+      "loss": 1.1104,
+      "step": 389
+    },
+    {
+      "epoch": 1.0103626943005182,
+      "grad_norm": 37.97709310694863,
+      "learning_rate": 2.145791542829597e-05,
+      "loss": 1.0663,
+      "step": 390
+    },
+    {
+      "epoch": 1.0129533678756477,
+      "grad_norm": 39.379668162327384,
+      "learning_rate": 2.1372294760042686e-05,
+      "loss": 1.1405,
+      "step": 391
+    },
+    {
+      "epoch": 1.0155440414507773,
+      "grad_norm": 27.136201219298698,
+      "learning_rate": 2.1286648814859636e-05,
+      "loss": 1.0963,
+      "step": 392
+    },
+    {
+      "epoch": 1.0181347150259068,
+      "grad_norm": 39.34261641469313,
+      "learning_rate": 2.120097917029897e-05,
+      "loss": 1.1276,
+      "step": 393
+    },
+    {
+      "epoch": 1.0207253886010363,
+      "grad_norm": 46.77583801285328,
+      "learning_rate": 2.1115287404349357e-05,
+      "loss": 1.1171,
+      "step": 394
+    },
+    {
+      "epoch": 1.0233160621761659,
+      "grad_norm": 55.10335066695868,
+      "learning_rate": 2.1029575095406933e-05,
+      "loss": 1.0831,
+      "step": 395
+    },
+    {
+      "epoch": 1.0259067357512954,
+      "grad_norm": 76.88533851789373,
+      "learning_rate": 2.0943843822246234e-05,
+      "loss": 1.0925,
+      "step": 396
+    },
+    {
+      "epoch": 1.028497409326425,
+      "grad_norm": 29.604569209708462,
+      "learning_rate": 2.0858095163991094e-05,
+      "loss": 1.1259,
+      "step": 397
+    },
+    {
+      "epoch": 1.0310880829015545,
+      "grad_norm": 37.71348366628868,
+      "learning_rate": 2.077233070008557e-05,
+      "loss": 1.0792,
+      "step": 398
+    },
+    {
+      "epoch": 1.0336787564766838,
+      "grad_norm": 26.866133194031644,
+      "learning_rate": 2.0686552010264872e-05,
+      "loss": 1.1649,
+      "step": 399
+    },
+    {
+      "epoch": 1.0362694300518134,
+      "grad_norm": 35.739274800620635,
+      "learning_rate": 2.060076067452622e-05,
+      "loss": 1.0837,
+      "step": 400
+    },
+    {
+      "epoch": 1.038860103626943,
+      "grad_norm": 24.479129391259896,
+      "learning_rate": 2.0514958273099778e-05,
+      "loss": 1.073,
+      "step": 401
+    },
+    {
+      "epoch": 1.0414507772020725,
+      "grad_norm": 50.49963650108008,
+      "learning_rate": 2.042914638641952e-05,
+      "loss": 1.0912,
+      "step": 402
+    },
+    {
+      "epoch": 1.044041450777202,
+      "grad_norm": 35.6875451072032,
+      "learning_rate": 2.0343326595094154e-05,
+      "loss": 1.0936,
+      "step": 403
+    },
+    {
+      "epoch": 1.0466321243523315,
+      "grad_norm": 30.212298193414487,
+      "learning_rate": 2.0257500479877965e-05,
+      "loss": 1.089,
+      "step": 404
+    },
+    {
+      "epoch": 1.049222797927461,
+      "grad_norm": 28.65828720015124,
+      "learning_rate": 2.0171669621641743e-05,
+      "loss": 1.1727,
+      "step": 405
+    },
+    {
+      "epoch": 1.0518134715025906,
+      "grad_norm": 39.2199058392425,
+      "learning_rate": 2.0085835601343627e-05,
+      "loss": 1.1493,
+      "step": 406
+    },
+    {
+      "epoch": 1.0544041450777202,
+      "grad_norm": 110.01204177059546,
+      "learning_rate": 2e-05,
+      "loss": 1.1245,
+      "step": 407
+    },
+    {
+      "epoch": 1.0569948186528497,
+      "grad_norm": 43.427381349600374,
+      "learning_rate": 1.9914164398656383e-05,
+      "loss": 1.1183,
+      "step": 408
+    },
+    {
+      "epoch": 1.0595854922279793,
+      "grad_norm": 64.78768909817894,
+      "learning_rate": 1.9828330378358264e-05,
+      "loss": 1.1528,
+      "step": 409
+    },
+    {
+      "epoch": 1.0621761658031088,
+      "grad_norm": 26.50257915912425,
+      "learning_rate": 1.974249952012204e-05,
+      "loss": 1.1568,
+      "step": 410
+    },
+    {
+      "epoch": 1.0647668393782384,
+      "grad_norm": 27.63159204178893,
+      "learning_rate": 1.9656673404905852e-05,
+      "loss": 1.1071,
+      "step": 411
+    },
+    {
+      "epoch": 1.067357512953368,
+      "grad_norm": 27.0795355533723,
+      "learning_rate": 1.957085361358049e-05,
+      "loss": 1.0809,
+      "step": 412
+    },
+    {
+      "epoch": 1.0699481865284974,
+      "grad_norm": 41.84795332660821,
+      "learning_rate": 1.9485041726900232e-05,
+      "loss": 1.0744,
+      "step": 413
+    },
+    {
+      "epoch": 1.072538860103627,
+      "grad_norm": 143.2109134427192,
+      "learning_rate": 1.939923932547379e-05,
+      "loss": 1.0905,
+      "step": 414
+    },
+    {
+      "epoch": 1.0751295336787565,
+      "grad_norm": 89.55384065946154,
+      "learning_rate": 1.931344798973513e-05,
+      "loss": 1.1012,
+      "step": 415
+    },
+    {
+      "epoch": 1.077720207253886,
+      "grad_norm": 31.072074793068015,
+      "learning_rate": 1.922766929991443e-05,
+      "loss": 1.1141,
+      "step": 416
+    },
+    {
+      "epoch": 1.0803108808290156,
+      "grad_norm": 29.82683189045969,
+      "learning_rate": 1.914190483600891e-05,
+      "loss": 1.0842,
+      "step": 417
+    },
+    {
+      "epoch": 1.0829015544041452,
+      "grad_norm": 30.09708662586305,
+      "learning_rate": 1.9056156177753776e-05,
+      "loss": 1.1088,
+      "step": 418
+    },
+    {
+      "epoch": 1.0854922279792747,
+      "grad_norm": 27.637437518920503,
+      "learning_rate": 1.897042490459307e-05,
+      "loss": 1.058,
+      "step": 419
+    },
+    {
+      "epoch": 1.0880829015544042,
+      "grad_norm": 69.34285700381683,
+      "learning_rate": 1.8884712595650653e-05,
+      "loss": 1.0314,
+      "step": 420
+    },
+    {
+      "epoch": 1.0906735751295338,
+      "grad_norm": 25.644927284592956,
+      "learning_rate": 1.8799020829701036e-05,
+      "loss": 1.0916,
+      "step": 421
+    },
+    {
+      "epoch": 1.093264248704663,
+      "grad_norm": 30.3898986852319,
+      "learning_rate": 1.871335118514037e-05,
+      "loss": 1.0797,
+      "step": 422
+    },
+    {
+      "epoch": 1.0958549222797926,
+      "grad_norm": 22.271334693423444,
+      "learning_rate": 1.862770523995732e-05,
+      "loss": 1.1134,
+      "step": 423
+    },
+    {
+      "epoch": 1.0984455958549222,
+      "grad_norm": 35.85874616678876,
+      "learning_rate": 1.854208457170404e-05,
+      "loss": 1.0927,
+      "step": 424
+    },
+    {
+      "epoch": 1.1010362694300517,
+      "grad_norm": 43.06832041948097,
+      "learning_rate": 1.8456490757467075e-05,
+      "loss": 1.093,
+      "step": 425
+    },
+    {
+      "epoch": 1.1036269430051813,
+      "grad_norm": 37.83777637993467,
+      "learning_rate": 1.8370925373838356e-05,
+      "loss": 1.1268,
+      "step": 426
+    },
+    {
+      "epoch": 1.1062176165803108,
+      "grad_norm": 23.798059023605177,
+      "learning_rate": 1.8285389996886113e-05,
+      "loss": 1.0989,
+      "step": 427
+    },
+    {
+      "epoch": 1.1088082901554404,
+      "grad_norm": 25.443104465500795,
+      "learning_rate": 1.8199886202125897e-05,
+      "loss": 1.0581,
+      "step": 428
+    },
+    {
+      "epoch": 1.11139896373057,
+      "grad_norm": 23.76241444847441,
+      "learning_rate": 1.8114415564491513e-05,
+      "loss": 1.0908,
+      "step": 429
+    },
+    {
+      "epoch": 1.1139896373056994,
+      "grad_norm": 26.5600693044426,
+      "learning_rate": 1.8028979658306033e-05,
+      "loss": 1.1321,
+      "step": 430
+    },
+    {
+      "epoch": 1.116580310880829,
+      "grad_norm": 44.854375199828986,
+      "learning_rate": 1.794358005725279e-05,
+      "loss": 1.0762,
+      "step": 431
+    },
+    {
+      "epoch": 1.1191709844559585,
+      "grad_norm": 28.05797777410846,
+      "learning_rate": 1.785821833434642e-05,
+      "loss": 1.0698,
+      "step": 432
+    },
+    {
+      "epoch": 1.121761658031088,
+      "grad_norm": 26.488479630212364,
+      "learning_rate": 1.7772896061903824e-05,
+      "loss": 1.1223,
+      "step": 433
+    },
+    {
+      "epoch": 1.1243523316062176,
+      "grad_norm": 32.77084542157883,
+      "learning_rate": 1.768761481151529e-05,
+      "loss": 1.0984,
+      "step": 434
+    },
+    {
+      "epoch": 1.1269430051813472,
+      "grad_norm": 39.13198413130026,
+      "learning_rate": 1.7602376154015456e-05,
+      "loss": 1.1551,
+      "step": 435
+    },
+    {
+      "epoch": 1.1295336787564767,
+      "grad_norm": 23.878966995283953,
+      "learning_rate": 1.751718165945447e-05,
+      "loss": 1.1133,
+      "step": 436
+    },
+    {
+      "epoch": 1.1321243523316062,
+      "grad_norm": 33.90472985566232,
+      "learning_rate": 1.743203289706898e-05,
+      "loss": 1.1219,
+      "step": 437
+    },
+    {
+      "epoch": 1.1347150259067358,
+      "grad_norm": 23.340369938533712,
+      "learning_rate": 1.734693143525331e-05,
+      "loss": 1.1244,
+      "step": 438
+    },
+    {
+      "epoch": 1.1373056994818653,
+      "grad_norm": 105.6885206147852,
+      "learning_rate": 1.7261878841530494e-05,
+      "loss": 1.0788,
+      "step": 439
+    },
+    {
+      "epoch": 1.1398963730569949,
+      "grad_norm": 28.453526076458317,
+      "learning_rate": 1.717687668252348e-05,
+      "loss": 1.1576,
+      "step": 440
+    },
+    {
+      "epoch": 1.1424870466321244,
+      "grad_norm": 36.1473991485961,
+      "learning_rate": 1.7091926523926205e-05,
+      "loss": 1.0859,
+      "step": 441
+    },
+    {
+      "epoch": 1.145077720207254,
+      "grad_norm": 27.043461146902448,
+      "learning_rate": 1.7007029930474804e-05,
+      "loss": 1.1072,
+      "step": 442
+    },
+    {
+      "epoch": 1.1476683937823835,
+      "grad_norm": 28.066170619981435,
+      "learning_rate": 1.6922188465918763e-05,
+      "loss": 1.1279,
+      "step": 443
+    },
+    {
+      "epoch": 1.150259067357513,
+      "grad_norm": 38.62445822837212,
+      "learning_rate": 1.6837403692992136e-05,
+      "loss": 1.1275,
+      "step": 444
+    },
+    {
+      "epoch": 1.1528497409326426,
+      "grad_norm": 28.077258963587767,
+      "learning_rate": 1.6752677173384734e-05,
+      "loss": 1.1004,
+      "step": 445
+    },
+    {
+      "epoch": 1.1554404145077721,
+      "grad_norm": 42.1405744301338,
+      "learning_rate": 1.6668010467713363e-05,
+      "loss": 1.1141,
+      "step": 446
+    },
+    {
+      "epoch": 1.1580310880829017,
+      "grad_norm": 26.827291684301034,
+      "learning_rate": 1.658340513549312e-05,
+      "loss": 1.1216,
+      "step": 447
+    },
+    {
+      "epoch": 1.160621761658031,
+      "grad_norm": 30.863489441619983,
+      "learning_rate": 1.649886273510861e-05,
+      "loss": 1.1898,
+      "step": 448
+    },
+    {
+      "epoch": 1.1632124352331605,
+      "grad_norm": 27.73579733476068,
+      "learning_rate": 1.641438482378529e-05,
+      "loss": 1.0971,
+      "step": 449
+    },
+    {
+      "epoch": 1.16580310880829,
+      "grad_norm": 32.84347174567353,
+      "learning_rate": 1.6329972957560736e-05,
+      "loss": 1.0579,
+      "step": 450
+    },
+    {
+      "epoch": 1.1683937823834196,
+      "grad_norm": 30.06456192962641,
+      "learning_rate": 1.6245628691256032e-05,
+      "loss": 1.1057,
+      "step": 451
+    },
+    {
+      "epoch": 1.1709844559585492,
+      "grad_norm": 36.554506394377846,
+      "learning_rate": 1.616135357844709e-05,
+      "loss": 1.1008,
+      "step": 452
+    },
+    {
+      "epoch": 1.1735751295336787,
+      "grad_norm": 27.358643056184114,
+      "learning_rate": 1.6077149171436063e-05,
+      "loss": 1.101,
+      "step": 453
+    },
+    {
+      "epoch": 1.1761658031088082,
+      "grad_norm": 111.13373813893604,
+      "learning_rate": 1.599301702122274e-05,
+      "loss": 1.0688,
+      "step": 454
+    },
+    {
+      "epoch": 1.1787564766839378,
+      "grad_norm": 33.94168250727336,
+      "learning_rate": 1.590895867747599e-05,
+      "loss": 1.0721,
+      "step": 455
+    },
+    {
+      "epoch": 1.1813471502590673,
+      "grad_norm": 53.93978395349692,
+      "learning_rate": 1.582497568850517e-05,
+      "loss": 1.0584,
+      "step": 456
+    },
+    {
+      "epoch": 1.1839378238341969,
+      "grad_norm": 29.19245794937285,
+      "learning_rate": 1.574106960123169e-05,
+      "loss": 1.067,
+      "step": 457
+    },
+    {
+      "epoch": 1.1865284974093264,
+      "grad_norm": 28.06897801999048,
+      "learning_rate": 1.5657241961160434e-05,
+      "loss": 1.0899,
+      "step": 458
+    },
+    {
+      "epoch": 1.189119170984456,
+      "grad_norm": 52.31256652964293,
+      "learning_rate": 1.557349431235135e-05,
+      "loss": 1.0925,
+      "step": 459
+    },
+    {
+      "epoch": 1.1917098445595855,
+      "grad_norm": 65.39771110845307,
+      "learning_rate": 1.5489828197390988e-05,
+      "loss": 1.1448,
+      "step": 460
+    },
+    {
+      "epoch": 1.194300518134715,
+      "grad_norm": 27.062780348557254,
+      "learning_rate": 1.5406245157364093e-05,
+      "loss": 1.0871,
+      "step": 461
+    },
+    {
+      "epoch": 1.1968911917098446,
+      "grad_norm": 41.667025056250424,
+      "learning_rate": 1.5322746731825195e-05,
+      "loss": 1.048,
+      "step": 462
+    },
+    {
+      "epoch": 1.1994818652849741,
+      "grad_norm": 24.936669803360665,
+      "learning_rate": 1.5239334458770291e-05,
+      "loss": 1.1243,
+      "step": 463
+    },
+    {
+      "epoch": 1.2020725388601037,
+      "grad_norm": 26.65392149600558,
+      "learning_rate": 1.5156009874608484e-05,
+      "loss": 1.0919,
+      "step": 464
+    },
+    {
+      "epoch": 1.2046632124352332,
+      "grad_norm": 48.57730651937978,
+      "learning_rate": 1.5072774514133708e-05,
+      "loss": 1.1259,
+      "step": 465
+    },
+    {
+      "epoch": 1.2072538860103628,
+      "grad_norm": 31.34891257114439,
+      "learning_rate": 1.4989629910496424e-05,
+      "loss": 1.0733,
+      "step": 466
+    },
+    {
+      "epoch": 1.2098445595854923,
+      "grad_norm": 24.541559850584985,
+      "learning_rate": 1.4906577595175428e-05,
+      "loss": 1.1166,
+      "step": 467
+    },
+    {
+      "epoch": 1.2124352331606219,
+      "grad_norm": 20.4345832961354,
+      "learning_rate": 1.4823619097949584e-05,
+      "loss": 1.0916,
+      "step": 468
+    },
+    {
+      "epoch": 1.2150259067357512,
+      "grad_norm": 28.860712194727487,
+      "learning_rate": 1.4740755946869708e-05,
+      "loss": 1.1043,
+      "step": 469
+    },
+    {
+      "epoch": 1.2176165803108807,
+      "grad_norm": 25.71820242946282,
+      "learning_rate": 1.4657989668230363e-05,
+      "loss": 1.0949,
+      "step": 470
+    },
+    {
+      "epoch": 1.2202072538860103,
+      "grad_norm": 51.16994773097077,
+      "learning_rate": 1.4575321786541801e-05,
+      "loss": 1.141,
+      "step": 471
+    },
+    {
+      "epoch": 1.2227979274611398,
+      "grad_norm": 32.70442309640389,
+      "learning_rate": 1.4492753824501833e-05,
+      "loss": 1.1127,
+      "step": 472
+    },
+    {
+      "epoch": 1.2253886010362693,
+      "grad_norm": 21.913285172411495,
+      "learning_rate": 1.4410287302967813e-05,
+      "loss": 1.084,
+      "step": 473
+    },
+    {
+      "epoch": 1.2279792746113989,
+      "grad_norm": 34.45727214001296,
+      "learning_rate": 1.4327923740928613e-05,
+      "loss": 1.0836,
+      "step": 474
+    },
+    {
+      "epoch": 1.2305699481865284,
+      "grad_norm": 26.768013926034776,
+      "learning_rate": 1.4245664655476663e-05,
+      "loss": 1.1264,
+      "step": 475
+    },
+    {
+      "epoch": 1.233160621761658,
+      "grad_norm": 28.401965255935572,
+      "learning_rate": 1.4163511561779956e-05,
+      "loss": 1.0805,
+      "step": 476
+    },
+    {
+      "epoch": 1.2357512953367875,
+      "grad_norm": 29.19935757288793,
+      "learning_rate": 1.4081465973054216e-05,
+      "loss": 1.0825,
+      "step": 477
+    },
+    {
+      "epoch": 1.238341968911917,
+      "grad_norm": 24.55918541541201,
+      "learning_rate": 1.3999529400534941e-05,
+      "loss": 1.1164,
+      "step": 478
+    },
+    {
+      "epoch": 1.2409326424870466,
+      "grad_norm": 25.35635406268312,
+      "learning_rate": 1.3917703353449646e-05,
+      "loss": 1.1334,
+      "step": 479
+    },
+    {
+      "epoch": 1.2435233160621761,
+      "grad_norm": 45.453901005004184,
+      "learning_rate": 1.3835989338989996e-05,
+      "loss": 1.1387,
+      "step": 480
+    },
+    {
+      "epoch": 1.2461139896373057,
+      "grad_norm": 21.67852694202104,
+      "learning_rate": 1.375438886228411e-05,
+      "loss": 1.0846,
+      "step": 481
+    },
+    {
+      "epoch": 1.2487046632124352,
+      "grad_norm": 171.2474074894732,
+      "learning_rate": 1.3672903426368773e-05,
+      "loss": 1.1388,
+      "step": 482
+    },
+    {
+      "epoch": 1.2512953367875648,
+      "grad_norm": 43.18223835070906,
+      "learning_rate": 1.3591534532161781e-05,
+      "loss": 1.1483,
+      "step": 483
+    },
+    {
+      "epoch": 1.2538860103626943,
+      "grad_norm": 29.447332565856644,
+      "learning_rate": 1.3510283678434317e-05,
+      "loss": 1.07,
+      "step": 484
+    },
+    {
+      "epoch": 1.2564766839378239,
+      "grad_norm": 28.600251051615228,
+      "learning_rate": 1.3429152361783307e-05,
+      "loss": 1.0798,
+      "step": 485
+    },
+    {
+      "epoch": 1.2564766839378239,
+      "eval_loss": 1.085669755935669,
+      "eval_runtime": 38.1134,
+      "eval_samples_per_second": 19.521,
+      "eval_steps_per_second": 1.233,
+      "step": 485
+    },
+    {
+      "epoch": 1.2590673575129534,
+      "grad_norm": 47.124643074410464,
+      "learning_rate": 1.3348142076603876e-05,
+      "loss": 1.0875,
+      "step": 486
+    },
+    {
+      "epoch": 1.261658031088083,
+      "grad_norm": 42.06019726307143,
+      "learning_rate": 1.3267254315061797e-05,
+      "loss": 1.1429,
+      "step": 487
+    },
+    {
+      "epoch": 1.2642487046632125,
+      "grad_norm": 18.950734630756962,
+      "learning_rate": 1.318649056706605e-05,
+      "loss": 1.0747,
+      "step": 488
+    },
+    {
+      "epoch": 1.266839378238342,
+      "grad_norm": 31.903949502516806,
+      "learning_rate": 1.3105852320241326e-05,
+      "loss": 1.1041,
+      "step": 489
+    },
+    {
+      "epoch": 1.2694300518134716,
+      "grad_norm": 22.957473008085927,
+      "learning_rate": 1.3025341059900675e-05,
+      "loss": 1.1046,
+      "step": 490
+    },
+    {
+      "epoch": 1.2720207253886011,
+      "grad_norm": 22.325983256563678,
+      "learning_rate": 1.2944958269018103e-05,
+      "loss": 1.0643,
+      "step": 491
+    },
+    {
+      "epoch": 1.2746113989637307,
+      "grad_norm": 29.689383331974955,
+      "learning_rate": 1.2864705428201307e-05,
+      "loss": 1.0949,
+      "step": 492
+    },
+    {
+      "epoch": 1.2772020725388602,
+      "grad_norm": 25.338298442945575,
+      "learning_rate": 1.2784584015664337e-05,
+      "loss": 1.0725,
+      "step": 493
+    },
+    {
+      "epoch": 1.2797927461139897,
+      "grad_norm": 31.591732488078588,
+      "learning_rate": 1.2704595507200435e-05,
+      "loss": 1.0347,
+      "step": 494
+    },
+    {
+      "epoch": 1.2823834196891193,
+      "grad_norm": 42.96243570696118,
+      "learning_rate": 1.26247413761548e-05,
+      "loss": 1.1196,
+      "step": 495
+    },
+    {
+      "epoch": 1.2849740932642488,
+      "grad_norm": 26.559546676266024,
+      "learning_rate": 1.254502309339749e-05,
+      "loss": 1.0187,
+      "step": 496
+    },
+    {
+      "epoch": 1.2875647668393784,
+      "grad_norm": 27.58444017584016,
+      "learning_rate": 1.2465442127296297e-05,
+      "loss": 1.0985,
+      "step": 497
+    },
+    {
+      "epoch": 1.2901554404145077,
+      "grad_norm": 36.53028730423797,
+      "learning_rate": 1.2385999943689732e-05,
+      "loss": 1.068,
+      "step": 498
+    },
+    {
+      "epoch": 1.2927461139896372,
+      "grad_norm": 38.94837307599113,
+      "learning_rate": 1.2306698005859975e-05,
+      "loss": 1.0736,
+      "step": 499
+    },
+    {
+      "epoch": 1.2953367875647668,
+      "grad_norm": 36.67208266195125,
+      "learning_rate": 1.2227537774505996e-05,
+      "loss": 1.119,
+      "step": 500
+    },
+    {
+      "epoch": 1.2979274611398963,
+      "grad_norm": 31.086410648635283,
+      "learning_rate": 1.2148520707716567e-05,
+      "loss": 1.1094,
+      "step": 501
+    },
+    {
+      "epoch": 1.3005181347150259,
+      "grad_norm": 27.96977481605826,
+      "learning_rate": 1.2069648260943473e-05,
+      "loss": 1.1345,
+      "step": 502
+    },
+    {
+      "epoch": 1.3031088082901554,
+      "grad_norm": 22.89450502840197,
+      "learning_rate": 1.1990921886974669e-05,
+      "loss": 1.12,
+      "step": 503
+    },
+    {
+      "epoch": 1.305699481865285,
+      "grad_norm": 18.54206032224653,
+      "learning_rate": 1.1912343035907535e-05,
+      "loss": 1.0929,
+      "step": 504
+    },
+    {
+      "epoch": 1.3082901554404145,
+      "grad_norm": 38.9386007237313,
+      "learning_rate": 1.1833913155122132e-05,
+      "loss": 1.1381,
+      "step": 505
+    },
+    {
+      "epoch": 1.310880829015544,
+      "grad_norm": 37.05899458809635,
+      "learning_rate": 1.1755633689254609e-05,
+      "loss": 1.0535,
+      "step": 506
+    },
+    {
+      "epoch": 1.3134715025906736,
+      "grad_norm": 27.716372794195156,
+      "learning_rate": 1.1677506080170512e-05,
+      "loss": 1.1342,
+      "step": 507
+    },
+    {
+      "epoch": 1.3160621761658031,
+      "grad_norm": 40.42306246079416,
+      "learning_rate": 1.1599531766938306e-05,
+      "loss": 1.0887,
+      "step": 508
+    },
+    {
+      "epoch": 1.3186528497409327,
+      "grad_norm": 98.56681767405578,
+      "learning_rate": 1.1521712185802789e-05,
+      "loss": 1.0954,
+      "step": 509
+    },
+    {
+      "epoch": 1.3212435233160622,
+      "grad_norm": 34.42816933350743,
+      "learning_rate": 1.1444048770158718e-05,
+      "loss": 1.0512,
+      "step": 510
+    },
+    {
+      "epoch": 1.3238341968911918,
+      "grad_norm": 52.457523653614096,
+      "learning_rate": 1.136654295052433e-05,
+      "loss": 1.1599,
+      "step": 511
+    },
+    {
+      "epoch": 1.3264248704663213,
+      "grad_norm": 26.832339531661276,
+      "learning_rate": 1.1289196154515048e-05,
+      "loss": 1.0602,
+      "step": 512
+    },
+    {
+      "epoch": 1.3290155440414508,
+      "grad_norm": 32.746047673769816,
+      "learning_rate": 1.1212009806817163e-05,
+      "loss": 1.1544,
+      "step": 513
+    },
+    {
+      "epoch": 1.3316062176165804,
+      "grad_norm": 37.44483451702055,
+      "learning_rate": 1.1134985329161608e-05,
+      "loss": 1.1421,
+      "step": 514
+    },
+    {
+      "epoch": 1.33419689119171,
+      "grad_norm": 28.625976525737606,
+      "learning_rate": 1.1058124140297718e-05,
+      "loss": 1.0858,
+      "step": 515
+    },
+    {
+      "epoch": 1.3367875647668392,
+      "grad_norm": 38.64141195246213,
+      "learning_rate": 1.0981427655967183e-05,
+      "loss": 1.0983,
+      "step": 516
+    },
+    {
+      "epoch": 1.3393782383419688,
+      "grad_norm": 29.989753893533425,
+      "learning_rate": 1.0904897288877891e-05,
+      "loss": 1.1269,
+      "step": 517
+    },
+    {
+      "epoch": 1.3419689119170983,
+      "grad_norm": 48.63990665515511,
+      "learning_rate": 1.0828534448677942e-05,
+      "loss": 1.0844,
+      "step": 518
+    },
+    {
+      "epoch": 1.3445595854922279,
+      "grad_norm": 25.477227318250847,
+      "learning_rate": 1.0752340541929711e-05,
+      "loss": 1.0742,
+      "step": 519
+    },
+    {
+      "epoch": 1.3471502590673574,
+      "grad_norm": 26.363588814537763,
+      "learning_rate": 1.0676316972083867e-05,
+      "loss": 1.0533,
+      "step": 520
+    },
+    {
+      "epoch": 1.349740932642487,
+      "grad_norm": 34.59968737708606,
+      "learning_rate": 1.060046513945361e-05,
+      "loss": 1.0983,
+      "step": 521
+    },
+    {
+      "epoch": 1.3523316062176165,
+      "grad_norm": 52.51652561846762,
+      "learning_rate": 1.0524786441188786e-05,
+      "loss": 1.1319,
+      "step": 522
+    },
+    {
+      "epoch": 1.354922279792746,
+      "grad_norm": 21.360221214301127,
+      "learning_rate": 1.0449282271250239e-05,
+      "loss": 1.0627,
+      "step": 523
+    },
+    {
+      "epoch": 1.3575129533678756,
+      "grad_norm": 37.00053933682603,
+      "learning_rate": 1.0373954020384073e-05,
+      "loss": 1.096,
+      "step": 524
+    },
+    {
+      "epoch": 1.3601036269430051,
+      "grad_norm": 39.212240822687484,
+      "learning_rate": 1.029880307609608e-05,
+      "loss": 1.0512,
+      "step": 525
+    },
+    {
+      "epoch": 1.3626943005181347,
+      "grad_norm": 24.89842378385804,
+      "learning_rate": 1.0223830822626124e-05,
+      "loss": 1.0538,
+      "step": 526
+    },
+    {
+      "epoch": 1.3652849740932642,
+      "grad_norm": 29.14416894424653,
+      "learning_rate": 1.0149038640922715e-05,
+      "loss": 1.1538,
+      "step": 527
+    },
+    {
+      "epoch": 1.3678756476683938,
+      "grad_norm": 31.688722122648855,
+      "learning_rate": 1.0074427908617515e-05,
+      "loss": 1.171,
+      "step": 528
+    },
+    {
+      "epoch": 1.3704663212435233,
+      "grad_norm": 41.918909004413734,
+      "learning_rate": 1.0000000000000006e-05,
+      "loss": 1.1203,
+      "step": 529
+    },
+    {
+      "epoch": 1.3730569948186528,
+      "grad_norm": 26.70963454516576,
+      "learning_rate": 9.92575628599213e-06,
+      "loss": 1.0855,
+      "step": 530
+    },
+    {
+      "epoch": 1.3756476683937824,
+      "grad_norm": 24.819351173466824,
+      "learning_rate": 9.851698134123095e-06,
+      "loss": 1.0972,
+      "step": 531
+    },
+    {
+      "epoch": 1.378238341968912,
+      "grad_norm": 22.100465399566815,
+      "learning_rate": 9.777826908504126e-06,
+      "loss": 1.08,
+      "step": 532
+    },
+    {
+      "epoch": 1.3808290155440415,
+      "grad_norm": 29.31574709406259,
+      "learning_rate": 9.704143969803392e-06,
+      "loss": 1.0835,
+      "step": 533
+    },
+    {
+      "epoch": 1.383419689119171,
+      "grad_norm": 25.551326748473052,
+      "learning_rate": 9.630650675220892e-06,
+      "loss": 1.0396,
+      "step": 534
+    },
+    {
+      "epoch": 1.3860103626943006,
+      "grad_norm": 59.07595627892596,
+      "learning_rate": 9.557348378463503e-06,
+      "loss": 1.0814,
+      "step": 535
+    },
+    {
+      "epoch": 1.38860103626943,
+      "grad_norm": 24.96501978981908,
+      "learning_rate": 9.484238429720018e-06,
+      "loss": 1.0187,
+      "step": 536
+    },
+    {
+      "epoch": 1.3911917098445596,
+      "grad_norm": 42.530604702279234,
+      "learning_rate": 9.411322175636298e-06,
+      "loss": 1.074,
+      "step": 537
+    },
+    {
+      "epoch": 1.3937823834196892,
+      "grad_norm": 34.91129065632851,
+      "learning_rate": 9.338600959290414e-06,
+      "loss": 1.0878,
+      "step": 538
+    },
+    {
+      "epoch": 1.3963730569948187,
+      "grad_norm": 32.07525956876426,
+      "learning_rate": 9.266076120167992e-06,
+      "loss": 1.0962,
+      "step": 539
+    },
+    {
+      "epoch": 1.3989637305699483,
+      "grad_norm": 40.18387743296675,
+      "learning_rate": 9.193748994137462e-06,
+      "loss": 1.1033,
+      "step": 540
+    },
+    {
+      "epoch": 1.4015544041450778,
+      "grad_norm": 66.68031460980451,
+      "learning_rate": 9.121620913425508e-06,
+      "loss": 1.1466,
+      "step": 541
+    },
+    {
+      "epoch": 1.4041450777202074,
+      "grad_norm": 34.07506059584738,
+      "learning_rate": 9.04969320659249e-06,
+      "loss": 1.1184,
+      "step": 542
+    },
+    {
+      "epoch": 1.406735751295337,
+      "grad_norm": 17.130845779169075,
+      "learning_rate": 8.977967198508001e-06,
+      "loss": 1.0803,
+      "step": 543
+    },
+    {
+      "epoch": 1.4093264248704664,
+      "grad_norm": 22.4457025132615,
+      "learning_rate": 8.906444210326441e-06,
+      "loss": 1.0745,
+      "step": 544
+    },
+    {
+      "epoch": 1.411917098445596,
+      "grad_norm": 73.43971735356851,
+      "learning_rate": 8.83512555946271e-06,
+      "loss": 1.0717,
+      "step": 545
+    },
+    {
+      "epoch": 1.4145077720207253,
+      "grad_norm": 38.16321297719761,
+      "learning_rate": 8.764012559567899e-06,
+      "loss": 1.1371,
+      "step": 546
+    },
+    {
+      "epoch": 1.4170984455958548,
+      "grad_norm": 56.14718024907725,
+      "learning_rate": 8.693106520505147e-06,
+      "loss": 1.0185,
+      "step": 547
+    },
+    {
+      "epoch": 1.4196891191709844,
+      "grad_norm": 53.3812598790062,
+      "learning_rate": 8.622408748325461e-06,
+      "loss": 1.0859,
+      "step": 548
+    },
+    {
+      "epoch": 1.422279792746114,
+      "grad_norm": 39.69041631433326,
+      "learning_rate": 8.551920545243704e-06,
+      "loss": 1.1146,
+      "step": 549
+    },
+    {
+      "epoch": 1.4248704663212435,
+      "grad_norm": 24.099260758984773,
+      "learning_rate": 8.481643209614576e-06,
+      "loss": 1.0968,
+      "step": 550
+    },
+    {
+      "epoch": 1.427461139896373,
+      "grad_norm": 22.623850373369237,
+      "learning_rate": 8.411578035908728e-06,
+      "loss": 1.0642,
+      "step": 551
+    },
+    {
+      "epoch": 1.4300518134715026,
+      "grad_norm": 25.343746374404027,
+      "learning_rate": 8.341726314688875e-06,
+      "loss": 1.0815,
+      "step": 552
+    },
+    {
+      "epoch": 1.432642487046632,
+      "grad_norm": 35.82641011588973,
+      "learning_rate": 8.272089332586089e-06,
+      "loss": 1.1012,
+      "step": 553
+    },
+    {
+      "epoch": 1.4352331606217616,
+      "grad_norm": 24.81161215784662,
+      "learning_rate": 8.20266837227603e-06,
+      "loss": 1.1086,
+      "step": 554
+    },
+    {
+      "epoch": 1.4378238341968912,
+      "grad_norm": 54.18243481591251,
+      "learning_rate": 8.133464712455364e-06,
+      "loss": 1.0704,
+      "step": 555
+    },
+    {
+      "epoch": 1.4404145077720207,
+      "grad_norm": 23.602598217141395,
+      "learning_rate": 8.064479627818213e-06,
+      "loss": 1.1519,
+      "step": 556
+    },
+    {
+      "epoch": 1.4430051813471503,
+      "grad_norm": 31.124404868409982,
+      "learning_rate": 7.995714389032638e-06,
+      "loss": 1.0705,
+      "step": 557
+    },
+    {
+      "epoch": 1.4455958549222798,
+      "grad_norm": 24.14171016995626,
+      "learning_rate": 7.927170262717284e-06,
+      "loss": 1.1083,
+      "step": 558
+    },
+    {
+      "epoch": 1.4481865284974094,
+      "grad_norm": 47.987203109917175,
+      "learning_rate": 7.858848511417998e-06,
+      "loss": 1.0836,
+      "step": 559
+    },
+    {
+      "epoch": 1.450777202072539,
+      "grad_norm": 25.871447098066056,
+      "learning_rate": 7.790750393584616e-06,
+      "loss": 1.0787,
+      "step": 560
+    },
+    {
+      "epoch": 1.4533678756476685,
+      "grad_norm": 23.820249113937482,
+      "learning_rate": 7.72287716354776e-06,
+      "loss": 1.1165,
+      "step": 561
+    },
+    {
+      "epoch": 1.455958549222798,
+      "grad_norm": 48.04131308947624,
+      "learning_rate": 7.65523007149575e-06,
+      "loss": 1.0819,
+      "step": 562
+    },
+    {
+      "epoch": 1.4585492227979275,
+      "grad_norm": 29.273494083692352,
+      "learning_rate": 7.587810363451544e-06,
+      "loss": 1.0302,
+      "step": 563
+    },
+    {
+      "epoch": 1.4611398963730569,
+      "grad_norm": 120.01571222366722,
+      "learning_rate": 7.5206192812498345e-06,
+      "loss": 1.1291,
+      "step": 564
+    },
+    {
+      "epoch": 1.4637305699481864,
+      "grad_norm": 33.16947662083338,
+      "learning_rate": 7.4536580625141244e-06,
+      "loss": 1.0842,
+      "step": 565
+    },
+    {
+      "epoch": 1.466321243523316,
+      "grad_norm": 29.979556378166713,
+      "learning_rate": 7.386927940633981e-06,
+      "loss": 1.1116,
+      "step": 566
+    },
+    {
+      "epoch": 1.4689119170984455,
+      "grad_norm": 27.172344859281896,
+      "learning_rate": 7.32043014474227e-06,
+      "loss": 1.0676,
+      "step": 567
+    },
+    {
+      "epoch": 1.471502590673575,
+      "grad_norm": 30.208548637757318,
+      "learning_rate": 7.254165899692554e-06,
+      "loss": 1.1104,
+      "step": 568
+    },
+    {
+      "epoch": 1.4740932642487046,
+      "grad_norm": 19.385421184583773,
+      "learning_rate": 7.188136426036498e-06,
+      "loss": 1.0085,
+      "step": 569
+    },
+    {
+      "epoch": 1.4766839378238341,
+      "grad_norm": 30.350787749309685,
+      "learning_rate": 7.12234294000143e-06,
+      "loss": 1.0584,
+      "step": 570
+    },
+    {
+      "epoch": 1.4792746113989637,
+      "grad_norm": 31.520305600900198,
+      "learning_rate": 7.056786653467882e-06,
+      "loss": 1.0831,
+      "step": 571
+    },
+    {
+      "epoch": 1.4818652849740932,
+      "grad_norm": 46.13006972574487,
+      "learning_rate": 6.991468773947321e-06,
+      "loss": 1.1761,
+      "step": 572
+    },
+    {
+      "epoch": 1.4844559585492227,
+      "grad_norm": 26.72340868362835,
+      "learning_rate": 6.926390504559879e-06,
+      "loss": 1.0605,
+      "step": 573
+    },
+    {
+      "epoch": 1.4870466321243523,
+      "grad_norm": 25.992965411102556,
+      "learning_rate": 6.861553044012206e-06,
+      "loss": 1.1015,
+      "step": 574
+    },
+    {
+      "epoch": 1.4896373056994818,
+      "grad_norm": 38.60187420279626,
+      "learning_rate": 6.796957586575364e-06,
+      "loss": 1.1232,
+      "step": 575
+    },
+    {
+      "epoch": 1.4922279792746114,
+      "grad_norm": 21.7618591565717,
+      "learning_rate": 6.732605322062869e-06,
+      "loss": 1.1196,
+      "step": 576
+    },
+    {
+      "epoch": 1.494818652849741,
+      "grad_norm": 28.233093007170996,
+      "learning_rate": 6.668497435808736e-06,
+      "loss": 1.1451,
+      "step": 577
+    },
+    {
+      "epoch": 1.4974093264248705,
+      "grad_norm": 28.061514297823816,
+      "learning_rate": 6.604635108645683e-06,
+      "loss": 1.0832,
+      "step": 578
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 35.34503147975386,
+      "learning_rate": 6.5410195168833425e-06,
+      "loss": 1.118,
+      "step": 579
+    },
+    {
+      "epoch": 1.5025906735751295,
+      "grad_norm": 31.940516004139344,
+      "learning_rate": 6.477651832286633e-06,
+      "loss": 1.1052,
+      "step": 580
+    },
+    {
+      "epoch": 1.505181347150259,
+      "grad_norm": 25.647504733675635,
+      "learning_rate": 6.414533222054138e-06,
+      "loss": 1.1055,
+      "step": 581
+    },
+    {
+      "epoch": 1.5077720207253886,
+      "grad_norm": 68.16422579698298,
+      "learning_rate": 6.3516648487966456e-06,
+      "loss": 1.0784,
+      "step": 582
+    },
+    {
+      "epoch": 1.5077720207253886,
+      "eval_loss": 1.0824710130691528,
+      "eval_runtime": 37.4923,
+      "eval_samples_per_second": 19.844,
+      "eval_steps_per_second": 1.254,
+      "step": 582
+    },
+    {
+      "epoch": 1.5103626943005182,
+      "grad_norm": 46.95363643283118,
+      "learning_rate": 6.289047870515692e-06,
+      "loss": 1.1271,
+      "step": 583
+    },
+    {
+      "epoch": 1.5129533678756477,
+      "grad_norm": 37.80701104174098,
+      "learning_rate": 6.226683440582268e-06,
+      "loss": 1.126,
+      "step": 584
+    },
+    {
+      "epoch": 1.5155440414507773,
+      "grad_norm": 32.03225059321182,
+      "learning_rate": 6.164572707715564e-06,
+      "loss": 1.0152,
+      "step": 585
+    },
+    {
+      "epoch": 1.5181347150259068,
+      "grad_norm": 31.21438627768379,
+      "learning_rate": 6.102716815961787e-06,
+      "loss": 1.1595,
+      "step": 586
+    },
+    {
+      "epoch": 1.5207253886010363,
+      "grad_norm": 23.55515793723355,
+      "learning_rate": 6.041116904673125e-06,
+      "loss": 1.0943,
+      "step": 587
+    },
+    {
+      "epoch": 1.5233160621761659,
+      "grad_norm": 26.92022994571063,
+      "learning_rate": 5.979774108486751e-06,
+      "loss": 1.0554,
+      "step": 588
+    },
+    {
+      "epoch": 1.5259067357512954,
+      "grad_norm": 24.957086694295352,
+      "learning_rate": 5.918689557303885e-06,
+      "loss": 1.0711,
+      "step": 589
+    },
+    {
+      "epoch": 1.528497409326425,
+      "grad_norm": 87.48440577770464,
+      "learning_rate": 5.857864376269051e-06,
+      "loss": 1.1679,
+      "step": 590
+    },
+    {
+      "epoch": 1.5310880829015545,
+      "grad_norm": 21.756969247026838,
+      "learning_rate": 5.7972996857492896e-06,
+      "loss": 1.0716,
+      "step": 591
+    },
+    {
+      "epoch": 1.533678756476684,
+      "grad_norm": 33.92695136944769,
+      "learning_rate": 5.736996601313545e-06,
+      "loss": 1.0376,
+      "step": 592
+    },
+    {
+      "epoch": 1.5362694300518136,
+      "grad_norm": 32.738888590276794,
+      "learning_rate": 5.676956233712139e-06,
+      "loss": 1.0245,
+      "step": 593
+    },
+    {
+      "epoch": 1.5388601036269431,
+      "grad_norm": 22.38597679049821,
+      "learning_rate": 5.617179688856271e-06,
+      "loss": 1.1103,
+      "step": 594
+    },
+    {
+      "epoch": 1.5414507772020727,
+      "grad_norm": 30.168619654124416,
+      "learning_rate": 5.557668067797677e-06,
+      "loss": 1.2007,
+      "step": 595
+    },
+    {
+      "epoch": 1.5440414507772022,
+      "grad_norm": 24.460334668593116,
+      "learning_rate": 5.498422466708349e-06,
+      "loss": 1.0842,
+      "step": 596
+    },
+    {
+      "epoch": 1.5466321243523318,
+      "grad_norm": 25.877463433966412,
+      "learning_rate": 5.439443976860306e-06,
+      "loss": 1.0537,
+      "step": 597
+    },
+    {
+      "epoch": 1.549222797927461,
+      "grad_norm": 27.67111694532404,
+      "learning_rate": 5.38073368460555e-06,
+      "loss": 1.0863,
+      "step": 598
+    },
+    {
+      "epoch": 1.5518134715025906,
+      "grad_norm": 43.112045139256026,
+      "learning_rate": 5.32229267135602e-06,
+      "loss": 1.1168,
+      "step": 599
+    },
+    {
+      "epoch": 1.5544041450777202,
+      "grad_norm": 31.60344278763487,
+      "learning_rate": 5.2641220135636685e-06,
+      "loss": 1.0939,
+      "step": 600
+    },
+    {
+      "epoch": 1.5569948186528497,
+      "grad_norm": 37.795536334167195,
+      "learning_rate": 5.206222782700667e-06,
+      "loss": 1.1084,
+      "step": 601
+    },
+    {
+      "epoch": 1.5595854922279793,
+      "grad_norm": 27.529824319458413,
+      "learning_rate": 5.1485960452396266e-06,
+      "loss": 1.0755,
+      "step": 602
+    },
+    {
+      "epoch": 1.5621761658031088,
+      "grad_norm": 29.172376961452496,
+      "learning_rate": 5.091242862634e-06,
+      "loss": 1.0231,
+      "step": 603
+    },
+    {
+      "epoch": 1.5647668393782384,
+      "grad_norm": 24.94560254083931,
+      "learning_rate": 5.0341642912984844e-06,
+      "loss": 1.0782,
+      "step": 604
+    },
+    {
+      "epoch": 1.567357512953368,
+      "grad_norm": 31.79546143794924,
+      "learning_rate": 4.977361382589607e-06,
+      "loss": 1.1202,
+      "step": 605
+    },
+    {
+      "epoch": 1.5699481865284974,
+      "grad_norm": 39.3795372477718,
+      "learning_rate": 4.920835182786316e-06,
+      "loss": 1.0349,
+      "step": 606
+    },
+    {
+      "epoch": 1.572538860103627,
+      "grad_norm": 31.308429467189708,
+      "learning_rate": 4.864586733070755e-06,
+      "loss": 1.0582,
+      "step": 607
+    },
+    {
+      "epoch": 1.5751295336787565,
+      "grad_norm": 32.82748366949945,
+      "learning_rate": 4.808617069509034e-06,
+      "loss": 1.1246,
+      "step": 608
+    },
+    {
+      "epoch": 1.577720207253886,
+      "grad_norm": 24.281936328515055,
+      "learning_rate": 4.752927223032196e-06,
+      "loss": 1.0679,
+      "step": 609
+    },
+    {
+      "epoch": 1.5803108808290154,
+      "grad_norm": 111.23884469313498,
+      "learning_rate": 4.697518219417188e-06,
+      "loss": 1.1319,
+      "step": 610
+    },
+    {
+      "epoch": 1.582901554404145,
+      "grad_norm": 35.484299416160596,
+      "learning_rate": 4.6423910792680005e-06,
+      "loss": 1.1348,
+      "step": 611
+    },
+    {
+      "epoch": 1.5854922279792745,
+      "grad_norm": 27.135342529418295,
+      "learning_rate": 4.587546817996826e-06,
+      "loss": 1.0948,
+      "step": 612
+    },
+    {
+      "epoch": 1.588082901554404,
+      "grad_norm": 81.98158494527004,
+      "learning_rate": 4.532986445805405e-06,
+      "loss": 1.0864,
+      "step": 613
+    },
+    {
+      "epoch": 1.5906735751295336,
+      "grad_norm": 61.490418707157346,
+      "learning_rate": 4.478710967666371e-06,
+      "loss": 1.0693,
+      "step": 614
+    },
+    {
+      "epoch": 1.593264248704663,
+      "grad_norm": 25.633018846282962,
+      "learning_rate": 4.424721383304791e-06,
+      "loss": 1.1084,
+      "step": 615
+    },
+    {
+      "epoch": 1.5958549222797926,
+      "grad_norm": 28.194280804517373,
+      "learning_rate": 4.371018687179689e-06,
+      "loss": 1.1722,
+      "step": 616
+    },
+    {
+      "epoch": 1.5984455958549222,
+      "grad_norm": 27.8080566828581,
+      "learning_rate": 4.317603868465794e-06,
+      "loss": 1.1171,
+      "step": 617
+    },
+    {
+      "epoch": 1.6010362694300517,
+      "grad_norm": 42.959036729178806,
+      "learning_rate": 4.264477911035265e-06,
+      "loss": 1.074,
+      "step": 618
+    },
+    {
+      "epoch": 1.6036269430051813,
+      "grad_norm": 23.937218136554392,
+      "learning_rate": 4.211641793439609e-06,
+      "loss": 1.13,
+      "step": 619
+    },
+    {
+      "epoch": 1.6062176165803108,
+      "grad_norm": 43.913677975121566,
+      "learning_rate": 4.159096488891623e-06,
+      "loss": 1.1671,
+      "step": 620
+    },
+    {
+      "epoch": 1.6088082901554404,
+      "grad_norm": 48.107566289352114,
+      "learning_rate": 4.106842965247497e-06,
+      "loss": 1.1071,
+      "step": 621
+    },
+    {
+      "epoch": 1.61139896373057,
+      "grad_norm": 28.25790913819402,
+      "learning_rate": 4.054882184988971e-06,
+      "loss": 1.0716,
+      "step": 622
+    },
+    {
+      "epoch": 1.6139896373056994,
+      "grad_norm": 26.59960827233381,
+      "learning_rate": 4.003215105205613e-06,
+      "loss": 1.146,
+      "step": 623
+    },
+    {
+      "epoch": 1.616580310880829,
+      "grad_norm": 22.79614250574067,
+      "learning_rate": 3.951842677577171e-06,
+      "loss": 1.0761,
+      "step": 624
+    },
+    {
+      "epoch": 1.6191709844559585,
+      "grad_norm": 24.24036779343114,
+      "learning_rate": 3.900765848356083e-06,
+      "loss": 1.1037,
+      "step": 625
+    },
+    {
+      "epoch": 1.621761658031088,
+      "grad_norm": 27.295669679621373,
+      "learning_rate": 3.849985558349998e-06,
+      "loss": 1.1015,
+      "step": 626
+    },
+    {
+      "epoch": 1.6243523316062176,
+      "grad_norm": 54.413225233914176,
+      "learning_rate": 3.799502742904497e-06,
+      "loss": 1.0318,
+      "step": 627
+    },
+    {
+      "epoch": 1.6269430051813472,
+      "grad_norm": 38.84848713400369,
+      "learning_rate": 3.749318331885825e-06,
+      "loss": 1.1147,
+      "step": 628
+    },
+    {
+      "epoch": 1.6295336787564767,
+      "grad_norm": 23.912199342429506,
+      "learning_rate": 3.699433249663775e-06,
+      "loss": 1.1439,
+      "step": 629
+    },
+    {
+      "epoch": 1.6321243523316062,
+      "grad_norm": 48.95526983090661,
+      "learning_rate": 3.649848415094681e-06,
+      "loss": 1.0229,
+      "step": 630
+    },
+    {
+      "epoch": 1.6347150259067358,
+      "grad_norm": 32.099897123524585,
+      "learning_rate": 3.60056474150446e-06,
+      "loss": 1.0589,
+      "step": 631
+    },
+    {
+      "epoch": 1.6373056994818653,
+      "grad_norm": 31.802660850585973,
+      "learning_rate": 3.551583136671817e-06,
+      "loss": 1.1137,
+      "step": 632
+    },
+    {
+      "epoch": 1.6398963730569949,
+      "grad_norm": 34.2655686599537,
+      "learning_rate": 3.5029045028115105e-06,
+      "loss": 1.1318,
+      "step": 633
+    },
+    {
+      "epoch": 1.6424870466321244,
+      "grad_norm": 191.48847051006786,
+      "learning_rate": 3.4545297365577437e-06,
+      "loss": 1.0921,
+      "step": 634
+    },
+    {
+      "epoch": 1.645077720207254,
+      "grad_norm": 24.236450154622357,
+      "learning_rate": 3.406459728947622e-06,
+      "loss": 1.0851,
+      "step": 635
+    },
+    {
+      "epoch": 1.6476683937823835,
+      "grad_norm": 38.819342476228876,
+      "learning_rate": 3.358695365404785e-06,
+      "loss": 1.0962,
+      "step": 636
+    },
+    {
+      "epoch": 1.650259067357513,
+      "grad_norm": 31.53545103406636,
+      "learning_rate": 3.3112375257230547e-06,
+      "loss": 1.0994,
+      "step": 637
+    },
+    {
+      "epoch": 1.6528497409326426,
+      "grad_norm": 71.55299438562814,
+      "learning_rate": 3.2640870840502646e-06,
+      "loss": 1.08,
+      "step": 638
+    },
+    {
+      "epoch": 1.6554404145077721,
+      "grad_norm": 57.94234006640972,
+      "learning_rate": 3.2172449088721235e-06,
+      "loss": 1.0921,
+      "step": 639
+    },
+    {
+      "epoch": 1.6580310880829017,
+      "grad_norm": 58.15229256885828,
+      "learning_rate": 3.1707118629962607e-06,
+      "loss": 1.0981,
+      "step": 640
+    },
+    {
+      "epoch": 1.6606217616580312,
+      "grad_norm": 25.105795165561457,
+      "learning_rate": 3.1244888035362875e-06,
+      "loss": 1.101,
+      "step": 641
+    },
+    {
+      "epoch": 1.6632124352331608,
+      "grad_norm": 33.15366058006866,
+      "learning_rate": 3.0785765818960534e-06,
+      "loss": 1.0517,
+      "step": 642
+    },
+    {
+      "epoch": 1.6658031088082903,
+      "grad_norm": 35.79893709161297,
+      "learning_rate": 3.0329760437539233e-06,
+      "loss": 1.0886,
+      "step": 643
+    },
+    {
+      "epoch": 1.6683937823834198,
+      "grad_norm": 49.59918009099835,
+      "learning_rate": 2.9876880290472376e-06,
+      "loss": 1.0756,
+      "step": 644
+    },
+    {
+      "epoch": 1.6709844559585494,
+      "grad_norm": 21.485142494367135,
+      "learning_rate": 2.942713371956809e-06,
+      "loss": 1.1017,
+      "step": 645
+    },
+    {
+      "epoch": 1.6735751295336787,
+      "grad_norm": 29.23169287520316,
+      "learning_rate": 2.8980529008915793e-06,
+      "loss": 1.1241,
+      "step": 646
+    },
+    {
+      "epoch": 1.6761658031088082,
+      "grad_norm": 27.913868608886553,
+      "learning_rate": 2.853707438473352e-06,
+      "loss": 1.0841,
+      "step": 647
+    },
+    {
+      "epoch": 1.6787564766839378,
+      "grad_norm": 18.438597602055644,
+      "learning_rate": 2.8096778015216484e-06,
+      "loss": 1.0891,
+      "step": 648
+    },
+    {
+      "epoch": 1.6813471502590673,
+      "grad_norm": 54.0556941620233,
+      "learning_rate": 2.7659648010386365e-06,
+      "loss": 1.0589,
+      "step": 649
+    },
+    {
+      "epoch": 1.6839378238341969,
+      "grad_norm": 108.10101848740734,
+      "learning_rate": 2.7225692421942306e-06,
+      "loss": 1.0766,
+      "step": 650
+    },
+    {
+      "epoch": 1.6865284974093264,
+      "grad_norm": 106.58835736628185,
+      "learning_rate": 2.679491924311226e-06,
+      "loss": 1.1144,
+      "step": 651
+    },
+    {
+      "epoch": 1.689119170984456,
+      "grad_norm": 31.53371570516213,
+      "learning_rate": 2.6367336408506063e-06,
+      "loss": 1.02,
+      "step": 652
+    },
+    {
+      "epoch": 1.6917098445595855,
+      "grad_norm": 36.263088086669775,
+      "learning_rate": 2.594295179396895e-06,
+      "loss": 1.0679,
+      "step": 653
+    },
+    {
+      "epoch": 1.694300518134715,
+      "grad_norm": 24.47507184337666,
+      "learning_rate": 2.5521773216436875e-06,
+      "loss": 1.1092,
+      "step": 654
+    },
+    {
+      "epoch": 1.6968911917098446,
+      "grad_norm": 33.05899532106974,
+      "learning_rate": 2.5103808433792075e-06,
+      "loss": 1.053,
+      "step": 655
+    },
+    {
+      "epoch": 1.6994818652849741,
+      "grad_norm": 29.132344102799873,
+      "learning_rate": 2.468906514472065e-06,
+      "loss": 1.0518,
+      "step": 656
+    },
+    {
+      "epoch": 1.7020725388601037,
+      "grad_norm": 43.48960854254409,
+      "learning_rate": 2.4277550988570362e-06,
+      "loss": 1.0537,
+      "step": 657
+    },
+    {
+      "epoch": 1.704663212435233,
+      "grad_norm": 28.13627467897817,
+      "learning_rate": 2.3869273545210158e-06,
+      "loss": 1.0558,
+      "step": 658
+    },
+    {
+      "epoch": 1.7072538860103625,
+      "grad_norm": 33.18164212520423,
+      "learning_rate": 2.3464240334890496e-06,
+      "loss": 1.054,
+      "step": 659
+    },
+    {
+      "epoch": 1.709844559585492,
+      "grad_norm": 41.884394437273144,
+      "learning_rate": 2.3062458818104804e-06,
+      "loss": 1.0871,
+      "step": 660
+    },
+    {
+      "epoch": 1.7124352331606216,
+      "grad_norm": 27.119840736470916,
+      "learning_rate": 2.266393639545197e-06,
+      "loss": 1.0743,
+      "step": 661
+    },
+    {
+      "epoch": 1.7150259067357512,
+      "grad_norm": 20.70474999023591,
+      "learning_rate": 2.22686804075003e-06,
+      "loss": 1.0718,
+      "step": 662
+    },
+    {
+      "epoch": 1.7176165803108807,
+      "grad_norm": 21.469651089617198,
+      "learning_rate": 2.187669813465192e-06,
+      "loss": 1.0584,
+      "step": 663
+    },
+    {
+      "epoch": 1.7202072538860103,
+      "grad_norm": 29.901704269591495,
+      "learning_rate": 2.1487996797009103e-06,
+      "loss": 1.1175,
+      "step": 664
+    },
+    {
+      "epoch": 1.7227979274611398,
+      "grad_norm": 75.06310533674302,
+      "learning_rate": 2.110258355424093e-06,
+      "loss": 1.124,
+      "step": 665
+    },
+    {
+      "epoch": 1.7253886010362693,
+      "grad_norm": 34.13349153293387,
+      "learning_rate": 2.0720465505451524e-06,
+      "loss": 1.1395,
+      "step": 666
+    },
+    {
+      "epoch": 1.7279792746113989,
+      "grad_norm": 26.83922350447555,
+      "learning_rate": 2.0341649689049458e-06,
+      "loss": 1.0449,
+      "step": 667
+    },
+    {
+      "epoch": 1.7305699481865284,
+      "grad_norm": 37.284339589086024,
+      "learning_rate": 1.9966143082617797e-06,
+      "loss": 1.0332,
+      "step": 668
+    },
+    {
+      "epoch": 1.733160621761658,
+      "grad_norm": 46.453238969399074,
+      "learning_rate": 1.959395260278587e-06,
+      "loss": 1.1303,
+      "step": 669
+    },
+    {
+      "epoch": 1.7357512953367875,
+      "grad_norm": 22.743791018223284,
+      "learning_rate": 1.922508510510166e-06,
+      "loss": 1.0993,
+      "step": 670
+    },
+    {
+      "epoch": 1.738341968911917,
+      "grad_norm": 27.788137087891727,
+      "learning_rate": 1.885954738390572e-06,
+      "loss": 1.1234,
+      "step": 671
+    },
+    {
+      "epoch": 1.7409326424870466,
+      "grad_norm": 34.03637743502625,
+      "learning_rate": 1.8497346172205733e-06,
+      "loss": 1.085,
+      "step": 672
+    },
+    {
+      "epoch": 1.7435233160621761,
+      "grad_norm": 30.308363072599853,
+      "learning_rate": 1.8138488141552856e-06,
+      "loss": 1.0348,
+      "step": 673
+    },
+    {
+      "epoch": 1.7461139896373057,
+      "grad_norm": 26.81612464278571,
+      "learning_rate": 1.7782979901918507e-06,
+      "loss": 1.0672,
+      "step": 674
+    },
+    {
+      "epoch": 1.7487046632124352,
+      "grad_norm": 46.96340147563577,
+      "learning_rate": 1.7430828001572897e-06,
+      "loss": 1.0807,
+      "step": 675
+    },
+    {
+      "epoch": 1.7512953367875648,
+      "grad_norm": 30.87064631308438,
+      "learning_rate": 1.7082038926964162e-06,
+      "loss": 1.1411,
+      "step": 676
+    },
+    {
+      "epoch": 1.7538860103626943,
+      "grad_norm": 79.59411718865987,
+      "learning_rate": 1.6736619102599073e-06,
+      "loss": 1.0234,
+      "step": 677
+    },
+    {
+      "epoch": 1.7564766839378239,
+      "grad_norm": 30.875792565440594,
+      "learning_rate": 1.6394574890924574e-06,
+      "loss": 1.1506,
+      "step": 678
+    },
+    {
+      "epoch": 1.7590673575129534,
+      "grad_norm": 34.227935587917464,
+      "learning_rate": 1.605591259221071e-06,
+      "loss": 1.0981,
+      "step": 679
+    },
+    {
+      "epoch": 1.7590673575129534,
+      "eval_loss": 1.0809757709503174,
+      "eval_runtime": 37.9729,
+      "eval_samples_per_second": 19.593,
+      "eval_steps_per_second": 1.238,
+      "step": 679
+    },
+    {
+      "epoch": 1.761658031088083,
+      "grad_norm": 31.849171622198522,
+      "learning_rate": 1.572063844443441e-06,
+      "loss": 1.1227,
+      "step": 680
+    },
+    {
+      "epoch": 1.7642487046632125,
+      "grad_norm": 32.75765881856165,
+      "learning_rate": 1.5388758623164802e-06,
+      "loss": 1.0842,
+      "step": 681
+    },
+    {
+      "epoch": 1.766839378238342,
+      "grad_norm": 27.83779558188967,
+      "learning_rate": 1.5060279241449304e-06,
+      "loss": 1.0419,
+      "step": 682
+    },
+    {
+      "epoch": 1.7694300518134716,
+      "grad_norm": 30.646833576522408,
+      "learning_rate": 1.4735206349701003e-06,
+      "loss": 1.0983,
+      "step": 683
+    },
+    {
+      "epoch": 1.7720207253886011,
+      "grad_norm": 29.748071428344947,
+      "learning_rate": 1.4413545935587415e-06,
+      "loss": 1.1276,
+      "step": 684
+    },
+    {
+      "epoch": 1.7746113989637307,
+      "grad_norm": 32.57104117085742,
+      "learning_rate": 1.4095303923919956e-06,
+      "loss": 1.0728,
+      "step": 685
+    },
+    {
+      "epoch": 1.7772020725388602,
+      "grad_norm": 32.02209671450587,
+      "learning_rate": 1.3780486176544905e-06,
+      "loss": 1.1148,
+      "step": 686
+    },
+    {
+      "epoch": 1.7797927461139897,
+      "grad_norm": 31.902388050458736,
+      "learning_rate": 1.3469098492235521e-06,
+      "loss": 1.0873,
+      "step": 687
+    },
+    {
+      "epoch": 1.7823834196891193,
+      "grad_norm": 33.159581668201604,
+      "learning_rate": 1.316114660658505e-06,
+      "loss": 1.0308,
+      "step": 688
+    },
+    {
+      "epoch": 1.7849740932642488,
+      "grad_norm": 25.531240947030152,
+      "learning_rate": 1.2856636191901296e-06,
+      "loss": 1.0893,
+      "step": 689
+    },
+    {
+      "epoch": 1.7875647668393784,
+      "grad_norm": 25.382870674663973,
+      "learning_rate": 1.255557285710185e-06,
+      "loss": 1.1089,
+      "step": 690
+    },
+    {
+      "epoch": 1.790155440414508,
+      "grad_norm": 26.184606368046406,
+      "learning_rate": 1.225796214761117e-06,
+      "loss": 1.1515,
+      "step": 691
+    },
+    {
+      "epoch": 1.7927461139896375,
+      "grad_norm": 27.78595815725415,
+      "learning_rate": 1.196380954525802e-06,
+      "loss": 1.0871,
+      "step": 692
+    },
+    {
+      "epoch": 1.795336787564767,
+      "grad_norm": 32.137607036645285,
+      "learning_rate": 1.1673120468174837e-06,
+      "loss": 1.1396,
+      "step": 693
+    },
+    {
+      "epoch": 1.7979274611398963,
+      "grad_norm": 31.931928767500203,
+      "learning_rate": 1.1385900270697658e-06,
+      "loss": 1.1175,
+      "step": 694
+    },
+    {
+      "epoch": 1.8005181347150259,
+      "grad_norm": 36.61199052966244,
+      "learning_rate": 1.110215424326775e-06,
+      "loss": 1.1867,
+      "step": 695
+    },
+    {
+      "epoch": 1.8031088082901554,
+      "grad_norm": 49.9081839820131,
+      "learning_rate": 1.0821887612333959e-06,
+      "loss": 1.1266,
+      "step": 696
+    },
+    {
+      "epoch": 1.805699481865285,
+      "grad_norm": 25.346034138603734,
+      "learning_rate": 1.0545105540256628e-06,
+      "loss": 1.0614,
+      "step": 697
+    },
+    {
+      "epoch": 1.8082901554404145,
+      "grad_norm": 47.53838459679947,
+      "learning_rate": 1.0271813125212237e-06,
+      "loss": 1.1314,
+      "step": 698
+    },
+    {
+      "epoch": 1.810880829015544,
+      "grad_norm": 30.496460286583815,
+      "learning_rate": 1.0002015401099797e-06,
+      "loss": 1.1067,
+      "step": 699
+    },
+    {
+      "epoch": 1.8134715025906736,
+      "grad_norm": 29.929097539381686,
+      "learning_rate": 9.735717337447981e-07,
+      "loss": 1.0424,
+      "step": 700
+    },
+    {
+      "epoch": 1.8160621761658031,
+      "grad_norm": 30.887132457194266,
+      "learning_rate": 9.4729238393235e-07,
+      "loss": 1.1248,
+      "step": 701
+    },
+    {
+      "epoch": 1.8186528497409327,
+      "grad_norm": 24.26916275448189,
+      "learning_rate": 9.21363974724101e-07,
+      "loss": 1.0577,
+      "step": 702
+    },
+    {
+      "epoch": 1.8212435233160622,
+      "grad_norm": 40.34641617989283,
+      "learning_rate": 8.957869837073673e-07,
+      "loss": 1.1639,
+      "step": 703
+    },
+    {
+      "epoch": 1.8238341968911918,
+      "grad_norm": 34.3133374466777,
+      "learning_rate": 8.705618819965411e-07,
+      "loss": 1.0866,
+      "step": 704
+    },
+    {
+      "epoch": 1.8264248704663213,
+      "grad_norm": 25.164299615685284,
+      "learning_rate": 8.456891342243945e-07,
+      "loss": 1.1232,
+      "step": 705
+    },
+    {
+      "epoch": 1.8290155440414506,
+      "grad_norm": 129.91297199628124,
+      "learning_rate": 8.211691985335357e-07,
+      "loss": 1.1542,
+      "step": 706
+    },
+    {
+      "epoch": 1.8316062176165802,
+      "grad_norm": 23.928927141144797,
+      "learning_rate": 7.970025265679648e-07,
+      "loss": 1.0813,
+      "step": 707
+    },
+    {
+      "epoch": 1.8341968911917097,
+      "grad_norm": 22.631504479886225,
+      "learning_rate": 7.731895634647513e-07,
+      "loss": 1.1164,
+      "step": 708
+    },
+    {
+      "epoch": 1.8367875647668392,
+      "grad_norm": 84.2359250723018,
+      "learning_rate": 7.497307478458382e-07,
+      "loss": 1.1081,
+      "step": 709
+    },
+    {
+      "epoch": 1.8393782383419688,
+      "grad_norm": 51.39142883893451,
+      "learning_rate": 7.266265118099669e-07,
+      "loss": 1.105,
+      "step": 710
+    },
+    {
+      "epoch": 1.8419689119170983,
+      "grad_norm": 41.18280727079993,
+      "learning_rate": 7.038772809247075e-07,
+      "loss": 1.1211,
+      "step": 711
+    },
+    {
+      "epoch": 1.8445595854922279,
+      "grad_norm": 34.330855277813534,
+      "learning_rate": 6.814834742186361e-07,
+      "loss": 1.0783,
+      "step": 712
+    },
+    {
+      "epoch": 1.8471502590673574,
+      "grad_norm": 46.858780552576334,
+      "learning_rate": 6.594455041735925e-07,
+      "loss": 1.0214,
+      "step": 713
+    },
+    {
+      "epoch": 1.849740932642487,
+      "grad_norm": 94.2712798319484,
+      "learning_rate": 6.377637767171152e-07,
+      "loss": 1.098,
+      "step": 714
+    },
+    {
+      "epoch": 1.8523316062176165,
+      "grad_norm": 33.00073975184253,
+      "learning_rate": 6.164386912149289e-07,
+      "loss": 1.0906,
+      "step": 715
+    },
+    {
+      "epoch": 1.854922279792746,
+      "grad_norm": 30.030119862133272,
+      "learning_rate": 5.954706404636179e-07,
+      "loss": 1.1073,
+      "step": 716
+    },
+    {
+      "epoch": 1.8575129533678756,
+      "grad_norm": 46.42282973245658,
+      "learning_rate": 5.748600106833735e-07,
+      "loss": 1.1553,
+      "step": 717
+    },
+    {
+      "epoch": 1.8601036269430051,
+      "grad_norm": 26.48910946182044,
+      "learning_rate": 5.546071815108845e-07,
+      "loss": 1.0704,
+      "step": 718
+    },
+    {
+      "epoch": 1.8626943005181347,
+      "grad_norm": 29.34093197155635,
+      "learning_rate": 5.347125259923491e-07,
+      "loss": 1.1,
+      "step": 719
+    },
+    {
+      "epoch": 1.8652849740932642,
+      "grad_norm": 24.689130499541356,
+      "learning_rate": 5.151764105766011e-07,
+      "loss": 1.067,
+      "step": 720
+    },
+    {
+      "epoch": 1.8678756476683938,
+      "grad_norm": 21.25619644617847,
+      "learning_rate": 4.959991951083498e-07,
+      "loss": 1.1125,
+      "step": 721
+    },
+    {
+      "epoch": 1.8704663212435233,
+      "grad_norm": 23.946272802272112,
+      "learning_rate": 4.771812328215708e-07,
+      "loss": 1.0798,
+      "step": 722
+    },
+    {
+      "epoch": 1.8730569948186528,
+      "grad_norm": 33.286030816378954,
+      "learning_rate": 4.587228703329838e-07,
+      "loss": 1.0756,
+      "step": 723
+    },
+    {
+      "epoch": 1.8756476683937824,
+      "grad_norm": 109.02542545414109,
+      "learning_rate": 4.40624447635678e-07,
+      "loss": 1.073,
+      "step": 724
+    },
+    {
+      "epoch": 1.878238341968912,
+      "grad_norm": 133.80505789447585,
+      "learning_rate": 4.228862980928439e-07,
+      "loss": 1.1218,
+      "step": 725
+    },
+    {
+      "epoch": 1.8808290155440415,
+      "grad_norm": 28.671374209715793,
+      "learning_rate": 4.0550874843163337e-07,
+      "loss": 1.1546,
+      "step": 726
+    },
+    {
+      "epoch": 1.883419689119171,
+      "grad_norm": 20.092775273550536,
+      "learning_rate": 3.8849211873714266e-07,
+      "loss": 1.0608,
+      "step": 727
+    },
+    {
+      "epoch": 1.8860103626943006,
+      "grad_norm": 18.87195408427635,
+      "learning_rate": 3.7183672244652135e-07,
+      "loss": 1.0437,
+      "step": 728
+    },
+    {
+      "epoch": 1.88860103626943,
+      "grad_norm": 24.985644120932864,
+      "learning_rate": 3.5554286634318814e-07,
+      "loss": 1.0989,
+      "step": 729
+    },
+    {
+      "epoch": 1.8911917098445596,
+      "grad_norm": 24.09887960702925,
+      "learning_rate": 3.3961085055119083e-07,
+      "loss": 1.0347,
+      "step": 730
+    },
+    {
+      "epoch": 1.8937823834196892,
+      "grad_norm": 98.50926523613283,
+      "learning_rate": 3.2404096852967305e-07,
+      "loss": 1.1163,
+      "step": 731
+    },
+    {
+      "epoch": 1.8963730569948187,
+      "grad_norm": 42.45357973111845,
+      "learning_rate": 3.0883350706746973e-07,
+      "loss": 1.1497,
+      "step": 732
+    },
+    {
+      "epoch": 1.8989637305699483,
+      "grad_norm": 25.430184794482617,
+      "learning_rate": 2.9398874627782014e-07,
+      "loss": 1.0154,
+      "step": 733
+    },
+    {
+      "epoch": 1.9015544041450778,
+      "grad_norm": 32.56552224066898,
+      "learning_rate": 2.7950695959322093e-07,
+      "loss": 1.0976,
+      "step": 734
+    },
+    {
+      "epoch": 1.9041450777202074,
+      "grad_norm": 25.518391980867197,
+      "learning_rate": 2.653884137603702e-07,
+      "loss": 1.1122,
+      "step": 735
+    },
+    {
+      "epoch": 1.906735751295337,
+      "grad_norm": 20.537146853099735,
+      "learning_rate": 2.516333688352801e-07,
+      "loss": 1.0592,
+      "step": 736
+    },
+    {
+      "epoch": 1.9093264248704664,
+      "grad_norm": 25.28898033119641,
+      "learning_rate": 2.382420781784589e-07,
+      "loss": 1.0706,
+      "step": 737
+    },
+    {
+      "epoch": 1.911917098445596,
+      "grad_norm": 55.74230904177274,
+      "learning_rate": 2.2521478845025867e-07,
+      "loss": 1.1706,
+      "step": 738
+    },
+    {
+      "epoch": 1.9145077720207255,
+      "grad_norm": 42.768439146141375,
+      "learning_rate": 2.1255173960634146e-07,
+      "loss": 1.0917,
+      "step": 739
+    },
+    {
+      "epoch": 1.917098445595855,
+      "grad_norm": 31.627146067352545,
+      "learning_rate": 2.0025316489323597e-07,
+      "loss": 1.0842,
+      "step": 740
+    },
+    {
+      "epoch": 1.9196891191709846,
+      "grad_norm": 67.01614151937272,
+      "learning_rate": 1.8831929084406119e-07,
+      "loss": 1.1287,
+      "step": 741
+    },
+    {
+      "epoch": 1.922279792746114,
+      "grad_norm": 56.931018082229045,
+      "learning_rate": 1.7675033727434288e-07,
+      "loss": 1.148,
+      "step": 742
+    },
+    {
+      "epoch": 1.9248704663212435,
+      "grad_norm": 35.24107640275113,
+      "learning_rate": 1.655465172779702e-07,
+      "loss": 1.0814,
+      "step": 743
+    },
+    {
+      "epoch": 1.927461139896373,
+      "grad_norm": 28.45308969334642,
+      "learning_rate": 1.547080372232679e-07,
+      "loss": 1.1092,
+      "step": 744
+    },
+    {
+      "epoch": 1.9300518134715026,
+      "grad_norm": 67.36918357149847,
+      "learning_rate": 1.44235096749199e-07,
+      "loss": 1.1332,
+      "step": 745
+    },
+    {
+      "epoch": 1.932642487046632,
+      "grad_norm": 33.50866269131509,
+      "learning_rate": 1.3412788876167925e-07,
+      "loss": 1.0884,
+      "step": 746
+    },
+    {
+      "epoch": 1.9352331606217616,
+      "grad_norm": 34.359505767271465,
+      "learning_rate": 1.2438659943003306e-07,
+      "loss": 0.9982,
+      "step": 747
+    },
+    {
+      "epoch": 1.9378238341968912,
+      "grad_norm": 44.805290236152125,
+      "learning_rate": 1.1501140818355627e-07,
+      "loss": 1.065,
+      "step": 748
+    },
+    {
+      "epoch": 1.9404145077720207,
+      "grad_norm": 35.70322964853727,
+      "learning_rate": 1.0600248770821886e-07,
+      "loss": 1.1435,
+      "step": 749
+    },
+    {
+      "epoch": 1.9430051813471503,
+      "grad_norm": 37.7037381444634,
+      "learning_rate": 9.736000394348299e-08,
+      "loss": 1.1085,
+      "step": 750
+    },
+    {
+      "epoch": 1.9455958549222798,
+      "grad_norm": 19.88028370873119,
+      "learning_rate": 8.908411607923884e-08,
+      "loss": 1.0903,
+      "step": 751
+    },
+    {
+      "epoch": 1.9481865284974094,
+      "grad_norm": 22.037441897095253,
+      "learning_rate": 8.117497655287798e-08,
+      "loss": 1.0621,
+      "step": 752
+    },
+    {
+      "epoch": 1.950777202072539,
+      "grad_norm": 36.597366625713235,
+      "learning_rate": 7.363273104648904e-08,
+      "loss": 1.134,
+      "step": 753
+    },
+    {
+      "epoch": 1.9533678756476682,
+      "grad_norm": 36.91544331752125,
+      "learning_rate": 6.645751848417093e-08,
+      "loss": 1.0894,
+      "step": 754
+    },
+    {
+      "epoch": 1.9559585492227978,
+      "grad_norm": 30.791496804716704,
+      "learning_rate": 5.964947102946594e-08,
+      "loss": 1.0774,
+      "step": 755
+    },
+    {
+      "epoch": 1.9585492227979273,
+      "grad_norm": 24.76204564200231,
+      "learning_rate": 5.320871408294403e-08,
+      "loss": 1.1167,
+      "step": 756
+    },
+    {
+      "epoch": 1.9611398963730569,
+      "grad_norm": 31.78111531944549,
+      "learning_rate": 4.713536627987347e-08,
+      "loss": 1.0709,
+      "step": 757
+    },
+    {
+      "epoch": 1.9637305699481864,
+      "grad_norm": 36.388018093644106,
+      "learning_rate": 4.1429539488047066e-08,
+      "loss": 1.0492,
+      "step": 758
+    },
+    {
+      "epoch": 1.966321243523316,
+      "grad_norm": 27.235358627643226,
+      "learning_rate": 3.6091338805719356e-08,
+      "loss": 1.1128,
+      "step": 759
+    },
+    {
+      "epoch": 1.9689119170984455,
+      "grad_norm": 26.526882273916378,
+      "learning_rate": 3.1120862559670396e-08,
+      "loss": 1.1129,
+      "step": 760
+    },
+    {
+      "epoch": 1.971502590673575,
+      "grad_norm": 28.962449597773997,
+      "learning_rate": 2.651820230338942e-08,
+      "loss": 1.1286,
+      "step": 761
+    },
+    {
+      "epoch": 1.9740932642487046,
+      "grad_norm": 104.33848533313731,
+      "learning_rate": 2.2283442815402845e-08,
+      "loss": 1.117,
+      "step": 762
+    },
+    {
+      "epoch": 1.9766839378238341,
+      "grad_norm": 179.66099272542536,
+      "learning_rate": 1.8416662097693326e-08,
+      "loss": 1.0788,
+      "step": 763
+    },
+    {
+      "epoch": 1.9792746113989637,
+      "grad_norm": 28.438877123785307,
+      "learning_rate": 1.491793137427866e-08,
+      "loss": 1.1436,
+      "step": 764
+    },
+    {
+      "epoch": 1.9818652849740932,
+      "grad_norm": 44.454308819411644,
+      "learning_rate": 1.1787315089895057e-08,
+      "loss": 1.1108,
+      "step": 765
+    },
+    {
+      "epoch": 1.9844559585492227,
+      "grad_norm": 53.23249975862293,
+      "learning_rate": 9.024870908802552e-09,
+      "loss": 0.9971,
+      "step": 766
+    },
+    {
+      "epoch": 1.9870466321243523,
+      "grad_norm": 35.2043549019015,
+      "learning_rate": 6.630649713739168e-09,
+      "loss": 1.1205,
+      "step": 767
+    },
+    {
+      "epoch": 1.9896373056994818,
+      "grad_norm": 22.286284343829376,
+      "learning_rate": 4.6046956049639045e-09,
+      "loss": 1.0848,
+      "step": 768
+    },
+    {
+      "epoch": 1.9922279792746114,
+      "grad_norm": 24.94719200433733,
+      "learning_rate": 2.94704589946182e-09,
+      "loss": 1.1308,
+      "step": 769
+    },
+    {
+      "epoch": 1.994818652849741,
+      "grad_norm": 41.684623957583106,
+      "learning_rate": 1.657731130246809e-09,
+      "loss": 1.1555,
+      "step": 770
+    },
+    {
+      "epoch": 1.9974093264248705,
+      "grad_norm": 55.480495348949425,
+      "learning_rate": 7.367750458020518e-10,
+      "loss": 1.129,
+      "step": 771
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 43.2148652279276,
+      "learning_rate": 1.8419460964258505e-10,
+      "loss": 1.0835,
+      "step": 772
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 772,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 193,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.3363988309999616e+16,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoint-772/training_args.bin b/checkpoint-772/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..a4d661b15e5bbd8390fd11a502bea76680041301
--- /dev/null
+++ b/checkpoint-772/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3fe76c44cf1ade69372a2b861f80cfcfc5ba88f283683f660a4a0605f642aee3
+size 8568
diff --git a/checkpoint-772/zero_to_fp32.py b/checkpoint-772/zero_to_fp32.py
new file mode 100644
index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8
--- /dev/null
+++ b/checkpoint-772/zero_to_fp32.py
@@ -0,0 +1,604 @@
+#!/usr/bin/env python
+
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example: python zero_to_fp32.py . pytorch_model.bin
+
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+from collections import OrderedDict
+from dataclasses import dataclass
+
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+                                            FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+                                            FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+
+
+@dataclass
+class zero_model_state:
+    buffers: dict()
+    param_shapes: dict()
+    shared_params: list
+    ds_version: int
+    frozen_param_shapes: dict()
+    frozen_param_fragments: dict()
+
+
+debug = 0
+
+# load to cpu
+device = torch.device('cpu')
+
+
+def atoi(text):
+    return int(text) if text.isdigit() else text
+
+
+def natural_keys(text):
+    '''
+    alist.sort(key=natural_keys) sorts in human order
+    http://nedbatchelder.com/blog/200712/human_sorting.html
+    (See Toothy's implementation in the comments)
+    '''
+    return [atoi(c) for c in re.split(r'(\d+)', text)]
+
+
+def get_model_state_file(checkpoint_dir, zero_stage):
+    if not os.path.isdir(checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+
+    # there should be only one file
+    if zero_stage <= 2:
+        file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+    elif zero_stage == 3:
+        file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+
+    if not os.path.exists(file):
+        raise FileNotFoundError(f"can't find model states file at '{file}'")
+
+    return file
+
+
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+    # XXX: need to test that this simple glob rule works for multi-node setup too
+    ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+
+    if len(ckpt_files) == 0:
+        raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+
+    return ckpt_files
+
+
+def get_optim_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+
+
+def get_model_state_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+
+
+def parse_model_states(files):
+    zero_model_states = []
+    for file in files:
+        state_dict = torch.load(file, map_location=device)
+
+        if BUFFER_NAMES not in state_dict:
+            raise ValueError(f"{file} is not a model state checkpoint")
+        buffer_names = state_dict[BUFFER_NAMES]
+        if debug:
+            print("Found buffers:", buffer_names)
+
+        # recover just the buffers while restoring them to fp32 if they were saved in fp16
+        buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+        param_shapes = state_dict[PARAM_SHAPES]
+
+        # collect parameters that are included in param_shapes
+        param_names = []
+        for s in param_shapes:
+            for name in s.keys():
+                param_names.append(name)
+
+        # update with frozen parameters
+        frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+        if frozen_param_shapes is not None:
+            if debug:
+                print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+            param_names += list(frozen_param_shapes.keys())
+
+        # handle shared params
+        shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+
+        ds_version = state_dict.get(DS_VERSION, None)
+
+        frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+
+        z_model_state = zero_model_state(buffers=buffers,
+                                         param_shapes=param_shapes,
+                                         shared_params=shared_params,
+                                         ds_version=ds_version,
+                                         frozen_param_shapes=frozen_param_shapes,
+                                         frozen_param_fragments=frozen_param_fragments)
+        zero_model_states.append(z_model_state)
+
+    return zero_model_states
+
+
+def parse_optim_states(files, ds_checkpoint_dir):
+
+    total_files = len(files)
+    state_dicts = []
+    for f in files:
+        state_dict = torch.load(f, map_location=device)
+        # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+        # and also handle the case where it was already removed by another helper script
+        state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+        state_dicts.append(state_dict)
+
+    if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
+        raise ValueError(f"{files[0]} is not a zero checkpoint")
+    zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+    world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+
+    # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+    # parameters can be different from data parallelism for non-expert parameters. So we can just
+    # use the max of the partition_count to get the dp world_size.
+
+    if type(world_size) is list:
+        world_size = max(world_size)
+
+    if world_size != total_files:
+        raise ValueError(
+            f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+            "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+        )
+
+    # the groups are named differently in each stage
+    if zero_stage <= 2:
+        fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+    elif zero_stage == 3:
+        fp32_groups_key = FP32_FLAT_GROUPS
+    else:
+        raise ValueError(f"unknown zero stage {zero_stage}")
+
+    if zero_stage <= 2:
+        fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+    elif zero_stage == 3:
+        # if there is more than one param group, there will be multiple flattened tensors - one
+        # flattened tensor per group - for simplicity merge them into a single tensor
+        #
+        # XXX: could make the script more memory efficient for when there are multiple groups - it
+        # will require matching the sub-lists of param_shapes for each param group flattened tensor
+
+        fp32_flat_groups = [
+            torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts))
+        ]
+
+    return zero_stage, world_size, fp32_flat_groups
+
+
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
+    """
+    Returns fp32 state_dict reconstructed from ds checkpoint
+
+    Args:
+        - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+
+    """
+    print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+
+    optim_files = get_optim_files(ds_checkpoint_dir)
+    zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+    print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+
+    model_files = get_model_state_files(ds_checkpoint_dir)
+
+    zero_model_states = parse_model_states(model_files)
+    print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+
+    if zero_stage <= 2:
+        return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+    elif zero_stage == 3:
+        return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+
+
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+
+    frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+    frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+
+    if debug:
+        num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+        print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+    total_params = 0
+    total_numel = 0
+    for name, shape in frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+
+        state_dict[name] = frozen_param_fragments[name]
+
+        if debug:
+            print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _has_callable(obj, fn):
+    attr = getattr(obj, fn, None)
+    return callable(attr)
+
+
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+
+    # Reconstruction protocol:
+    #
+    # XXX: document this
+
+    if debug:
+        for i in range(world_size):
+            for j in range(len(fp32_flat_groups[0])):
+                print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+
+    # XXX: memory usage doubles here (zero2)
+    num_param_groups = len(fp32_flat_groups[0])
+    merged_single_partition_of_fp32_groups = []
+    for i in range(num_param_groups):
+        merged_partitions = [sd[i] for sd in fp32_flat_groups]
+        full_single_fp32_vector = torch.cat(merged_partitions, 0)
+        merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+    avail_numel = sum(
+        [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+
+    if debug:
+        wanted_params = sum([len(shapes) for shapes in param_shapes])
+        wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+        # not asserting if there is a mismatch due to possible padding
+        print(f"Have {avail_numel} numels to process.")
+        print(f"Need {wanted_numel} numels in {wanted_params} params.")
+
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    total_numel = 0
+    total_params = 0
+    for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+        offset = 0
+        avail_numel = full_single_fp32_vector.numel()
+        for name, shape in shapes.items():
+
+            unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+            total_numel += unpartitioned_numel
+            total_params += 1
+
+            if debug:
+                print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+            state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+            offset += unpartitioned_numel
+
+        # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+        # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+        # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+        # live optimizer object, so we are checking that the numbers are within the right range
+        align_to = 2 * world_size
+
+        def zero2_align(x):
+            return align_to * math.ceil(x / align_to)
+
+        if debug:
+            print(f"original offset={offset}, avail_numel={avail_numel}")
+
+        offset = zero2_align(offset)
+        avail_numel = zero2_align(avail_numel)
+
+        if debug:
+            print(f"aligned  offset={offset}, avail_numel={avail_numel}")
+
+        # Sanity check
+        if offset != avail_numel:
+            raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+    print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+
+    if not exclude_frozen_parameters:
+        _zero2_merge_frozen_params(state_dict, zero_model_states)
+
+    _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+
+    return state_dict
+
+
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+    remainder = unpartitioned_numel % world_size
+    padding_numel = (world_size - remainder) if remainder else 0
+    partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+    return partitioned_numel, padding_numel
+
+
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+
+    if debug:
+        for i in range(world_size):
+            num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+            print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+        frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+    total_params = 0
+    total_numel = 0
+    for name, shape in zero_model_states[0].frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+
+        param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+        state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+        if debug:
+            print(
+                f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    avail_numel = fp32_flat_groups[0].numel() * world_size
+    # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+    # param, re-consolidating each param, while dealing with padding if any
+
+    # merge list of dicts, preserving order
+    param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+
+    if debug:
+        for i in range(world_size):
+            print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+
+        wanted_params = len(param_shapes)
+        wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+        # not asserting if there is a mismatch due to possible padding
+        avail_numel = fp32_flat_groups[0].numel() * world_size
+        print(f"Trainable params: Have {avail_numel} numels to process.")
+        print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    offset = 0
+    total_numel = 0
+    total_params = 0
+    for name, shape in param_shapes.items():
+
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        total_params += 1
+
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+        if debug:
+            print(
+                f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+
+        # XXX: memory usage doubles here
+        state_dict[name] = torch.cat(
+            tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)),
+            0).narrow(0, 0, unpartitioned_numel).view(shape)
+        offset += partitioned_numel
+
+    offset *= world_size
+
+    # Sanity check
+    if offset != avail_numel:
+        raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+    print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+
+    if not exclude_frozen_parameters:
+        _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+
+    _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+
+    return state_dict
+
+
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+    ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+    via a model hub.
+
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+
+    Returns:
+        - pytorch ``state_dict``
+
+    Note: this approach may not work if your application doesn't have sufficient free CPU memory and
+    you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+    the checkpoint.
+
+    A typical usage might be ::
+
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        # do the training and checkpoint saving
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+        model = model.cpu() # move to cpu
+        model.load_state_dict(state_dict)
+        # submit to model hub or save the model to share with others
+
+    In this example the ``model`` will no longer be usable in the deepspeed context of the same
+    application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+    If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+
+    """
+    if tag is None:
+        latest_path = os.path.join(checkpoint_dir, 'latest')
+        if os.path.isfile(latest_path):
+            with open(latest_path, 'r') as fd:
+                tag = fd.read().strip()
+        else:
+            raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+
+    ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+
+    if not os.path.isdir(ds_checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+
+    return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
+
+
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+    loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+    """
+
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters)
+    print(f"Saving fp32 state dict to {output_file}")
+    torch.save(state_dict, output_file)
+
+
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+    """
+    1. Put the provided model to cpu
+    2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+    3. Load it into the provided model
+
+    Args:
+        - ``model``: the model object to update
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+
+    Returns:
+        - ``model`: modified model
+
+    Make sure you have plenty of CPU memory available before you call this function. If you don't
+    have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+    conveniently placed for you in the checkpoint folder.
+
+    A typical usage might be ::
+
+        from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+        model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+        # submit to model hub or save the model to share with others
+
+    Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+    of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+    """
+    logger.info(f"Extracting fp32 weights")
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+
+    logger.info(f"Overwriting model with fp32 weights")
+    model = model.cpu()
+    model.load_state_dict(state_dict, strict=False)
+
+    return model
+
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("checkpoint_dir",
+                        type=str,
+                        help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+    parser.add_argument(
+        "output_file",
+        type=str,
+        help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)")
+    parser.add_argument("-t",
+                        "--tag",
+                        type=str,
+                        default=None,
+                        help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+    parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
+    parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+    args = parser.parse_args()
+
+    debug = args.debug
+
+    convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
+                                               args.output_file,
+                                               tag=args.tag,
+                                               exclude_frozen_parameters=args.exclude_frozen_parameters)
diff --git a/checkpoint-822/README.md b/checkpoint-822/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..5d958b0327f8e51ca223e270eb789387f6b41fb2
--- /dev/null
+++ b/checkpoint-822/README.md
@@ -0,0 +1,202 @@
+---
+base_model: THUDM/GLM-4-32B-0414
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.15.1
\ No newline at end of file
diff --git a/checkpoint-822/adapter_config.json b/checkpoint-822/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..d23c5bb0164ae65157b73dbb2e6dc419d09b28ad
--- /dev/null
+++ b/checkpoint-822/adapter_config.json
@@ -0,0 +1,41 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "THUDM/GLM-4-32B-0414",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": null,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": [
+    "embed_tokens",
+    "lm_head"
+  ],
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "o_proj",
+    "v_proj",
+    "down_proj",
+    "q_proj",
+    "k_proj",
+    "gate_up_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_rslora": true
+}
\ No newline at end of file
diff --git a/checkpoint-822/adapter_model.safetensors b/checkpoint-822/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..3bfc934021ae2f94535e9442dcecf9427f7b12c1
--- /dev/null
+++ b/checkpoint-822/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f9dabe0dcb2a00ba6eca0b1e4fb714d3c1d5289929ed928c9ab44c923fdb4073
+size 5579575888
diff --git a/checkpoint-822/global_step821/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-822/global_step821/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..16815fd557595e661dab5a16408d01d8bc738a5d
--- /dev/null
+++ b/checkpoint-822/global_step821/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ff8217124921ff5a249c3953fe8750c111f39ee584b057ac5596ebc7e42b122e
+size 2458601314
diff --git a/checkpoint-822/global_step821/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-822/global_step821/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..41058968a54c3bfd5e358e754073c22bf7811ff8
--- /dev/null
+++ b/checkpoint-822/global_step821/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cf46bfcdeced91f17b777758b5806c22a3f781b6a7ae5b7600171774a7671fc5
+size 2458601314
diff --git a/checkpoint-822/global_step821/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoint-822/global_step821/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..5132756409fd067405e321b7018532c627b38684
--- /dev/null
+++ b/checkpoint-822/global_step821/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:57fd306863f4304d57f9023d7c32314dfc0b620cc0c6367bc8e1d9e7fb11a012
+size 2458601314
diff --git a/checkpoint-822/global_step821/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/checkpoint-822/global_step821/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..216308849092812f84d7046ad32eed4104b8bf54
--- /dev/null
+++ b/checkpoint-822/global_step821/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de80e3a4e60771fb87094ed8cf54a31277bd49cffc9f7b584ed6644528236371
+size 2458601314
diff --git a/checkpoint-822/global_step821/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/checkpoint-822/global_step821/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..49a891d22c90cba7425954eb0407bdd1c4efb3ed
--- /dev/null
+++ b/checkpoint-822/global_step821/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0a3634a9b0e7b5b158ebda08c1170da9c1fe2faa98325ee847cf175b72e68905
+size 2458601314
diff --git a/checkpoint-822/global_step821/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/checkpoint-822/global_step821/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..398d63e074368586e5a07b2572328a4346163c60
--- /dev/null
+++ b/checkpoint-822/global_step821/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2b70a9bfa4920e0ca156acfc71a9c65d73792cc1477fb0b834a9d0c64a01a33f
+size 2458601314
diff --git a/checkpoint-822/global_step821/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/checkpoint-822/global_step821/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..5ba7ec2e2194d6cc8ec13f75c9675a133cbc51ff
--- /dev/null
+++ b/checkpoint-822/global_step821/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d15c17bc85a35923d0fd407b4b7284d4bc536654f79f4f615608e0bf68bb3232
+size 2458601314
diff --git a/checkpoint-822/global_step821/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/checkpoint-822/global_step821/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..0cce7869ee261dfb20609964701f785355c18595
--- /dev/null
+++ b/checkpoint-822/global_step821/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6c6a26561aa408977821f667dd9ffad58042de3c5a3bd8755c8b17d94d965be0
+size 2458601314
diff --git a/checkpoint-822/global_step821/zero_pp_rank_0_mp_rank_00_model_states.pt b/checkpoint-822/global_step821/zero_pp_rank_0_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4f490f3e848c1bf755364ff89f7c1f705f0eb805
--- /dev/null
+++ b/checkpoint-822/global_step821/zero_pp_rank_0_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0a6e664a02794bb3bcea64d6dbccffbf8cd011135fd9d479bae3e940093260c0
+size 752148
diff --git a/checkpoint-822/global_step821/zero_pp_rank_1_mp_rank_00_model_states.pt b/checkpoint-822/global_step821/zero_pp_rank_1_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4fe52bc115942793e064cea7c14d3a518d57742c
--- /dev/null
+++ b/checkpoint-822/global_step821/zero_pp_rank_1_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b804acd6b0587702229c9fc03ee6f33a832b75d651a3b9f36460a5d3bbc3327f
+size 752148
diff --git a/checkpoint-822/global_step821/zero_pp_rank_2_mp_rank_00_model_states.pt b/checkpoint-822/global_step821/zero_pp_rank_2_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ceb5431be8f713f4fa27f3b49da1c4f23e238b04
--- /dev/null
+++ b/checkpoint-822/global_step821/zero_pp_rank_2_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a9be64f1086cb24858a8cbe60b9c8ba1bc4cdb724cf76362b19153f20e4a0fad
+size 752148
diff --git a/checkpoint-822/global_step821/zero_pp_rank_3_mp_rank_00_model_states.pt b/checkpoint-822/global_step821/zero_pp_rank_3_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..41ec666967258ddb9184466e72490b7098db6aa5
--- /dev/null
+++ b/checkpoint-822/global_step821/zero_pp_rank_3_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1b5e59dddf8daf3b5da78b90796d7c60a64dd05d09af6116b6a4c508fefe3260
+size 752148
diff --git a/checkpoint-822/global_step821/zero_pp_rank_4_mp_rank_00_model_states.pt b/checkpoint-822/global_step821/zero_pp_rank_4_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..befdbc0e0502acc54b204842fbc529601551e5ad
--- /dev/null
+++ b/checkpoint-822/global_step821/zero_pp_rank_4_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:268c1f89f4fc75e297f0d683c7312808815c037812840c115152cc9b4646595a
+size 752148
diff --git a/checkpoint-822/global_step821/zero_pp_rank_5_mp_rank_00_model_states.pt b/checkpoint-822/global_step821/zero_pp_rank_5_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..969e10f9bda06b782290e0a3530a15d9fc623a75
--- /dev/null
+++ b/checkpoint-822/global_step821/zero_pp_rank_5_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fb39d1703bc2a45f71137c7a48ad375b703b40ad9a3e1ea40bbc117d0592506b
+size 752148
diff --git a/checkpoint-822/global_step821/zero_pp_rank_6_mp_rank_00_model_states.pt b/checkpoint-822/global_step821/zero_pp_rank_6_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..6be1bd2852206278b51a2582ca948997f5139731
--- /dev/null
+++ b/checkpoint-822/global_step821/zero_pp_rank_6_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8166dd6c0a2182a927ac738f3317f53998929e19a9024662cc4057587ddb0e4b
+size 752148
diff --git a/checkpoint-822/global_step821/zero_pp_rank_7_mp_rank_00_model_states.pt b/checkpoint-822/global_step821/zero_pp_rank_7_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..8c3ebfbba87842a3e5a237af7da03b24c0efaa0c
--- /dev/null
+++ b/checkpoint-822/global_step821/zero_pp_rank_7_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:69a9e5eca51799f601e4db178a891384f5f49b72eb7dd3e3123ff45eb56ca4e6
+size 752148
diff --git a/checkpoint-822/latest b/checkpoint-822/latest
new file mode 100644
index 0000000000000000000000000000000000000000..3159aab1f7bb3903604150491f83c05295b87c00
--- /dev/null
+++ b/checkpoint-822/latest
@@ -0,0 +1 @@
+global_step821
\ No newline at end of file
diff --git a/checkpoint-822/rng_state_0.pth b/checkpoint-822/rng_state_0.pth
new file mode 100644
index 0000000000000000000000000000000000000000..008a6bab3696310472a1afaaf67aadd849da50c3
--- /dev/null
+++ b/checkpoint-822/rng_state_0.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0bb5fed332be2363e0d622c76a17b7a5b6d05bf89825570682adb3cce5ac3b32
+size 15984
diff --git a/checkpoint-822/rng_state_1.pth b/checkpoint-822/rng_state_1.pth
new file mode 100644
index 0000000000000000000000000000000000000000..a4cdf0595ae08ff971ae10aac00157c9ab410833
--- /dev/null
+++ b/checkpoint-822/rng_state_1.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9036603899dce8aed76bec4fedbc4a938c7ff8c25747841b38a8a6985bcc5258
+size 15984
diff --git a/checkpoint-822/rng_state_2.pth b/checkpoint-822/rng_state_2.pth
new file mode 100644
index 0000000000000000000000000000000000000000..b7ce057356bc6924a6de2ba333e030246eb0ec97
--- /dev/null
+++ b/checkpoint-822/rng_state_2.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7e5383ee48caba99966f39ef74c58fd9b753b4e81b93e096480e12713d196444
+size 15984
diff --git a/checkpoint-822/rng_state_3.pth b/checkpoint-822/rng_state_3.pth
new file mode 100644
index 0000000000000000000000000000000000000000..5cb528dc950ed9161e91a8f0144b8f29af4452e7
--- /dev/null
+++ b/checkpoint-822/rng_state_3.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cf7fcbf4184b00ba751039333e9c778fd6d6248e42b7de7962bbaa421f2a9f01
+size 15984
diff --git a/checkpoint-822/rng_state_4.pth b/checkpoint-822/rng_state_4.pth
new file mode 100644
index 0000000000000000000000000000000000000000..56ac3edfc264abf4bb62dbeaa93082b3eb8754f7
--- /dev/null
+++ b/checkpoint-822/rng_state_4.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:74b113cac9bc45f3a2b939d24f8bbcd4dd6e88d64c9d08763b93514f25d07726
+size 15984
diff --git a/checkpoint-822/rng_state_5.pth b/checkpoint-822/rng_state_5.pth
new file mode 100644
index 0000000000000000000000000000000000000000..11e5c63d3af4f1835f3ae4aa32a10fd4d1678d42
--- /dev/null
+++ b/checkpoint-822/rng_state_5.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:561dc72467b5f7ee784383e8f29005c89d31198021d0fbe8f7ccb3ccec775670
+size 15984
diff --git a/checkpoint-822/rng_state_6.pth b/checkpoint-822/rng_state_6.pth
new file mode 100644
index 0000000000000000000000000000000000000000..77436a3257a4c1cb7e32859741e535765b91a0e1
--- /dev/null
+++ b/checkpoint-822/rng_state_6.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:80dcc08478ae8fa87319934fd245ff2b4e3e9e1aa8cc251bae816273cf2590cf
+size 15984
diff --git a/checkpoint-822/rng_state_7.pth b/checkpoint-822/rng_state_7.pth
new file mode 100644
index 0000000000000000000000000000000000000000..dfea8a63a18beab5870a1386ac65e6eb1fe78182
--- /dev/null
+++ b/checkpoint-822/rng_state_7.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:09de15462c487019f9e0e4a3deee385b63c3fbfd9825baa43d347d9967f6f507
+size 15984
diff --git a/checkpoint-822/scheduler.pt b/checkpoint-822/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..3b3727e665d98d9888e069fffbfda0cbe4b2913a
--- /dev/null
+++ b/checkpoint-822/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c8f38eb6ccddcf04fc28a6ceca8a53e7217e0aa0d7768e55e066d15d6b242cd3
+size 1064
diff --git a/checkpoint-822/special_tokens_map.json b/checkpoint-822/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..3cf953c2c8a3c89778c92e54c685942bb1130616
--- /dev/null
+++ b/checkpoint-822/special_tokens_map.json
@@ -0,0 +1,32 @@
+{
+  "additional_special_tokens": [
+    "<|endoftext|>",
+    "[MASK]",
+    "[gMASK]",
+    "[sMASK]",
+    "<sop>",
+    "<eop>",
+    "<|system|>",
+    "<|user|>",
+    "<|assistant|>",
+    "<|observation|>",
+    "<|begin_of_image|>",
+    "<|end_of_image|>",
+    "<|begin_of_video|>",
+    "<|end_of_video|>"
+  ],
+  "eos_token": {
+    "content": "<|user|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
diff --git a/checkpoint-822/tokenizer.json b/checkpoint-822/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..4d1dde37a2715c11628fc84bf571976f9f80eb69
--- /dev/null
+++ b/checkpoint-822/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:76ebeac0d8bd7879ead7b43c16b44981f277e47225de2bd7de9ae1a6cc664a8c
+size 19966496
diff --git a/checkpoint-822/tokenizer_config.json b/checkpoint-822/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..e19c45f6310a17f6c1c6be76a429ac68438d547f
--- /dev/null
+++ b/checkpoint-822/tokenizer_config.json
@@ -0,0 +1,146 @@
+{
+  "added_tokens_decoder": {
+    "151329": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151330": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151331": {
+      "content": "[gMASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151332": {
+      "content": "[sMASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151333": {
+      "content": "<sop>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151334": {
+      "content": "<eop>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151335": {
+      "content": "<|system|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151336": {
+      "content": "<|user|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151337": {
+      "content": "<|assistant|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151338": {
+      "content": "<|observation|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151339": {
+      "content": "<|begin_of_image|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151340": {
+      "content": "<|end_of_image|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151341": {
+      "content": "<|begin_of_video|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151342": {
+      "content": "<|end_of_video|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|endoftext|>",
+    "[MASK]",
+    "[gMASK]",
+    "[sMASK]",
+    "<sop>",
+    "<eop>",
+    "<|system|>",
+    "<|user|>",
+    "<|assistant|>",
+    "<|observation|>",
+    "<|begin_of_image|>",
+    "<|end_of_image|>",
+    "<|begin_of_video|>",
+    "<|end_of_video|>"
+  ],
+  "chat_template": "[gMASK]<sop>\n{%- if tools -%}\n<|system|>\n# 可用工具\n{% for tool in tools %}\n    {%- set function = tool.function if tool.get(\"function\") else tool %}\n\n## {{ function.name }}\n\n{{ function | tojson(indent=4, ensure_ascii=False) }}\n在调用上述函数时，请使用 Json 格式表示调用的参数。\n{%- endfor %}\n{%- endif -%}\n\n{%- for msg in messages %}\n    {%- if msg.role == 'system' %}\n<|system|>\n{{ msg.content }}\n    {%- endif %}\n{%- endfor %}\n\n{%- for message in messages if message.role != 'system' %}\n    {%- set role = message['role'] %}\n    {%- set content = message['content'] %}\n    {%- set meta = message.get(\"metadata\", \"\") %}\n\n    {%- if role == 'user' %}\n<|user|>\n{{ content }}\n    {%- elif role == 'assistant' and not meta %}\n<|assistant|>\n{{ content }}\n    {%- elif role == 'assistant' and meta %}\n<|assistant|>{{ meta }}\n{{ content }}\n    {%- elif role == 'observation' %}\n<|observation|>\n{{ content }}\n    {%- endif %}\n{%- endfor %}\n{% if add_generation_prompt %}<|assistant|>{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "do_lower_case": false,
+  "eos_token": "<|user|>",
+  "extra_special_tokens": {},
+  "model_input_names": [
+    "input_ids",
+    "attention_mask"
+  ],
+  "model_max_length": 128000,
+  "pad_token": "<|endoftext|>",
+  "padding_side": "left",
+  "remove_space": false,
+  "tokenizer_class": "PreTrainedTokenizer"
+}
diff --git a/checkpoint-822/trainer_state.json b/checkpoint-822/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..2124d7628466454deb0df8507b054c6a0bbecab1
--- /dev/null
+++ b/checkpoint-822/trainer_state.json
@@ -0,0 +1,5852 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.9963547995139734,
+  "eval_steps": 103,
+  "global_step": 822,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.002430133657351154,
+      "grad_norm": 715.4923219036787,
+      "learning_rate": 0.0,
+      "loss": 1.3541,
+      "step": 1
+    },
+    {
+      "epoch": 0.002430133657351154,
+      "eval_loss": 1.3335719108581543,
+      "eval_runtime": 53.4883,
+      "eval_samples_per_second": 13.91,
+      "eval_steps_per_second": 1.739,
+      "step": 1
+    },
+    {
+      "epoch": 0.004860267314702308,
+      "grad_norm": 614.6970578314867,
+      "learning_rate": 5e-06,
+      "loss": 1.3775,
+      "step": 2
+    },
+    {
+      "epoch": 0.007290400972053463,
+      "grad_norm": 471.59017991123795,
+      "learning_rate": 1e-05,
+      "loss": 1.339,
+      "step": 3
+    },
+    {
+      "epoch": 0.009720534629404616,
+      "grad_norm": 238.72216262259653,
+      "learning_rate": 1.5e-05,
+      "loss": 1.3829,
+      "step": 4
+    },
+    {
+      "epoch": 0.012150668286755772,
+      "grad_norm": 355.68955726709873,
+      "learning_rate": 2e-05,
+      "loss": 1.3597,
+      "step": 5
+    },
+    {
+      "epoch": 0.014580801944106925,
+      "grad_norm": 414.5627284272111,
+      "learning_rate": 2.5e-05,
+      "loss": 1.3862,
+      "step": 6
+    },
+    {
+      "epoch": 0.01701093560145808,
+      "grad_norm": 534.9877222052693,
+      "learning_rate": 3e-05,
+      "loss": 1.2784,
+      "step": 7
+    },
+    {
+      "epoch": 0.019441069258809233,
+      "grad_norm": 153.38895635666677,
+      "learning_rate": 3.5e-05,
+      "loss": 1.3521,
+      "step": 8
+    },
+    {
+      "epoch": 0.02187120291616039,
+      "grad_norm": 858.293734138087,
+      "learning_rate": 4e-05,
+      "loss": 1.2461,
+      "step": 9
+    },
+    {
+      "epoch": 0.024301336573511544,
+      "grad_norm": 255.81989388533376,
+      "learning_rate": 4.5e-05,
+      "loss": 1.2778,
+      "step": 10
+    },
+    {
+      "epoch": 0.026731470230862697,
+      "grad_norm": 368.91949003479226,
+      "learning_rate": 5e-05,
+      "loss": 1.3412,
+      "step": 11
+    },
+    {
+      "epoch": 0.02916160388821385,
+      "grad_norm": 176.49481799555898,
+      "learning_rate": 5.500000000000001e-05,
+      "loss": 1.3437,
+      "step": 12
+    },
+    {
+      "epoch": 0.031591737545565005,
+      "grad_norm": 208.57742104974147,
+      "learning_rate": 6e-05,
+      "loss": 1.2859,
+      "step": 13
+    },
+    {
+      "epoch": 0.03402187120291616,
+      "grad_norm": 93.26742036471734,
+      "learning_rate": 6.500000000000001e-05,
+      "loss": 1.1843,
+      "step": 14
+    },
+    {
+      "epoch": 0.03645200486026731,
+      "grad_norm": 145.53380444622215,
+      "learning_rate": 7e-05,
+      "loss": 1.4281,
+      "step": 15
+    },
+    {
+      "epoch": 0.038882138517618466,
+      "grad_norm": 126.56724937430516,
+      "learning_rate": 7.500000000000001e-05,
+      "loss": 1.3908,
+      "step": 16
+    },
+    {
+      "epoch": 0.041312272174969626,
+      "grad_norm": 106.19246390662754,
+      "learning_rate": 8e-05,
+      "loss": 1.344,
+      "step": 17
+    },
+    {
+      "epoch": 0.04374240583232078,
+      "grad_norm": 289.348178084847,
+      "learning_rate": 8.5e-05,
+      "loss": 1.2708,
+      "step": 18
+    },
+    {
+      "epoch": 0.046172539489671933,
+      "grad_norm": 286.63676887065634,
+      "learning_rate": 9e-05,
+      "loss": 1.3564,
+      "step": 19
+    },
+    {
+      "epoch": 0.04860267314702309,
+      "grad_norm": 269.6096299101413,
+      "learning_rate": 9.5e-05,
+      "loss": 1.2184,
+      "step": 20
+    },
+    {
+      "epoch": 0.05103280680437424,
+      "grad_norm": 151.28678796160915,
+      "learning_rate": 0.0001,
+      "loss": 1.2974,
+      "step": 21
+    },
+    {
+      "epoch": 0.053462940461725394,
+      "grad_norm": 265.5625538646362,
+      "learning_rate": 0.000105,
+      "loss": 1.2703,
+      "step": 22
+    },
+    {
+      "epoch": 0.05589307411907655,
+      "grad_norm": 724.7157187586193,
+      "learning_rate": 0.00011000000000000002,
+      "loss": 1.2691,
+      "step": 23
+    },
+    {
+      "epoch": 0.0583232077764277,
+      "grad_norm": 425.3768239347252,
+      "learning_rate": 0.00011499999999999999,
+      "loss": 1.375,
+      "step": 24
+    },
+    {
+      "epoch": 0.060753341433778855,
+      "grad_norm": 314.5119318308783,
+      "learning_rate": 0.00012,
+      "loss": 1.2952,
+      "step": 25
+    },
+    {
+      "epoch": 0.06318347509113001,
+      "grad_norm": 557.519173033834,
+      "learning_rate": 0.000125,
+      "loss": 1.2923,
+      "step": 26
+    },
+    {
+      "epoch": 0.06561360874848117,
+      "grad_norm": 211.4069356529637,
+      "learning_rate": 0.00013000000000000002,
+      "loss": 1.2629,
+      "step": 27
+    },
+    {
+      "epoch": 0.06804374240583232,
+      "grad_norm": 299.7742653722713,
+      "learning_rate": 0.00013500000000000003,
+      "loss": 1.3099,
+      "step": 28
+    },
+    {
+      "epoch": 0.07047387606318348,
+      "grad_norm": 182.18551965886013,
+      "learning_rate": 0.00014,
+      "loss": 1.2215,
+      "step": 29
+    },
+    {
+      "epoch": 0.07290400972053462,
+      "grad_norm": 153.38300520125887,
+      "learning_rate": 0.000145,
+      "loss": 1.2799,
+      "step": 30
+    },
+    {
+      "epoch": 0.07533414337788578,
+      "grad_norm": 849.4472853252786,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 1.2012,
+      "step": 31
+    },
+    {
+      "epoch": 0.07776427703523693,
+      "grad_norm": 179.94814586965418,
+      "learning_rate": 0.000155,
+      "loss": 1.2103,
+      "step": 32
+    },
+    {
+      "epoch": 0.08019441069258809,
+      "grad_norm": 180.36681057956048,
+      "learning_rate": 0.00016,
+      "loss": 1.2414,
+      "step": 33
+    },
+    {
+      "epoch": 0.08262454434993925,
+      "grad_norm": 113.72852454032189,
+      "learning_rate": 0.000165,
+      "loss": 1.2508,
+      "step": 34
+    },
+    {
+      "epoch": 0.0850546780072904,
+      "grad_norm": 150.53415363213057,
+      "learning_rate": 0.00017,
+      "loss": 1.2528,
+      "step": 35
+    },
+    {
+      "epoch": 0.08748481166464156,
+      "grad_norm": 156.19567878683574,
+      "learning_rate": 0.000175,
+      "loss": 1.2016,
+      "step": 36
+    },
+    {
+      "epoch": 0.0899149453219927,
+      "grad_norm": 416.34884765145057,
+      "learning_rate": 0.00018,
+      "loss": 1.254,
+      "step": 37
+    },
+    {
+      "epoch": 0.09234507897934387,
+      "grad_norm": 269.7105025581372,
+      "learning_rate": 0.00018500000000000002,
+      "loss": 1.2215,
+      "step": 38
+    },
+    {
+      "epoch": 0.09477521263669501,
+      "grad_norm": 249.35069047655023,
+      "learning_rate": 0.00019,
+      "loss": 1.2078,
+      "step": 39
+    },
+    {
+      "epoch": 0.09720534629404617,
+      "grad_norm": 167.16896045613478,
+      "learning_rate": 0.000195,
+      "loss": 1.1866,
+      "step": 40
+    },
+    {
+      "epoch": 0.09963547995139732,
+      "grad_norm": 248.22240554128427,
+      "learning_rate": 0.0002,
+      "loss": 1.252,
+      "step": 41
+    },
+    {
+      "epoch": 0.10206561360874848,
+      "grad_norm": 180.89520841022969,
+      "learning_rate": 0.0001999991930332148,
+      "loss": 1.2251,
+      "step": 42
+    },
+    {
+      "epoch": 0.10449574726609964,
+      "grad_norm": 614.4291375430485,
+      "learning_rate": 0.00019999677214588312,
+      "loss": 1.2563,
+      "step": 43
+    },
+    {
+      "epoch": 0.10692588092345079,
+      "grad_norm": 211.7523427355369,
+      "learning_rate": 0.00019999273737707646,
+      "loss": 1.193,
+      "step": 44
+    },
+    {
+      "epoch": 0.10935601458080195,
+      "grad_norm": 181.56788458769344,
+      "learning_rate": 0.00019998708879191335,
+      "loss": 1.2598,
+      "step": 45
+    },
+    {
+      "epoch": 0.1117861482381531,
+      "grad_norm": 157.5783414916277,
+      "learning_rate": 0.00019997982648155814,
+      "loss": 1.2663,
+      "step": 46
+    },
+    {
+      "epoch": 0.11421628189550426,
+      "grad_norm": 155.78006251192625,
+      "learning_rate": 0.00019997095056321971,
+      "loss": 1.1637,
+      "step": 47
+    },
+    {
+      "epoch": 0.1166464155528554,
+      "grad_norm": 202.0253360488958,
+      "learning_rate": 0.00019996046118014955,
+      "loss": 1.2508,
+      "step": 48
+    },
+    {
+      "epoch": 0.11907654921020656,
+      "grad_norm": 192.7576297264874,
+      "learning_rate": 0.00019994835850163924,
+      "loss": 1.2014,
+      "step": 49
+    },
+    {
+      "epoch": 0.12150668286755771,
+      "grad_norm": 132.5484871621418,
+      "learning_rate": 0.00019993464272301804,
+      "loss": 1.2279,
+      "step": 50
+    },
+    {
+      "epoch": 0.12393681652490887,
+      "grad_norm": 128.32285438248965,
+      "learning_rate": 0.00019991931406564944,
+      "loss": 1.2179,
+      "step": 51
+    },
+    {
+      "epoch": 0.12636695018226002,
+      "grad_norm": 552.3669463716512,
+      "learning_rate": 0.00019990237277692788,
+      "loss": 1.1498,
+      "step": 52
+    },
+    {
+      "epoch": 0.12879708383961117,
+      "grad_norm": 86.17911790260192,
+      "learning_rate": 0.00019988381913027442,
+      "loss": 1.2784,
+      "step": 53
+    },
+    {
+      "epoch": 0.13122721749696234,
+      "grad_norm": 70.83294605515782,
+      "learning_rate": 0.00019986365342513265,
+      "loss": 1.2224,
+      "step": 54
+    },
+    {
+      "epoch": 0.1336573511543135,
+      "grad_norm": 45.23624563299466,
+      "learning_rate": 0.00019984187598696363,
+      "loss": 1.1746,
+      "step": 55
+    },
+    {
+      "epoch": 0.13608748481166463,
+      "grad_norm": 57.67645735585192,
+      "learning_rate": 0.00019981848716724073,
+      "loss": 1.2154,
+      "step": 56
+    },
+    {
+      "epoch": 0.1385176184690158,
+      "grad_norm": 45.661268047129674,
+      "learning_rate": 0.00019979348734344398,
+      "loss": 1.1411,
+      "step": 57
+    },
+    {
+      "epoch": 0.14094775212636695,
+      "grad_norm": 53.10628399970359,
+      "learning_rate": 0.00019976687691905393,
+      "loss": 1.2029,
+      "step": 58
+    },
+    {
+      "epoch": 0.1433778857837181,
+      "grad_norm": 38.71353325803162,
+      "learning_rate": 0.00019973865632354516,
+      "loss": 1.1976,
+      "step": 59
+    },
+    {
+      "epoch": 0.14580801944106925,
+      "grad_norm": 42.789208063581114,
+      "learning_rate": 0.0001997088260123793,
+      "loss": 1.1477,
+      "step": 60
+    },
+    {
+      "epoch": 0.14823815309842042,
+      "grad_norm": 37.613194740192164,
+      "learning_rate": 0.0001996773864669978,
+      "loss": 1.2529,
+      "step": 61
+    },
+    {
+      "epoch": 0.15066828675577157,
+      "grad_norm": 47.96813084127655,
+      "learning_rate": 0.00019964433819481405,
+      "loss": 1.2328,
+      "step": 62
+    },
+    {
+      "epoch": 0.15309842041312272,
+      "grad_norm": 55.30483872428545,
+      "learning_rate": 0.00019960968172920516,
+      "loss": 1.1996,
+      "step": 63
+    },
+    {
+      "epoch": 0.15552855407047386,
+      "grad_norm": 35.58995799070749,
+      "learning_rate": 0.00019957341762950344,
+      "loss": 1.1248,
+      "step": 64
+    },
+    {
+      "epoch": 0.15795868772782504,
+      "grad_norm": 58.86131222300149,
+      "learning_rate": 0.00019953554648098748,
+      "loss": 1.3017,
+      "step": 65
+    },
+    {
+      "epoch": 0.16038882138517618,
+      "grad_norm": 32.12091331878439,
+      "learning_rate": 0.00019949606889487233,
+      "loss": 1.1961,
+      "step": 66
+    },
+    {
+      "epoch": 0.16281895504252733,
+      "grad_norm": 167.27433996357928,
+      "learning_rate": 0.0001994549855083001,
+      "loss": 1.1768,
+      "step": 67
+    },
+    {
+      "epoch": 0.1652490886998785,
+      "grad_norm": 32.3328494297432,
+      "learning_rate": 0.0001994122969843293,
+      "loss": 1.1802,
+      "step": 68
+    },
+    {
+      "epoch": 0.16767922235722965,
+      "grad_norm": 39.92530074438497,
+      "learning_rate": 0.0001993680040119244,
+      "loss": 1.2098,
+      "step": 69
+    },
+    {
+      "epoch": 0.1701093560145808,
+      "grad_norm": 45.60830517129956,
+      "learning_rate": 0.0001993221073059445,
+      "loss": 1.2159,
+      "step": 70
+    },
+    {
+      "epoch": 0.17253948967193194,
+      "grad_norm": 35.462695032736335,
+      "learning_rate": 0.00019927460760713197,
+      "loss": 1.1818,
+      "step": 71
+    },
+    {
+      "epoch": 0.17496962332928312,
+      "grad_norm": 43.05751624597826,
+      "learning_rate": 0.0001992255056821004,
+      "loss": 1.2011,
+      "step": 72
+    },
+    {
+      "epoch": 0.17739975698663427,
+      "grad_norm": 47.13143404969894,
+      "learning_rate": 0.00019917480232332224,
+      "loss": 1.1669,
+      "step": 73
+    },
+    {
+      "epoch": 0.1798298906439854,
+      "grad_norm": 72.07146401418987,
+      "learning_rate": 0.000199122498349116,
+      "loss": 1.181,
+      "step": 74
+    },
+    {
+      "epoch": 0.1822600243013366,
+      "grad_norm": 36.289202348834955,
+      "learning_rate": 0.00019906859460363307,
+      "loss": 1.1787,
+      "step": 75
+    },
+    {
+      "epoch": 0.18469015795868773,
+      "grad_norm": 46.92636167228936,
+      "learning_rate": 0.00019901309195684416,
+      "loss": 1.2316,
+      "step": 76
+    },
+    {
+      "epoch": 0.18712029161603888,
+      "grad_norm": 31.71425340357504,
+      "learning_rate": 0.00019895599130452505,
+      "loss": 1.1607,
+      "step": 77
+    },
+    {
+      "epoch": 0.18955042527339003,
+      "grad_norm": 43.94199928621344,
+      "learning_rate": 0.00019889729356824235,
+      "loss": 1.1919,
+      "step": 78
+    },
+    {
+      "epoch": 0.1919805589307412,
+      "grad_norm": 45.33073791860179,
+      "learning_rate": 0.0001988369996953386,
+      "loss": 1.2237,
+      "step": 79
+    },
+    {
+      "epoch": 0.19441069258809235,
+      "grad_norm": 135.89980489661897,
+      "learning_rate": 0.00019877511065891673,
+      "loss": 1.1822,
+      "step": 80
+    },
+    {
+      "epoch": 0.1968408262454435,
+      "grad_norm": 439.6770852212966,
+      "learning_rate": 0.00019871162745782478,
+      "loss": 1.1441,
+      "step": 81
+    },
+    {
+      "epoch": 0.19927095990279464,
+      "grad_norm": 80.73319798776026,
+      "learning_rate": 0.0001986465511166394,
+      "loss": 1.1709,
+      "step": 82
+    },
+    {
+      "epoch": 0.20170109356014582,
+      "grad_norm": 87.76515297497458,
+      "learning_rate": 0.00019857988268564953,
+      "loss": 1.1549,
+      "step": 83
+    },
+    {
+      "epoch": 0.20413122721749696,
+      "grad_norm": 70.08754986406095,
+      "learning_rate": 0.00019851162324083932,
+      "loss": 1.1771,
+      "step": 84
+    },
+    {
+      "epoch": 0.2065613608748481,
+      "grad_norm": 187.8198997057664,
+      "learning_rate": 0.0001984417738838709,
+      "loss": 1.2068,
+      "step": 85
+    },
+    {
+      "epoch": 0.20899149453219928,
+      "grad_norm": 127.78818684755072,
+      "learning_rate": 0.00019837033574206646,
+      "loss": 1.1974,
+      "step": 86
+    },
+    {
+      "epoch": 0.21142162818955043,
+      "grad_norm": 127.82979216871074,
+      "learning_rate": 0.0001982973099683902,
+      "loss": 1.185,
+      "step": 87
+    },
+    {
+      "epoch": 0.21385176184690158,
+      "grad_norm": 142.35425084857746,
+      "learning_rate": 0.00019822269774142954,
+      "loss": 1.2225,
+      "step": 88
+    },
+    {
+      "epoch": 0.21628189550425272,
+      "grad_norm": 246.64019353564817,
+      "learning_rate": 0.0001981465002653763,
+      "loss": 1.2574,
+      "step": 89
+    },
+    {
+      "epoch": 0.2187120291616039,
+      "grad_norm": 189.88471076285524,
+      "learning_rate": 0.0001980687187700071,
+      "loss": 1.1635,
+      "step": 90
+    },
+    {
+      "epoch": 0.22114216281895505,
+      "grad_norm": 116.65693373141701,
+      "learning_rate": 0.00019798935451066361,
+      "loss": 1.1457,
+      "step": 91
+    },
+    {
+      "epoch": 0.2235722964763062,
+      "grad_norm": 71.76422539970217,
+      "learning_rate": 0.00019790840876823232,
+      "loss": 1.2354,
+      "step": 92
+    },
+    {
+      "epoch": 0.22600243013365734,
+      "grad_norm": 139.42330509386431,
+      "learning_rate": 0.0001978258828491236,
+      "loss": 1.18,
+      "step": 93
+    },
+    {
+      "epoch": 0.2284325637910085,
+      "grad_norm": 131.88308820601443,
+      "learning_rate": 0.00019774177808525113,
+      "loss": 1.1868,
+      "step": 94
+    },
+    {
+      "epoch": 0.23086269744835966,
+      "grad_norm": 85.81071125615291,
+      "learning_rate": 0.00019765609583400977,
+      "loss": 1.1814,
+      "step": 95
+    },
+    {
+      "epoch": 0.2332928311057108,
+      "grad_norm": 84.43756298541064,
+      "learning_rate": 0.00019756883747825424,
+      "loss": 1.1658,
+      "step": 96
+    },
+    {
+      "epoch": 0.23572296476306198,
+      "grad_norm": 114.24245545143974,
+      "learning_rate": 0.0001974800044262764,
+      "loss": 1.2497,
+      "step": 97
+    },
+    {
+      "epoch": 0.23815309842041313,
+      "grad_norm": 76.577511222722,
+      "learning_rate": 0.00019738959811178272,
+      "loss": 1.1414,
+      "step": 98
+    },
+    {
+      "epoch": 0.24058323207776428,
+      "grad_norm": 171.8084830895381,
+      "learning_rate": 0.00019729761999387103,
+      "loss": 1.1619,
+      "step": 99
+    },
+    {
+      "epoch": 0.24301336573511542,
+      "grad_norm": 221.87752250936416,
+      "learning_rate": 0.00019720407155700707,
+      "loss": 1.2718,
+      "step": 100
+    },
+    {
+      "epoch": 0.2454434993924666,
+      "grad_norm": 205.64943975370608,
+      "learning_rate": 0.00019710895431100046,
+      "loss": 1.1786,
+      "step": 101
+    },
+    {
+      "epoch": 0.24787363304981774,
+      "grad_norm": 160.16582903260615,
+      "learning_rate": 0.00019701226979098037,
+      "loss": 1.1426,
+      "step": 102
+    },
+    {
+      "epoch": 0.2503037667071689,
+      "grad_norm": 82.85031394537334,
+      "learning_rate": 0.00019691401955737072,
+      "loss": 1.1718,
+      "step": 103
+    },
+    {
+      "epoch": 0.2503037667071689,
+      "eval_loss": 1.1633374691009521,
+      "eval_runtime": 52.6182,
+      "eval_samples_per_second": 14.14,
+      "eval_steps_per_second": 1.767,
+      "step": 103
+    },
+    {
+      "epoch": 0.25273390036452004,
+      "grad_norm": 94.74469296109082,
+      "learning_rate": 0.000196814205195865,
+      "loss": 1.2255,
+      "step": 104
+    },
+    {
+      "epoch": 0.2551640340218712,
+      "grad_norm": 126.15797466756656,
+      "learning_rate": 0.00019671282831740076,
+      "loss": 1.1623,
+      "step": 105
+    },
+    {
+      "epoch": 0.25759416767922233,
+      "grad_norm": 79.41156434272008,
+      "learning_rate": 0.0001966098905581334,
+      "loss": 1.1606,
+      "step": 106
+    },
+    {
+      "epoch": 0.2600243013365735,
+      "grad_norm": 70.33104031058372,
+      "learning_rate": 0.00019650539357941003,
+      "loss": 1.196,
+      "step": 107
+    },
+    {
+      "epoch": 0.2624544349939247,
+      "grad_norm": 69.57260733822498,
+      "learning_rate": 0.0001963993390677424,
+      "loss": 1.1939,
+      "step": 108
+    },
+    {
+      "epoch": 0.2648845686512758,
+      "grad_norm": 81.78820691772725,
+      "learning_rate": 0.00019629172873477995,
+      "loss": 1.2553,
+      "step": 109
+    },
+    {
+      "epoch": 0.267314702308627,
+      "grad_norm": 117.06324110268656,
+      "learning_rate": 0.00019618256431728194,
+      "loss": 1.2535,
+      "step": 110
+    },
+    {
+      "epoch": 0.26974483596597815,
+      "grad_norm": 83.26993317104247,
+      "learning_rate": 0.00019607184757708951,
+      "loss": 1.157,
+      "step": 111
+    },
+    {
+      "epoch": 0.27217496962332927,
+      "grad_norm": 51.990829456422375,
+      "learning_rate": 0.00019595958030109735,
+      "loss": 1.1274,
+      "step": 112
+    },
+    {
+      "epoch": 0.27460510328068044,
+      "grad_norm": 119.7487160875729,
+      "learning_rate": 0.00019584576430122473,
+      "loss": 1.1422,
+      "step": 113
+    },
+    {
+      "epoch": 0.2770352369380316,
+      "grad_norm": 88.15636932272304,
+      "learning_rate": 0.00019573040141438624,
+      "loss": 1.1599,
+      "step": 114
+    },
+    {
+      "epoch": 0.27946537059538273,
+      "grad_norm": 62.346402225534774,
+      "learning_rate": 0.00019561349350246226,
+      "loss": 1.1909,
+      "step": 115
+    },
+    {
+      "epoch": 0.2818955042527339,
+      "grad_norm": 76.40612150653034,
+      "learning_rate": 0.0001954950424522688,
+      "loss": 1.1646,
+      "step": 116
+    },
+    {
+      "epoch": 0.284325637910085,
+      "grad_norm": 94.8711613055073,
+      "learning_rate": 0.00019537505017552716,
+      "loss": 1.1547,
+      "step": 117
+    },
+    {
+      "epoch": 0.2867557715674362,
+      "grad_norm": 63.86961661796314,
+      "learning_rate": 0.00019525351860883293,
+      "loss": 1.1841,
+      "step": 118
+    },
+    {
+      "epoch": 0.2891859052247874,
+      "grad_norm": 133.2417924150684,
+      "learning_rate": 0.00019513044971362494,
+      "loss": 1.1365,
+      "step": 119
+    },
+    {
+      "epoch": 0.2916160388821385,
+      "grad_norm": 133.44891510996445,
+      "learning_rate": 0.00019500584547615333,
+      "loss": 1.1696,
+      "step": 120
+    },
+    {
+      "epoch": 0.29404617253948967,
+      "grad_norm": 58.51701768739601,
+      "learning_rate": 0.00019487970790744774,
+      "loss": 1.1874,
+      "step": 121
+    },
+    {
+      "epoch": 0.29647630619684084,
+      "grad_norm": 49.536158238056196,
+      "learning_rate": 0.00019475203904328474,
+      "loss": 1.1798,
+      "step": 122
+    },
+    {
+      "epoch": 0.29890643985419196,
+      "grad_norm": 94.27608706983857,
+      "learning_rate": 0.000194622840944155,
+      "loss": 1.2443,
+      "step": 123
+    },
+    {
+      "epoch": 0.30133657351154314,
+      "grad_norm": 103.868243202843,
+      "learning_rate": 0.00019449211569523,
+      "loss": 1.1759,
+      "step": 124
+    },
+    {
+      "epoch": 0.3037667071688943,
+      "grad_norm": 73.31536435980003,
+      "learning_rate": 0.00019435986540632843,
+      "loss": 1.1885,
+      "step": 125
+    },
+    {
+      "epoch": 0.30619684082624543,
+      "grad_norm": 64.91149114745738,
+      "learning_rate": 0.00019422609221188207,
+      "loss": 1.1864,
+      "step": 126
+    },
+    {
+      "epoch": 0.3086269744835966,
+      "grad_norm": 95.34449184763653,
+      "learning_rate": 0.00019409079827090145,
+      "loss": 1.1339,
+      "step": 127
+    },
+    {
+      "epoch": 0.3110571081409477,
+      "grad_norm": 67.36156159754226,
+      "learning_rate": 0.00019395398576694086,
+      "loss": 1.1845,
+      "step": 128
+    },
+    {
+      "epoch": 0.3134872417982989,
+      "grad_norm": 36.94913176821407,
+      "learning_rate": 0.00019381565690806328,
+      "loss": 1.2154,
+      "step": 129
+    },
+    {
+      "epoch": 0.3159173754556501,
+      "grad_norm": 69.05265214547647,
+      "learning_rate": 0.00019367581392680457,
+      "loss": 1.1642,
+      "step": 130
+    },
+    {
+      "epoch": 0.3183475091130012,
+      "grad_norm": 38.974761165559855,
+      "learning_rate": 0.00019353445908013755,
+      "loss": 1.1508,
+      "step": 131
+    },
+    {
+      "epoch": 0.32077764277035237,
+      "grad_norm": 48.47215142199794,
+      "learning_rate": 0.00019339159464943557,
+      "loss": 1.2011,
+      "step": 132
+    },
+    {
+      "epoch": 0.32320777642770354,
+      "grad_norm": 41.88512063342574,
+      "learning_rate": 0.00019324722294043558,
+      "loss": 1.1643,
+      "step": 133
+    },
+    {
+      "epoch": 0.32563791008505466,
+      "grad_norm": 25.59403215229145,
+      "learning_rate": 0.00019310134628320114,
+      "loss": 1.1954,
+      "step": 134
+    },
+    {
+      "epoch": 0.32806804374240583,
+      "grad_norm": 58.02634988046396,
+      "learning_rate": 0.00019295396703208453,
+      "loss": 1.1544,
+      "step": 135
+    },
+    {
+      "epoch": 0.330498177399757,
+      "grad_norm": 31.26218977398251,
+      "learning_rate": 0.00019280508756568896,
+      "loss": 1.1613,
+      "step": 136
+    },
+    {
+      "epoch": 0.33292831105710813,
+      "grad_norm": 31.81234539284103,
+      "learning_rate": 0.00019265471028683014,
+      "loss": 1.1892,
+      "step": 137
+    },
+    {
+      "epoch": 0.3353584447144593,
+      "grad_norm": 54.44930114675527,
+      "learning_rate": 0.00019250283762249748,
+      "loss": 1.2801,
+      "step": 138
+    },
+    {
+      "epoch": 0.3377885783718105,
+      "grad_norm": 30.320486287732734,
+      "learning_rate": 0.00019234947202381486,
+      "loss": 1.1934,
+      "step": 139
+    },
+    {
+      "epoch": 0.3402187120291616,
+      "grad_norm": 32.76175001943503,
+      "learning_rate": 0.00019219461596600113,
+      "loss": 1.1436,
+      "step": 140
+    },
+    {
+      "epoch": 0.34264884568651277,
+      "grad_norm": 36.802264122697316,
+      "learning_rate": 0.00019203827194833026,
+      "loss": 1.1418,
+      "step": 141
+    },
+    {
+      "epoch": 0.3450789793438639,
+      "grad_norm": 35.03898729580271,
+      "learning_rate": 0.0001918804424940908,
+      "loss": 1.2479,
+      "step": 142
+    },
+    {
+      "epoch": 0.34750911300121506,
+      "grad_norm": 89.58068030461165,
+      "learning_rate": 0.00019172113015054532,
+      "loss": 1.2504,
+      "step": 143
+    },
+    {
+      "epoch": 0.34993924665856624,
+      "grad_norm": 30.05799668441019,
+      "learning_rate": 0.00019156033748888917,
+      "loss": 1.1662,
+      "step": 144
+    },
+    {
+      "epoch": 0.35236938031591736,
+      "grad_norm": 33.80121199203598,
+      "learning_rate": 0.00019139806710420914,
+      "loss": 1.1862,
+      "step": 145
+    },
+    {
+      "epoch": 0.35479951397326853,
+      "grad_norm": 31.510896023067872,
+      "learning_rate": 0.00019123432161544142,
+      "loss": 1.147,
+      "step": 146
+    },
+    {
+      "epoch": 0.3572296476306197,
+      "grad_norm": 32.92613286618093,
+      "learning_rate": 0.00019106910366532942,
+      "loss": 1.1421,
+      "step": 147
+    },
+    {
+      "epoch": 0.3596597812879708,
+      "grad_norm": 245.36013493823395,
+      "learning_rate": 0.00019090241592038113,
+      "loss": 1.1306,
+      "step": 148
+    },
+    {
+      "epoch": 0.362089914945322,
+      "grad_norm": 72.3061625644275,
+      "learning_rate": 0.000190734261070826,
+      "loss": 1.1144,
+      "step": 149
+    },
+    {
+      "epoch": 0.3645200486026732,
+      "grad_norm": 63.77748866336388,
+      "learning_rate": 0.00019056464183057157,
+      "loss": 1.1249,
+      "step": 150
+    },
+    {
+      "epoch": 0.3669501822600243,
+      "grad_norm": 633.2421324308109,
+      "learning_rate": 0.00019039356093715975,
+      "loss": 1.1359,
+      "step": 151
+    },
+    {
+      "epoch": 0.36938031591737547,
+      "grad_norm": 34.456657555313704,
+      "learning_rate": 0.00019022102115172248,
+      "loss": 1.1397,
+      "step": 152
+    },
+    {
+      "epoch": 0.3718104495747266,
+      "grad_norm": 35.21328820959324,
+      "learning_rate": 0.00019004702525893732,
+      "loss": 1.1741,
+      "step": 153
+    },
+    {
+      "epoch": 0.37424058323207776,
+      "grad_norm": 90.32405227187036,
+      "learning_rate": 0.00018987157606698235,
+      "loss": 1.1844,
+      "step": 154
+    },
+    {
+      "epoch": 0.37667071688942894,
+      "grad_norm": 39.348755664527914,
+      "learning_rate": 0.000189694676407491,
+      "loss": 1.1216,
+      "step": 155
+    },
+    {
+      "epoch": 0.37910085054678005,
+      "grad_norm": 58.85540744859834,
+      "learning_rate": 0.00018951632913550626,
+      "loss": 1.115,
+      "step": 156
+    },
+    {
+      "epoch": 0.38153098420413123,
+      "grad_norm": 39.849945227365325,
+      "learning_rate": 0.0001893365371294346,
+      "loss": 1.1705,
+      "step": 157
+    },
+    {
+      "epoch": 0.3839611178614824,
+      "grad_norm": 40.300954908722304,
+      "learning_rate": 0.0001891553032909996,
+      "loss": 1.1831,
+      "step": 158
+    },
+    {
+      "epoch": 0.3863912515188335,
+      "grad_norm": 53.72009888405355,
+      "learning_rate": 0.00018897263054519498,
+      "loss": 1.1613,
+      "step": 159
+    },
+    {
+      "epoch": 0.3888213851761847,
+      "grad_norm": 142.22686975859034,
+      "learning_rate": 0.0001887885218402375,
+      "loss": 1.1639,
+      "step": 160
+    },
+    {
+      "epoch": 0.39125151883353587,
+      "grad_norm": 50.141889086717356,
+      "learning_rate": 0.00018860298014751944,
+      "loss": 1.1659,
+      "step": 161
+    },
+    {
+      "epoch": 0.393681652490887,
+      "grad_norm": 63.25519968311113,
+      "learning_rate": 0.0001884160084615604,
+      "loss": 1.168,
+      "step": 162
+    },
+    {
+      "epoch": 0.39611178614823817,
+      "grad_norm": 50.59325246324073,
+      "learning_rate": 0.0001882276097999592,
+      "loss": 1.1202,
+      "step": 163
+    },
+    {
+      "epoch": 0.3985419198055893,
+      "grad_norm": 58.32587879810431,
+      "learning_rate": 0.0001880377872033451,
+      "loss": 1.1587,
+      "step": 164
+    },
+    {
+      "epoch": 0.40097205346294046,
+      "grad_norm": 211.50882688314653,
+      "learning_rate": 0.00018784654373532866,
+      "loss": 1.1551,
+      "step": 165
+    },
+    {
+      "epoch": 0.40340218712029163,
+      "grad_norm": 47.82888424614203,
+      "learning_rate": 0.00018765388248245246,
+      "loss": 1.2274,
+      "step": 166
+    },
+    {
+      "epoch": 0.40583232077764275,
+      "grad_norm": 97.94922685274778,
+      "learning_rate": 0.00018745980655414114,
+      "loss": 1.0872,
+      "step": 167
+    },
+    {
+      "epoch": 0.4082624544349939,
+      "grad_norm": 44.74994721544976,
+      "learning_rate": 0.0001872643190826512,
+      "loss": 1.1244,
+      "step": 168
+    },
+    {
+      "epoch": 0.4106925880923451,
+      "grad_norm": 53.84692426866845,
+      "learning_rate": 0.00018706742322302064,
+      "loss": 1.1576,
+      "step": 169
+    },
+    {
+      "epoch": 0.4131227217496962,
+      "grad_norm": 54.43599132185614,
+      "learning_rate": 0.0001868691221530178,
+      "loss": 1.0957,
+      "step": 170
+    },
+    {
+      "epoch": 0.4155528554070474,
+      "grad_norm": 39.21766518089018,
+      "learning_rate": 0.00018666941907309026,
+      "loss": 1.1625,
+      "step": 171
+    },
+    {
+      "epoch": 0.41798298906439857,
+      "grad_norm": 49.40030697752548,
+      "learning_rate": 0.000186468317206313,
+      "loss": 1.1556,
+      "step": 172
+    },
+    {
+      "epoch": 0.4204131227217497,
+      "grad_norm": 101.50309647820374,
+      "learning_rate": 0.0001862658197983366,
+      "loss": 1.1687,
+      "step": 173
+    },
+    {
+      "epoch": 0.42284325637910086,
+      "grad_norm": 105.41233861946563,
+      "learning_rate": 0.0001860619301173347,
+      "loss": 1.1687,
+      "step": 174
+    },
+    {
+      "epoch": 0.425273390036452,
+      "grad_norm": 103.99749987770305,
+      "learning_rate": 0.0001858566514539513,
+      "loss": 1.144,
+      "step": 175
+    },
+    {
+      "epoch": 0.42770352369380316,
+      "grad_norm": 78.83490301242213,
+      "learning_rate": 0.0001856499871212477,
+      "loss": 1.2318,
+      "step": 176
+    },
+    {
+      "epoch": 0.43013365735115433,
+      "grad_norm": 62.325757489859335,
+      "learning_rate": 0.00018544194045464886,
+      "loss": 1.1092,
+      "step": 177
+    },
+    {
+      "epoch": 0.43256379100850545,
+      "grad_norm": 81.32804926878099,
+      "learning_rate": 0.00018523251481188986,
+      "loss": 1.2233,
+      "step": 178
+    },
+    {
+      "epoch": 0.4349939246658566,
+      "grad_norm": 38.97928032166606,
+      "learning_rate": 0.00018502171357296144,
+      "loss": 1.2371,
+      "step": 179
+    },
+    {
+      "epoch": 0.4374240583232078,
+      "grad_norm": 82.62345361244209,
+      "learning_rate": 0.0001848095401400555,
+      "loss": 1.1562,
+      "step": 180
+    },
+    {
+      "epoch": 0.4398541919805589,
+      "grad_norm": 47.793381366401626,
+      "learning_rate": 0.0001845959979375104,
+      "loss": 1.1249,
+      "step": 181
+    },
+    {
+      "epoch": 0.4422843256379101,
+      "grad_norm": 53.6022948471739,
+      "learning_rate": 0.00018438109041175532,
+      "loss": 1.1415,
+      "step": 182
+    },
+    {
+      "epoch": 0.44471445929526127,
+      "grad_norm": 65.92717051568573,
+      "learning_rate": 0.00018416482103125506,
+      "loss": 1.1748,
+      "step": 183
+    },
+    {
+      "epoch": 0.4471445929526124,
+      "grad_norm": 59.410481167619494,
+      "learning_rate": 0.0001839471932864537,
+      "loss": 1.1399,
+      "step": 184
+    },
+    {
+      "epoch": 0.44957472660996356,
+      "grad_norm": 64.22740395872977,
+      "learning_rate": 0.0001837282106897185,
+      "loss": 1.2193,
+      "step": 185
+    },
+    {
+      "epoch": 0.4520048602673147,
+      "grad_norm": 54.63497168787729,
+      "learning_rate": 0.00018350787677528306,
+      "loss": 1.153,
+      "step": 186
+    },
+    {
+      "epoch": 0.45443499392466585,
+      "grad_norm": 49.60676029637355,
+      "learning_rate": 0.00018328619509919044,
+      "loss": 1.1509,
+      "step": 187
+    },
+    {
+      "epoch": 0.456865127582017,
+      "grad_norm": 32.29074835877607,
+      "learning_rate": 0.00018306316923923563,
+      "loss": 1.1851,
+      "step": 188
+    },
+    {
+      "epoch": 0.45929526123936815,
+      "grad_norm": 61.13632454163589,
+      "learning_rate": 0.0001828388027949078,
+      "loss": 1.1323,
+      "step": 189
+    },
+    {
+      "epoch": 0.4617253948967193,
+      "grad_norm": 67.48617660835801,
+      "learning_rate": 0.00018261309938733238,
+      "loss": 1.1956,
+      "step": 190
+    },
+    {
+      "epoch": 0.4641555285540705,
+      "grad_norm": 38.31182257784929,
+      "learning_rate": 0.00018238606265921238,
+      "loss": 1.1379,
+      "step": 191
+    },
+    {
+      "epoch": 0.4665856622114216,
+      "grad_norm": 47.30995766708629,
+      "learning_rate": 0.00018215769627476984,
+      "loss": 1.1462,
+      "step": 192
+    },
+    {
+      "epoch": 0.4690157958687728,
+      "grad_norm": 34.57093925891121,
+      "learning_rate": 0.00018192800391968642,
+      "loss": 1.1979,
+      "step": 193
+    },
+    {
+      "epoch": 0.47144592952612396,
+      "grad_norm": 34.45645740457662,
+      "learning_rate": 0.0001816969893010442,
+      "loss": 1.1763,
+      "step": 194
+    },
+    {
+      "epoch": 0.4738760631834751,
+      "grad_norm": 39.21862152859671,
+      "learning_rate": 0.00018146465614726567,
+      "loss": 1.1514,
+      "step": 195
+    },
+    {
+      "epoch": 0.47630619684082626,
+      "grad_norm": 34.765347344568106,
+      "learning_rate": 0.00018123100820805355,
+      "loss": 1.1426,
+      "step": 196
+    },
+    {
+      "epoch": 0.4787363304981774,
+      "grad_norm": 35.04245362239315,
+      "learning_rate": 0.00018099604925433043,
+      "loss": 1.143,
+      "step": 197
+    },
+    {
+      "epoch": 0.48116646415552855,
+      "grad_norm": 103.45636476066032,
+      "learning_rate": 0.00018075978307817764,
+      "loss": 1.1713,
+      "step": 198
+    },
+    {
+      "epoch": 0.4835965978128797,
+      "grad_norm": 43.0297373660821,
+      "learning_rate": 0.00018052221349277442,
+      "loss": 1.2226,
+      "step": 199
+    },
+    {
+      "epoch": 0.48602673147023084,
+      "grad_norm": 32.80474372048966,
+      "learning_rate": 0.000180283344332336,
+      "loss": 1.1556,
+      "step": 200
+    },
+    {
+      "epoch": 0.488456865127582,
+      "grad_norm": 59.42688731224296,
+      "learning_rate": 0.00018004317945205197,
+      "loss": 1.1411,
+      "step": 201
+    },
+    {
+      "epoch": 0.4908869987849332,
+      "grad_norm": 102.0917822407188,
+      "learning_rate": 0.000179801722728024,
+      "loss": 1.1309,
+      "step": 202
+    },
+    {
+      "epoch": 0.4933171324422843,
+      "grad_norm": 309.9346821950787,
+      "learning_rate": 0.0001795589780572031,
+      "loss": 1.1953,
+      "step": 203
+    },
+    {
+      "epoch": 0.4957472660996355,
+      "grad_norm": 344.5019267346993,
+      "learning_rate": 0.0001793149493573271,
+      "loss": 1.1524,
+      "step": 204
+    },
+    {
+      "epoch": 0.49817739975698666,
+      "grad_norm": 50.075205946207085,
+      "learning_rate": 0.00017906964056685706,
+      "loss": 1.1495,
+      "step": 205
+    },
+    {
+      "epoch": 0.5006075334143378,
+      "grad_norm": 132.32227258331488,
+      "learning_rate": 0.00017882305564491396,
+      "loss": 1.1976,
+      "step": 206
+    },
+    {
+      "epoch": 0.5006075334143378,
+      "eval_loss": 1.146019458770752,
+      "eval_runtime": 52.7816,
+      "eval_samples_per_second": 14.096,
+      "eval_steps_per_second": 1.762,
+      "step": 206
+    },
+    {
+      "epoch": 0.503037667071689,
+      "grad_norm": 138.57200377669218,
+      "learning_rate": 0.00017857519857121458,
+      "loss": 1.2159,
+      "step": 207
+    },
+    {
+      "epoch": 0.5054678007290401,
+      "grad_norm": 268.41109734161546,
+      "learning_rate": 0.00017832607334600746,
+      "loss": 1.1748,
+      "step": 208
+    },
+    {
+      "epoch": 0.5078979343863913,
+      "grad_norm": 72.44153953442401,
+      "learning_rate": 0.00017807568399000822,
+      "loss": 1.1758,
+      "step": 209
+    },
+    {
+      "epoch": 0.5103280680437424,
+      "grad_norm": 97.75400124096738,
+      "learning_rate": 0.00017782403454433477,
+      "loss": 1.1004,
+      "step": 210
+    },
+    {
+      "epoch": 0.5127582017010935,
+      "grad_norm": 84.19522802756285,
+      "learning_rate": 0.000177571129070442,
+      "loss": 1.1397,
+      "step": 211
+    },
+    {
+      "epoch": 0.5151883353584447,
+      "grad_norm": 132.95081835535706,
+      "learning_rate": 0.00017731697165005618,
+      "loss": 1.146,
+      "step": 212
+    },
+    {
+      "epoch": 0.5176184690157959,
+      "grad_norm": 560.3351292126325,
+      "learning_rate": 0.0001770615663851093,
+      "loss": 1.1937,
+      "step": 213
+    },
+    {
+      "epoch": 0.520048602673147,
+      "grad_norm": 252.72862614645885,
+      "learning_rate": 0.0001768049173976727,
+      "loss": 1.1213,
+      "step": 214
+    },
+    {
+      "epoch": 0.5224787363304981,
+      "grad_norm": 356.2985211032981,
+      "learning_rate": 0.0001765470288298905,
+      "loss": 1.22,
+      "step": 215
+    },
+    {
+      "epoch": 0.5249088699878494,
+      "grad_norm": 952.600672502031,
+      "learning_rate": 0.00017628790484391284,
+      "loss": 1.1321,
+      "step": 216
+    },
+    {
+      "epoch": 0.5273390036452005,
+      "grad_norm": 289.9357041930161,
+      "learning_rate": 0.0001760275496218288,
+      "loss": 1.1688,
+      "step": 217
+    },
+    {
+      "epoch": 0.5297691373025516,
+      "grad_norm": 48.69445264741508,
+      "learning_rate": 0.0001757659673655986,
+      "loss": 1.1551,
+      "step": 218
+    },
+    {
+      "epoch": 0.5321992709599028,
+      "grad_norm": 40.15160247154335,
+      "learning_rate": 0.0001755031622969862,
+      "loss": 1.1459,
+      "step": 219
+    },
+    {
+      "epoch": 0.534629404617254,
+      "grad_norm": 44.59390817019205,
+      "learning_rate": 0.00017523913865749078,
+      "loss": 1.2012,
+      "step": 220
+    },
+    {
+      "epoch": 0.5370595382746051,
+      "grad_norm": 30.189717624412484,
+      "learning_rate": 0.00017497390070827848,
+      "loss": 1.15,
+      "step": 221
+    },
+    {
+      "epoch": 0.5394896719319563,
+      "grad_norm": 27.185608574176108,
+      "learning_rate": 0.00017470745273011362,
+      "loss": 1.0763,
+      "step": 222
+    },
+    {
+      "epoch": 0.5419198055893074,
+      "grad_norm": 99.44121390806423,
+      "learning_rate": 0.00017443979902328956,
+      "loss": 1.1478,
+      "step": 223
+    },
+    {
+      "epoch": 0.5443499392466585,
+      "grad_norm": 29.684499344634585,
+      "learning_rate": 0.00017417094390755934,
+      "loss": 1.1123,
+      "step": 224
+    },
+    {
+      "epoch": 0.5467800729040098,
+      "grad_norm": 26.788847114635054,
+      "learning_rate": 0.00017390089172206592,
+      "loss": 1.1169,
+      "step": 225
+    },
+    {
+      "epoch": 0.5492102065613609,
+      "grad_norm": 31.84817878214798,
+      "learning_rate": 0.00017362964682527218,
+      "loss": 1.1524,
+      "step": 226
+    },
+    {
+      "epoch": 0.551640340218712,
+      "grad_norm": 34.834632993822424,
+      "learning_rate": 0.00017335721359489057,
+      "loss": 1.1761,
+      "step": 227
+    },
+    {
+      "epoch": 0.5540704738760632,
+      "grad_norm": 66.6084234453716,
+      "learning_rate": 0.00017308359642781242,
+      "loss": 1.1175,
+      "step": 228
+    },
+    {
+      "epoch": 0.5565006075334143,
+      "grad_norm": 35.15720180142773,
+      "learning_rate": 0.00017280879974003707,
+      "loss": 1.2012,
+      "step": 229
+    },
+    {
+      "epoch": 0.5589307411907655,
+      "grad_norm": 35.975450782756226,
+      "learning_rate": 0.00017253282796660056,
+      "loss": 1.1801,
+      "step": 230
+    },
+    {
+      "epoch": 0.5613608748481167,
+      "grad_norm": 83.49050230764925,
+      "learning_rate": 0.0001722556855615039,
+      "loss": 1.1576,
+      "step": 231
+    },
+    {
+      "epoch": 0.5637910085054678,
+      "grad_norm": 150.44630441002784,
+      "learning_rate": 0.00017197737699764146,
+      "loss": 1.1826,
+      "step": 232
+    },
+    {
+      "epoch": 0.5662211421628189,
+      "grad_norm": 31.322382197739042,
+      "learning_rate": 0.00017169790676672858,
+      "loss": 1.1784,
+      "step": 233
+    },
+    {
+      "epoch": 0.56865127582017,
+      "grad_norm": 33.15983653687515,
+      "learning_rate": 0.0001714172793792291,
+      "loss": 1.1411,
+      "step": 234
+    },
+    {
+      "epoch": 0.5710814094775213,
+      "grad_norm": 22.206850165103052,
+      "learning_rate": 0.0001711354993642827,
+      "loss": 1.1772,
+      "step": 235
+    },
+    {
+      "epoch": 0.5735115431348724,
+      "grad_norm": 43.35721272668955,
+      "learning_rate": 0.00017085257126963152,
+      "loss": 1.0915,
+      "step": 236
+    },
+    {
+      "epoch": 0.5759416767922235,
+      "grad_norm": 29.57234737116712,
+      "learning_rate": 0.0001705684996615472,
+      "loss": 1.0977,
+      "step": 237
+    },
+    {
+      "epoch": 0.5783718104495748,
+      "grad_norm": 42.929644875053214,
+      "learning_rate": 0.00017028328912475668,
+      "loss": 1.1782,
+      "step": 238
+    },
+    {
+      "epoch": 0.5808019441069259,
+      "grad_norm": 32.15711272871687,
+      "learning_rate": 0.0001699969442623686,
+      "loss": 1.1855,
+      "step": 239
+    },
+    {
+      "epoch": 0.583232077764277,
+      "grad_norm": 43.64453730184205,
+      "learning_rate": 0.00016970946969579887,
+      "loss": 1.1171,
+      "step": 240
+    },
+    {
+      "epoch": 0.5856622114216282,
+      "grad_norm": 26.145541544112593,
+      "learning_rate": 0.00016942087006469592,
+      "loss": 1.1656,
+      "step": 241
+    },
+    {
+      "epoch": 0.5880923450789793,
+      "grad_norm": 53.98173886095731,
+      "learning_rate": 0.00016913115002686616,
+      "loss": 1.1378,
+      "step": 242
+    },
+    {
+      "epoch": 0.5905224787363305,
+      "grad_norm": 50.851193586801195,
+      "learning_rate": 0.00016884031425819853,
+      "loss": 1.1338,
+      "step": 243
+    },
+    {
+      "epoch": 0.5929526123936817,
+      "grad_norm": 30.166674036386443,
+      "learning_rate": 0.0001685483674525891,
+      "loss": 1.1732,
+      "step": 244
+    },
+    {
+      "epoch": 0.5953827460510328,
+      "grad_norm": 32.580505176392656,
+      "learning_rate": 0.00016825531432186543,
+      "loss": 1.143,
+      "step": 245
+    },
+    {
+      "epoch": 0.5978128797083839,
+      "grad_norm": 35.087231952662634,
+      "learning_rate": 0.0001679611595957103,
+      "loss": 1.212,
+      "step": 246
+    },
+    {
+      "epoch": 0.6002430133657352,
+      "grad_norm": 44.69578306542608,
+      "learning_rate": 0.00016766590802158566,
+      "loss": 1.1527,
+      "step": 247
+    },
+    {
+      "epoch": 0.6026731470230863,
+      "grad_norm": 39.8378839133733,
+      "learning_rate": 0.00016736956436465573,
+      "loss": 1.2174,
+      "step": 248
+    },
+    {
+      "epoch": 0.6051032806804374,
+      "grad_norm": 25.571860004032857,
+      "learning_rate": 0.0001670721334077103,
+      "loss": 1.1031,
+      "step": 249
+    },
+    {
+      "epoch": 0.6075334143377886,
+      "grad_norm": 27.626061413643438,
+      "learning_rate": 0.00016677361995108743,
+      "loss": 1.107,
+      "step": 250
+    },
+    {
+      "epoch": 0.6099635479951397,
+      "grad_norm": 47.405627339857176,
+      "learning_rate": 0.00016647402881259598,
+      "loss": 1.1521,
+      "step": 251
+    },
+    {
+      "epoch": 0.6123936816524909,
+      "grad_norm": 31.951762409660272,
+      "learning_rate": 0.00016617336482743794,
+      "loss": 1.174,
+      "step": 252
+    },
+    {
+      "epoch": 0.6148238153098421,
+      "grad_norm": 44.304437144236104,
+      "learning_rate": 0.00016587163284813032,
+      "loss": 1.1286,
+      "step": 253
+    },
+    {
+      "epoch": 0.6172539489671932,
+      "grad_norm": 21.990501251879344,
+      "learning_rate": 0.00016556883774442675,
+      "loss": 1.1927,
+      "step": 254
+    },
+    {
+      "epoch": 0.6196840826245443,
+      "grad_norm": 43.91119350789936,
+      "learning_rate": 0.00016526498440323914,
+      "loss": 1.1399,
+      "step": 255
+    },
+    {
+      "epoch": 0.6221142162818954,
+      "grad_norm": 28.064569132249982,
+      "learning_rate": 0.00016496007772855853,
+      "loss": 1.1913,
+      "step": 256
+    },
+    {
+      "epoch": 0.6245443499392467,
+      "grad_norm": 99.97142272243896,
+      "learning_rate": 0.0001646541226413761,
+      "loss": 1.1694,
+      "step": 257
+    },
+    {
+      "epoch": 0.6269744835965978,
+      "grad_norm": 27.12524206817854,
+      "learning_rate": 0.00016434712407960373,
+      "loss": 1.2398,
+      "step": 258
+    },
+    {
+      "epoch": 0.6294046172539489,
+      "grad_norm": 42.99171796479219,
+      "learning_rate": 0.00016403908699799425,
+      "loss": 1.145,
+      "step": 259
+    },
+    {
+      "epoch": 0.6318347509113001,
+      "grad_norm": 24.064938768293658,
+      "learning_rate": 0.00016373001636806153,
+      "loss": 1.098,
+      "step": 260
+    },
+    {
+      "epoch": 0.6342648845686513,
+      "grad_norm": 31.72232981247621,
+      "learning_rate": 0.00016341991717800023,
+      "loss": 1.1779,
+      "step": 261
+    },
+    {
+      "epoch": 0.6366950182260024,
+      "grad_norm": 39.97326887390835,
+      "learning_rate": 0.00016310879443260528,
+      "loss": 1.3142,
+      "step": 262
+    },
+    {
+      "epoch": 0.6391251518833536,
+      "grad_norm": 27.519208072826963,
+      "learning_rate": 0.00016279665315319114,
+      "loss": 1.2039,
+      "step": 263
+    },
+    {
+      "epoch": 0.6415552855407047,
+      "grad_norm": 52.94895557810481,
+      "learning_rate": 0.00016248349837751062,
+      "loss": 1.1718,
+      "step": 264
+    },
+    {
+      "epoch": 0.6439854191980559,
+      "grad_norm": 23.603047222747566,
+      "learning_rate": 0.0001621693351596739,
+      "loss": 1.1155,
+      "step": 265
+    },
+    {
+      "epoch": 0.6464155528554071,
+      "grad_norm": 21.400341520569807,
+      "learning_rate": 0.00016185416857006647,
+      "loss": 1.1242,
+      "step": 266
+    },
+    {
+      "epoch": 0.6488456865127582,
+      "grad_norm": 51.167335508822276,
+      "learning_rate": 0.00016153800369526788,
+      "loss": 1.1746,
+      "step": 267
+    },
+    {
+      "epoch": 0.6512758201701093,
+      "grad_norm": 26.219581065473573,
+      "learning_rate": 0.00016122084563796905,
+      "loss": 1.0836,
+      "step": 268
+    },
+    {
+      "epoch": 0.6537059538274606,
+      "grad_norm": 56.820249886600706,
+      "learning_rate": 0.0001609026995168904,
+      "loss": 1.1625,
+      "step": 269
+    },
+    {
+      "epoch": 0.6561360874848117,
+      "grad_norm": 37.43384869992443,
+      "learning_rate": 0.00016058357046669898,
+      "loss": 1.2143,
+      "step": 270
+    },
+    {
+      "epoch": 0.6585662211421628,
+      "grad_norm": 31.885237168871473,
+      "learning_rate": 0.00016026346363792567,
+      "loss": 1.1536,
+      "step": 271
+    },
+    {
+      "epoch": 0.660996354799514,
+      "grad_norm": 34.66147983279251,
+      "learning_rate": 0.00015994238419688199,
+      "loss": 1.2095,
+      "step": 272
+    },
+    {
+      "epoch": 0.6634264884568651,
+      "grad_norm": 86.90365354594917,
+      "learning_rate": 0.00015962033732557686,
+      "loss": 1.1149,
+      "step": 273
+    },
+    {
+      "epoch": 0.6658566221142163,
+      "grad_norm": 52.21177462889067,
+      "learning_rate": 0.00015929732822163287,
+      "loss": 1.1861,
+      "step": 274
+    },
+    {
+      "epoch": 0.6682867557715675,
+      "grad_norm": 92.11184701145604,
+      "learning_rate": 0.00015897336209820239,
+      "loss": 1.1853,
+      "step": 275
+    },
+    {
+      "epoch": 0.6707168894289186,
+      "grad_norm": 30.662475573811115,
+      "learning_rate": 0.00015864844418388342,
+      "loss": 1.0912,
+      "step": 276
+    },
+    {
+      "epoch": 0.6731470230862697,
+      "grad_norm": 26.15855468837027,
+      "learning_rate": 0.00015832257972263523,
+      "loss": 1.1618,
+      "step": 277
+    },
+    {
+      "epoch": 0.675577156743621,
+      "grad_norm": 41.14250673970726,
+      "learning_rate": 0.00015799577397369375,
+      "loss": 1.1499,
+      "step": 278
+    },
+    {
+      "epoch": 0.6780072904009721,
+      "grad_norm": 31.93253644773631,
+      "learning_rate": 0.00015766803221148673,
+      "loss": 1.1229,
+      "step": 279
+    },
+    {
+      "epoch": 0.6804374240583232,
+      "grad_norm": 39.87120131585165,
+      "learning_rate": 0.00015733935972554844,
+      "loss": 1.1647,
+      "step": 280
+    },
+    {
+      "epoch": 0.6828675577156743,
+      "grad_norm": 52.741654062271124,
+      "learning_rate": 0.0001570097618204345,
+      "loss": 1.1362,
+      "step": 281
+    },
+    {
+      "epoch": 0.6852976913730255,
+      "grad_norm": 33.13137686002526,
+      "learning_rate": 0.0001566792438156362,
+      "loss": 1.1825,
+      "step": 282
+    },
+    {
+      "epoch": 0.6877278250303767,
+      "grad_norm": 20.284041564566042,
+      "learning_rate": 0.00015634781104549442,
+      "loss": 1.1439,
+      "step": 283
+    },
+    {
+      "epoch": 0.6901579586877278,
+      "grad_norm": 164.9222932471453,
+      "learning_rate": 0.00015601546885911404,
+      "loss": 1.122,
+      "step": 284
+    },
+    {
+      "epoch": 0.692588092345079,
+      "grad_norm": 27.092346730158148,
+      "learning_rate": 0.00015568222262027717,
+      "loss": 1.157,
+      "step": 285
+    },
+    {
+      "epoch": 0.6950182260024301,
+      "grad_norm": 39.46898996008012,
+      "learning_rate": 0.00015534807770735664,
+      "loss": 1.1092,
+      "step": 286
+    },
+    {
+      "epoch": 0.6974483596597812,
+      "grad_norm": 30.00942949300714,
+      "learning_rate": 0.00015501303951322943,
+      "loss": 1.243,
+      "step": 287
+    },
+    {
+      "epoch": 0.6998784933171325,
+      "grad_norm": 31.435817418038887,
+      "learning_rate": 0.00015467711344518942,
+      "loss": 1.1034,
+      "step": 288
+    },
+    {
+      "epoch": 0.7023086269744836,
+      "grad_norm": 54.53572773177548,
+      "learning_rate": 0.00015434030492486023,
+      "loss": 1.2216,
+      "step": 289
+    },
+    {
+      "epoch": 0.7047387606318347,
+      "grad_norm": 24.51082708234768,
+      "learning_rate": 0.00015400261938810757,
+      "loss": 1.1532,
+      "step": 290
+    },
+    {
+      "epoch": 0.707168894289186,
+      "grad_norm": 104.85480514443172,
+      "learning_rate": 0.00015366406228495172,
+      "loss": 1.1156,
+      "step": 291
+    },
+    {
+      "epoch": 0.7095990279465371,
+      "grad_norm": 26.398830117870997,
+      "learning_rate": 0.0001533246390794794,
+      "loss": 1.0934,
+      "step": 292
+    },
+    {
+      "epoch": 0.7120291616038882,
+      "grad_norm": 25.062392373037707,
+      "learning_rate": 0.00015298435524975572,
+      "loss": 1.1453,
+      "step": 293
+    },
+    {
+      "epoch": 0.7144592952612394,
+      "grad_norm": 25.385505352027444,
+      "learning_rate": 0.0001526432162877356,
+      "loss": 1.1359,
+      "step": 294
+    },
+    {
+      "epoch": 0.7168894289185905,
+      "grad_norm": 18.00146943000571,
+      "learning_rate": 0.00015230122769917527,
+      "loss": 1.1129,
+      "step": 295
+    },
+    {
+      "epoch": 0.7193195625759417,
+      "grad_norm": 22.55383473288135,
+      "learning_rate": 0.00015195839500354335,
+      "loss": 1.142,
+      "step": 296
+    },
+    {
+      "epoch": 0.7217496962332929,
+      "grad_norm": 30.013723395820165,
+      "learning_rate": 0.00015161472373393186,
+      "loss": 1.1379,
+      "step": 297
+    },
+    {
+      "epoch": 0.724179829890644,
+      "grad_norm": 40.566201545240425,
+      "learning_rate": 0.0001512702194369668,
+      "loss": 1.1326,
+      "step": 298
+    },
+    {
+      "epoch": 0.7266099635479951,
+      "grad_norm": 27.34716639907029,
+      "learning_rate": 0.00015092488767271857,
+      "loss": 1.0782,
+      "step": 299
+    },
+    {
+      "epoch": 0.7290400972053463,
+      "grad_norm": 45.0837594669075,
+      "learning_rate": 0.00015057873401461253,
+      "loss": 1.2054,
+      "step": 300
+    },
+    {
+      "epoch": 0.7314702308626975,
+      "grad_norm": 22.39794101270309,
+      "learning_rate": 0.00015023176404933874,
+      "loss": 1.1052,
+      "step": 301
+    },
+    {
+      "epoch": 0.7339003645200486,
+      "grad_norm": 21.818512025585306,
+      "learning_rate": 0.00014988398337676198,
+      "loss": 1.1664,
+      "step": 302
+    },
+    {
+      "epoch": 0.7363304981773997,
+      "grad_norm": 33.09386163968815,
+      "learning_rate": 0.00014953539760983122,
+      "loss": 1.1364,
+      "step": 303
+    },
+    {
+      "epoch": 0.7387606318347509,
+      "grad_norm": 26.3253592215911,
+      "learning_rate": 0.00014918601237448923,
+      "loss": 1.1093,
+      "step": 304
+    },
+    {
+      "epoch": 0.741190765492102,
+      "grad_norm": 32.54878723405212,
+      "learning_rate": 0.0001488358333095816,
+      "loss": 1.182,
+      "step": 305
+    },
+    {
+      "epoch": 0.7436208991494532,
+      "grad_norm": 28.645473311846015,
+      "learning_rate": 0.0001484848660667658,
+      "loss": 1.2064,
+      "step": 306
+    },
+    {
+      "epoch": 0.7460510328068044,
+      "grad_norm": 29.02693042820854,
+      "learning_rate": 0.00014813311631041995,
+      "loss": 1.1545,
+      "step": 307
+    },
+    {
+      "epoch": 0.7484811664641555,
+      "grad_norm": 20.28193033099828,
+      "learning_rate": 0.00014778058971755154,
+      "loss": 1.1885,
+      "step": 308
+    },
+    {
+      "epoch": 0.7509113001215066,
+      "grad_norm": 121.86121371804961,
+      "learning_rate": 0.00014742729197770552,
+      "loss": 1.095,
+      "step": 309
+    },
+    {
+      "epoch": 0.7509113001215066,
+      "eval_loss": 1.133868932723999,
+      "eval_runtime": 52.6711,
+      "eval_samples_per_second": 14.125,
+      "eval_steps_per_second": 1.766,
+      "step": 309
+    },
+    {
+      "epoch": 0.7533414337788579,
+      "grad_norm": 50.1793074315811,
+      "learning_rate": 0.00014707322879287276,
+      "loss": 1.1679,
+      "step": 310
+    },
+    {
+      "epoch": 0.755771567436209,
+      "grad_norm": 31.791309498678103,
+      "learning_rate": 0.00014671840587739783,
+      "loss": 1.1277,
+      "step": 311
+    },
+    {
+      "epoch": 0.7582017010935601,
+      "grad_norm": 56.88911226488106,
+      "learning_rate": 0.00014636282895788688,
+      "loss": 1.1492,
+      "step": 312
+    },
+    {
+      "epoch": 0.7606318347509113,
+      "grad_norm": 117.29437608667352,
+      "learning_rate": 0.00014600650377311522,
+      "loss": 1.1123,
+      "step": 313
+    },
+    {
+      "epoch": 0.7630619684082625,
+      "grad_norm": 107.56728772749254,
+      "learning_rate": 0.00014564943607393459,
+      "loss": 1.171,
+      "step": 314
+    },
+    {
+      "epoch": 0.7654921020656136,
+      "grad_norm": 34.085830256919685,
+      "learning_rate": 0.0001452916316231805,
+      "loss": 1.1854,
+      "step": 315
+    },
+    {
+      "epoch": 0.7679222357229648,
+      "grad_norm": 23.625747202851176,
+      "learning_rate": 0.000144933096195579,
+      "loss": 1.1622,
+      "step": 316
+    },
+    {
+      "epoch": 0.7703523693803159,
+      "grad_norm": 56.9917185309248,
+      "learning_rate": 0.00014457383557765386,
+      "loss": 1.2037,
+      "step": 317
+    },
+    {
+      "epoch": 0.772782503037667,
+      "grad_norm": 34.55554043725056,
+      "learning_rate": 0.00014421385556763266,
+      "loss": 1.1273,
+      "step": 318
+    },
+    {
+      "epoch": 0.7752126366950183,
+      "grad_norm": 34.205286759913115,
+      "learning_rate": 0.00014385316197535372,
+      "loss": 1.2039,
+      "step": 319
+    },
+    {
+      "epoch": 0.7776427703523694,
+      "grad_norm": 27.30015395778206,
+      "learning_rate": 0.00014349176062217195,
+      "loss": 1.1903,
+      "step": 320
+    },
+    {
+      "epoch": 0.7800729040097205,
+      "grad_norm": 23.077745147127867,
+      "learning_rate": 0.00014312965734086518,
+      "loss": 1.1539,
+      "step": 321
+    },
+    {
+      "epoch": 0.7825030376670717,
+      "grad_norm": 26.22112568156326,
+      "learning_rate": 0.00014276685797553977,
+      "loss": 1.1807,
+      "step": 322
+    },
+    {
+      "epoch": 0.7849331713244229,
+      "grad_norm": 34.813719314948514,
+      "learning_rate": 0.0001424033683815365,
+      "loss": 1.1247,
+      "step": 323
+    },
+    {
+      "epoch": 0.787363304981774,
+      "grad_norm": 27.109609629038324,
+      "learning_rate": 0.00014203919442533597,
+      "loss": 1.1735,
+      "step": 324
+    },
+    {
+      "epoch": 0.7897934386391251,
+      "grad_norm": 144.91672798575476,
+      "learning_rate": 0.00014167434198446383,
+      "loss": 1.1007,
+      "step": 325
+    },
+    {
+      "epoch": 0.7922235722964763,
+      "grad_norm": 42.19042828736382,
+      "learning_rate": 0.00014130881694739616,
+      "loss": 1.1398,
+      "step": 326
+    },
+    {
+      "epoch": 0.7946537059538274,
+      "grad_norm": 43.00144921766715,
+      "learning_rate": 0.00014094262521346427,
+      "loss": 1.1712,
+      "step": 327
+    },
+    {
+      "epoch": 0.7970838396111786,
+      "grad_norm": 26.343159670729925,
+      "learning_rate": 0.0001405757726927595,
+      "loss": 1.2103,
+      "step": 328
+    },
+    {
+      "epoch": 0.7995139732685298,
+      "grad_norm": 31.68271222195729,
+      "learning_rate": 0.00014020826530603776,
+      "loss": 1.1578,
+      "step": 329
+    },
+    {
+      "epoch": 0.8019441069258809,
+      "grad_norm": 39.08920292536896,
+      "learning_rate": 0.00013984010898462416,
+      "loss": 1.1377,
+      "step": 330
+    },
+    {
+      "epoch": 0.804374240583232,
+      "grad_norm": 34.56898084569197,
+      "learning_rate": 0.00013947130967031717,
+      "loss": 1.1886,
+      "step": 331
+    },
+    {
+      "epoch": 0.8068043742405833,
+      "grad_norm": 42.016356369933895,
+      "learning_rate": 0.00013910187331529276,
+      "loss": 1.1577,
+      "step": 332
+    },
+    {
+      "epoch": 0.8092345078979344,
+      "grad_norm": 21.25953597879822,
+      "learning_rate": 0.00013873180588200827,
+      "loss": 1.1259,
+      "step": 333
+    },
+    {
+      "epoch": 0.8116646415552855,
+      "grad_norm": 39.49634140985428,
+      "learning_rate": 0.0001383611133431062,
+      "loss": 1.173,
+      "step": 334
+    },
+    {
+      "epoch": 0.8140947752126367,
+      "grad_norm": 29.837690582268863,
+      "learning_rate": 0.00013798980168131794,
+      "loss": 1.1322,
+      "step": 335
+    },
+    {
+      "epoch": 0.8165249088699879,
+      "grad_norm": 23.510451396240928,
+      "learning_rate": 0.000137617876889367,
+      "loss": 1.1392,
+      "step": 336
+    },
+    {
+      "epoch": 0.818955042527339,
+      "grad_norm": 19.183017199526635,
+      "learning_rate": 0.00013724534496987247,
+      "loss": 1.157,
+      "step": 337
+    },
+    {
+      "epoch": 0.8213851761846902,
+      "grad_norm": 51.85037647612581,
+      "learning_rate": 0.0001368722119352521,
+      "loss": 1.1255,
+      "step": 338
+    },
+    {
+      "epoch": 0.8238153098420413,
+      "grad_norm": 31.635699477838273,
+      "learning_rate": 0.00013649848380762513,
+      "loss": 1.1429,
+      "step": 339
+    },
+    {
+      "epoch": 0.8262454434993924,
+      "grad_norm": 39.6479124739029,
+      "learning_rate": 0.00013612416661871533,
+      "loss": 1.1609,
+      "step": 340
+    },
+    {
+      "epoch": 0.8286755771567437,
+      "grad_norm": 21.453228401011238,
+      "learning_rate": 0.0001357492664097534,
+      "loss": 1.1247,
+      "step": 341
+    },
+    {
+      "epoch": 0.8311057108140948,
+      "grad_norm": 28.514958428145494,
+      "learning_rate": 0.00013537378923137973,
+      "loss": 1.0845,
+      "step": 342
+    },
+    {
+      "epoch": 0.8335358444714459,
+      "grad_norm": 26.98663985253516,
+      "learning_rate": 0.00013499774114354655,
+      "loss": 1.1092,
+      "step": 343
+    },
+    {
+      "epoch": 0.8359659781287971,
+      "grad_norm": 30.76143424141064,
+      "learning_rate": 0.00013462112821542016,
+      "loss": 1.1759,
+      "step": 344
+    },
+    {
+      "epoch": 0.8383961117861483,
+      "grad_norm": 39.023771167108656,
+      "learning_rate": 0.0001342439565252831,
+      "loss": 1.1024,
+      "step": 345
+    },
+    {
+      "epoch": 0.8408262454434994,
+      "grad_norm": 29.787639099820225,
+      "learning_rate": 0.0001338662321604358,
+      "loss": 1.2141,
+      "step": 346
+    },
+    {
+      "epoch": 0.8432563791008505,
+      "grad_norm": 25.60634301240642,
+      "learning_rate": 0.00013348796121709862,
+      "loss": 1.1244,
+      "step": 347
+    },
+    {
+      "epoch": 0.8456865127582017,
+      "grad_norm": 76.98542857181108,
+      "learning_rate": 0.00013310914980031334,
+      "loss": 1.19,
+      "step": 348
+    },
+    {
+      "epoch": 0.8481166464155528,
+      "grad_norm": 110.28982985071892,
+      "learning_rate": 0.0001327298040238446,
+      "loss": 1.1295,
+      "step": 349
+    },
+    {
+      "epoch": 0.850546780072904,
+      "grad_norm": 22.610631125609732,
+      "learning_rate": 0.0001323499300100811,
+      "loss": 1.1445,
+      "step": 350
+    },
+    {
+      "epoch": 0.8529769137302552,
+      "grad_norm": 29.958515973723888,
+      "learning_rate": 0.00013196953388993726,
+      "loss": 1.2048,
+      "step": 351
+    },
+    {
+      "epoch": 0.8554070473876063,
+      "grad_norm": 30.691798031468103,
+      "learning_rate": 0.00013158862180275363,
+      "loss": 1.1628,
+      "step": 352
+    },
+    {
+      "epoch": 0.8578371810449574,
+      "grad_norm": 28.568576369680258,
+      "learning_rate": 0.00013120719989619833,
+      "loss": 1.0899,
+      "step": 353
+    },
+    {
+      "epoch": 0.8602673147023087,
+      "grad_norm": 42.12623456189728,
+      "learning_rate": 0.0001308252743261675,
+      "loss": 1.1451,
+      "step": 354
+    },
+    {
+      "epoch": 0.8626974483596598,
+      "grad_norm": 112.39248005736448,
+      "learning_rate": 0.00013044285125668614,
+      "loss": 1.154,
+      "step": 355
+    },
+    {
+      "epoch": 0.8651275820170109,
+      "grad_norm": 28.013602355549782,
+      "learning_rate": 0.0001300599368598086,
+      "loss": 1.1937,
+      "step": 356
+    },
+    {
+      "epoch": 0.8675577156743621,
+      "grad_norm": 27.763517972300694,
+      "learning_rate": 0.0001296765373155188,
+      "loss": 1.1243,
+      "step": 357
+    },
+    {
+      "epoch": 0.8699878493317132,
+      "grad_norm": 112.85815824767063,
+      "learning_rate": 0.0001292926588116308,
+      "loss": 1.1595,
+      "step": 358
+    },
+    {
+      "epoch": 0.8724179829890644,
+      "grad_norm": 27.085127886556087,
+      "learning_rate": 0.00012890830754368855,
+      "loss": 1.1196,
+      "step": 359
+    },
+    {
+      "epoch": 0.8748481166464156,
+      "grad_norm": 31.56336829128541,
+      "learning_rate": 0.00012852348971486617,
+      "loss": 1.1231,
+      "step": 360
+    },
+    {
+      "epoch": 0.8772782503037667,
+      "grad_norm": 31.904393738907178,
+      "learning_rate": 0.0001281382115358679,
+      "loss": 1.097,
+      "step": 361
+    },
+    {
+      "epoch": 0.8797083839611178,
+      "grad_norm": 25.034453894065827,
+      "learning_rate": 0.00012775247922482748,
+      "loss": 1.1246,
+      "step": 362
+    },
+    {
+      "epoch": 0.8821385176184691,
+      "grad_norm": 33.221958266501474,
+      "learning_rate": 0.0001273662990072083,
+      "loss": 1.1189,
+      "step": 363
+    },
+    {
+      "epoch": 0.8845686512758202,
+      "grad_norm": 26.638980136773224,
+      "learning_rate": 0.00012697967711570242,
+      "loss": 1.1315,
+      "step": 364
+    },
+    {
+      "epoch": 0.8869987849331713,
+      "grad_norm": 27.231479341362885,
+      "learning_rate": 0.00012659261979013043,
+      "loss": 1.1464,
+      "step": 365
+    },
+    {
+      "epoch": 0.8894289185905225,
+      "grad_norm": 19.654091006710207,
+      "learning_rate": 0.0001262051332773404,
+      "loss": 1.1271,
+      "step": 366
+    },
+    {
+      "epoch": 0.8918590522478737,
+      "grad_norm": 50.3934263865559,
+      "learning_rate": 0.00012581722383110718,
+      "loss": 1.1002,
+      "step": 367
+    },
+    {
+      "epoch": 0.8942891859052248,
+      "grad_norm": 20.25952031318632,
+      "learning_rate": 0.00012542889771203166,
+      "loss": 1.0629,
+      "step": 368
+    },
+    {
+      "epoch": 0.8967193195625759,
+      "grad_norm": 19.16914945262315,
+      "learning_rate": 0.00012504016118743935,
+      "loss": 1.1597,
+      "step": 369
+    },
+    {
+      "epoch": 0.8991494532199271,
+      "grad_norm": 35.65941460173898,
+      "learning_rate": 0.00012465102053127957,
+      "loss": 1.1501,
+      "step": 370
+    },
+    {
+      "epoch": 0.9015795868772782,
+      "grad_norm": 26.093269180565315,
+      "learning_rate": 0.00012426148202402404,
+      "loss": 1.1455,
+      "step": 371
+    },
+    {
+      "epoch": 0.9040097205346294,
+      "grad_norm": 30.928987547424892,
+      "learning_rate": 0.00012387155195256537,
+      "loss": 1.1392,
+      "step": 372
+    },
+    {
+      "epoch": 0.9064398541919806,
+      "grad_norm": 20.17512596846915,
+      "learning_rate": 0.00012348123661011601,
+      "loss": 1.1196,
+      "step": 373
+    },
+    {
+      "epoch": 0.9088699878493317,
+      "grad_norm": 24.380789157356805,
+      "learning_rate": 0.00012309054229610623,
+      "loss": 1.1,
+      "step": 374
+    },
+    {
+      "epoch": 0.9113001215066828,
+      "grad_norm": 95.49408387682203,
+      "learning_rate": 0.00012269947531608276,
+      "loss": 1.1825,
+      "step": 375
+    },
+    {
+      "epoch": 0.913730255164034,
+      "grad_norm": 23.635286340368726,
+      "learning_rate": 0.0001223080419816069,
+      "loss": 1.1717,
+      "step": 376
+    },
+    {
+      "epoch": 0.9161603888213852,
+      "grad_norm": 21.942478063568313,
+      "learning_rate": 0.00012191624861015254,
+      "loss": 1.1661,
+      "step": 377
+    },
+    {
+      "epoch": 0.9185905224787363,
+      "grad_norm": 74.12601397150299,
+      "learning_rate": 0.00012152410152500453,
+      "loss": 1.1967,
+      "step": 378
+    },
+    {
+      "epoch": 0.9210206561360875,
+      "grad_norm": 37.26720386499629,
+      "learning_rate": 0.00012113160705515625,
+      "loss": 1.1566,
+      "step": 379
+    },
+    {
+      "epoch": 0.9234507897934386,
+      "grad_norm": 34.080854733427635,
+      "learning_rate": 0.00012073877153520776,
+      "loss": 1.0847,
+      "step": 380
+    },
+    {
+      "epoch": 0.9258809234507898,
+      "grad_norm": 26.50842916877183,
+      "learning_rate": 0.0001203456013052634,
+      "loss": 1.0824,
+      "step": 381
+    },
+    {
+      "epoch": 0.928311057108141,
+      "grad_norm": 37.92039651416441,
+      "learning_rate": 0.00011995210271082944,
+      "loss": 1.1485,
+      "step": 382
+    },
+    {
+      "epoch": 0.9307411907654921,
+      "grad_norm": 38.56931832374284,
+      "learning_rate": 0.00011955828210271187,
+      "loss": 1.0737,
+      "step": 383
+    },
+    {
+      "epoch": 0.9331713244228432,
+      "grad_norm": 24.419015296791592,
+      "learning_rate": 0.0001191641458369136,
+      "loss": 1.1208,
+      "step": 384
+    },
+    {
+      "epoch": 0.9356014580801945,
+      "grad_norm": 28.75379656643836,
+      "learning_rate": 0.00011876970027453222,
+      "loss": 1.1071,
+      "step": 385
+    },
+    {
+      "epoch": 0.9380315917375456,
+      "grad_norm": 138.39305133994282,
+      "learning_rate": 0.00011837495178165706,
+      "loss": 1.1405,
+      "step": 386
+    },
+    {
+      "epoch": 0.9404617253948967,
+      "grad_norm": 22.200435229928654,
+      "learning_rate": 0.00011797990672926652,
+      "loss": 1.124,
+      "step": 387
+    },
+    {
+      "epoch": 0.9428918590522479,
+      "grad_norm": 40.21978055156661,
+      "learning_rate": 0.00011758457149312538,
+      "loss": 1.1875,
+      "step": 388
+    },
+    {
+      "epoch": 0.945321992709599,
+      "grad_norm": 23.592672098002485,
+      "learning_rate": 0.00011718895245368167,
+      "loss": 1.1748,
+      "step": 389
+    },
+    {
+      "epoch": 0.9477521263669502,
+      "grad_norm": 17.463183827323444,
+      "learning_rate": 0.00011679305599596393,
+      "loss": 1.1794,
+      "step": 390
+    },
+    {
+      "epoch": 0.9501822600243013,
+      "grad_norm": 36.219441964332646,
+      "learning_rate": 0.00011639688850947799,
+      "loss": 1.1459,
+      "step": 391
+    },
+    {
+      "epoch": 0.9526123936816525,
+      "grad_norm": 23.727472560980413,
+      "learning_rate": 0.00011600045638810386,
+      "loss": 1.076,
+      "step": 392
+    },
+    {
+      "epoch": 0.9550425273390036,
+      "grad_norm": 57.63284414960702,
+      "learning_rate": 0.00011560376602999272,
+      "loss": 1.1919,
+      "step": 393
+    },
+    {
+      "epoch": 0.9574726609963548,
+      "grad_norm": 40.23829998466358,
+      "learning_rate": 0.00011520682383746333,
+      "loss": 1.0701,
+      "step": 394
+    },
+    {
+      "epoch": 0.959902794653706,
+      "grad_norm": 58.2018640218209,
+      "learning_rate": 0.00011480963621689905,
+      "loss": 1.1745,
+      "step": 395
+    },
+    {
+      "epoch": 0.9623329283110571,
+      "grad_norm": 27.693448904288406,
+      "learning_rate": 0.00011441220957864421,
+      "loss": 1.1323,
+      "step": 396
+    },
+    {
+      "epoch": 0.9647630619684082,
+      "grad_norm": 34.94430005820724,
+      "learning_rate": 0.00011401455033690076,
+      "loss": 1.1497,
+      "step": 397
+    },
+    {
+      "epoch": 0.9671931956257594,
+      "grad_norm": 17.521922247865188,
+      "learning_rate": 0.00011361666490962468,
+      "loss": 1.1319,
+      "step": 398
+    },
+    {
+      "epoch": 0.9696233292831106,
+      "grad_norm": 25.886687159935246,
+      "learning_rate": 0.00011321855971842243,
+      "loss": 1.1418,
+      "step": 399
+    },
+    {
+      "epoch": 0.9720534629404617,
+      "grad_norm": 31.388154506614836,
+      "learning_rate": 0.00011282024118844738,
+      "loss": 1.1282,
+      "step": 400
+    },
+    {
+      "epoch": 0.9744835965978129,
+      "grad_norm": 27.458601253675347,
+      "learning_rate": 0.00011242171574829599,
+      "loss": 1.1647,
+      "step": 401
+    },
+    {
+      "epoch": 0.976913730255164,
+      "grad_norm": 25.922873022924257,
+      "learning_rate": 0.00011202298982990411,
+      "loss": 1.091,
+      "step": 402
+    },
+    {
+      "epoch": 0.9793438639125152,
+      "grad_norm": 20.129467589894766,
+      "learning_rate": 0.00011162406986844323,
+      "loss": 1.2,
+      "step": 403
+    },
+    {
+      "epoch": 0.9817739975698664,
+      "grad_norm": 25.11892123906363,
+      "learning_rate": 0.00011122496230221645,
+      "loss": 1.0731,
+      "step": 404
+    },
+    {
+      "epoch": 0.9842041312272175,
+      "grad_norm": 26.416884392453543,
+      "learning_rate": 0.00011082567357255484,
+      "loss": 1.1836,
+      "step": 405
+    },
+    {
+      "epoch": 0.9866342648845686,
+      "grad_norm": 18.768078773975784,
+      "learning_rate": 0.00011042621012371322,
+      "loss": 1.1275,
+      "step": 406
+    },
+    {
+      "epoch": 0.9890643985419199,
+      "grad_norm": 22.275756523796257,
+      "learning_rate": 0.00011002657840276627,
+      "loss": 1.1228,
+      "step": 407
+    },
+    {
+      "epoch": 0.991494532199271,
+      "grad_norm": 29.605335344828575,
+      "learning_rate": 0.00010962678485950455,
+      "loss": 1.0255,
+      "step": 408
+    },
+    {
+      "epoch": 0.9939246658566221,
+      "grad_norm": 41.1718200727633,
+      "learning_rate": 0.00010922683594633021,
+      "loss": 1.1876,
+      "step": 409
+    },
+    {
+      "epoch": 0.9963547995139733,
+      "grad_norm": 20.46397475257922,
+      "learning_rate": 0.00010882673811815304,
+      "loss": 1.1168,
+      "step": 410
+    },
+    {
+      "epoch": 0.9987849331713244,
+      "grad_norm": 21.084924025016928,
+      "learning_rate": 0.00010842649783228624,
+      "loss": 1.1948,
+      "step": 411
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 21.084924025016928,
+      "learning_rate": 0.00010802612154834211,
+      "loss": 1.1076,
+      "step": 412
+    },
+    {
+      "epoch": 1.0,
+      "eval_loss": 1.121336579322815,
+      "eval_runtime": 52.7043,
+      "eval_samples_per_second": 14.116,
+      "eval_steps_per_second": 1.765,
+      "step": 412
+    },
+    {
+      "epoch": 1.0024301336573511,
+      "grad_norm": 35.25758968935371,
+      "learning_rate": 0.00010762561572812788,
+      "loss": 1.1335,
+      "step": 413
+    },
+    {
+      "epoch": 1.0048602673147022,
+      "grad_norm": 20.78715726366623,
+      "learning_rate": 0.0001072249868355415,
+      "loss": 1.1003,
+      "step": 414
+    },
+    {
+      "epoch": 1.0072904009720534,
+      "grad_norm": 31.01116633763719,
+      "learning_rate": 0.0001068242413364671,
+      "loss": 1.1225,
+      "step": 415
+    },
+    {
+      "epoch": 1.0097205346294047,
+      "grad_norm": 19.050638172672897,
+      "learning_rate": 0.00010642338569867086,
+      "loss": 1.0595,
+      "step": 416
+    },
+    {
+      "epoch": 1.0121506682867558,
+      "grad_norm": 41.54235389574412,
+      "learning_rate": 0.00010602242639169648,
+      "loss": 1.1719,
+      "step": 417
+    },
+    {
+      "epoch": 1.014580801944107,
+      "grad_norm": 41.34218206464363,
+      "learning_rate": 0.00010562136988676078,
+      "loss": 1.1292,
+      "step": 418
+    },
+    {
+      "epoch": 1.017010935601458,
+      "grad_norm": 32.436985934581934,
+      "learning_rate": 0.0001052202226566494,
+      "loss": 1.1244,
+      "step": 419
+    },
+    {
+      "epoch": 1.0194410692588092,
+      "grad_norm": 19.631825450596665,
+      "learning_rate": 0.0001048189911756121,
+      "loss": 1.1323,
+      "step": 420
+    },
+    {
+      "epoch": 1.0218712029161603,
+      "grad_norm": 23.275029440216805,
+      "learning_rate": 0.00010441768191925847,
+      "loss": 1.1605,
+      "step": 421
+    },
+    {
+      "epoch": 1.0243013365735116,
+      "grad_norm": 21.44161988455765,
+      "learning_rate": 0.0001040163013644533,
+      "loss": 1.0886,
+      "step": 422
+    },
+    {
+      "epoch": 1.0267314702308628,
+      "grad_norm": 31.9765167465431,
+      "learning_rate": 0.00010361485598921212,
+      "loss": 1.1378,
+      "step": 423
+    },
+    {
+      "epoch": 1.0291616038882139,
+      "grad_norm": 22.340741556027833,
+      "learning_rate": 0.00010321335227259661,
+      "loss": 1.1278,
+      "step": 424
+    },
+    {
+      "epoch": 1.031591737545565,
+      "grad_norm": 29.27286563037163,
+      "learning_rate": 0.00010281179669461005,
+      "loss": 1.1186,
+      "step": 425
+    },
+    {
+      "epoch": 1.034021871202916,
+      "grad_norm": 65.85877610734141,
+      "learning_rate": 0.00010241019573609269,
+      "loss": 1.1673,
+      "step": 426
+    },
+    {
+      "epoch": 1.0364520048602672,
+      "grad_norm": 35.173784527846884,
+      "learning_rate": 0.00010200855587861724,
+      "loss": 1.0903,
+      "step": 427
+    },
+    {
+      "epoch": 1.0388821385176186,
+      "grad_norm": 29.91546238299385,
+      "learning_rate": 0.00010160688360438419,
+      "loss": 1.0884,
+      "step": 428
+    },
+    {
+      "epoch": 1.0413122721749697,
+      "grad_norm": 26.873308685100223,
+      "learning_rate": 0.0001012051853961172,
+      "loss": 1.1296,
+      "step": 429
+    },
+    {
+      "epoch": 1.0437424058323208,
+      "grad_norm": 25.90622275527891,
+      "learning_rate": 0.00010080346773695853,
+      "loss": 1.1349,
+      "step": 430
+    },
+    {
+      "epoch": 1.046172539489672,
+      "grad_norm": 21.388851321680434,
+      "learning_rate": 0.00010040173711036431,
+      "loss": 1.0947,
+      "step": 431
+    },
+    {
+      "epoch": 1.048602673147023,
+      "grad_norm": 31.206506843880053,
+      "learning_rate": 0.0001,
+      "loss": 1.1541,
+      "step": 432
+    },
+    {
+      "epoch": 1.0510328068043742,
+      "grad_norm": 19.486767323523555,
+      "learning_rate": 9.959826288963571e-05,
+      "loss": 1.1574,
+      "step": 433
+    },
+    {
+      "epoch": 1.0534629404617255,
+      "grad_norm": 102.81325604770561,
+      "learning_rate": 9.919653226304148e-05,
+      "loss": 1.1762,
+      "step": 434
+    },
+    {
+      "epoch": 1.0558930741190766,
+      "grad_norm": 17.18170280255333,
+      "learning_rate": 9.879481460388282e-05,
+      "loss": 1.1208,
+      "step": 435
+    },
+    {
+      "epoch": 1.0583232077764277,
+      "grad_norm": 29.88292309614927,
+      "learning_rate": 9.839311639561583e-05,
+      "loss": 1.1114,
+      "step": 436
+    },
+    {
+      "epoch": 1.0607533414337789,
+      "grad_norm": 23.50392429976475,
+      "learning_rate": 9.799144412138275e-05,
+      "loss": 1.2026,
+      "step": 437
+    },
+    {
+      "epoch": 1.06318347509113,
+      "grad_norm": 24.794408487434744,
+      "learning_rate": 9.758980426390732e-05,
+      "loss": 1.1587,
+      "step": 438
+    },
+    {
+      "epoch": 1.065613608748481,
+      "grad_norm": 38.726295800289655,
+      "learning_rate": 9.718820330538998e-05,
+      "loss": 1.14,
+      "step": 439
+    },
+    {
+      "epoch": 1.0680437424058322,
+      "grad_norm": 31.152256057732977,
+      "learning_rate": 9.678664772740343e-05,
+      "loss": 1.0882,
+      "step": 440
+    },
+    {
+      "epoch": 1.0704738760631836,
+      "grad_norm": 65.73380095432839,
+      "learning_rate": 9.638514401078788e-05,
+      "loss": 1.1213,
+      "step": 441
+    },
+    {
+      "epoch": 1.0729040097205347,
+      "grad_norm": 69.07317297910537,
+      "learning_rate": 9.598369863554673e-05,
+      "loss": 1.1285,
+      "step": 442
+    },
+    {
+      "epoch": 1.0753341433778858,
+      "grad_norm": 62.55969576940585,
+      "learning_rate": 9.558231808074156e-05,
+      "loss": 1.1252,
+      "step": 443
+    },
+    {
+      "epoch": 1.077764277035237,
+      "grad_norm": 26.35106444530265,
+      "learning_rate": 9.51810088243879e-05,
+      "loss": 1.108,
+      "step": 444
+    },
+    {
+      "epoch": 1.080194410692588,
+      "grad_norm": 76.70006955440516,
+      "learning_rate": 9.477977734335061e-05,
+      "loss": 1.1144,
+      "step": 445
+    },
+    {
+      "epoch": 1.0826245443499392,
+      "grad_norm": 22.376983523395264,
+      "learning_rate": 9.437863011323922e-05,
+      "loss": 1.173,
+      "step": 446
+    },
+    {
+      "epoch": 1.0850546780072905,
+      "grad_norm": 33.51322062360491,
+      "learning_rate": 9.397757360830353e-05,
+      "loss": 1.089,
+      "step": 447
+    },
+    {
+      "epoch": 1.0874848116646416,
+      "grad_norm": 24.87252097324779,
+      "learning_rate": 9.357661430132915e-05,
+      "loss": 1.098,
+      "step": 448
+    },
+    {
+      "epoch": 1.0899149453219927,
+      "grad_norm": 48.95371674408058,
+      "learning_rate": 9.317575866353292e-05,
+      "loss": 1.0491,
+      "step": 449
+    },
+    {
+      "epoch": 1.0923450789793439,
+      "grad_norm": 25.50740340531524,
+      "learning_rate": 9.277501316445854e-05,
+      "loss": 1.0939,
+      "step": 450
+    },
+    {
+      "epoch": 1.094775212636695,
+      "grad_norm": 27.60998778610316,
+      "learning_rate": 9.23743842718721e-05,
+      "loss": 1.1564,
+      "step": 451
+    },
+    {
+      "epoch": 1.097205346294046,
+      "grad_norm": 63.99226186124907,
+      "learning_rate": 9.197387845165793e-05,
+      "loss": 1.1088,
+      "step": 452
+    },
+    {
+      "epoch": 1.0996354799513974,
+      "grad_norm": 36.441157466567596,
+      "learning_rate": 9.157350216771378e-05,
+      "loss": 1.0897,
+      "step": 453
+    },
+    {
+      "epoch": 1.1020656136087486,
+      "grad_norm": 32.32587774153429,
+      "learning_rate": 9.117326188184695e-05,
+      "loss": 1.1285,
+      "step": 454
+    },
+    {
+      "epoch": 1.1044957472660997,
+      "grad_norm": 33.39257750037465,
+      "learning_rate": 9.077316405366981e-05,
+      "loss": 1.1568,
+      "step": 455
+    },
+    {
+      "epoch": 1.1069258809234508,
+      "grad_norm": 45.03485873480868,
+      "learning_rate": 9.037321514049548e-05,
+      "loss": 1.0791,
+      "step": 456
+    },
+    {
+      "epoch": 1.109356014580802,
+      "grad_norm": 35.1451377482015,
+      "learning_rate": 8.997342159723371e-05,
+      "loss": 1.1243,
+      "step": 457
+    },
+    {
+      "epoch": 1.111786148238153,
+      "grad_norm": 67.01465976966,
+      "learning_rate": 8.957378987628682e-05,
+      "loss": 1.0978,
+      "step": 458
+    },
+    {
+      "epoch": 1.1142162818955041,
+      "grad_norm": 33.057859846207634,
+      "learning_rate": 8.917432642744518e-05,
+      "loss": 1.1431,
+      "step": 459
+    },
+    {
+      "epoch": 1.1166464155528555,
+      "grad_norm": 30.602840863536635,
+      "learning_rate": 8.877503769778356e-05,
+      "loss": 1.1157,
+      "step": 460
+    },
+    {
+      "epoch": 1.1190765492102066,
+      "grad_norm": 38.088467248288964,
+      "learning_rate": 8.83759301315568e-05,
+      "loss": 1.0776,
+      "step": 461
+    },
+    {
+      "epoch": 1.1215066828675577,
+      "grad_norm": 66.03671829863266,
+      "learning_rate": 8.797701017009591e-05,
+      "loss": 1.1468,
+      "step": 462
+    },
+    {
+      "epoch": 1.1239368165249088,
+      "grad_norm": 32.293691874682686,
+      "learning_rate": 8.757828425170404e-05,
+      "loss": 1.1115,
+      "step": 463
+    },
+    {
+      "epoch": 1.12636695018226,
+      "grad_norm": 32.70707175332633,
+      "learning_rate": 8.717975881155261e-05,
+      "loss": 1.1677,
+      "step": 464
+    },
+    {
+      "epoch": 1.128797083839611,
+      "grad_norm": 48.79069594971439,
+      "learning_rate": 8.678144028157759e-05,
+      "loss": 1.1341,
+      "step": 465
+    },
+    {
+      "epoch": 1.1312272174969624,
+      "grad_norm": 37.52808559072613,
+      "learning_rate": 8.638333509037536e-05,
+      "loss": 1.1414,
+      "step": 466
+    },
+    {
+      "epoch": 1.1336573511543135,
+      "grad_norm": 27.096068124970536,
+      "learning_rate": 8.598544966309925e-05,
+      "loss": 1.1719,
+      "step": 467
+    },
+    {
+      "epoch": 1.1360874848116647,
+      "grad_norm": 16.019227077248434,
+      "learning_rate": 8.55877904213558e-05,
+      "loss": 1.1148,
+      "step": 468
+    },
+    {
+      "epoch": 1.1385176184690158,
+      "grad_norm": 29.861941956913498,
+      "learning_rate": 8.519036378310096e-05,
+      "loss": 1.1486,
+      "step": 469
+    },
+    {
+      "epoch": 1.140947752126367,
+      "grad_norm": 23.058998452019107,
+      "learning_rate": 8.47931761625367e-05,
+      "loss": 1.0745,
+      "step": 470
+    },
+    {
+      "epoch": 1.143377885783718,
+      "grad_norm": 24.486692418227875,
+      "learning_rate": 8.43962339700073e-05,
+      "loss": 1.1333,
+      "step": 471
+    },
+    {
+      "epoch": 1.1458080194410694,
+      "grad_norm": 31.632544516924323,
+      "learning_rate": 8.399954361189615e-05,
+      "loss": 1.1565,
+      "step": 472
+    },
+    {
+      "epoch": 1.1482381530984205,
+      "grad_norm": 21.67735267443374,
+      "learning_rate": 8.360311149052205e-05,
+      "loss": 1.109,
+      "step": 473
+    },
+    {
+      "epoch": 1.1506682867557716,
+      "grad_norm": 29.096918560226527,
+      "learning_rate": 8.320694400403606e-05,
+      "loss": 1.1517,
+      "step": 474
+    },
+    {
+      "epoch": 1.1530984204131227,
+      "grad_norm": 46.067313216206955,
+      "learning_rate": 8.281104754631835e-05,
+      "loss": 1.1043,
+      "step": 475
+    },
+    {
+      "epoch": 1.1555285540704738,
+      "grad_norm": 30.84953769166141,
+      "learning_rate": 8.241542850687465e-05,
+      "loss": 1.1081,
+      "step": 476
+    },
+    {
+      "epoch": 1.157958687727825,
+      "grad_norm": 39.34158523904847,
+      "learning_rate": 8.20200932707335e-05,
+      "loss": 1.1787,
+      "step": 477
+    },
+    {
+      "epoch": 1.160388821385176,
+      "grad_norm": 39.14663302484904,
+      "learning_rate": 8.162504821834295e-05,
+      "loss": 1.202,
+      "step": 478
+    },
+    {
+      "epoch": 1.1628189550425274,
+      "grad_norm": 49.7279004249915,
+      "learning_rate": 8.123029972546781e-05,
+      "loss": 1.1439,
+      "step": 479
+    },
+    {
+      "epoch": 1.1652490886998785,
+      "grad_norm": 35.49897960878779,
+      "learning_rate": 8.083585416308642e-05,
+      "loss": 1.0741,
+      "step": 480
+    },
+    {
+      "epoch": 1.1676792223572297,
+      "grad_norm": 31.306252618855535,
+      "learning_rate": 8.044171789728816e-05,
+      "loss": 1.0697,
+      "step": 481
+    },
+    {
+      "epoch": 1.1701093560145808,
+      "grad_norm": 22.40745672651249,
+      "learning_rate": 8.004789728917059e-05,
+      "loss": 1.1498,
+      "step": 482
+    },
+    {
+      "epoch": 1.172539489671932,
+      "grad_norm": 32.19326746671122,
+      "learning_rate": 7.965439869473664e-05,
+      "loss": 1.1392,
+      "step": 483
+    },
+    {
+      "epoch": 1.1749696233292832,
+      "grad_norm": 33.66876390791385,
+      "learning_rate": 7.926122846479224e-05,
+      "loss": 1.1049,
+      "step": 484
+    },
+    {
+      "epoch": 1.1773997569866343,
+      "grad_norm": 35.43357233261174,
+      "learning_rate": 7.886839294484377e-05,
+      "loss": 1.0467,
+      "step": 485
+    },
+    {
+      "epoch": 1.1798298906439855,
+      "grad_norm": 50.660998166256256,
+      "learning_rate": 7.84758984749955e-05,
+      "loss": 1.1244,
+      "step": 486
+    },
+    {
+      "epoch": 1.1822600243013366,
+      "grad_norm": 41.356845334605936,
+      "learning_rate": 7.808375138984745e-05,
+      "loss": 1.1279,
+      "step": 487
+    },
+    {
+      "epoch": 1.1846901579586877,
+      "grad_norm": 22.947663723281487,
+      "learning_rate": 7.769195801839313e-05,
+      "loss": 1.0787,
+      "step": 488
+    },
+    {
+      "epoch": 1.1871202916160388,
+      "grad_norm": 36.434647074399905,
+      "learning_rate": 7.730052468391725e-05,
+      "loss": 1.1148,
+      "step": 489
+    },
+    {
+      "epoch": 1.18955042527339,
+      "grad_norm": 75.94549877059467,
+      "learning_rate": 7.690945770389377e-05,
+      "loss": 1.1127,
+      "step": 490
+    },
+    {
+      "epoch": 1.1919805589307413,
+      "grad_norm": 68.03126664734435,
+      "learning_rate": 7.6518763389884e-05,
+      "loss": 1.1672,
+      "step": 491
+    },
+    {
+      "epoch": 1.1944106925880924,
+      "grad_norm": 40.15361719091623,
+      "learning_rate": 7.612844804743466e-05,
+      "loss": 1.0962,
+      "step": 492
+    },
+    {
+      "epoch": 1.1968408262454435,
+      "grad_norm": 105.80023571763755,
+      "learning_rate": 7.573851797597602e-05,
+      "loss": 1.1091,
+      "step": 493
+    },
+    {
+      "epoch": 1.1992709599027946,
+      "grad_norm": 41.84401502420881,
+      "learning_rate": 7.534897946872042e-05,
+      "loss": 1.1359,
+      "step": 494
+    },
+    {
+      "epoch": 1.2017010935601458,
+      "grad_norm": 21.985533615468846,
+      "learning_rate": 7.495983881256067e-05,
+      "loss": 1.1024,
+      "step": 495
+    },
+    {
+      "epoch": 1.2041312272174969,
+      "grad_norm": 23.02649898605792,
+      "learning_rate": 7.457110228796838e-05,
+      "loss": 1.1089,
+      "step": 496
+    },
+    {
+      "epoch": 1.206561360874848,
+      "grad_norm": 74.4950498938832,
+      "learning_rate": 7.418277616889282e-05,
+      "loss": 1.0439,
+      "step": 497
+    },
+    {
+      "epoch": 1.2089914945321993,
+      "grad_norm": 27.637660484960865,
+      "learning_rate": 7.379486672265964e-05,
+      "loss": 1.1453,
+      "step": 498
+    },
+    {
+      "epoch": 1.2114216281895505,
+      "grad_norm": 34.98561655821008,
+      "learning_rate": 7.340738020986961e-05,
+      "loss": 1.139,
+      "step": 499
+    },
+    {
+      "epoch": 1.2138517618469016,
+      "grad_norm": 28.47627677351389,
+      "learning_rate": 7.302032288429756e-05,
+      "loss": 1.0623,
+      "step": 500
+    },
+    {
+      "epoch": 1.2162818955042527,
+      "grad_norm": 39.551486186427596,
+      "learning_rate": 7.263370099279172e-05,
+      "loss": 1.1277,
+      "step": 501
+    },
+    {
+      "epoch": 1.2187120291616038,
+      "grad_norm": 44.12973085459368,
+      "learning_rate": 7.224752077517253e-05,
+      "loss": 1.1768,
+      "step": 502
+    },
+    {
+      "epoch": 1.2211421628189552,
+      "grad_norm": 84.84836585196132,
+      "learning_rate": 7.186178846413214e-05,
+      "loss": 1.1892,
+      "step": 503
+    },
+    {
+      "epoch": 1.2235722964763063,
+      "grad_norm": 34.94807915131505,
+      "learning_rate": 7.147651028513383e-05,
+      "loss": 1.1108,
+      "step": 504
+    },
+    {
+      "epoch": 1.2260024301336574,
+      "grad_norm": 46.19847384406232,
+      "learning_rate": 7.109169245631149e-05,
+      "loss": 1.0956,
+      "step": 505
+    },
+    {
+      "epoch": 1.2284325637910085,
+      "grad_norm": 38.58484473058957,
+      "learning_rate": 7.070734118836925e-05,
+      "loss": 1.1175,
+      "step": 506
+    },
+    {
+      "epoch": 1.2308626974483596,
+      "grad_norm": 37.84739298111386,
+      "learning_rate": 7.032346268448118e-05,
+      "loss": 1.1411,
+      "step": 507
+    },
+    {
+      "epoch": 1.2332928311057108,
+      "grad_norm": 53.5471335398439,
+      "learning_rate": 6.994006314019141e-05,
+      "loss": 1.1332,
+      "step": 508
+    },
+    {
+      "epoch": 1.2357229647630619,
+      "grad_norm": 91.55067777365485,
+      "learning_rate": 6.955714874331387e-05,
+      "loss": 1.1205,
+      "step": 509
+    },
+    {
+      "epoch": 1.2381530984204132,
+      "grad_norm": 27.05333642785952,
+      "learning_rate": 6.917472567383252e-05,
+      "loss": 1.099,
+      "step": 510
+    },
+    {
+      "epoch": 1.2405832320777643,
+      "grad_norm": 24.519879042487336,
+      "learning_rate": 6.87928001038017e-05,
+      "loss": 1.1401,
+      "step": 511
+    },
+    {
+      "epoch": 1.2430133657351154,
+      "grad_norm": 33.763495598365786,
+      "learning_rate": 6.84113781972464e-05,
+      "loss": 1.2058,
+      "step": 512
+    },
+    {
+      "epoch": 1.2454434993924666,
+      "grad_norm": 34.49114206138826,
+      "learning_rate": 6.803046611006278e-05,
+      "loss": 1.1044,
+      "step": 513
+    },
+    {
+      "epoch": 1.2478736330498177,
+      "grad_norm": 74.20211157975073,
+      "learning_rate": 6.765006998991888e-05,
+      "loss": 1.111,
+      "step": 514
+    },
+    {
+      "epoch": 1.250303766707169,
+      "grad_norm": 32.30436806042553,
+      "learning_rate": 6.727019597615545e-05,
+      "loss": 1.1063,
+      "step": 515
+    },
+    {
+      "epoch": 1.250303766707169,
+      "eval_loss": 1.1128273010253906,
+      "eval_runtime": 53.4998,
+      "eval_samples_per_second": 13.907,
+      "eval_steps_per_second": 1.738,
+      "step": 515
+    },
+    {
+      "epoch": 1.25273390036452,
+      "grad_norm": 42.104054612880084,
+      "learning_rate": 6.689085019968669e-05,
+      "loss": 1.1315,
+      "step": 516
+    },
+    {
+      "epoch": 1.2551640340218713,
+      "grad_norm": 25.66097714624212,
+      "learning_rate": 6.651203878290139e-05,
+      "loss": 1.0916,
+      "step": 517
+    },
+    {
+      "epoch": 1.2575941676792224,
+      "grad_norm": 35.12310576456352,
+      "learning_rate": 6.613376783956423e-05,
+      "loss": 1.0699,
+      "step": 518
+    },
+    {
+      "epoch": 1.2600243013365735,
+      "grad_norm": 34.172951559594566,
+      "learning_rate": 6.575604347471695e-05,
+      "loss": 1.1412,
+      "step": 519
+    },
+    {
+      "epoch": 1.2624544349939246,
+      "grad_norm": 54.373563773275116,
+      "learning_rate": 6.537887178457984e-05,
+      "loss": 1.1255,
+      "step": 520
+    },
+    {
+      "epoch": 1.2648845686512757,
+      "grad_norm": 33.806385046788755,
+      "learning_rate": 6.500225885645346e-05,
+      "loss": 1.101,
+      "step": 521
+    },
+    {
+      "epoch": 1.267314702308627,
+      "grad_norm": 34.17813695957543,
+      "learning_rate": 6.46262107686203e-05,
+      "loss": 1.1226,
+      "step": 522
+    },
+    {
+      "epoch": 1.2697448359659782,
+      "grad_norm": 24.68048087106548,
+      "learning_rate": 6.425073359024663e-05,
+      "loss": 1.1787,
+      "step": 523
+    },
+    {
+      "epoch": 1.2721749696233293,
+      "grad_norm": 32.78749757697808,
+      "learning_rate": 6.387583338128471e-05,
+      "loss": 1.0541,
+      "step": 524
+    },
+    {
+      "epoch": 1.2746051032806804,
+      "grad_norm": 30.906673844090044,
+      "learning_rate": 6.350151619237488e-05,
+      "loss": 1.0964,
+      "step": 525
+    },
+    {
+      "epoch": 1.2770352369380316,
+      "grad_norm": 32.571858392892736,
+      "learning_rate": 6.312778806474795e-05,
+      "loss": 1.1251,
+      "step": 526
+    },
+    {
+      "epoch": 1.2794653705953827,
+      "grad_norm": 43.02428916532565,
+      "learning_rate": 6.275465503012751e-05,
+      "loss": 1.0473,
+      "step": 527
+    },
+    {
+      "epoch": 1.2818955042527338,
+      "grad_norm": 60.93587506764561,
+      "learning_rate": 6.2382123110633e-05,
+      "loss": 1.078,
+      "step": 528
+    },
+    {
+      "epoch": 1.2843256379100851,
+      "grad_norm": 64.6934775930251,
+      "learning_rate": 6.201019831868208e-05,
+      "loss": 1.0904,
+      "step": 529
+    },
+    {
+      "epoch": 1.2867557715674363,
+      "grad_norm": 32.977077613035426,
+      "learning_rate": 6.16388866568938e-05,
+      "loss": 1.0705,
+      "step": 530
+    },
+    {
+      "epoch": 1.2891859052247874,
+      "grad_norm": 28.27407310492513,
+      "learning_rate": 6.126819411799175e-05,
+      "loss": 1.1252,
+      "step": 531
+    },
+    {
+      "epoch": 1.2916160388821385,
+      "grad_norm": 33.73515826089828,
+      "learning_rate": 6.0898126684707265e-05,
+      "loss": 1.1262,
+      "step": 532
+    },
+    {
+      "epoch": 1.2940461725394896,
+      "grad_norm": 25.370361818959903,
+      "learning_rate": 6.052869032968285e-05,
+      "loss": 1.0845,
+      "step": 533
+    },
+    {
+      "epoch": 1.296476306196841,
+      "grad_norm": 37.389287060597105,
+      "learning_rate": 6.015989101537586e-05,
+      "loss": 1.1352,
+      "step": 534
+    },
+    {
+      "epoch": 1.2989064398541919,
+      "grad_norm": 39.04755104008223,
+      "learning_rate": 5.979173469396227e-05,
+      "loss": 1.1538,
+      "step": 535
+    },
+    {
+      "epoch": 1.3013365735115432,
+      "grad_norm": 34.33676719612293,
+      "learning_rate": 5.9424227307240554e-05,
+      "loss": 1.1725,
+      "step": 536
+    },
+    {
+      "epoch": 1.3037667071688943,
+      "grad_norm": 64.66076997769457,
+      "learning_rate": 5.905737478653572e-05,
+      "loss": 1.1146,
+      "step": 537
+    },
+    {
+      "epoch": 1.3061968408262454,
+      "grad_norm": 48.043289790386325,
+      "learning_rate": 5.8691183052603834e-05,
+      "loss": 1.1035,
+      "step": 538
+    },
+    {
+      "epoch": 1.3086269744835966,
+      "grad_norm": 49.08397341659928,
+      "learning_rate": 5.83256580155362e-05,
+      "loss": 1.1653,
+      "step": 539
+    },
+    {
+      "epoch": 1.3110571081409477,
+      "grad_norm": 46.688886812303515,
+      "learning_rate": 5.796080557466406e-05,
+      "loss": 1.1328,
+      "step": 540
+    },
+    {
+      "epoch": 1.313487241798299,
+      "grad_norm": 27.503882325413493,
+      "learning_rate": 5.7596631618463514e-05,
+      "loss": 1.1019,
+      "step": 541
+    },
+    {
+      "epoch": 1.3159173754556501,
+      "grad_norm": 48.88974129574653,
+      "learning_rate": 5.723314202446026e-05,
+      "loss": 1.121,
+      "step": 542
+    },
+    {
+      "epoch": 1.3183475091130012,
+      "grad_norm": 28.105881157995345,
+      "learning_rate": 5.687034265913485e-05,
+      "loss": 1.0898,
+      "step": 543
+    },
+    {
+      "epoch": 1.3207776427703524,
+      "grad_norm": 30.410731278414804,
+      "learning_rate": 5.6508239377828034e-05,
+      "loss": 1.07,
+      "step": 544
+    },
+    {
+      "epoch": 1.3232077764277035,
+      "grad_norm": 38.08324176765882,
+      "learning_rate": 5.614683802464631e-05,
+      "loss": 1.1503,
+      "step": 545
+    },
+    {
+      "epoch": 1.3256379100850546,
+      "grad_norm": 46.28952293745534,
+      "learning_rate": 5.578614443236738e-05,
+      "loss": 1.1282,
+      "step": 546
+    },
+    {
+      "epoch": 1.3280680437424057,
+      "grad_norm": 68.2597453597135,
+      "learning_rate": 5.542616442234618e-05,
+      "loss": 1.1373,
+      "step": 547
+    },
+    {
+      "epoch": 1.330498177399757,
+      "grad_norm": 30.351663825014143,
+      "learning_rate": 5.5066903804421025e-05,
+      "loss": 1.1633,
+      "step": 548
+    },
+    {
+      "epoch": 1.3329283110571082,
+      "grad_norm": 38.2711285636887,
+      "learning_rate": 5.470836837681954e-05,
+      "loss": 1.1604,
+      "step": 549
+    },
+    {
+      "epoch": 1.3353584447144593,
+      "grad_norm": 35.64230091531108,
+      "learning_rate": 5.4350563926065404e-05,
+      "loss": 1.0564,
+      "step": 550
+    },
+    {
+      "epoch": 1.3377885783718104,
+      "grad_norm": 44.869816046925564,
+      "learning_rate": 5.399349622688479e-05,
+      "loss": 1.1376,
+      "step": 551
+    },
+    {
+      "epoch": 1.3402187120291615,
+      "grad_norm": 26.681037126315633,
+      "learning_rate": 5.3637171042113146e-05,
+      "loss": 1.0867,
+      "step": 552
+    },
+    {
+      "epoch": 1.3426488456865129,
+      "grad_norm": 34.6124686262535,
+      "learning_rate": 5.32815941226022e-05,
+      "loss": 1.0474,
+      "step": 553
+    },
+    {
+      "epoch": 1.3450789793438638,
+      "grad_norm": 35.92639009060983,
+      "learning_rate": 5.2926771207127254e-05,
+      "loss": 1.0958,
+      "step": 554
+    },
+    {
+      "epoch": 1.3475091130012151,
+      "grad_norm": 39.08938922562224,
+      "learning_rate": 5.2572708022294504e-05,
+      "loss": 1.074,
+      "step": 555
+    },
+    {
+      "epoch": 1.3499392466585662,
+      "grad_norm": 76.06708166273745,
+      "learning_rate": 5.2219410282448514e-05,
+      "loss": 1.0865,
+      "step": 556
+    },
+    {
+      "epoch": 1.3523693803159174,
+      "grad_norm": 74.14222265654887,
+      "learning_rate": 5.1866883689580056e-05,
+      "loss": 1.1567,
+      "step": 557
+    },
+    {
+      "epoch": 1.3547995139732685,
+      "grad_norm": 34.82441678662901,
+      "learning_rate": 5.151513393323426e-05,
+      "loss": 1.0802,
+      "step": 558
+    },
+    {
+      "epoch": 1.3572296476306196,
+      "grad_norm": 75.53504846566143,
+      "learning_rate": 5.116416669041843e-05,
+      "loss": 1.0623,
+      "step": 559
+    },
+    {
+      "epoch": 1.359659781287971,
+      "grad_norm": 29.423475817434785,
+      "learning_rate": 5.0813987625510775e-05,
+      "loss": 1.077,
+      "step": 560
+    },
+    {
+      "epoch": 1.362089914945322,
+      "grad_norm": 44.607486168434534,
+      "learning_rate": 5.046460239016879e-05,
+      "loss": 1.096,
+      "step": 561
+    },
+    {
+      "epoch": 1.3645200486026732,
+      "grad_norm": 40.684125033315404,
+      "learning_rate": 5.011601662323807e-05,
+      "loss": 1.148,
+      "step": 562
+    },
+    {
+      "epoch": 1.3669501822600243,
+      "grad_norm": 47.33103026318705,
+      "learning_rate": 4.976823595066128e-05,
+      "loss": 1.1712,
+      "step": 563
+    },
+    {
+      "epoch": 1.3693803159173754,
+      "grad_norm": 51.17017845058186,
+      "learning_rate": 4.9421265985387476e-05,
+      "loss": 1.1287,
+      "step": 564
+    },
+    {
+      "epoch": 1.3718104495747265,
+      "grad_norm": 50.76665552103517,
+      "learning_rate": 4.907511232728145e-05,
+      "loss": 1.1156,
+      "step": 565
+    },
+    {
+      "epoch": 1.3742405832320777,
+      "grad_norm": 32.6007633025874,
+      "learning_rate": 4.872978056303327e-05,
+      "loss": 1.1477,
+      "step": 566
+    },
+    {
+      "epoch": 1.376670716889429,
+      "grad_norm": 29.696241441710107,
+      "learning_rate": 4.8385276266068146e-05,
+      "loss": 1.0874,
+      "step": 567
+    },
+    {
+      "epoch": 1.37910085054678,
+      "grad_norm": 58.96613500379004,
+      "learning_rate": 4.804160499645667e-05,
+      "loss": 1.0616,
+      "step": 568
+    },
+    {
+      "epoch": 1.3815309842041312,
+      "grad_norm": 37.104100020310334,
+      "learning_rate": 4.7698772300824756e-05,
+      "loss": 1.0878,
+      "step": 569
+    },
+    {
+      "epoch": 1.3839611178614823,
+      "grad_norm": 51.735902941979305,
+      "learning_rate": 4.735678371226441e-05,
+      "loss": 1.0836,
+      "step": 570
+    },
+    {
+      "epoch": 1.3863912515188335,
+      "grad_norm": 55.49190976804079,
+      "learning_rate": 4.7015644750244306e-05,
+      "loss": 1.0473,
+      "step": 571
+    },
+    {
+      "epoch": 1.3888213851761848,
+      "grad_norm": 34.27972449829039,
+      "learning_rate": 4.6675360920520625e-05,
+      "loss": 1.0723,
+      "step": 572
+    },
+    {
+      "epoch": 1.391251518833536,
+      "grad_norm": 28.508157856527724,
+      "learning_rate": 4.6335937715048306e-05,
+      "loss": 1.0723,
+      "step": 573
+    },
+    {
+      "epoch": 1.393681652490887,
+      "grad_norm": 106.84009565003795,
+      "learning_rate": 4.599738061189244e-05,
+      "loss": 1.149,
+      "step": 574
+    },
+    {
+      "epoch": 1.3961117861482382,
+      "grad_norm": 50.543394606036294,
+      "learning_rate": 4.565969507513981e-05,
+      "loss": 1.0991,
+      "step": 575
+    },
+    {
+      "epoch": 1.3985419198055893,
+      "grad_norm": 30.409124335052745,
+      "learning_rate": 4.532288655481062e-05,
+      "loss": 1.1157,
+      "step": 576
+    },
+    {
+      "epoch": 1.4009720534629404,
+      "grad_norm": 89.92061876679301,
+      "learning_rate": 4.498696048677059e-05,
+      "loss": 1.1526,
+      "step": 577
+    },
+    {
+      "epoch": 1.4034021871202915,
+      "grad_norm": 84.27775422110602,
+      "learning_rate": 4.465192229264337e-05,
+      "loss": 1.1418,
+      "step": 578
+    },
+    {
+      "epoch": 1.4058323207776429,
+      "grad_norm": 40.7815489623743,
+      "learning_rate": 4.4317777379722866e-05,
+      "loss": 1.0831,
+      "step": 579
+    },
+    {
+      "epoch": 1.408262454434994,
+      "grad_norm": 66.6911504313278,
+      "learning_rate": 4.3984531140885943e-05,
+      "loss": 1.1088,
+      "step": 580
+    },
+    {
+      "epoch": 1.410692588092345,
+      "grad_norm": 137.00882181835217,
+      "learning_rate": 4.365218895450558e-05,
+      "loss": 1.1089,
+      "step": 581
+    },
+    {
+      "epoch": 1.4131227217496962,
+      "grad_norm": 41.139168895296855,
+      "learning_rate": 4.332075618436386e-05,
+      "loss": 1.1603,
+      "step": 582
+    },
+    {
+      "epoch": 1.4155528554070473,
+      "grad_norm": 35.443969765428506,
+      "learning_rate": 4.29902381795655e-05,
+      "loss": 1.0301,
+      "step": 583
+    },
+    {
+      "epoch": 1.4179829890643987,
+      "grad_norm": 32.931514576694674,
+      "learning_rate": 4.266064027445155e-05,
+      "loss": 1.1016,
+      "step": 584
+    },
+    {
+      "epoch": 1.4204131227217496,
+      "grad_norm": 64.21015694858382,
+      "learning_rate": 4.2331967788513295e-05,
+      "loss": 1.0789,
+      "step": 585
+    },
+    {
+      "epoch": 1.422843256379101,
+      "grad_norm": 84.13251752827094,
+      "learning_rate": 4.200422602630629e-05,
+      "loss": 1.1573,
+      "step": 586
+    },
+    {
+      "epoch": 1.425273390036452,
+      "grad_norm": 53.61636603108024,
+      "learning_rate": 4.167742027736482e-05,
+      "loss": 1.0942,
+      "step": 587
+    },
+    {
+      "epoch": 1.4277035236938032,
+      "grad_norm": 133.20877569415256,
+      "learning_rate": 4.135155581611661e-05,
+      "loss": 1.0877,
+      "step": 588
+    },
+    {
+      "epoch": 1.4301336573511543,
+      "grad_norm": 49.85736467319357,
+      "learning_rate": 4.102663790179764e-05,
+      "loss": 1.0619,
+      "step": 589
+    },
+    {
+      "epoch": 1.4325637910085054,
+      "grad_norm": 91.13217639524017,
+      "learning_rate": 4.070267177836712e-05,
+      "loss": 1.1093,
+      "step": 590
+    },
+    {
+      "epoch": 1.4349939246658567,
+      "grad_norm": 49.25558128250457,
+      "learning_rate": 4.037966267442315e-05,
+      "loss": 1.1344,
+      "step": 591
+    },
+    {
+      "epoch": 1.4374240583232079,
+      "grad_norm": 95.87244356130316,
+      "learning_rate": 4.005761580311805e-05,
+      "loss": 1.0929,
+      "step": 592
+    },
+    {
+      "epoch": 1.439854191980559,
+      "grad_norm": 74.28903671045653,
+      "learning_rate": 3.973653636207437e-05,
+      "loss": 1.1263,
+      "step": 593
+    },
+    {
+      "epoch": 1.44228432563791,
+      "grad_norm": 53.99454529785116,
+      "learning_rate": 3.941642953330103e-05,
+      "loss": 1.0916,
+      "step": 594
+    },
+    {
+      "epoch": 1.4447144592952612,
+      "grad_norm": 113.26015597338959,
+      "learning_rate": 3.909730048310962e-05,
+      "loss": 1.1009,
+      "step": 595
+    },
+    {
+      "epoch": 1.4471445929526123,
+      "grad_norm": 134.4015550981493,
+      "learning_rate": 3.8779154362030986e-05,
+      "loss": 1.1351,
+      "step": 596
+    },
+    {
+      "epoch": 1.4495747266099634,
+      "grad_norm": 90.61611981238187,
+      "learning_rate": 3.846199630473216e-05,
+      "loss": 1.0827,
+      "step": 597
+    },
+    {
+      "epoch": 1.4520048602673148,
+      "grad_norm": 56.55050791518521,
+      "learning_rate": 3.814583142993352e-05,
+      "loss": 1.1145,
+      "step": 598
+    },
+    {
+      "epoch": 1.454434993924666,
+      "grad_norm": 265.6916535243014,
+      "learning_rate": 3.7830664840326145e-05,
+      "loss": 1.1459,
+      "step": 599
+    },
+    {
+      "epoch": 1.456865127582017,
+      "grad_norm": 72.81191101030372,
+      "learning_rate": 3.7516501622489367e-05,
+      "loss": 1.0903,
+      "step": 600
+    },
+    {
+      "epoch": 1.4592952612393681,
+      "grad_norm": 58.309143549086556,
+      "learning_rate": 3.720334684680889e-05,
+      "loss": 1.1041,
+      "step": 601
+    },
+    {
+      "epoch": 1.4617253948967193,
+      "grad_norm": 35.19205741792398,
+      "learning_rate": 3.689120556739475e-05,
+      "loss": 1.1523,
+      "step": 602
+    },
+    {
+      "epoch": 1.4641555285540706,
+      "grad_norm": 88.97226951757321,
+      "learning_rate": 3.6580082821999786e-05,
+      "loss": 1.1117,
+      "step": 603
+    },
+    {
+      "epoch": 1.4665856622114215,
+      "grad_norm": 64.50873879301322,
+      "learning_rate": 3.6269983631938475e-05,
+      "loss": 1.1256,
+      "step": 604
+    },
+    {
+      "epoch": 1.4690157958687728,
+      "grad_norm": 78.10556611104111,
+      "learning_rate": 3.596091300200578e-05,
+      "loss": 1.0834,
+      "step": 605
+    },
+    {
+      "epoch": 1.471445929526124,
+      "grad_norm": 69.38449946362529,
+      "learning_rate": 3.565287592039628e-05,
+      "loss": 1.1026,
+      "step": 606
+    },
+    {
+      "epoch": 1.473876063183475,
+      "grad_norm": 79.60241521456905,
+      "learning_rate": 3.534587735862391e-05,
+      "loss": 1.0456,
+      "step": 607
+    },
+    {
+      "epoch": 1.4763061968408262,
+      "grad_norm": 89.68581306071424,
+      "learning_rate": 3.503992227144147e-05,
+      "loss": 1.0809,
+      "step": 608
+    },
+    {
+      "epoch": 1.4787363304981773,
+      "grad_norm": 68.570527237558,
+      "learning_rate": 3.473501559676088e-05,
+      "loss": 1.0754,
+      "step": 609
+    },
+    {
+      "epoch": 1.4811664641555287,
+      "grad_norm": 54.94762317625427,
+      "learning_rate": 3.4431162255573245e-05,
+      "loss": 1.1751,
+      "step": 610
+    },
+    {
+      "epoch": 1.4835965978128798,
+      "grad_norm": 109.12821602719706,
+      "learning_rate": 3.4128367151869714e-05,
+      "loss": 1.1055,
+      "step": 611
+    },
+    {
+      "epoch": 1.486026731470231,
+      "grad_norm": 198.79030469542352,
+      "learning_rate": 3.3826635172562094e-05,
+      "loss": 1.1369,
+      "step": 612
+    },
+    {
+      "epoch": 1.488456865127582,
+      "grad_norm": 62.002866716809,
+      "learning_rate": 3.352597118740404e-05,
+      "loss": 1.1611,
+      "step": 613
+    },
+    {
+      "epoch": 1.4908869987849331,
+      "grad_norm": 79.21193137029579,
+      "learning_rate": 3.3226380048912585e-05,
+      "loss": 1.1688,
+      "step": 614
+    },
+    {
+      "epoch": 1.4933171324422843,
+      "grad_norm": 68.6722934326242,
+      "learning_rate": 3.292786659228973e-05,
+      "loss": 1.1248,
+      "step": 615
+    },
+    {
+      "epoch": 1.4957472660996354,
+      "grad_norm": 104.34122241838278,
+      "learning_rate": 3.263043563534428e-05,
+      "loss": 1.1425,
+      "step": 616
+    },
+    {
+      "epoch": 1.4981773997569867,
+      "grad_norm": 86.43862038340298,
+      "learning_rate": 3.233409197841437e-05,
+      "loss": 1.0562,
+      "step": 617
+    },
+    {
+      "epoch": 1.5006075334143378,
+      "grad_norm": 79.74137751394451,
+      "learning_rate": 3.2038840404289705e-05,
+      "loss": 1.1214,
+      "step": 618
+    },
+    {
+      "epoch": 1.5006075334143378,
+      "eval_loss": 1.1088899374008179,
+      "eval_runtime": 53.0545,
+      "eval_samples_per_second": 14.023,
+      "eval_steps_per_second": 1.753,
+      "step": 618
+    },
+    {
+      "epoch": 1.503037667071689,
+      "grad_norm": 126.19650708566132,
+      "learning_rate": 3.174468567813461e-05,
+      "loss": 1.181,
+      "step": 619
+    },
+    {
+      "epoch": 1.50546780072904,
+      "grad_norm": 64.86293986153461,
+      "learning_rate": 3.14516325474109e-05,
+      "loss": 1.0607,
+      "step": 620
+    },
+    {
+      "epoch": 1.5078979343863912,
+      "grad_norm": 62.06308896160908,
+      "learning_rate": 3.115968574180149e-05,
+      "loss": 1.0914,
+      "step": 621
+    },
+    {
+      "epoch": 1.5103280680437425,
+      "grad_norm": 168.27548636755165,
+      "learning_rate": 3.086884997313387e-05,
+      "loss": 1.1595,
+      "step": 622
+    },
+    {
+      "epoch": 1.5127582017010934,
+      "grad_norm": 156.46495738513647,
+      "learning_rate": 3.0579129935304066e-05,
+      "loss": 1.1263,
+      "step": 623
+    },
+    {
+      "epoch": 1.5151883353584448,
+      "grad_norm": 71.761765760571,
+      "learning_rate": 3.029053030420115e-05,
+      "loss": 1.049,
+      "step": 624
+    },
+    {
+      "epoch": 1.517618469015796,
+      "grad_norm": 87.26870047585324,
+      "learning_rate": 3.0003055737631403e-05,
+      "loss": 1.1917,
+      "step": 625
+    },
+    {
+      "epoch": 1.520048602673147,
+      "grad_norm": 142.01139847883954,
+      "learning_rate": 2.9716710875243326e-05,
+      "loss": 1.1038,
+      "step": 626
+    },
+    {
+      "epoch": 1.5224787363304981,
+      "grad_norm": 81.15254185021365,
+      "learning_rate": 2.9431500338452832e-05,
+      "loss": 1.0824,
+      "step": 627
+    },
+    {
+      "epoch": 1.5249088699878492,
+      "grad_norm": 68.21138775878333,
+      "learning_rate": 2.9147428730368475e-05,
+      "loss": 1.0676,
+      "step": 628
+    },
+    {
+      "epoch": 1.5273390036452006,
+      "grad_norm": 61.929977077152344,
+      "learning_rate": 2.886450063571735e-05,
+      "loss": 1.1928,
+      "step": 629
+    },
+    {
+      "epoch": 1.5297691373025515,
+      "grad_norm": 76.19248167649229,
+      "learning_rate": 2.858272062077091e-05,
+      "loss": 1.0737,
+      "step": 630
+    },
+    {
+      "epoch": 1.5321992709599028,
+      "grad_norm": 67.40817795826194,
+      "learning_rate": 2.8302093233271453e-05,
+      "loss": 1.0734,
+      "step": 631
+    },
+    {
+      "epoch": 1.534629404617254,
+      "grad_norm": 35.17352084915858,
+      "learning_rate": 2.802262300235857e-05,
+      "loss": 1.0062,
+      "step": 632
+    },
+    {
+      "epoch": 1.537059538274605,
+      "grad_norm": 97.0705094618675,
+      "learning_rate": 2.7744314438496088e-05,
+      "loss": 1.121,
+      "step": 633
+    },
+    {
+      "epoch": 1.5394896719319564,
+      "grad_norm": 52.21457659022329,
+      "learning_rate": 2.7467172033399458e-05,
+      "loss": 1.1864,
+      "step": 634
+    },
+    {
+      "epoch": 1.5419198055893073,
+      "grad_norm": 260.1057846866782,
+      "learning_rate": 2.7191200259962934e-05,
+      "loss": 1.1549,
+      "step": 635
+    },
+    {
+      "epoch": 1.5443499392466586,
+      "grad_norm": 66.65086231184844,
+      "learning_rate": 2.691640357218759e-05,
+      "loss": 1.1023,
+      "step": 636
+    },
+    {
+      "epoch": 1.5467800729040098,
+      "grad_norm": 680.8791021196618,
+      "learning_rate": 2.6642786405109475e-05,
+      "loss": 1.0943,
+      "step": 637
+    },
+    {
+      "epoch": 1.5492102065613609,
+      "grad_norm": 36.199872792671414,
+      "learning_rate": 2.6370353174727836e-05,
+      "loss": 1.0924,
+      "step": 638
+    },
+    {
+      "epoch": 1.551640340218712,
+      "grad_norm": 84.1148767833362,
+      "learning_rate": 2.6099108277934103e-05,
+      "loss": 1.1361,
+      "step": 639
+    },
+    {
+      "epoch": 1.5540704738760631,
+      "grad_norm": 81.84432345021693,
+      "learning_rate": 2.5829056092440662e-05,
+      "loss": 1.0868,
+      "step": 640
+    },
+    {
+      "epoch": 1.5565006075334145,
+      "grad_norm": 39.42683610456025,
+      "learning_rate": 2.556020097671046e-05,
+      "loss": 1.1506,
+      "step": 641
+    },
+    {
+      "epoch": 1.5589307411907654,
+      "grad_norm": 54.33249421192736,
+      "learning_rate": 2.5292547269886392e-05,
+      "loss": 1.0517,
+      "step": 642
+    },
+    {
+      "epoch": 1.5613608748481167,
+      "grad_norm": 410.5903072488164,
+      "learning_rate": 2.5026099291721516e-05,
+      "loss": 1.0995,
+      "step": 643
+    },
+    {
+      "epoch": 1.5637910085054678,
+      "grad_norm": 83.574545998207,
+      "learning_rate": 2.4760861342509233e-05,
+      "loss": 1.0792,
+      "step": 644
+    },
+    {
+      "epoch": 1.566221142162819,
+      "grad_norm": 399.66181496308434,
+      "learning_rate": 2.449683770301382e-05,
+      "loss": 1.2167,
+      "step": 645
+    },
+    {
+      "epoch": 1.56865127582017,
+      "grad_norm": 55.12309263364805,
+      "learning_rate": 2.4234032634401406e-05,
+      "loss": 1.0332,
+      "step": 646
+    },
+    {
+      "epoch": 1.5710814094775212,
+      "grad_norm": 61.30588953316776,
+      "learning_rate": 2.397245037817125e-05,
+      "loss": 1.0659,
+      "step": 647
+    },
+    {
+      "epoch": 1.5735115431348725,
+      "grad_norm": 75.74467195338701,
+      "learning_rate": 2.371209515608718e-05,
+      "loss": 1.1254,
+      "step": 648
+    },
+    {
+      "epoch": 1.5759416767922234,
+      "grad_norm": 67.98309962901806,
+      "learning_rate": 2.345297117010954e-05,
+      "loss": 1.1119,
+      "step": 649
+    },
+    {
+      "epoch": 1.5783718104495748,
+      "grad_norm": 59.08178521357814,
+      "learning_rate": 2.3195082602327312e-05,
+      "loss": 1.0866,
+      "step": 650
+    },
+    {
+      "epoch": 1.5808019441069259,
+      "grad_norm": 94.26571313695092,
+      "learning_rate": 2.2938433614890697e-05,
+      "loss": 1.1742,
+      "step": 651
+    },
+    {
+      "epoch": 1.583232077764277,
+      "grad_norm": 92.74387959878898,
+      "learning_rate": 2.2683028349943815e-05,
+      "loss": 1.1765,
+      "step": 652
+    },
+    {
+      "epoch": 1.5856622114216283,
+      "grad_norm": 54.0790750014235,
+      "learning_rate": 2.242887092955801e-05,
+      "loss": 1.0979,
+      "step": 653
+    },
+    {
+      "epoch": 1.5880923450789792,
+      "grad_norm": 55.72195824432094,
+      "learning_rate": 2.2175965455665226e-05,
+      "loss": 1.0826,
+      "step": 654
+    },
+    {
+      "epoch": 1.5905224787363306,
+      "grad_norm": 60.8162820416134,
+      "learning_rate": 2.1924316009991787e-05,
+      "loss": 1.0884,
+      "step": 655
+    },
+    {
+      "epoch": 1.5929526123936817,
+      "grad_norm": 67.20621804796278,
+      "learning_rate": 2.167392665399256e-05,
+      "loss": 1.1426,
+      "step": 656
+    },
+    {
+      "epoch": 1.5953827460510328,
+      "grad_norm": 63.50889552696206,
+      "learning_rate": 2.1424801428785447e-05,
+      "loss": 1.1819,
+      "step": 657
+    },
+    {
+      "epoch": 1.597812879708384,
+      "grad_norm": 60.34121097929382,
+      "learning_rate": 2.1176944355086058e-05,
+      "loss": 1.1051,
+      "step": 658
+    },
+    {
+      "epoch": 1.600243013365735,
+      "grad_norm": 91.95807405182529,
+      "learning_rate": 2.0930359433142932e-05,
+      "loss": 1.0768,
+      "step": 659
+    },
+    {
+      "epoch": 1.6026731470230864,
+      "grad_norm": 33.84817514299781,
+      "learning_rate": 2.068505064267292e-05,
+      "loss": 1.1556,
+      "step": 660
+    },
+    {
+      "epoch": 1.6051032806804373,
+      "grad_norm": 44.846129252871364,
+      "learning_rate": 2.0441021942796944e-05,
+      "loss": 1.192,
+      "step": 661
+    },
+    {
+      "epoch": 1.6075334143377886,
+      "grad_norm": 104.85494442468764,
+      "learning_rate": 2.0198277271976052e-05,
+      "loss": 1.1912,
+      "step": 662
+    },
+    {
+      "epoch": 1.6099635479951397,
+      "grad_norm": 59.541562510020924,
+      "learning_rate": 1.995682054794803e-05,
+      "loss": 1.0932,
+      "step": 663
+    },
+    {
+      "epoch": 1.6123936816524909,
+      "grad_norm": 57.73876590809742,
+      "learning_rate": 1.9716655667664008e-05,
+      "loss": 1.1691,
+      "step": 664
+    },
+    {
+      "epoch": 1.6148238153098422,
+      "grad_norm": 37.00550106127363,
+      "learning_rate": 1.9477786507225616e-05,
+      "loss": 1.0974,
+      "step": 665
+    },
+    {
+      "epoch": 1.617253948967193,
+      "grad_norm": 271.6238263663105,
+      "learning_rate": 1.924021692182236e-05,
+      "loss": 1.1196,
+      "step": 666
+    },
+    {
+      "epoch": 1.6196840826245444,
+      "grad_norm": 69.94535819115217,
+      "learning_rate": 1.900395074566962e-05,
+      "loss": 1.1219,
+      "step": 667
+    },
+    {
+      "epoch": 1.6221142162818953,
+      "grad_norm": 64.77937566314249,
+      "learning_rate": 1.8768991791946456e-05,
+      "loss": 1.0457,
+      "step": 668
+    },
+    {
+      "epoch": 1.6245443499392467,
+      "grad_norm": 91.1799572658908,
+      "learning_rate": 1.8535343852734332e-05,
+      "loss": 1.1082,
+      "step": 669
+    },
+    {
+      "epoch": 1.6269744835965978,
+      "grad_norm": 140.3320781032681,
+      "learning_rate": 1.8303010698955804e-05,
+      "loss": 1.1587,
+      "step": 670
+    },
+    {
+      "epoch": 1.629404617253949,
+      "grad_norm": 129.9206563142473,
+      "learning_rate": 1.8071996080313602e-05,
+      "loss": 1.0436,
+      "step": 671
+    },
+    {
+      "epoch": 1.6318347509113003,
+      "grad_norm": 57.52355335064491,
+      "learning_rate": 1.784230372523018e-05,
+      "loss": 1.0777,
+      "step": 672
+    },
+    {
+      "epoch": 1.6342648845686512,
+      "grad_norm": 45.59691137086442,
+      "learning_rate": 1.76139373407876e-05,
+      "loss": 1.1133,
+      "step": 673
+    },
+    {
+      "epoch": 1.6366950182260025,
+      "grad_norm": 174.9829716096277,
+      "learning_rate": 1.7386900612667633e-05,
+      "loss": 1.1704,
+      "step": 674
+    },
+    {
+      "epoch": 1.6391251518833536,
+      "grad_norm": 106.67575565748977,
+      "learning_rate": 1.7161197205092216e-05,
+      "loss": 1.108,
+      "step": 675
+    },
+    {
+      "epoch": 1.6415552855407047,
+      "grad_norm": 80.2118578939736,
+      "learning_rate": 1.69368307607644e-05,
+      "loss": 1.1134,
+      "step": 676
+    },
+    {
+      "epoch": 1.6439854191980559,
+      "grad_norm": 50.075694613199865,
+      "learning_rate": 1.6713804900809582e-05,
+      "loss": 1.103,
+      "step": 677
+    },
+    {
+      "epoch": 1.646415552855407,
+      "grad_norm": 69.23038320811604,
+      "learning_rate": 1.649212322471695e-05,
+      "loss": 1.1189,
+      "step": 678
+    },
+    {
+      "epoch": 1.6488456865127583,
+      "grad_norm": 33.2935221457007,
+      "learning_rate": 1.6271789310281517e-05,
+      "loss": 1.0763,
+      "step": 679
+    },
+    {
+      "epoch": 1.6512758201701092,
+      "grad_norm": 74.75507124872362,
+      "learning_rate": 1.605280671354632e-05,
+      "loss": 1.0983,
+      "step": 680
+    },
+    {
+      "epoch": 1.6537059538274606,
+      "grad_norm": 72.6880045095337,
+      "learning_rate": 1.583517896874498e-05,
+      "loss": 1.1151,
+      "step": 681
+    },
+    {
+      "epoch": 1.6561360874848117,
+      "grad_norm": 59.70666181469054,
+      "learning_rate": 1.561890958824469e-05,
+      "loss": 1.1202,
+      "step": 682
+    },
+    {
+      "epoch": 1.6585662211421628,
+      "grad_norm": 136.06883726877848,
+      "learning_rate": 1.540400206248963e-05,
+      "loss": 1.114,
+      "step": 683
+    },
+    {
+      "epoch": 1.6609963547995141,
+      "grad_norm": 48.25877797639542,
+      "learning_rate": 1.5190459859944505e-05,
+      "loss": 1.0926,
+      "step": 684
+    },
+    {
+      "epoch": 1.663426488456865,
+      "grad_norm": 99.27065031977625,
+      "learning_rate": 1.4978286427038601e-05,
+      "loss": 1.0938,
+      "step": 685
+    },
+    {
+      "epoch": 1.6658566221142164,
+      "grad_norm": 73.70604863380417,
+      "learning_rate": 1.4767485188110152e-05,
+      "loss": 1.0955,
+      "step": 686
+    },
+    {
+      "epoch": 1.6682867557715675,
+      "grad_norm": 97.29634642853938,
+      "learning_rate": 1.4558059545351143e-05,
+      "loss": 1.0993,
+      "step": 687
+    },
+    {
+      "epoch": 1.6707168894289186,
+      "grad_norm": 169.33237029052367,
+      "learning_rate": 1.435001287875234e-05,
+      "loss": 1.1484,
+      "step": 688
+    },
+    {
+      "epoch": 1.6731470230862697,
+      "grad_norm": 51.080335246500006,
+      "learning_rate": 1.4143348546048707e-05,
+      "loss": 1.1279,
+      "step": 689
+    },
+    {
+      "epoch": 1.6755771567436208,
+      "grad_norm": 123.74332262351422,
+      "learning_rate": 1.3938069882665328e-05,
+      "loss": 1.144,
+      "step": 690
+    },
+    {
+      "epoch": 1.6780072904009722,
+      "grad_norm": 150.6264388349919,
+      "learning_rate": 1.3734180201663439e-05,
+      "loss": 1.048,
+      "step": 691
+    },
+    {
+      "epoch": 1.680437424058323,
+      "grad_norm": 45.78978589208615,
+      "learning_rate": 1.3531682793687028e-05,
+      "loss": 1.0943,
+      "step": 692
+    },
+    {
+      "epoch": 1.6828675577156744,
+      "grad_norm": 59.23541668296553,
+      "learning_rate": 1.3330580926909763e-05,
+      "loss": 1.1422,
+      "step": 693
+    },
+    {
+      "epoch": 1.6852976913730255,
+      "grad_norm": 83.37564839198684,
+      "learning_rate": 1.3130877846982204e-05,
+      "loss": 1.1167,
+      "step": 694
+    },
+    {
+      "epoch": 1.6877278250303767,
+      "grad_norm": 169.89181363126755,
+      "learning_rate": 1.2932576776979377e-05,
+      "loss": 1.0153,
+      "step": 695
+    },
+    {
+      "epoch": 1.6901579586877278,
+      "grad_norm": 41.65359342112402,
+      "learning_rate": 1.2735680917348802e-05,
+      "loss": 1.0842,
+      "step": 696
+    },
+    {
+      "epoch": 1.692588092345079,
+      "grad_norm": 91.76072613046553,
+      "learning_rate": 1.2540193445858883e-05,
+      "loss": 1.1274,
+      "step": 697
+    },
+    {
+      "epoch": 1.6950182260024302,
+      "grad_norm": 86.16989165645253,
+      "learning_rate": 1.2346117517547551e-05,
+      "loss": 1.106,
+      "step": 698
+    },
+    {
+      "epoch": 1.6974483596597811,
+      "grad_norm": 75.86627467070798,
+      "learning_rate": 1.2153456264671337e-05,
+      "loss": 1.0642,
+      "step": 699
+    },
+    {
+      "epoch": 1.6998784933171325,
+      "grad_norm": 78.47579727138226,
+      "learning_rate": 1.1962212796654926e-05,
+      "loss": 1.053,
+      "step": 700
+    },
+    {
+      "epoch": 1.7023086269744836,
+      "grad_norm": 81.45952046323904,
+      "learning_rate": 1.1772390200040817e-05,
+      "loss": 1.1003,
+      "step": 701
+    },
+    {
+      "epoch": 1.7047387606318347,
+      "grad_norm": 81.5215081559605,
+      "learning_rate": 1.1583991538439598e-05,
+      "loss": 1.0789,
+      "step": 702
+    },
+    {
+      "epoch": 1.707168894289186,
+      "grad_norm": 123.8954411953181,
+      "learning_rate": 1.139701985248055e-05,
+      "loss": 1.0574,
+      "step": 703
+    },
+    {
+      "epoch": 1.709599027946537,
+      "grad_norm": 66.51876171521589,
+      "learning_rate": 1.1211478159762478e-05,
+      "loss": 1.0866,
+      "step": 704
+    },
+    {
+      "epoch": 1.7120291616038883,
+      "grad_norm": 88.7505135509034,
+      "learning_rate": 1.1027369454805058e-05,
+      "loss": 1.1039,
+      "step": 705
+    },
+    {
+      "epoch": 1.7144592952612394,
+      "grad_norm": 51.948320911337355,
+      "learning_rate": 1.0844696709000435e-05,
+      "loss": 1.0891,
+      "step": 706
+    },
+    {
+      "epoch": 1.7168894289185905,
+      "grad_norm": 116.12502404263041,
+      "learning_rate": 1.0663462870565411e-05,
+      "loss": 1.1284,
+      "step": 707
+    },
+    {
+      "epoch": 1.7193195625759417,
+      "grad_norm": 49.752442053177056,
+      "learning_rate": 1.0483670864493778e-05,
+      "loss": 1.11,
+      "step": 708
+    },
+    {
+      "epoch": 1.7217496962332928,
+      "grad_norm": 89.67691421405478,
+      "learning_rate": 1.0305323592509009e-05,
+      "loss": 1.1504,
+      "step": 709
+    },
+    {
+      "epoch": 1.7241798298906441,
+      "grad_norm": 84.9951363796106,
+      "learning_rate": 1.0128423933017671e-05,
+      "loss": 1.1163,
+      "step": 710
+    },
+    {
+      "epoch": 1.726609963547995,
+      "grad_norm": 53.83015858877197,
+      "learning_rate": 9.952974741062703e-06,
+      "loss": 1.0768,
+      "step": 711
+    },
+    {
+      "epoch": 1.7290400972053463,
+      "grad_norm": 87.01137462153444,
+      "learning_rate": 9.77897884827752e-06,
+      "loss": 1.0505,
+      "step": 712
+    },
+    {
+      "epoch": 1.7314702308626975,
+      "grad_norm": 119.85348125427905,
+      "learning_rate": 9.606439062840256e-06,
+      "loss": 1.1866,
+      "step": 713
+    },
+    {
+      "epoch": 1.7339003645200486,
+      "grad_norm": 38.86482306830089,
+      "learning_rate": 9.435358169428442e-06,
+      "loss": 1.1203,
+      "step": 714
+    },
+    {
+      "epoch": 1.7363304981773997,
+      "grad_norm": 105.47836599222568,
+      "learning_rate": 9.265738929174051e-06,
+      "loss": 1.1219,
+      "step": 715
+    },
+    {
+      "epoch": 1.7387606318347508,
+      "grad_norm": 97.01504953945435,
+      "learning_rate": 9.097584079618893e-06,
+      "loss": 1.0897,
+      "step": 716
+    },
+    {
+      "epoch": 1.7411907654921022,
+      "grad_norm": 55.37203351389315,
+      "learning_rate": 8.93089633467058e-06,
+      "loss": 1.0747,
+      "step": 717
+    },
+    {
+      "epoch": 1.743620899149453,
+      "grad_norm": 53.68546468478919,
+      "learning_rate": 8.765678384558607e-06,
+      "loss": 1.0636,
+      "step": 718
+    },
+    {
+      "epoch": 1.7460510328068044,
+      "grad_norm": 93.22850661983693,
+      "learning_rate": 8.601932895790877e-06,
+      "loss": 1.0801,
+      "step": 719
+    },
+    {
+      "epoch": 1.7484811664641555,
+      "grad_norm": 75.10018201630282,
+      "learning_rate": 8.439662511110847e-06,
+      "loss": 1.1608,
+      "step": 720
+    },
+    {
+      "epoch": 1.7509113001215066,
+      "grad_norm": 75.88601313663253,
+      "learning_rate": 8.278869849454718e-06,
+      "loss": 1.0286,
+      "step": 721
+    },
+    {
+      "epoch": 1.7509113001215066,
+      "eval_loss": 1.1075224876403809,
+      "eval_runtime": 53.2869,
+      "eval_samples_per_second": 13.962,
+      "eval_steps_per_second": 1.745,
+      "step": 721
+    },
+    {
+      "epoch": 1.753341433778858,
+      "grad_norm": 75.94291636970333,
+      "learning_rate": 8.119557505909215e-06,
+      "loss": 1.1615,
+      "step": 722
+    },
+    {
+      "epoch": 1.7557715674362089,
+      "grad_norm": 85.61204534745477,
+      "learning_rate": 7.961728051669737e-06,
+      "loss": 1.1312,
+      "step": 723
+    },
+    {
+      "epoch": 1.7582017010935602,
+      "grad_norm": 42.0496338509614,
+      "learning_rate": 7.805384033998875e-06,
+      "loss": 1.1068,
+      "step": 724
+    },
+    {
+      "epoch": 1.7606318347509113,
+      "grad_norm": 67.9823900081791,
+      "learning_rate": 7.650527976185173e-06,
+      "loss": 1.134,
+      "step": 725
+    },
+    {
+      "epoch": 1.7630619684082625,
+      "grad_norm": 50.797982181202315,
+      "learning_rate": 7.497162377502542e-06,
+      "loss": 1.0903,
+      "step": 726
+    },
+    {
+      "epoch": 1.7654921020656136,
+      "grad_norm": 66.34495889496102,
+      "learning_rate": 7.3452897131698564e-06,
+      "loss": 1.0895,
+      "step": 727
+    },
+    {
+      "epoch": 1.7679222357229647,
+      "grad_norm": 97.21072984563654,
+      "learning_rate": 7.194912434311052e-06,
+      "loss": 1.0891,
+      "step": 728
+    },
+    {
+      "epoch": 1.770352369380316,
+      "grad_norm": 153.67433901334545,
+      "learning_rate": 7.046032967915483e-06,
+      "loss": 1.1057,
+      "step": 729
+    },
+    {
+      "epoch": 1.772782503037667,
+      "grad_norm": 65.34101790074203,
+      "learning_rate": 6.898653716798887e-06,
+      "loss": 1.1252,
+      "step": 730
+    },
+    {
+      "epoch": 1.7752126366950183,
+      "grad_norm": 60.35832905175029,
+      "learning_rate": 6.75277705956443e-06,
+      "loss": 1.1177,
+      "step": 731
+    },
+    {
+      "epoch": 1.7776427703523694,
+      "grad_norm": 47.338317641259096,
+      "learning_rate": 6.60840535056445e-06,
+      "loss": 1.0986,
+      "step": 732
+    },
+    {
+      "epoch": 1.7800729040097205,
+      "grad_norm": 50.2479403169235,
+      "learning_rate": 6.465540919862456e-06,
+      "loss": 1.0675,
+      "step": 733
+    },
+    {
+      "epoch": 1.7825030376670719,
+      "grad_norm": 76.05847584461722,
+      "learning_rate": 6.32418607319546e-06,
+      "loss": 1.0962,
+      "step": 734
+    },
+    {
+      "epoch": 1.7849331713244228,
+      "grad_norm": 5776.25484119808,
+      "learning_rate": 6.184343091936751e-06,
+      "loss": 1.1224,
+      "step": 735
+    },
+    {
+      "epoch": 1.787363304981774,
+      "grad_norm": 60.26557281969165,
+      "learning_rate": 6.046014233059161e-06,
+      "loss": 1.1682,
+      "step": 736
+    },
+    {
+      "epoch": 1.789793438639125,
+      "grad_norm": 173.76865709745172,
+      "learning_rate": 5.909201729098579e-06,
+      "loss": 1.1463,
+      "step": 737
+    },
+    {
+      "epoch": 1.7922235722964763,
+      "grad_norm": 44.51475254326123,
+      "learning_rate": 5.77390778811796e-06,
+      "loss": 1.1127,
+      "step": 738
+    },
+    {
+      "epoch": 1.7946537059538274,
+      "grad_norm": 62.21753016508825,
+      "learning_rate": 5.640134593671598e-06,
+      "loss": 1.1897,
+      "step": 739
+    },
+    {
+      "epoch": 1.7970838396111786,
+      "grad_norm": 57.213736643350934,
+      "learning_rate": 5.5078843047700275e-06,
+      "loss": 1.1004,
+      "step": 740
+    },
+    {
+      "epoch": 1.79951397326853,
+      "grad_norm": 73.79879091710353,
+      "learning_rate": 5.3771590558450265e-06,
+      "loss": 1.2378,
+      "step": 741
+    },
+    {
+      "epoch": 1.8019441069258808,
+      "grad_norm": 69.33802717306622,
+      "learning_rate": 5.247960956715259e-06,
+      "loss": 1.078,
+      "step": 742
+    },
+    {
+      "epoch": 1.8043742405832321,
+      "grad_norm": 70.1118673770208,
+      "learning_rate": 5.12029209255227e-06,
+      "loss": 1.1082,
+      "step": 743
+    },
+    {
+      "epoch": 1.8068043742405833,
+      "grad_norm": 60.49720666164233,
+      "learning_rate": 4.994154523846695e-06,
+      "loss": 1.1694,
+      "step": 744
+    },
+    {
+      "epoch": 1.8092345078979344,
+      "grad_norm": 62.67576959014564,
+      "learning_rate": 4.869550286375091e-06,
+      "loss": 1.1017,
+      "step": 745
+    },
+    {
+      "epoch": 1.8116646415552855,
+      "grad_norm": 52.797557858037294,
+      "learning_rate": 4.746481391167068e-06,
+      "loss": 1.0547,
+      "step": 746
+    },
+    {
+      "epoch": 1.8140947752126366,
+      "grad_norm": 78.39250293351613,
+      "learning_rate": 4.624949824472858e-06,
+      "loss": 1.1395,
+      "step": 747
+    },
+    {
+      "epoch": 1.816524908869988,
+      "grad_norm": 345.5438304913529,
+      "learning_rate": 4.504957547731214e-06,
+      "loss": 1.1248,
+      "step": 748
+    },
+    {
+      "epoch": 1.8189550425273389,
+      "grad_norm": 128.04277202807285,
+      "learning_rate": 4.386506497537757e-06,
+      "loss": 1.2115,
+      "step": 749
+    },
+    {
+      "epoch": 1.8213851761846902,
+      "grad_norm": 81.9882481842496,
+      "learning_rate": 4.269598585613776e-06,
+      "loss": 1.071,
+      "step": 750
+    },
+    {
+      "epoch": 1.8238153098420413,
+      "grad_norm": 99.80236227862193,
+      "learning_rate": 4.154235698775277e-06,
+      "loss": 1.1591,
+      "step": 751
+    },
+    {
+      "epoch": 1.8262454434993924,
+      "grad_norm": 152.61066223998088,
+      "learning_rate": 4.040419698902631e-06,
+      "loss": 1.1322,
+      "step": 752
+    },
+    {
+      "epoch": 1.8286755771567438,
+      "grad_norm": 32.12973237305346,
+      "learning_rate": 3.928152422910491e-06,
+      "loss": 1.0985,
+      "step": 753
+    },
+    {
+      "epoch": 1.8311057108140947,
+      "grad_norm": 42.81358112745556,
+      "learning_rate": 3.817435682718096e-06,
+      "loss": 1.1252,
+      "step": 754
+    },
+    {
+      "epoch": 1.833535844471446,
+      "grad_norm": 54.37793958217706,
+      "learning_rate": 3.7082712652200867e-06,
+      "loss": 1.1263,
+      "step": 755
+    },
+    {
+      "epoch": 1.8359659781287971,
+      "grad_norm": 76.10189962024336,
+      "learning_rate": 3.6006609322576156e-06,
+      "loss": 1.2002,
+      "step": 756
+    },
+    {
+      "epoch": 1.8383961117861483,
+      "grad_norm": 65.3082792846633,
+      "learning_rate": 3.4946064205899965e-06,
+      "loss": 1.074,
+      "step": 757
+    },
+    {
+      "epoch": 1.8408262454434994,
+      "grad_norm": 51.04243707129297,
+      "learning_rate": 3.390109441866618e-06,
+      "loss": 1.1253,
+      "step": 758
+    },
+    {
+      "epoch": 1.8432563791008505,
+      "grad_norm": 162.25838422994087,
+      "learning_rate": 3.287171682599255e-06,
+      "loss": 1.0746,
+      "step": 759
+    },
+    {
+      "epoch": 1.8456865127582018,
+      "grad_norm": 53.42288615863692,
+      "learning_rate": 3.1857948041349894e-06,
+      "loss": 1.0434,
+      "step": 760
+    },
+    {
+      "epoch": 1.8481166464155527,
+      "grad_norm": 47.70669365600897,
+      "learning_rate": 3.085980442629288e-06,
+      "loss": 1.0694,
+      "step": 761
+    },
+    {
+      "epoch": 1.850546780072904,
+      "grad_norm": 35.5865078619899,
+      "learning_rate": 2.9877302090196346e-06,
+      "loss": 1.1292,
+      "step": 762
+    },
+    {
+      "epoch": 1.8529769137302552,
+      "grad_norm": 259.96541089150685,
+      "learning_rate": 2.8910456889995498e-06,
+      "loss": 1.1138,
+      "step": 763
+    },
+    {
+      "epoch": 1.8554070473876063,
+      "grad_norm": 100.9818148806922,
+      "learning_rate": 2.7959284429929456e-06,
+      "loss": 1.1414,
+      "step": 764
+    },
+    {
+      "epoch": 1.8578371810449574,
+      "grad_norm": 58.16414187512478,
+      "learning_rate": 2.7023800061289907e-06,
+      "loss": 1.1076,
+      "step": 765
+    },
+    {
+      "epoch": 1.8602673147023085,
+      "grad_norm": 48.78094545883972,
+      "learning_rate": 2.6104018882173064e-06,
+      "loss": 1.1061,
+      "step": 766
+    },
+    {
+      "epoch": 1.86269744835966,
+      "grad_norm": 32.1696048844329,
+      "learning_rate": 2.5199955737236104e-06,
+      "loss": 1.0771,
+      "step": 767
+    },
+    {
+      "epoch": 1.8651275820170108,
+      "grad_norm": 67.39153628388367,
+      "learning_rate": 2.4311625217457778e-06,
+      "loss": 1.1179,
+      "step": 768
+    },
+    {
+      "epoch": 1.8675577156743621,
+      "grad_norm": 57.3659391222766,
+      "learning_rate": 2.3439041659902407e-06,
+      "loss": 1.1348,
+      "step": 769
+    },
+    {
+      "epoch": 1.8699878493317132,
+      "grad_norm": 48.86332839622862,
+      "learning_rate": 2.2582219147489147e-06,
+      "loss": 1.067,
+      "step": 770
+    },
+    {
+      "epoch": 1.8724179829890644,
+      "grad_norm": 59.25100384913438,
+      "learning_rate": 2.174117150876398e-06,
+      "loss": 1.0814,
+      "step": 771
+    },
+    {
+      "epoch": 1.8748481166464157,
+      "grad_norm": 63.612150673079476,
+      "learning_rate": 2.091591231767709e-06,
+      "loss": 1.149,
+      "step": 772
+    },
+    {
+      "epoch": 1.8772782503037666,
+      "grad_norm": 76.64767939109268,
+      "learning_rate": 2.010645489336382e-06,
+      "loss": 1.1809,
+      "step": 773
+    },
+    {
+      "epoch": 1.879708383961118,
+      "grad_norm": 73.74931792569282,
+      "learning_rate": 1.9312812299929094e-06,
+      "loss": 1.1327,
+      "step": 774
+    },
+    {
+      "epoch": 1.882138517618469,
+      "grad_norm": 58.80672287733774,
+      "learning_rate": 1.8534997346237093e-06,
+      "loss": 1.0941,
+      "step": 775
+    },
+    {
+      "epoch": 1.8845686512758202,
+      "grad_norm": 38.760707938386155,
+      "learning_rate": 1.777302258570479e-06,
+      "loss": 1.0785,
+      "step": 776
+    },
+    {
+      "epoch": 1.8869987849331713,
+      "grad_norm": 78.58894037195526,
+      "learning_rate": 1.7026900316098215e-06,
+      "loss": 1.1067,
+      "step": 777
+    },
+    {
+      "epoch": 1.8894289185905224,
+      "grad_norm": 34.985018454660455,
+      "learning_rate": 1.6296642579335496e-06,
+      "loss": 1.0454,
+      "step": 778
+    },
+    {
+      "epoch": 1.8918590522478738,
+      "grad_norm": 56.94816404212683,
+      "learning_rate": 1.5582261161291245e-06,
+      "loss": 1.1402,
+      "step": 779
+    },
+    {
+      "epoch": 1.8942891859052247,
+      "grad_norm": 88.47683213840897,
+      "learning_rate": 1.4883767591606924e-06,
+      "loss": 1.1847,
+      "step": 780
+    },
+    {
+      "epoch": 1.896719319562576,
+      "grad_norm": 164.4313185857989,
+      "learning_rate": 1.4201173143504888e-06,
+      "loss": 1.0246,
+      "step": 781
+    },
+    {
+      "epoch": 1.8991494532199271,
+      "grad_norm": 44.826186028268175,
+      "learning_rate": 1.3534488833605974e-06,
+      "loss": 1.1285,
+      "step": 782
+    },
+    {
+      "epoch": 1.9015795868772782,
+      "grad_norm": 75.6977790059232,
+      "learning_rate": 1.2883725421752201e-06,
+      "loss": 1.111,
+      "step": 783
+    },
+    {
+      "epoch": 1.9040097205346294,
+      "grad_norm": 74.26685984847175,
+      "learning_rate": 1.2248893410832685e-06,
+      "loss": 1.0741,
+      "step": 784
+    },
+    {
+      "epoch": 1.9064398541919805,
+      "grad_norm": 54.63352143603637,
+      "learning_rate": 1.1630003046614323e-06,
+      "loss": 1.0816,
+      "step": 785
+    },
+    {
+      "epoch": 1.9088699878493318,
+      "grad_norm": 55.31777597928668,
+      "learning_rate": 1.1027064317576385e-06,
+      "loss": 1.1589,
+      "step": 786
+    },
+    {
+      "epoch": 1.9113001215066827,
+      "grad_norm": 81.61311763219298,
+      "learning_rate": 1.0440086954749517e-06,
+      "loss": 1.1165,
+      "step": 787
+    },
+    {
+      "epoch": 1.913730255164034,
+      "grad_norm": 49.503370916119344,
+      "learning_rate": 9.869080431558542e-07,
+      "loss": 1.1239,
+      "step": 788
+    },
+    {
+      "epoch": 1.9161603888213852,
+      "grad_norm": 47.648410246233475,
+      "learning_rate": 9.314053963669245e-07,
+      "loss": 1.1121,
+      "step": 789
+    },
+    {
+      "epoch": 1.9185905224787363,
+      "grad_norm": 66.83906080469764,
+      "learning_rate": 8.775016508840272e-07,
+      "loss": 1.1461,
+      "step": 790
+    },
+    {
+      "epoch": 1.9210206561360876,
+      "grad_norm": 148.92765988977712,
+      "learning_rate": 8.251976766777913e-07,
+      "loss": 1.1627,
+      "step": 791
+    },
+    {
+      "epoch": 1.9234507897934385,
+      "grad_norm": 70.09639733538229,
+      "learning_rate": 7.744943178996101e-07,
+      "loss": 1.0935,
+      "step": 792
+    },
+    {
+      "epoch": 1.9258809234507899,
+      "grad_norm": 85.9264341638202,
+      "learning_rate": 7.253923928680406e-07,
+      "loss": 1.1071,
+      "step": 793
+    },
+    {
+      "epoch": 1.928311057108141,
+      "grad_norm": 76.17401772146403,
+      "learning_rate": 6.778926940555152e-07,
+      "loss": 1.1448,
+      "step": 794
+    },
+    {
+      "epoch": 1.930741190765492,
+      "grad_norm": 91.15259575797278,
+      "learning_rate": 6.319959880756177e-07,
+      "loss": 1.1101,
+      "step": 795
+    },
+    {
+      "epoch": 1.9331713244228432,
+      "grad_norm": 31.655747702448462,
+      "learning_rate": 5.877030156707042e-07,
+      "loss": 1.039,
+      "step": 796
+    },
+    {
+      "epoch": 1.9356014580801943,
+      "grad_norm": 50.84824944272368,
+      "learning_rate": 5.450144916999134e-07,
+      "loss": 1.0511,
+      "step": 797
+    },
+    {
+      "epoch": 1.9380315917375457,
+      "grad_norm": 65.68864220454576,
+      "learning_rate": 5.039311051276752e-07,
+      "loss": 1.1926,
+      "step": 798
+    },
+    {
+      "epoch": 1.9404617253948966,
+      "grad_norm": 87.84909056776107,
+      "learning_rate": 4.644535190125421e-07,
+      "loss": 1.1022,
+      "step": 799
+    },
+    {
+      "epoch": 1.942891859052248,
+      "grad_norm": 49.60648808282262,
+      "learning_rate": 4.2658237049655323e-07,
+      "loss": 1.1283,
+      "step": 800
+    },
+    {
+      "epoch": 1.945321992709599,
+      "grad_norm": 60.29916671699956,
+      "learning_rate": 3.903182707948649e-07,
+      "loss": 1.0659,
+      "step": 801
+    },
+    {
+      "epoch": 1.9477521263669502,
+      "grad_norm": 85.60352537490172,
+      "learning_rate": 3.556618051859584e-07,
+      "loss": 1.1473,
+      "step": 802
+    },
+    {
+      "epoch": 1.9501822600243013,
+      "grad_norm": 63.26392365183893,
+      "learning_rate": 3.2261353300219176e-07,
+      "loss": 1.1018,
+      "step": 803
+    },
+    {
+      "epoch": 1.9526123936816524,
+      "grad_norm": 58.902160704029676,
+      "learning_rate": 2.9117398762069647e-07,
+      "loss": 1.1158,
+      "step": 804
+    },
+    {
+      "epoch": 1.9550425273390037,
+      "grad_norm": 95.75043109531097,
+      "learning_rate": 2.613436764548505e-07,
+      "loss": 1.1034,
+      "step": 805
+    },
+    {
+      "epoch": 1.9574726609963546,
+      "grad_norm": 61.028532599630765,
+      "learning_rate": 2.3312308094607382e-07,
+      "loss": 1.1239,
+      "step": 806
+    },
+    {
+      "epoch": 1.959902794653706,
+      "grad_norm": 42.40393099941929,
+      "learning_rate": 2.0651265655603492e-07,
+      "loss": 1.0899,
+      "step": 807
+    },
+    {
+      "epoch": 1.962332928311057,
+      "grad_norm": 72.5324497099459,
+      "learning_rate": 1.8151283275928964e-07,
+      "loss": 1.0923,
+      "step": 808
+    },
+    {
+      "epoch": 1.9647630619684082,
+      "grad_norm": 70.09629889999877,
+      "learning_rate": 1.5812401303639813e-07,
+      "loss": 1.122,
+      "step": 809
+    },
+    {
+      "epoch": 1.9671931956257596,
+      "grad_norm": 51.82649450794088,
+      "learning_rate": 1.3634657486737424e-07,
+      "loss": 1.1976,
+      "step": 810
+    },
+    {
+      "epoch": 1.9696233292831105,
+      "grad_norm": 38.14749887357619,
+      "learning_rate": 1.1618086972559062e-07,
+      "loss": 1.1402,
+      "step": 811
+    },
+    {
+      "epoch": 1.9720534629404618,
+      "grad_norm": 63.952260610433626,
+      "learning_rate": 9.762722307213868e-08,
+      "loss": 1.1099,
+      "step": 812
+    },
+    {
+      "epoch": 1.974483596597813,
+      "grad_norm": 67.73936930746979,
+      "learning_rate": 8.068593435055505e-08,
+      "loss": 1.0666,
+      "step": 813
+    },
+    {
+      "epoch": 1.976913730255164,
+      "grad_norm": 199.80849466800598,
+      "learning_rate": 6.535727698199213e-08,
+      "loss": 1.1676,
+      "step": 814
+    },
+    {
+      "epoch": 1.9793438639125152,
+      "grad_norm": 128.36184695966296,
+      "learning_rate": 5.164149836077714e-08,
+      "loss": 1.1347,
+      "step": 815
+    },
+    {
+      "epoch": 1.9817739975698663,
+      "grad_norm": 117.93650898577977,
+      "learning_rate": 3.953881985047092e-08,
+      "loss": 1.0093,
+      "step": 816
+    },
+    {
+      "epoch": 1.9842041312272176,
+      "grad_norm": 133.66858791525644,
+      "learning_rate": 2.9049436780281825e-08,
+      "loss": 1.132,
+      "step": 817
+    },
+    {
+      "epoch": 1.9866342648845685,
+      "grad_norm": 45.0102112528463,
+      "learning_rate": 2.0173518441868324e-08,
+      "loss": 1.1243,
+      "step": 818
+    },
+    {
+      "epoch": 1.9890643985419199,
+      "grad_norm": 99.86626236714106,
+      "learning_rate": 1.2911208086663351e-08,
+      "loss": 1.1503,
+      "step": 819
+    },
+    {
+      "epoch": 1.991494532199271,
+      "grad_norm": 76.9095216170618,
+      "learning_rate": 7.262622923531747e-09,
+      "loss": 1.1219,
+      "step": 820
+    },
+    {
+      "epoch": 1.993924665856622,
+      "grad_norm": 53.6297519446309,
+      "learning_rate": 3.2278541168717646e-09,
+      "loss": 1.1142,
+      "step": 821
+    },
+    {
+      "epoch": 1.9963547995139734,
+      "grad_norm": 113.49011128083137,
+      "learning_rate": 8.069667851939855e-10,
+      "loss": 1.1222,
+      "step": 822
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 822,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 206,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.218071900027093e+18,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoint-822/training_args.bin b/checkpoint-822/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..529c57f6a4b7b9fa2912b10c5ebbd4c9ae92b0f2
--- /dev/null
+++ b/checkpoint-822/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b6cfbae5f5972dd850bae3d0987f916904b4b5b8d723c11ef16db54c57724a76
+size 8568
diff --git a/checkpoint-822/zero_to_fp32.py b/checkpoint-822/zero_to_fp32.py
new file mode 100644
index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8
--- /dev/null
+++ b/checkpoint-822/zero_to_fp32.py
@@ -0,0 +1,604 @@
+#!/usr/bin/env python
+
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example: python zero_to_fp32.py . pytorch_model.bin
+
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+from collections import OrderedDict
+from dataclasses import dataclass
+
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+                                            FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+                                            FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+
+
+@dataclass
+class zero_model_state:
+    buffers: dict()
+    param_shapes: dict()
+    shared_params: list
+    ds_version: int
+    frozen_param_shapes: dict()
+    frozen_param_fragments: dict()
+
+
+debug = 0
+
+# load to cpu
+device = torch.device('cpu')
+
+
+def atoi(text):
+    return int(text) if text.isdigit() else text
+
+
+def natural_keys(text):
+    '''
+    alist.sort(key=natural_keys) sorts in human order
+    http://nedbatchelder.com/blog/200712/human_sorting.html
+    (See Toothy's implementation in the comments)
+    '''
+    return [atoi(c) for c in re.split(r'(\d+)', text)]
+
+
+def get_model_state_file(checkpoint_dir, zero_stage):
+    if not os.path.isdir(checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+
+    # there should be only one file
+    if zero_stage <= 2:
+        file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+    elif zero_stage == 3:
+        file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+
+    if not os.path.exists(file):
+        raise FileNotFoundError(f"can't find model states file at '{file}'")
+
+    return file
+
+
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+    # XXX: need to test that this simple glob rule works for multi-node setup too
+    ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+
+    if len(ckpt_files) == 0:
+        raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+
+    return ckpt_files
+
+
+def get_optim_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+
+
+def get_model_state_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+
+
+def parse_model_states(files):
+    zero_model_states = []
+    for file in files:
+        state_dict = torch.load(file, map_location=device)
+
+        if BUFFER_NAMES not in state_dict:
+            raise ValueError(f"{file} is not a model state checkpoint")
+        buffer_names = state_dict[BUFFER_NAMES]
+        if debug:
+            print("Found buffers:", buffer_names)
+
+        # recover just the buffers while restoring them to fp32 if they were saved in fp16
+        buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+        param_shapes = state_dict[PARAM_SHAPES]
+
+        # collect parameters that are included in param_shapes
+        param_names = []
+        for s in param_shapes:
+            for name in s.keys():
+                param_names.append(name)
+
+        # update with frozen parameters
+        frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+        if frozen_param_shapes is not None:
+            if debug:
+                print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+            param_names += list(frozen_param_shapes.keys())
+
+        # handle shared params
+        shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+
+        ds_version = state_dict.get(DS_VERSION, None)
+
+        frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+
+        z_model_state = zero_model_state(buffers=buffers,
+                                         param_shapes=param_shapes,
+                                         shared_params=shared_params,
+                                         ds_version=ds_version,
+                                         frozen_param_shapes=frozen_param_shapes,
+                                         frozen_param_fragments=frozen_param_fragments)
+        zero_model_states.append(z_model_state)
+
+    return zero_model_states
+
+
+def parse_optim_states(files, ds_checkpoint_dir):
+
+    total_files = len(files)
+    state_dicts = []
+    for f in files:
+        state_dict = torch.load(f, map_location=device)
+        # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+        # and also handle the case where it was already removed by another helper script
+        state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+        state_dicts.append(state_dict)
+
+    if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
+        raise ValueError(f"{files[0]} is not a zero checkpoint")
+    zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+    world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+
+    # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+    # parameters can be different from data parallelism for non-expert parameters. So we can just
+    # use the max of the partition_count to get the dp world_size.
+
+    if type(world_size) is list:
+        world_size = max(world_size)
+
+    if world_size != total_files:
+        raise ValueError(
+            f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+            "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+        )
+
+    # the groups are named differently in each stage
+    if zero_stage <= 2:
+        fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+    elif zero_stage == 3:
+        fp32_groups_key = FP32_FLAT_GROUPS
+    else:
+        raise ValueError(f"unknown zero stage {zero_stage}")
+
+    if zero_stage <= 2:
+        fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+    elif zero_stage == 3:
+        # if there is more than one param group, there will be multiple flattened tensors - one
+        # flattened tensor per group - for simplicity merge them into a single tensor
+        #
+        # XXX: could make the script more memory efficient for when there are multiple groups - it
+        # will require matching the sub-lists of param_shapes for each param group flattened tensor
+
+        fp32_flat_groups = [
+            torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts))
+        ]
+
+    return zero_stage, world_size, fp32_flat_groups
+
+
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
+    """
+    Returns fp32 state_dict reconstructed from ds checkpoint
+
+    Args:
+        - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+
+    """
+    print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+
+    optim_files = get_optim_files(ds_checkpoint_dir)
+    zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+    print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+
+    model_files = get_model_state_files(ds_checkpoint_dir)
+
+    zero_model_states = parse_model_states(model_files)
+    print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+
+    if zero_stage <= 2:
+        return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+    elif zero_stage == 3:
+        return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+
+
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+
+    frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+    frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+
+    if debug:
+        num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+        print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+    total_params = 0
+    total_numel = 0
+    for name, shape in frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+
+        state_dict[name] = frozen_param_fragments[name]
+
+        if debug:
+            print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _has_callable(obj, fn):
+    attr = getattr(obj, fn, None)
+    return callable(attr)
+
+
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+
+    # Reconstruction protocol:
+    #
+    # XXX: document this
+
+    if debug:
+        for i in range(world_size):
+            for j in range(len(fp32_flat_groups[0])):
+                print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+
+    # XXX: memory usage doubles here (zero2)
+    num_param_groups = len(fp32_flat_groups[0])
+    merged_single_partition_of_fp32_groups = []
+    for i in range(num_param_groups):
+        merged_partitions = [sd[i] for sd in fp32_flat_groups]
+        full_single_fp32_vector = torch.cat(merged_partitions, 0)
+        merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+    avail_numel = sum(
+        [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+
+    if debug:
+        wanted_params = sum([len(shapes) for shapes in param_shapes])
+        wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+        # not asserting if there is a mismatch due to possible padding
+        print(f"Have {avail_numel} numels to process.")
+        print(f"Need {wanted_numel} numels in {wanted_params} params.")
+
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    total_numel = 0
+    total_params = 0
+    for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+        offset = 0
+        avail_numel = full_single_fp32_vector.numel()
+        for name, shape in shapes.items():
+
+            unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+            total_numel += unpartitioned_numel
+            total_params += 1
+
+            if debug:
+                print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+            state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+            offset += unpartitioned_numel
+
+        # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+        # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+        # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+        # live optimizer object, so we are checking that the numbers are within the right range
+        align_to = 2 * world_size
+
+        def zero2_align(x):
+            return align_to * math.ceil(x / align_to)
+
+        if debug:
+            print(f"original offset={offset}, avail_numel={avail_numel}")
+
+        offset = zero2_align(offset)
+        avail_numel = zero2_align(avail_numel)
+
+        if debug:
+            print(f"aligned  offset={offset}, avail_numel={avail_numel}")
+
+        # Sanity check
+        if offset != avail_numel:
+            raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+    print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+
+    if not exclude_frozen_parameters:
+        _zero2_merge_frozen_params(state_dict, zero_model_states)
+
+    _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+
+    return state_dict
+
+
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+    remainder = unpartitioned_numel % world_size
+    padding_numel = (world_size - remainder) if remainder else 0
+    partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+    return partitioned_numel, padding_numel
+
+
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+
+    if debug:
+        for i in range(world_size):
+            num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+            print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+        frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+    total_params = 0
+    total_numel = 0
+    for name, shape in zero_model_states[0].frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+
+        param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+        state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+        if debug:
+            print(
+                f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    avail_numel = fp32_flat_groups[0].numel() * world_size
+    # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+    # param, re-consolidating each param, while dealing with padding if any
+
+    # merge list of dicts, preserving order
+    param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+
+    if debug:
+        for i in range(world_size):
+            print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+
+        wanted_params = len(param_shapes)
+        wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+        # not asserting if there is a mismatch due to possible padding
+        avail_numel = fp32_flat_groups[0].numel() * world_size
+        print(f"Trainable params: Have {avail_numel} numels to process.")
+        print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    offset = 0
+    total_numel = 0
+    total_params = 0
+    for name, shape in param_shapes.items():
+
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        total_params += 1
+
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+        if debug:
+            print(
+                f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+
+        # XXX: memory usage doubles here
+        state_dict[name] = torch.cat(
+            tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)),
+            0).narrow(0, 0, unpartitioned_numel).view(shape)
+        offset += partitioned_numel
+
+    offset *= world_size
+
+    # Sanity check
+    if offset != avail_numel:
+        raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+    print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+
+    if not exclude_frozen_parameters:
+        _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+
+    _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+
+    return state_dict
+
+
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+    ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+    via a model hub.
+
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+
+    Returns:
+        - pytorch ``state_dict``
+
+    Note: this approach may not work if your application doesn't have sufficient free CPU memory and
+    you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+    the checkpoint.
+
+    A typical usage might be ::
+
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        # do the training and checkpoint saving
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+        model = model.cpu() # move to cpu
+        model.load_state_dict(state_dict)
+        # submit to model hub or save the model to share with others
+
+    In this example the ``model`` will no longer be usable in the deepspeed context of the same
+    application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+    If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+
+    """
+    if tag is None:
+        latest_path = os.path.join(checkpoint_dir, 'latest')
+        if os.path.isfile(latest_path):
+            with open(latest_path, 'r') as fd:
+                tag = fd.read().strip()
+        else:
+            raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+
+    ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+
+    if not os.path.isdir(ds_checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+
+    return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
+
+
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+    loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+    """
+
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters)
+    print(f"Saving fp32 state dict to {output_file}")
+    torch.save(state_dict, output_file)
+
+
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+    """
+    1. Put the provided model to cpu
+    2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+    3. Load it into the provided model
+
+    Args:
+        - ``model``: the model object to update
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+
+    Returns:
+        - ``model`: modified model
+
+    Make sure you have plenty of CPU memory available before you call this function. If you don't
+    have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+    conveniently placed for you in the checkpoint folder.
+
+    A typical usage might be ::
+
+        from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+        model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+        # submit to model hub or save the model to share with others
+
+    Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+    of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+    """
+    logger.info(f"Extracting fp32 weights")
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+
+    logger.info(f"Overwriting model with fp32 weights")
+    model = model.cpu()
+    model.load_state_dict(state_dict, strict=False)
+
+    return model
+
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("checkpoint_dir",
+                        type=str,
+                        help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+    parser.add_argument(
+        "output_file",
+        type=str,
+        help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)")
+    parser.add_argument("-t",
+                        "--tag",
+                        type=str,
+                        default=None,
+                        help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+    parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
+    parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+    args = parser.parse_args()
+
+    debug = args.debug
+
+    convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
+                                               args.output_file,
+                                               tag=args.tag,
+                                               exclude_frozen_parameters=args.exclude_frozen_parameters)
diff --git a/config.json b/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..cdf48b3a202fbd0a012bd0c34f94c036b0ff0d8c
--- /dev/null
+++ b/config.json
@@ -0,0 +1,28 @@
+{
+  "_attn_implementation_autoset": true,
+  "architectures": [
+    "Glm4ForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "eos_token_id": 151336,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 6144,
+  "initializer_range": 0.02,
+  "intermediate_size": 23040,
+  "max_position_embeddings": 32768,
+  "model_type": "glm4",
+  "num_attention_heads": 48,
+  "num_hidden_layers": 61,
+  "num_key_value_heads": 2,
+  "pad_token_id": 151329,
+  "partial_rotary_factor": 0.5,
+  "rms_norm_eps": 1e-05,
+  "rope_theta": 10000.0,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.51.3",
+  "use_cache": false,
+  "vocab_size": 151552
+}
diff --git a/generation_config.json b/generation_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..243977cf4d01fec6edb0a8f97e440826b936aee7
--- /dev/null
+++ b/generation_config.json
@@ -0,0 +1,11 @@
+{
+  "_from_model_config": true,
+  "do_sample": true,
+  "eos_token_id": [
+    151329,
+    151336,
+    151338
+  ],
+  "pad_token_id": 151329,
+  "transformers_version": "4.51.3"
+}
diff --git a/model.safetensors b/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..7930155509434a0398f24149f374be2488e61633
--- /dev/null
+++ b/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5ae215fc1360a879202bd292aed15250010a2d5206626b9aa54c8d5f4acc0df2
+size 57136
diff --git a/special_tokens_map.json b/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..3cf953c2c8a3c89778c92e54c685942bb1130616
--- /dev/null
+++ b/special_tokens_map.json
@@ -0,0 +1,32 @@
+{
+  "additional_special_tokens": [
+    "<|endoftext|>",
+    "[MASK]",
+    "[gMASK]",
+    "[sMASK]",
+    "<sop>",
+    "<eop>",
+    "<|system|>",
+    "<|user|>",
+    "<|assistant|>",
+    "<|observation|>",
+    "<|begin_of_image|>",
+    "<|end_of_image|>",
+    "<|begin_of_video|>",
+    "<|end_of_video|>"
+  ],
+  "eos_token": {
+    "content": "<|user|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
diff --git a/tokenizer.json b/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..4d1dde37a2715c11628fc84bf571976f9f80eb69
--- /dev/null
+++ b/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:76ebeac0d8bd7879ead7b43c16b44981f277e47225de2bd7de9ae1a6cc664a8c
+size 19966496
diff --git a/tokenizer_config.json b/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..e19c45f6310a17f6c1c6be76a429ac68438d547f
--- /dev/null
+++ b/tokenizer_config.json
@@ -0,0 +1,146 @@
+{
+  "added_tokens_decoder": {
+    "151329": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151330": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151331": {
+      "content": "[gMASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151332": {
+      "content": "[sMASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151333": {
+      "content": "<sop>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151334": {
+      "content": "<eop>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151335": {
+      "content": "<|system|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151336": {
+      "content": "<|user|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151337": {
+      "content": "<|assistant|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151338": {
+      "content": "<|observation|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151339": {
+      "content": "<|begin_of_image|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151340": {
+      "content": "<|end_of_image|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151341": {
+      "content": "<|begin_of_video|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151342": {
+      "content": "<|end_of_video|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|endoftext|>",
+    "[MASK]",
+    "[gMASK]",
+    "[sMASK]",
+    "<sop>",
+    "<eop>",
+    "<|system|>",
+    "<|user|>",
+    "<|assistant|>",
+    "<|observation|>",
+    "<|begin_of_image|>",
+    "<|end_of_image|>",
+    "<|begin_of_video|>",
+    "<|end_of_video|>"
+  ],
+  "chat_template": "[gMASK]<sop>\n{%- if tools -%}\n<|system|>\n# 可用工具\n{% for tool in tools %}\n    {%- set function = tool.function if tool.get(\"function\") else tool %}\n\n## {{ function.name }}\n\n{{ function | tojson(indent=4, ensure_ascii=False) }}\n在调用上述函数时，请使用 Json 格式表示调用的参数。\n{%- endfor %}\n{%- endif -%}\n\n{%- for msg in messages %}\n    {%- if msg.role == 'system' %}\n<|system|>\n{{ msg.content }}\n    {%- endif %}\n{%- endfor %}\n\n{%- for message in messages if message.role != 'system' %}\n    {%- set role = message['role'] %}\n    {%- set content = message['content'] %}\n    {%- set meta = message.get(\"metadata\", \"\") %}\n\n    {%- if role == 'user' %}\n<|user|>\n{{ content }}\n    {%- elif role == 'assistant' and not meta %}\n<|assistant|>\n{{ content }}\n    {%- elif role == 'assistant' and meta %}\n<|assistant|>{{ meta }}\n{{ content }}\n    {%- elif role == 'observation' %}\n<|observation|>\n{{ content }}\n    {%- endif %}\n{%- endfor %}\n{% if add_generation_prompt %}<|assistant|>{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "do_lower_case": false,
+  "eos_token": "<|user|>",
+  "extra_special_tokens": {},
+  "model_input_names": [
+    "input_ids",
+    "attention_mask"
+  ],
+  "model_max_length": 128000,
+  "pad_token": "<|endoftext|>",
+  "padding_side": "left",
+  "remove_space": false,
+  "tokenizer_class": "PreTrainedTokenizer"
+}
diff --git a/training_args.bin b/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..529c57f6a4b7b9fa2912b10c5ebbd4c9ae92b0f2
--- /dev/null
+++ b/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b6cfbae5f5972dd850bae3d0987f916904b4b5b8d723c11ef16db54c57724a76
+size 8568